diff --git a/.flake8 b/.flake8
new file mode 100644
index 000000000000..e694a9d33d04
--- /dev/null
+++ b/.flake8
@@ -0,0 +1,5 @@
+[flake8]
+ignore = E,F403,F405,F541,F841,W
+select = E9,F,W6
+per-file-ignores =
+    __init__.py:F401
diff --git a/.github/ISSUE_TEMPLATE/ci_failure_report.md b/.github/ISSUE_TEMPLATE/ci_failure_report.md
new file mode 100644
index 000000000000..6bf4c7762319
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/ci_failure_report.md
@@ -0,0 +1,10 @@
+---
+name: CI failure report
+about: Report a DeepSpeed CI failure
+title: "{{ env.GITHUB_WORKFLOW }} CI test failure"
+labels: ci-failure
+assignees: ''
+
+---
+
+The Nightly CI for {{ env.GITHUB_SERVER_URL }}/{{ env.GITHUB_REPOSITORY }}/actions/runs/{{ env.GITHUB_RUN_ID }} failed.
diff --git a/.github/ISSUE_TEMPLATE/deepspeed_chat_bug_report.md b/.github/ISSUE_TEMPLATE/deepspeed_chat_bug_report.md
new file mode 100644
index 000000000000..bf997775fe32
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/deepspeed_chat_bug_report.md
@@ -0,0 +1,44 @@
+---
+name: Bug report (DeepSpeed-Chat)
+about: Create a DeepSpeed-Chat related issue to help us improve
+title: "[BUG]"
+labels: bug,deepspeed-chat
+assignees: ''
+
+---
+
+**Describe the bug**
+A clear and concise description of what the bug is. Please include which training step you are using and which model you are training.
+
+**Log output**
+If you used `train.py` to launch the application, please include the contents of the output log file.
+
+**To Reproduce**
+Steps to reproduce the behavior:
+1. Command/Script to reproduce
+2. What packages are required and their versions
+3. How to run the script
+4. ...
+
+**Expected behavior**
+A clear and concise description of what you expected to happen.
+
+**ds_report output**
+Please run `ds_report` to give us details about your setup.
+
+**Screenshots**
+If applicable, add screenshots to help explain your problem.
+
+**System info (please complete the following information):**
+ - OS: [e.g. Ubuntu 18.04]
+ - GPU count and types [e.g. two machines with x8 A100s each]
+ - (if applicable) what [DeepSpeed-MII](https://github.com/microsoft/deepspeed-mii) version are you using
+ - (if applicable) Hugging Face Transformers/Accelerate/etc. versions
+ - Python version
+ - Any other relevant info about your setup
+
+**Docker context**
+Are you using a specific docker image that you can share?
+
+**Additional context**
+Add any other context about the problem here.
diff --git a/.github/workflows/amd.yml b/.github/workflows/amd-mi100.yml
similarity index 68%
rename from .github/workflows/amd.yml
rename to .github/workflows/amd-mi100.yml
index 1552bff9695a..7ad0f4178db4 100644
--- a/.github/workflows/amd.yml
+++ b/.github/workflows/amd-mi100.yml
@@ -1,35 +1,29 @@
-name: amd
+name: amd-mi100
 
 on:
-  push:
-    branches:
-      - 'staging**'
-    paths-ignore:
-      - 'docs/**'
-  pull_request:
-    paths-ignore:
-      - 'docs/**'
+  schedule:
+    - cron: "0 0 * * *"
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}
   cancel-in-progress: true
 
 jobs:
-  unit-tests:
+  amd-tests:
     # The type of runner that the job will run on
-    runs-on: [self-hosted, amd]
+    runs-on: [self-hosted, amd, mi100]
 
     # Steps represent a sequence of tasks that will be executed as part of the job
     steps:
       # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
 
       - id: setup-venv
         uses: ./.github/workflows/setup-venv
 
       - name: Install pytorch
         run: |
-          pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/rocm5.1.1
+          pip install --cache-dir $TORCH_CACHE torch==1.13.1 torchvision --extra-index-url https://download.pytorch.org/whl/rocm5.1.1
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
@@ -56,7 +50,7 @@ jobs:
       # Runs a set of commands using the runners shell
       - name: Unit tests
         run: |
-          if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
+          unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
           cd tests
-          TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 -n 4 --verbose unit/
-          TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --verbose -m 'sequential' unit/
+          pytest $PYTEST_OPTS -n 4 --verbose unit/
+          pytest $PYTEST_OPTS -m 'sequential' unit/
diff --git a/.github/workflows/amd-mi200.yml b/.github/workflows/amd-mi200.yml
new file mode 100644
index 000000000000..8c4292d4675c
--- /dev/null
+++ b/.github/workflows/amd-mi200.yml
@@ -0,0 +1,82 @@
+name: amd-mi200
+
+on:
+  schedule:
+    - cron: "0 0 * * *"
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+permissions:
+  contents: read
+  issues: write
+
+jobs:
+  amd-tests:
+    # The type of runner that the job will run on
+    runs-on: [self-hosted, amd, mi200]
+
+    # Steps represent a sequence of tasks that will be executed as part of the job
+    steps:
+      # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
+      - uses: actions/checkout@v3
+
+      - id: setup-venv
+        uses: ./.github/workflows/setup-venv
+
+      - name: Install pytorch
+        run: |
+          pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/rocm5.6
+          python -c "import torch; print('torch:', torch.__version__, torch)"
+          python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
+
+      - name: Install transformers
+        run: |
+          git clone https://github.com/huggingface/transformers
+          cd transformers
+          # if needed switch to the last known good SHA until transformers@master is fixed
+          # git checkout 1cc453d33
+          git rev-parse --short HEAD
+          pip install .
+
+      - name: Install (ROCm) apex
+        run: |
+          git clone https://github.com/ROCmSoftwarePlatform/apex.git
+          cd apex
+          git checkout torch_2.1_higher
+          CURRENT_VER=$(git rev-parse HEAD)
+          INSTALLED_VER=$(cat /blob/amd-apex/.venv_installed_version)
+          if [[ "$CURRENT_VER" != "$INSTALLED_VER" ]]; then
+            pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings="--global-option=--cpp_ext" --config-settings="--global-option=--cuda_ext" --target=/blob/amd-apex/ --upgrade .
+            git rev-parse HEAD > /blob/amd-apex/.venv_installed_version
+          fi
+          echo PYTHONPATH=$PYTHONPATH:/blob/amd-apex/ >> $GITHUB_ENV
+      # Runs a set of commands using the runners shell
+      - name: Install deepspeed
+        run: |
+          pip install .[dev,1bit,autotuning]
+          #python -c "from deepspeed.env_report import cli_main; cli_main()"
+          ds_report
+
+      - name: Python environment
+        run: |
+          pip list
+
+      # Runs a set of commands using the runners shell
+      - name: Unit tests
+        run: |
+          unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
+          cd tests
+          pytest $PYTEST_OPTS -n 4 --verbose unit/
+          pytest $PYTEST_OPTS -m 'sequential' unit/
+
+      - name: Open GitHub issue if nightly CI fails
+        if: ${{ failure() && (github.event_name == 'schedule') }}
+        uses: JasonEtco/create-an-issue@v2
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          filename: .github/ISSUE_TEMPLATE/ci_failure_report.md
+          update_existing: true
diff --git a/.github/workflows/auto-sync.yml b/.github/workflows/auto-sync.yml
index 5cc5dc02224f..bfbf5a2ae37a 100644
--- a/.github/workflows/auto-sync.yml
+++ b/.github/workflows/auto-sync.yml
@@ -11,7 +11,7 @@ jobs:
     runs-on: ubuntu-20.04
 
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
         with:
           token: ${{ secrets.GHP_TOKEN }}
           repository: ${{ secrets.DST_REPO }}
diff --git a/.github/workflows/cpu-inference.yml b/.github/workflows/cpu-inference.yml
new file mode 100644
index 000000000000..521fe2b5bea4
--- /dev/null
+++ b/.github/workflows/cpu-inference.yml
@@ -0,0 +1,75 @@
+name: cpu-inference
+
+on:
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  unit-tests:
+    runs-on: ubuntu-20.04
+
+    steps:
+      - uses: actions/checkout@v3
+
+      - id: setup-venv
+        uses: ./.github/workflows/setup-venv
+
+      - name: Detect instruction sets on instance
+        run: |
+          lscpu
+          pip install cmake
+          git clone https://github.com/intel/intel-extension-for-pytorch
+          cd intel-extension-for-pytorch/tests/cpu/isa
+          cmake .
+          make
+          ./cpu_features
+
+      - name: Install numactl
+        run: |
+          sudo apt-get install -y numactl
+
+      - name: Install oneCCL Bindings for PyTorch
+        run: |
+          python -m pip install intel_extension_for_pytorch
+          python -m pip install oneccl_bind_pt==2.0 -f https://developer.intel.com/ipex-whl-stable-cpu
+
+      - name: Install oneCCL
+        run: |
+          git clone https://github.com/oneapi-src/oneCCL
+          cd oneCCL
+          mkdir build
+          cd build
+          cmake ..
+          make
+          make install
+          #source ./_install/env/setvars.sh
+          # test whether oneCCL is correctly installed
+          #mpirun -n 2 ./examples/benchmark/benchmark
+
+      - name: Install transformers
+        run: |
+          git clone https://github.com/huggingface/transformers
+          cd transformers
+          git rev-parse --short HEAD
+          pip install .
+
+      - name: Install deepspeed
+        run: |
+          # check why the host does not have AVX2 support
+          pip install .[dev,1bit,autotuning,inf]
+          ds_report
+
+      - name: Python environment
+        run: |
+          pip list
+
+      - name: Unit tests
+        run: |
+          source oneCCL/build/_install/env/setvars.sh
+          unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
+          cd tests
+          TRANSFORMERS_CACHE=~/tmp/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest -m 'seq_inference' unit/
+          TRANSFORMERS_CACHE=~/tmp/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest -m 'inference_ops' -m 'inference' unit/
diff --git a/.github/workflows/formatting.yml b/.github/workflows/formatting.yml
index b62e6266cbb5..a168af277fb8 100644
--- a/.github/workflows/formatting.yml
+++ b/.github/workflows/formatting.yml
@@ -1,12 +1,13 @@
 name: Formatting
 
 on:
-  push:
-    branches:
-      - 'staging**'
   pull_request:
     branches:
       '**'
+  merge_group:
+    branches: [ master ]
+  schedule:
+    - cron: "0 0 * * *"
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}
@@ -19,19 +20,20 @@ jobs:
     runs-on: ubuntu-20.04
 
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
 
       - name: environment
         run: |
           which python
           python --version
 
-      - name: Install deepspeed
+      - name: Install dependencies
         run: |
-          pip install .[dev,autotuning]
-          ds_report
+          # Previously we would do pip install .[dev] but this is causing out of
+          # space errors start with torch 2.1.0 release
+          grep -E "clang-format|pre-commit" requirements/requirements-dev.txt | xargs pip install
 
       - name: Formatting checks
         run: |
-           pip show pre-commit clang-format
-           pre-commit run --all-files
+          pip show pre-commit clang-format
+          pre-commit run --all-files
diff --git a/.github/workflows/nv-a6000.yml b/.github/workflows/nv-a6000.yml
new file mode 100644
index 000000000000..a2b99de488d5
--- /dev/null
+++ b/.github/workflows/nv-a6000.yml
@@ -0,0 +1,63 @@
+name: nv-a6000
+
+on:
+  pull_request:
+    paths:
+      - "deepspeed/inference/v2/**"
+      - "tests/unit/inference/v2/**"
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+permissions:
+  contents: read
+  issues: write
+
+jobs:
+  unit-tests:
+    runs-on: [self-hosted, nvidia, a6000]
+    container:
+      image: nvcr.io/nvidia/pytorch:23.03-py3
+      ports:
+        - 80
+      options: --gpus all --shm-size "8G"
+
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Check container state
+        run: |
+          ldd --version
+          nvcc --version
+          nvidia-smi
+          python -c "import torch; print('torch:', torch.__version__, torch)"
+          python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
+      - name: Install transformers
+        run: |
+          git clone --depth=1 https://github.com/huggingface/transformers
+          cd transformers
+          git rev-parse --short HEAD
+          python -m pip install .
+      - name: Install deepspeed
+        run: |
+          python -m pip install docutils==0.18.1 jinja2==3.0 urllib3==1.26.11 ninja
+          python -m pip install .[dev,1bit,autotuning]
+          ds_report
+      - name: Python environment
+        run: |
+          python -m pip list
+      - name: Unit tests
+        run: |
+          unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
+          cd tests
+          python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2' unit/ --torch_ver="2.0" --cuda_ver="12"
+          python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2_ops' unit/ --torch_ver="2.0" --cuda_ver="12"
+      - name: MII unit tests
+        run: |
+          git clone --depth=1 https://github.com/microsoft/DeepSpeed-MII.git
+          cd DeepSpeed-MII
+          pip install .[dev]
+          cd tests
+          python -m pytest --color=yes --durations=0 --verbose -rF ./
diff --git a/.github/workflows/nv-accelerate-v100.yml b/.github/workflows/nv-accelerate-v100.yml
index 081e2c7b0f00..0f6491e08336 100644
--- a/.github/workflows/nv-accelerate-v100.yml
+++ b/.github/workflows/nv-accelerate-v100.yml
@@ -1,14 +1,16 @@
 name: nv-accelerate-v100
 
 on:
-  push:
-    branches:
-      - 'staging**'
-    paths-ignore:
-      - 'docs/**'
   pull_request:
     paths-ignore:
       - 'docs/**'
+      - 'blogs/**'
+      - 'deepspeed/inference/v2/**'
+      - "tests/unit/inference/v2/**"
+  merge_group:
+    branches: [ master ]
+  schedule:
+    - cron: "0 0 * * *"
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}
@@ -19,14 +21,14 @@ jobs:
     runs-on: [self-hosted, nvidia, cu111, v100]
 
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
 
       - id: setup-venv
         uses: ./.github/workflows/setup-venv
 
       - name: Install pytorch
         run: |
-          pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/cu111
+          pip install -U --cache-dir $TORCH_CACHE torch torchvision --extra-index-url https://download.pytorch.org/whl/cu111
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
@@ -41,11 +43,9 @@ jobs:
 
       - name: HF Accelerate tests
         run: |
-          if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
+          unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
           git clone https://github.com/huggingface/accelerate
           cd accelerate
-          # tmp fix
-          git checkout 5f4ba04628eeea14f9d248ab0e54399899503532
           git rev-parse --short HEAD
           # installing dependencies
           pip install .[testing]
@@ -54,4 +54,4 @@ jobs:
           # tmp fix: force newer datasets version
           #pip install "datasets>=2.0.0"
           pip list
-          HF_DATASETS_CACHE=/blob/datasets_cache/ TRANSFORMERS_CACHE=/blob/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --verbose tests/deepspeed
+          pytest $PYTEST_OPTS --color=yes --durations=0 --verbose tests/deepspeed
diff --git a/.github/workflows/nv-ds-chat.yml b/.github/workflows/nv-ds-chat.yml
new file mode 100644
index 000000000000..b53fac36315b
--- /dev/null
+++ b/.github/workflows/nv-ds-chat.yml
@@ -0,0 +1,69 @@
+name: nv-ds-chat
+
+on:
+  schedule:
+    - cron: "0 0 * * *"
+  workflow_dispatch:
+    inputs:
+      dse_branch:
+        description: 'DeepSpeedExamples Branch'
+        required: false
+        default: 'master'
+        type: string
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  unit-tests:
+    runs-on: [self-hosted, nvidia, cu116, v100]
+
+    steps:
+      - uses: actions/checkout@v3
+
+      - id: setup-venv
+        uses: ./.github/workflows/setup-venv
+
+      - name: Install pytorch
+        run: |
+          pip3 install -U --cache-dir $TORCH_CACHE torch --index-url https://download.pytorch.org/whl/cu118
+          python -c "import torch; print('torch:', torch.__version__, torch)"
+          python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
+
+      - name: Install deepspeed
+        run: |
+          pip install .[dev]
+          ds_report
+
+      - name: Install deepspeed-chat
+        run: |
+          BRANCH="master"
+          if [[ ! -z "${{ github.event.inputs.dse_branch }}" ]]; then
+              BRANCH="${{ github.event.inputs.dse_branch }}"
+          fi
+          echo "DeepSpeedExamples Branch: $BRANCH"
+          git clone -b $BRANCH https://github.com/microsoft/DeepSpeedExamples.git
+          cd DeepSpeedExamples/applications/DeepSpeed-Chat
+          pip install -r requirements.txt
+          pip install -e .
+
+      - name: Python environment
+        run: |
+          pip list
+
+      - name: DS-Chat unit tests
+        run: |
+          cd DeepSpeedExamples/applications/DeepSpeed-Chat
+          unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
+          cd tests
+          pytest $PYTEST_OPTS ./
+
+      - name: Open GitHub issue if nightly CI fails
+        if: ${{ failure() && (github.event_name == 'schedule') }}
+        uses: JasonEtco/create-an-issue@v2
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          filename: .github/ISSUE_TEMPLATE/ci_failure_report.md
+          update_existing: true
diff --git a/.github/workflows/nv-h100.yml b/.github/workflows/nv-h100.yml
new file mode 100644
index 000000000000..a1b812b3eafd
--- /dev/null
+++ b/.github/workflows/nv-h100.yml
@@ -0,0 +1,65 @@
+name: nv-h100
+
+on:
+  schedule:
+    - cron: "0 0 * * *"
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+permissions:
+  contents: read
+  issues: write
+
+jobs:
+  unit-tests:
+    runs-on: [self-hosted, nvidia, h100]
+    container:
+      image: nvcr.io/nvidia/pytorch:23.03-py3
+      ports:
+        - 80
+      options: --gpus all --shm-size "8G"
+
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Check container state
+        run: |
+          nvidia-smi
+          python -c "import torch; print('torch:', torch.__version__, torch)"
+          python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
+
+      - name: Install transformers
+        run: |
+          git clone https://github.com/huggingface/transformers
+          cd transformers
+          git rev-parse --short HEAD
+          python -m pip install .
+
+      - name: Install deepspeed
+        run: |
+          python -m pip install docutils==0.18.1 jinja2==3.0 urllib3==1.26.11 ninja
+          python -m pip install .[dev,1bit,autotuning]
+          ds_report
+
+      - name: Python environment
+        run: |
+          python -m pip list
+
+      - name: Unit tests
+        run: |
+          unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
+          cd tests
+          python -m pytest $PYTEST_OPTS -n 4  unit/ --torch_ver="2.0" --cuda_ver="12"
+          python -m pytest $PYTEST_OPTS -m 'sequential' unit/ --torch_ver="2.0" --cuda_ver="12"
+
+      - name: Open GitHub issue if nightly CI fails
+        if: ${{ failure() && (github.event_name == 'schedule') }}
+        uses: JasonEtco/create-an-issue@v2
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          filename: .github/ISSUE_TEMPLATE/ci_failure_report.md
+          update_existing: true
diff --git a/.github/workflows/nv-inference.yml b/.github/workflows/nv-inference.yml
index 01e6ac9ee264..f20b4496b6df 100644
--- a/.github/workflows/nv-inference.yml
+++ b/.github/workflows/nv-inference.yml
@@ -1,14 +1,16 @@
 name: nv-inference
 
 on:
-  push:
-    branches:
-      - 'staging**'
-    paths-ignore:
-      - 'docs/**'
   pull_request:
     paths-ignore:
       - 'docs/**'
+      - 'blogs/**'
+      - 'deepspeed/inference/v2/**'
+      - "tests/unit/inference/v2/**"
+  merge_group:
+    branches: [ master ]
+  schedule:
+    - cron: "0 0 * * *"
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}
@@ -19,14 +21,14 @@ jobs:
     runs-on: [self-hosted, nvidia, cu116, v100]
 
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
 
       - id: setup-venv
         uses: ./.github/workflows/setup-venv
 
       - name: Install pytorch
         run: |
-          pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/cu116
+          pip install -U --cache-dir $TORCH_CACHE torch==1.13.1 torchvision --extra-index-url https://download.pytorch.org/whl/cu116
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
@@ -34,12 +36,13 @@ jobs:
         run: |
           git clone https://github.com/huggingface/transformers
           cd transformers
+          git checkout f370bebdc
           git rev-parse --short HEAD
           pip install .
 
       - name: Install deepspeed
         run: |
-          pip install .[dev,1bit,autotuning,inf]
+          pip install .[dev,1bit,autotuning,inf,triton]
           ds_report
 
       - name: Python environment
@@ -49,8 +52,13 @@ jobs:
       - name: Unit tests
         run: |
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
-          if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
           cd tests
-          TRANSFORMERS_CACHE=/blob/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --verbose -m 'seq_inference' unit/ --torch_ver="1.13" --cuda_ver="11.6"
-          TRANSFORMERS_CACHE=/blob/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --verbose -m 'inference_ops' unit/ --torch_ver="1.13" --cuda_ver="11.6"
-          TRANSFORMERS_CACHE=/blob/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked -n 4 --verbose -m 'inference' unit/ --torch_ver="1.13" --cuda_ver="11.6"
+          coverage run --concurrency=multiprocessing -m pytest $PYTEST_OPTS -m 'seq_inference' unit/ --torch_ver="1.13" --cuda_ver="11.6"
+          coverage run --concurrency=multiprocessing -m pytest $PYTEST_OPTS -m 'inference_ops' unit/ --torch_ver="1.13" --cuda_ver="11.6"
+          coverage run --concurrency=multiprocessing -m pytest $PYTEST_OPTS --forked -n 4 -m 'inference' unit/ --torch_ver="1.13" --cuda_ver="11.6"
+
+      - name: Coverage report
+        run: |
+          cd tests
+          coverage combine
+          coverage report -m
diff --git a/.github/workflows/nv-lightning-v100.yml b/.github/workflows/nv-lightning-v100.yml
index e564a29d1ab2..d25d40aef967 100644
--- a/.github/workflows/nv-lightning-v100.yml
+++ b/.github/workflows/nv-lightning-v100.yml
@@ -1,14 +1,16 @@
 name: nv-lightning-v100
 
 on:
-  push:
-    branches:
-      - 'staging**'
-    paths-ignore:
-      - 'docs/**'
   pull_request:
     paths-ignore:
       - 'docs/**'
+      - 'blogs/**'
+      - 'deepspeed/inference/v2/**'
+      - "tests/unit/inference/v2/**"
+  merge_group:
+    branches: [ master ]
+  schedule:
+    - cron: "0 0 * * *"
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}
@@ -19,14 +21,14 @@ jobs:
     runs-on: [self-hosted, nvidia, cu111, v100]
 
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
 
       - id: setup-venv
         uses: ./.github/workflows/setup-venv
 
       - name: Install pytorch
         run: |
-          pip install torch==1.9.1+cu111 torchvision==0.10.1+cu111 torchaudio==0.9.1 -f https://download.pytorch.org/whl/torch_stable.html
+          pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu118
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
@@ -41,8 +43,8 @@ jobs:
 
       - name: PyTorch Lightning Tests
         run: |
-          if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
+          unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
           pip install pytorch-lightning
           pip install "protobuf<4.21.0"
           cd tests
-          TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --verbose lightning/
+          pytest $PYTEST_OPTS lightning/
diff --git a/.github/workflows/nv-megatron.yml b/.github/workflows/nv-megatron.yml
index c490bc45d357..3a3b70dcd17d 100644
--- a/.github/workflows/nv-megatron.yml
+++ b/.github/workflows/nv-megatron.yml
@@ -1,14 +1,16 @@
 name: nv-megatron
 
 on:
-  push:
-    branches:
-      - 'staging**'
-    paths-ignore:
-      - 'docs/**'
   pull_request:
     paths-ignore:
       - 'docs/**'
+      - 'blogs/**'
+      - 'deepspeed/inference/v2/**'
+      - "tests/unit/inference/v2/**"
+  merge_group:
+    branches: [ master ]
+  schedule:
+    - cron: "0 0 * * *"
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}
@@ -19,14 +21,14 @@ jobs:
     runs-on: [self-hosted, nvidia, cu116, v100]
 
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
 
       - id: setup-venv
         uses: ./.github/workflows/setup-venv
 
       - name: Install pytorch
         run: |
-          pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/cu116
+          pip install -U --cache-dir $TORCH_CACHE torch==1.13.1 torchvision --extra-index-url https://download.pytorch.org/whl/cu116
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
@@ -37,7 +39,15 @@ jobs:
 
       - name: Install apex
         run: |
-          pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" git+https://github.com/NVIDIA/apex.git
+          git clone https://github.com/NVIDIA/apex.git
+          cd apex
+          CURRENT_VER=$(git rev-parse HEAD)
+          INSTALLED_VER=$(cat /blob/apex/.venv_installed_version)
+          if [[ "$CURRENT_VER" != "$INSTALLED_VER" ]]; then
+            pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--global-option=--cpp_ext" --config-settings "--global-option=--cuda_ext" --target=/blob/apex/ --upgrade .
+            git rev-parse HEAD > /blob/apex/.venv_installed_version
+          fi
+          echo PYTHONPATH=$PYTHONPATH:/blob/apex/ >> $GITHUB_ENV
 
       - name: Python environment
         run: |
@@ -45,10 +55,9 @@ jobs:
 
       - name: Megatron unit tests
         run: |
-          git clone --branch mrwyattii/fix-deprecated-numpy-types https://github.com/microsoft/Megatron-DeepSpeed.git
+          git clone https://github.com/microsoft/Megatron-DeepSpeed.git
           cd Megatron-DeepSpeed
           pip install .
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
-          if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
           cd tests
-          MEGATRON_CKPT_DIR=/blob/megatron_ckpt/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --verbose ./
+          pytest $PYTEST_OPTS ./
diff --git a/.github/workflows/nv-mii.yml b/.github/workflows/nv-mii.yml
index b5f54fad46ec..86de2a3b0bcb 100644
--- a/.github/workflows/nv-mii.yml
+++ b/.github/workflows/nv-mii.yml
@@ -1,14 +1,14 @@
 name: nv-mii
 
 on:
-  push:
-    branches:
-      - 'staging**'
-    paths-ignore:
-      - 'docs/**'
   pull_request:
     paths-ignore:
       - 'docs/**'
+      - 'blogs/**'
+  merge_group:
+    branches: [ master ]
+  schedule:
+    - cron: "0 0 * * *"
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}
@@ -19,14 +19,14 @@ jobs:
     runs-on: [self-hosted, nvidia, cu116, v100]
 
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
 
       - id: setup-venv
         uses: ./.github/workflows/setup-venv
 
       - name: Install pytorch
         run: |
-          pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/cu116
+          pip3 install -U --cache-dir $TORCH_CACHE torch --index-url https://download.pytorch.org/whl/cu118
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
@@ -54,6 +54,5 @@ jobs:
           cd DeepSpeed-MII
           pip install .[dev]
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
-          if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
-          cd tests
-          TRANSFORMERS_CACHE=/blob/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -m "CPU or local" ./
+          cd tests/legacy
+          pytest $PYTEST_OPTS --forked -m "deepspeed" ./
diff --git a/.github/workflows/nv-nightly.yml b/.github/workflows/nv-nightly.yml
index 04d545dadd6b..1ed7d34a6be4 100644
--- a/.github/workflows/nv-nightly.yml
+++ b/.github/workflows/nv-nightly.yml
@@ -8,19 +8,23 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}
   cancel-in-progress: true
 
+permissions:
+    contents: read
+    issues: write
+
 jobs:
   unit-tests:
     runs-on: [self-hosted, nvidia, cu116, v100]
 
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
 
       - id: setup-venv
         uses: ./.github/workflows/setup-venv
 
       - name: Install pytorch
         run: |
-          pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/cu116
+          pip install -U --cache-dir $TORCH_CACHE torch==1.13.1 torchvision --extra-index-url https://download.pytorch.org/whl/cu116
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
@@ -45,6 +49,14 @@ jobs:
       - name: Unit tests
         run: |
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
-          if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
           cd tests
-          TRANSFORMERS_CACHE=/blob/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -m 'nightly' unit/ --torch_ver="1.13" --cuda_ver="11.6"
+          pytest $PYTEST_OPTS --forked -m 'nightly' unit/ --torch_ver="1.13" --cuda_ver="11.6"
+
+      - name: Open GitHub issue if nightly CI fails
+        if: ${{ failure() && (github.event_name == 'schedule') }}
+        uses: JasonEtco/create-an-issue@v2
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          filename: .github/ISSUE_TEMPLATE/ci_failure_report.md
+          update_existing: true
diff --git a/.github/workflows/nv-pre-compile-ops.yml b/.github/workflows/nv-pre-compile-ops.yml
new file mode 100644
index 000000000000..839312190d22
--- /dev/null
+++ b/.github/workflows/nv-pre-compile-ops.yml
@@ -0,0 +1,41 @@
+name: nv-pre-compile-ops
+
+on:
+  pull_request:
+    branches:
+      '**'
+    paths-ignore:
+      - 'docs/**'
+      - 'blogs/**'
+      - 'deepspeed/inference/v2/**'
+      - "tests/unit/inference/v2/**"
+  merge_group:
+    branches: [ master ]
+  schedule:
+    - cron: "0 0 * * *"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  build-ops:
+    runs-on: ubuntu-20.04
+    container:
+      image: deepspeed/gh-builder:ubuntu1804-py38-torch1131-cu116
+
+    steps:
+        - uses: actions/checkout@v3
+
+        - name: environment
+          run: |
+            which python
+            python --version
+            python -c "import torch; print('torch:', torch.__version__, torch)"
+            #python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
+        - name: Compile DeepSpeed Ops
+          run: |
+            TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0" DS_BUILD_OPS=1 DS_BUILD_SPARSE_ATTN=0  DS_BUILD_CUTLASS_OPS=0 DS_BUILD_RAGGED_DEVICE_OPS=0 DS_BUILD_EVOFORMER_ATTN=0 pip3 install .
+        - name: DS Report
+          run: |
+             ds_report
diff --git a/.github/workflows/nv-sd.yml b/.github/workflows/nv-sd.yml
new file mode 100644
index 000000000000..5ca159074a4d
--- /dev/null
+++ b/.github/workflows/nv-sd.yml
@@ -0,0 +1,70 @@
+name: nv-sd
+
+on:
+  schedule:
+    - cron: "0 0 * * 0"
+  workflow_dispatch:
+  pull_request:
+    paths:
+      - "deepspeed/ops/transformer/inference/diffusers_**"
+      - "tests/unit/inference/test_stable_diffusion.py"
+      - "deepspeed/model_implementations/diffusers/unet.py"
+      - "deepspeed/model_implementations/diffusers/vae.py"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+permissions:
+  contents: read
+  issues: write
+
+jobs:
+  sd-tests:
+    runs-on: [self-hosted, nvidia, a6000]
+    container:
+      image: nvcr.io/nvidia/pytorch:23.03-py3
+      ports:
+        - 80
+      options: --gpus all --shm-size "8G"
+
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Check container state
+        run: |
+          ldd --version
+          nvcc --version
+          nvidia-smi
+          python -c "import torch; print('torch:', torch.__version__, torch)"
+          python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
+      - name: Install transformers
+        run: |
+          git clone https://github.com/huggingface/transformers
+          cd transformers
+          git rev-parse --short HEAD
+          python -m pip install .
+      - name: Install deepspeed
+        run: |
+          pip install image-similarity-measures
+          python -m pip install opencv-python==4.6.* --force-reinstall
+          python -m pip install docutils==0.18.1 jinja2==3.0 urllib3==1.26.11 ninja
+          python -m pip install .[dev,1bit,autotuning,sd]
+          ds_report
+      - name: Python environment
+        run: |
+          python -m pip list
+      - name: Unit tests
+        run: |
+          unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
+          cd tests
+          python -m pytest --color=yes --durations=0 --verbose -rF -m 'stable_diffusion' -k "TestStableDiffusion" unit/ --torch_ver="2.0" --cuda_ver="12"
+
+      - name: Open GitHub issue if weekly CI fails
+        if: ${{ failure() && (github.event_name == 'schedule') }}
+        uses: JasonEtco/create-an-issue@v2
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          filename: .github/ISSUE_TEMPLATE/ci_failure_report.md
+          update_existing: true
diff --git a/.github/workflows/nv-torch-latest-cpu.yml b/.github/workflows/nv-torch-latest-cpu.yml
index d0ccc29deaa5..9ca1529d9018 100644
--- a/.github/workflows/nv-torch-latest-cpu.yml
+++ b/.github/workflows/nv-torch-latest-cpu.yml
@@ -1,14 +1,16 @@
 name: nv-torch-latest-cpu
 
 on:
-  push:
-    branches:
-      - 'staging**'
-    paths-ignore:
-      - 'docs/**'
   pull_request:
     paths-ignore:
       - 'docs/**'
+      - 'blogs/**'
+      - 'deepspeed/inference/v2/**'
+      - "tests/unit/inference/v2/**"
+  merge_group:
+    branches: [ master ]
+  schedule:
+    - cron: "0 0 * * *"
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}
@@ -19,7 +21,7 @@ jobs:
     runs-on: ubuntu-20.04
 
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
 
       - id: setup-venv
         uses: ./.github/workflows/setup-venv
@@ -42,7 +44,6 @@ jobs:
       - name: Unit tests
         run: |
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
-          if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
           cd tests
-          TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --verbose -n 4 unit/ --torch_ver="1.12"
-          TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --verbose -m 'sequential' unit/ --torch_ver="1.12"
+          TRANSFORMERS_CACHE=/tmp/transformers_cache/ pytest $PYTEST_OPTS -n 4 unit/ --torch_ver="1.12"
+          TRANSFORMERS_CACHE=/tmp/transformers_cache/ pytest $PYTEST_OPTS -m 'sequential' unit/ --torch_ver="1.12"
diff --git a/.github/workflows/nv-torch-latest-v100.yml b/.github/workflows/nv-torch-latest-v100.yml
index 2a3dcc4acc99..8813a4bb2c4f 100644
--- a/.github/workflows/nv-torch-latest-v100.yml
+++ b/.github/workflows/nv-torch-latest-v100.yml
@@ -1,14 +1,16 @@
 name: nv-torch-latest-v100
 
 on:
-  push:
-    branches:
-      - 'staging**'
-    paths-ignore:
-      - 'docs/**'
   pull_request:
     paths-ignore:
       - 'docs/**'
+      - 'blogs/**'
+      - 'deepspeed/inference/v2/**'
+      - "tests/unit/inference/v2/**"
+  merge_group:
+    branches: [ master ]
+  schedule:
+    - cron: "0 0 * * *"
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}
@@ -19,14 +21,14 @@ jobs:
     runs-on: [self-hosted, nvidia, cu116, v100]
 
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
 
       - id: setup-venv
         uses: ./.github/workflows/setup-venv
 
       - name: Install pytorch
         run: |
-          pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/cu116
+          pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu118
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
@@ -51,7 +53,12 @@ jobs:
       - name: Unit tests
         run: |
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
-          if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
           cd tests
-          TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --verbose --forked -n 4 unit/ --torch_ver="1.13" --cuda_ver="11.6"
-          TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --verbose --forked -m 'sequential' unit/ --torch_ver="1.13" --cuda_ver="11.6"
+          coverage run --concurrency=multiprocessing -m pytest $PYTEST_OPTS --forked -n 4 unit/ --torch_ver="2.1" --cuda_ver="11.8"
+          coverage run --concurrency=multiprocessing -m pytest $PYTEST_OPTS --forked -m 'sequential' unit/ --torch_ver="2.1" --cuda_ver="11.8"
+
+      - name: Coverage report
+        run: |
+          cd tests
+          coverage combine
+          coverage report -m
diff --git a/.github/workflows/nv-torch-nightly-v100.yml b/.github/workflows/nv-torch-nightly-v100.yml
index 625517d59167..d0df6e546982 100644
--- a/.github/workflows/nv-torch-nightly-v100.yml
+++ b/.github/workflows/nv-torch-nightly-v100.yml
@@ -8,19 +8,23 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}
   cancel-in-progress: true
 
+permissions:
+  contents: read
+  issues: write
+
 jobs:
   unit-tests:
     runs-on: [self-hosted, nvidia, cu116, v100]
 
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
 
       - id: setup-venv
         uses: ./.github/workflows/setup-venv
 
       - name: Install pytorch
         run: |
-          pip install --pre torch torchvision --extra-index-url https://download.pytorch.org/whl/nightly/cu116
+          pip install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cu118
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
@@ -45,7 +49,15 @@ jobs:
       - name: Unit tests
         run: |
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
-          if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
           cd tests
-          TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -n 4 unit/
-          TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -m 'sequential' unit/
+          pytest $PYTEST_OPTS --forked -n 4 unit/
+          pytest $PYTEST_OPTS --forked -m 'sequential' unit/
+
+      - name: Open GitHub issue if nightly CI fails
+        if: ${{ failure() && (github.event_name == 'schedule') }}
+        uses: JasonEtco/create-an-issue@v2
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          filename: .github/ISSUE_TEMPLATE/ci_failure_report.md
+          update_existing: true
diff --git a/.github/workflows/nv-torch18-v100.yml b/.github/workflows/nv-torch110-p40.yml
similarity index 54%
rename from .github/workflows/nv-torch18-v100.yml
rename to .github/workflows/nv-torch110-p40.yml
index 0e1cd79b419f..45f3e0438233 100644
--- a/.github/workflows/nv-torch18-v100.yml
+++ b/.github/workflows/nv-torch110-p40.yml
@@ -1,32 +1,31 @@
-name: nv-torch18-v100
+name: nv-torch110-p40
 
 on:
-  push:
-    branches:
-      - 'staging**'
-    paths-ignore:
-      - 'docs/**'
-  pull_request:
-    paths-ignore:
-      - 'docs/**'
+  schedule:
+    - cron: "0 0 * * *"
+  workflow_dispatch:
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}
   cancel-in-progress: true
 
+permissions:
+    contents: read
+    issues: write
+
 jobs:
   unit-tests:
-    runs-on: [self-hosted, nvidia, cu111, v100]
+    runs-on: [self-hosted, nvidia, cu111, p40]
 
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
 
       - id: setup-venv
         uses: ./.github/workflows/setup-venv
 
       - name: Install pytorch
         run: |
-          pip install torch==1.8.2+cu111 torchvision==0.9.2+cu111 -f https://download.pytorch.org/whl/lts/1.8/torch_lts.html
+          pip install -U --cache-dir $TORCH_CACHE torch==1.10.0+cu111 torchvision==0.11.0+cu111 -f https://download.pytorch.org/whl/torch_stable.html
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
@@ -41,7 +40,7 @@ jobs:
 
       - name: Install deepspeed
         run: |
-          pip install .[dev,1bit,autotuning]
+          pip install .[dev,1bit,autotuning] --no-build-isolation
           ds_report
 
       - name: Python environment
@@ -51,7 +50,14 @@ jobs:
       - name: Unit tests
         run: |
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
-          if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
           cd tests
-          TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -n 4  unit/ --torch_ver="1.8" --cuda_ver="11"
-          TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -m 'sequential' unit/ --torch_ver="1.8" --cuda_ver="11"
+          pytest $PYTEST_OPTS --forked -n 4 unit/ --torch_ver="1.10" --cuda_ver="11.1"
+
+      - name: Open GitHub issue if nightly CI fails
+        if: ${{ failure() && (github.event_name == 'schedule') }}
+        uses: JasonEtco/create-an-issue@v2
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          filename: .github/ISSUE_TEMPLATE/ci_failure_report.md
+          update_existing: true
diff --git a/.github/workflows/nv-torch110-v100.yml b/.github/workflows/nv-torch110-v100.yml
new file mode 100644
index 000000000000..1fd8aaac0ffa
--- /dev/null
+++ b/.github/workflows/nv-torch110-v100.yml
@@ -0,0 +1,64 @@
+name: nv-torch110-v100
+
+on:
+  schedule:
+    - cron: "0 0 * * *"
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+permissions:
+  contents: read
+  issues: write
+
+jobs:
+  unit-tests:
+    runs-on: [self-hosted, nvidia, cu111, v100]
+
+    steps:
+      - uses: actions/checkout@v3
+
+      - id: setup-venv
+        uses: ./.github/workflows/setup-venv
+
+      - name: Install pytorch
+        run: |
+          pip install -U --cache-dir $TORCH_CACHE torch==1.10.0+cu111 torchvision==0.11.0+cu111 -f https://download.pytorch.org/whl/torch_stable.html
+          python -c "import torch; print('torch:', torch.__version__, torch)"
+          python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
+
+      - name: Install transformers
+        run: |
+          git clone https://github.com/huggingface/transformers
+          cd transformers
+          # if needed switch to the last known good SHA until transformers@master is fixed
+          # git checkout 1cc453d33
+          git rev-parse --short HEAD
+          pip install .
+
+      - name: Install deepspeed
+        run: |
+          pip install .[dev,1bit,autotuning] --no-build-isolation
+          ds_report
+
+      - name: Python environment
+        run: |
+          pip list
+
+      - name: Unit tests
+        run: |
+          unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
+          cd tests
+          pytest $PYTEST_OPTS --forked -n 4  unit/ --torch_ver="1.10" --cuda_ver="11"
+          pytest $PYTEST_OPTS --forked -m 'sequential' unit/ --torch_ver="1.10" --cuda_ver="11"
+
+      - name: Open GitHub issue if nightly CI fails
+        if: ${{ failure() && (github.event_name == 'schedule') }}
+        uses: JasonEtco/create-an-issue@v2
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          filename: .github/ISSUE_TEMPLATE/ci_failure_report.md
+          update_existing: true
diff --git a/.github/workflows/nv-torch18-p40.yml b/.github/workflows/nv-torch18-p40.yml
deleted file mode 100644
index 45aeeed5078f..000000000000
--- a/.github/workflows/nv-torch18-p40.yml
+++ /dev/null
@@ -1,55 +0,0 @@
-name: nv-torch18-p40
-
-on:
-  push:
-    branches:
-      - 'staging**'
-    paths-ignore:
-      - 'docs/**'
-  pull_request:
-    paths-ignore:
-      - 'docs/**'
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  unit-tests:
-    runs-on: [self-hosted, nvidia, cu101, p40]
-
-    steps:
-      - uses: actions/checkout@v2
-
-      - id: setup-venv
-        uses: ./.github/workflows/setup-venv
-
-      - name: Install pytorch
-        run: |
-          pip install torch==1.8.2 torchvision==0.9.2 --extra-index-url https://download.pytorch.org/whl/lts/1.8/cu101
-          python -c "import torch; print('torch:', torch.__version__, torch)"
-          python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
-
-      - name: Install transformers
-        run: |
-          git clone https://github.com/huggingface/transformers
-          cd transformers
-          # if needed switch to the last known good SHA until transformers@master is fixed
-          # git checkout 1cc453d33
-          git rev-parse --short HEAD
-          pip install .
-
-      - name: Install deepspeed
-        run: |
-          pip install .[dev,1bit,autotuning]
-          ds_report
-
-      - name: Python environment
-        run: |
-          pip list
-
-      - name: Unit tests
-        run: |
-          if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
-          cd tests
-          TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -n 4 unit/ --torch_ver="1.8" --cuda_ver="10.1"
diff --git a/.github/workflows/nv-transformers-v100.yml b/.github/workflows/nv-transformers-v100.yml
index ec84f2234836..7753133f2886 100644
--- a/.github/workflows/nv-transformers-v100.yml
+++ b/.github/workflows/nv-transformers-v100.yml
@@ -1,14 +1,16 @@
 name: nv-transformers-v100
 
 on:
-  push:
-    branches:
-      - 'staging**'
-    paths-ignore:
-      - 'docs/**'
   pull_request:
     paths-ignore:
       - 'docs/**'
+      - 'blogs/**'
+      - 'deepspeed/inference/v2/**'
+      - "tests/unit/inference/v2/**"
+  merge_group:
+    branches: [ master ]
+  schedule:
+    - cron: "0 0 * * *"
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}
@@ -16,20 +18,30 @@ concurrency:
 
 jobs:
   unit-tests:
-    runs-on: [self-hosted, nvidia, cu111, v100]
+    runs-on: [self-hosted, nvidia, cu116, v100]
 
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
 
       - id: setup-venv
         uses: ./.github/workflows/setup-venv
 
       - name: Install pytorch
         run: |
-          pip install torch==1.8.2+cu111 torchvision==0.9.2+cu111 -f https://download.pytorch.org/whl/lts/1.8/torch_lts.html
+          # use the same pytorch version as transformers CI
+          pip install -U --cache-dir $TORCH_CACHE torch==2.0.1+cu118 --index-url https://download.pytorch.org/whl/cu118
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
+      - name: Install transformers
+        run: |
+          git clone https://github.com/huggingface/transformers
+          cd transformers
+          # if needed switch to the last known good SHA until transformers@master is fixed
+          git checkout e7e9261a2
+          git rev-parse --short HEAD
+          pip install .
+
       - name: Install deepspeed
         run: |
           pip install .[dev,autotuning]
@@ -41,19 +53,12 @@ jobs:
 
       - name: HF transformers tests
         run: |
-          if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
-          git clone https://github.com/huggingface/transformers
+          unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
           cd transformers
-          # if needed switch to the last known good SHA until transformers@master is fixed
-          #git checkout 6268694e2
-          git rev-parse --short HEAD
-          # scipy/sklearn required for tests, using the 'dev' extra forces torch re-install
           pip install .[testing]
           # find reqs used in ds integration tests
           find examples/pytorch -regextype posix-egrep -regex '.*(language-modeling|question-answering|summarization|image-classification|text-classification|translation).*/requirements.txt' -exec grep -v 'torch' {} \; | xargs -I {} pip install --upgrade {}
-          # force datasets version due to issues
-          pip install datasets==2.2.2
           # force protobuf version due to issues
           pip install "protobuf<4.21.0"
           pip list
-          HF_DATASETS_CACHE=/blob/datasets_cache/ TRANSFORMERS_CACHE=/blob/transformers_cache/ WANDB_DISABLED=true TORCH_EXTENSIONS_DIR=./torch-extensions RUN_SLOW=1 pytest --color=yes --durations=0 --verbose tests/deepspeed
+          WANDB_DISABLED=true RUN_SLOW=1 pytest $PYTEST_OPTS tests/deepspeed
diff --git a/.github/workflows/pre-compile-ops.yml b/.github/workflows/pre-compile-ops.yml
deleted file mode 100644
index 2ff3bb6a4fc7..000000000000
--- a/.github/workflows/pre-compile-ops.yml
+++ /dev/null
@@ -1,45 +0,0 @@
-# This is a basic workflow to help you get started with Actions
-
-name: Tests-w-precompiled-ops
-
-# Controls when the action will run.
-on:
-  # Allows you to run this workflow manually from the Actions tab
-  workflow_dispatch:
-
-# A workflow run is made up of one or more jobs that can run sequentially or in parallel
-jobs:
-  # This workflow contains a single job called "build"
-  build:
-    # The type of runner that the job will run on
-    runs-on: self-hosted
-
-    # Steps represent a sequence of tasks that will be executed as part of the job
-    steps:
-      # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
-      - uses: actions/checkout@v2
-
-      - id: setup-venv
-        uses: ./.github/workflows/setup-venv
-
-      # Runs a single command using the runners shell
-      - name: environment
-        run: |
-          python -c "import torch; print('torch:', torch.__version__, torch)"
-          python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
-
-      # Runs a set of commands using the runners shell
-      - name: Install deepspeed
-        run: |
-          DS_BUILD_OPS=1 pip install .[dev]
-          ds_report
-
-      - name: Formatting checks
-        run: |
-           pre-commit run --all-files
-
-      # Runs a set of commands using the runners shell
-      - name: Unit tests
-        run: |
-          if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
-          TORCH_EXTENSIONS_DIR=./torch-extensions pytest --durations=0 --forked --verbose -x tests/unit/
diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml
index 9de35a6d17f6..279bad471c01 100644
--- a/.github/workflows/python.yml
+++ b/.github/workflows/python.yml
@@ -1,12 +1,16 @@
 name: python
 
 on:
-  push:
-    branches:
-      - 'staging**'
   pull_request:
     branches:
       '**'
+    paths-ignore:
+      - 'docs/**'
+      - 'blogs/**'
+  merge_group:
+    branches: [ master ]
+  schedule:
+    - cron: "0 0 * * *"
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}
@@ -24,7 +28,7 @@ jobs:
       image: deepspeed/gh-builder:py${{ matrix.pyVersion }}
 
     steps:
-        - uses: actions/checkout@v2
+        - uses: actions/checkout@v3
 
         - name: environment
           run: |
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
new file mode 100644
index 000000000000..8e016b4169cb
--- /dev/null
+++ b/.github/workflows/release.yml
@@ -0,0 +1,50 @@
+name: Build and publish DeepSpeed release
+
+on:
+  push:
+    tags:
+      - 'v*.*.*'
+
+jobs:
+  deploy:
+    runs-on: ubuntu-20.04
+    environment: release-env
+
+    steps:
+    - uses: actions/checkout@v3
+      with:
+        ref: "master"
+    - id: setup-venv
+      uses: ./.github/workflows/setup-venv
+    - name: Get release version from tag
+      run: |
+        echo "RELEASE_VERSION=${GITHUB_REF#refs/*/v}" >> $GITHUB_ENV
+    - name: Check release version
+      run: |
+        pip install packaging
+        python release/check_release_version.py --release_version ${{ env.RELEASE_VERSION }}
+    - name: Build DeepSpeed
+      run: |
+        DS_BUILD_STRING=" " python setup.py sdist
+    - name: Publish to PyPI
+      uses: pypa/gh-action-pypi-publish@release/v1
+      with:
+        password: ${{ secrets.PYPI_API_TOKEN }}
+        repository-url: https://upload.pypi.org/legacy/
+    - name: Bump version
+      run: |
+        python release/bump_patch_version.py --current_version ${{ env.RELEASE_VERSION }}
+    - name: Create Pull Request
+      uses: peter-evans/create-pull-request@v4
+      with:
+        token: ${{ secrets.GH_PAT }}
+        add-paths: |
+          version.txt
+        body: |
+          **Auto-generated PR to update version.txt after a DeepSpeed release**
+          Released version - ${{ env.RELEASE_VERSION }}
+          Author           - @${{ github.actor }}
+        branch: AutoPR/${{ env.RELEASE_VERSION }}
+        assignees: ${{ github.actor }}
+        title: "Update version.txt after ${{ env.RELEASE_VERSION }} release"
+        author: ${{ github.actor }} <${{ github.actor }}@users.noreply.github.com>
diff --git a/.github/workflows/setup-venv/action.yml b/.github/workflows/setup-venv/action.yml
index dacd50b8d471..ce2c458b9e57 100644
--- a/.github/workflows/setup-venv/action.yml
+++ b/.github/workflows/setup-venv/action.yml
@@ -12,11 +12,24 @@ runs:
       shell: bash
     - id: create-venv
       run: |
+        rm -rf ./unit-test-venv
         python -m venv unit-test-venv
         source ./unit-test-venv/bin/activate
         python -m pip install --upgrade pip
+        pip install wheel # required after pip>=23.1
         echo PATH=$PATH >> $GITHUB_ENV # Make it so venv is inherited for other steps
       shell: bash
+    - id: set-env-vars
+      run: |
+        echo TEST_DATA_DIR=/blob/ >> $GITHUB_ENV
+        echo TRANSFORMERS_CACHE=/blob/transformers_cache/ >> $GITHUB_ENV
+        echo TORCH_EXTENSIONS_DIR=./torch-extensions/ >> $GITHUB_ENV
+        echo TORCH_CACHE=/blob/torch_cache/ >> $GITHUB_ENV
+        echo HF_DATASETS_CACHE=/blob/datasets_cache/ >> $GITHUB_ENV
+        echo MEGATRON_CKPT_DIR=/blob/megatron_ckpt/ >> $GITHUB_ENV
+        echo CRITIC_CKPT_DIR=/blob/step2_opt_125m_ckpt/ >> $GITHUB_ENV
+        echo PYTEST_OPTS="--color=yes --durations=0 --verbose -rF" >> $GITHUB_ENV
+      shell: bash
     - id: print-env
       run: |
         which python
diff --git a/.gitignore b/.gitignore
index ab364ad8a7e7..5b9cc7ac3156 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,31 +1,40 @@
+# Ignore Python compiled files
 *.pyc
-.idea/
-*~
-*.swp
+
+# Ignore IDE-specific files and directories
+.idea/         # JetBrains IDE settings
+.vscode/       # Visual Studio Code settings
+.theia/        # Theia IDE settings
+
+# Ignore temporary and backup files
+*~             # General backup files
+*.swp          # Vim swap files
+
+# Ignore log files
 *.log
+
+# Ignore a specific generated file
 deepspeed/git_version_info_installed.py
+
+# Ignore Python bytecode cache
 __pycache__
 
 # Build + installation data
-build/
-dist/
-*.so
-deepspeed.egg-info/
-build.txt
-
-# Website
-docs/_site/
-docs/build
+build/                  # Build artifacts
+dist/                   # Distribution files
+*.so                    # Compiled shared objects
+deepspeed.egg-info/     # Deepspeed package info
+build.txt               # Build information
+
+# Website generated files
+docs/_site/             # Jekyll generated site
+docs/build              # Generated documentation
 docs/code-docs/source/_build
 docs/code-docs/_build
 docs/code-docs/build
-.sass-cache/
-.jekyll-cache/
+.sass-cache/            # SASS cache
+.jekyll-cache/          # Jekyll cache
 .jekyll-metadata
 
 # Testing data
-tests/unit/saved_checkpoint/
-
-# Dev/IDE data
-.vscode
-.theia
+tests/unit/saved_checkpoint/  # Saved checkpoints for testing
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index db67534e1936..2432a7a24124 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -22,12 +22,12 @@ repos:
     -   id: requirements-txt-fixer
     -   id: trailing-whitespace
 
--   repo: https://github.com/pre-commit/mirrors-yapf
-    rev: v0.31.0
+-   repo: https://github.com/google/yapf
+    rev: v0.32.0
     hooks:
     -   id: yapf
 
--   repo: https://gitlab.com/daverona/pre-commit-cpp
+-   repo: https://gitlab.com/daverona/pre-commit/cpp
     rev: 0.8.0
     hooks:
     -   id: clang-format  # formatter of C/C++ code based on a style guide: LLVM, Google, Chromium, Mozilla, and WebKit available
@@ -38,7 +38,7 @@ repos:
     -   id: check-torchdist
         name: check-torchdist
         entry: ./scripts/check-torchdist.py
-        language: script
+        language: python
         exclude: ^(deepspeed/comm/|docs/|benchmarks/|scripts/check-torchdist.py|deepspeed/moe/sharded_moe.py|deepspeed/runtime/comm/coalesced_collectives.py|deepspeed/elasticity/elastic_agent.py|deepspeed/launcher/launch.py|tests/unit/comm/test_dist.py)
         # Specific deepspeed/ files are excluded for now until we wrap ProcessGroup in deepspeed.comm
 
@@ -47,8 +47,9 @@ repos:
     -   id: check-license
         name: check-license
         entry: ./scripts/check-license.py
-        language: script
-        files: \.(py|cc|cu|h|cuh|hip)$
+        language: python
+        files: \.(py|c|cpp|cu|cc|h|hpp|cuh|hip|tr)$
+        exclude: ^(deepspeed/inference/v2/kernels/ragged_ops/blocked_flash|deepspeed/inference/v2/kernels/cutlass_ops/grouped_gemm)
 
 -   repo: https://github.com/codespell-project/codespell
     rev: v2.1.0
@@ -58,7 +59,7 @@ repos:
             # Do not check files that are automatically generated
             '--skip=docs/Gemfile.lock,tests/unit/gpt2-merges.txt,tests/unit/gpt2-vocab.json',
             '--ignore-regex=\\n',  # Do not count the 'n' in an escaped newline as part of a word
-            '--ignore-words-list=unsupport',  # Word used in error messages that need rewording
+            '--ignore-words-list=youn,unsupport,noe',  # Word used in error messages that need rewording
             --check-filenames,
             --check-hidden
         ]
@@ -67,4 +68,13 @@ repos:
     rev: 4.0.1
     hooks:
     -   id: flake8
-        args: ['--ignore=E,F403,F405,F541,F841,W', '--select=E9,F,W6', '--per-file-ignores=__init__.py:F401']
+        args: ['--config=.flake8']
+
+-   repo: local
+    hooks:
+    -   id: check-torchcuda
+        name: check-torchcuda
+        entry: ./scripts/check-torchcuda.py
+        language: python
+        exclude: ^(.github/workflows/|scripts/check-torchcuda.py|docs/_tutorials/accelerator-abstraction-interface.md|accelerator/cuda_accelerator.py|deepspeed/inference/engine.py|deepspeed/model_implementations/transformers/clip_encoder.py|deepspeed/model_implementations/diffusers/vae.py|deepspeed/model_implementations/diffusers/unet.py|op_builder/spatial_inference.py|op_builder/transformer_inference.py|op_builder/builder.py|setup.py|tests/unit/ops/sparse_attention/test_sparse_attention.py)
+        # Specific deepspeed/ files are excluded for now until we wrap ProcessGroup in deepspeed.comm
diff --git a/.readthedocs.yml b/.readthedocs.yml
index a2da36620152..91102a7de54b 100644
--- a/.readthedocs.yml
+++ b/.readthedocs.yml
@@ -1,6 +1,9 @@
-
 # Required
 version: 2
+build:
+  os: "ubuntu-22.04"
+  tools:
+    python: "3.8"
 
 # Build documentation in the docs/ directory with Sphinx
 sphinx:
@@ -13,6 +16,5 @@ formats:
 
 # Optionally set the version of Python and requirements required to build your docs
 python:
-  version: 3.7
   install:
     - requirements: requirements/requirements-readthedocs.txt
diff --git a/.style.yapf b/.style.yapf
index 4a4850fe4df6..be8721dd3e5c 100644
--- a/.style.yapf
+++ b/.style.yapf
@@ -1,3 +1,3 @@
 [style]
-SPLIT_ALL_COMMA_SEPARATED_VALUES = true
-COLUMN_LIMIT = 89
+SPLIT_ALL_COMMA_SEPARATED_VALUES = false
+COLUMN_LIMIT = 119
diff --git a/CODEOWNERS b/CODEOWNERS
index 5fc20409c276..2410b3ebc09b 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -7,7 +7,7 @@
 
 
 # top-level repo folders
-/.github/ @jeffra @mrwyattii
+/.github/ @jeffra @mrwyattii @loadams
 /azure/ @jeffra @awan-10
 /benchmarks/ @jeffra @awan-10 @mrwyattii @molly-smith
 /bin/ @jeffra
diff --git a/LICENSE b/LICENSE
index 9e841e7a26e4..261eeb9e9f8b 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,21 +1,201 @@
-    MIT License
-
-    Copyright (c) Microsoft Corporation.
-
-    Permission is hereby granted, free of charge, to any person obtaining a copy
-    of this software and associated documentation files (the "Software"), to deal
-    in the Software without restriction, including without limitation the rights
-    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-    copies of the Software, and to permit persons to whom the Software is
-    furnished to do so, subject to the following conditions:
-
-    The above copyright notice and this permission notice shall be included in all
-    copies or substantial portions of the Software.
-
-    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-    SOFTWARE
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/MANIFEST.in b/MANIFEST.in
index 2fec750c6644..ab79573ef96c 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,4 +1,6 @@
 include *.txt README.md
+include deepspeed/inference/v2/kernels/ragged_ops/libs/*.so
+include deepspeed/inference/v2/kernels/cutlass_ops/libs/*.so
 recursive-include requirements *.txt
 recursive-include deepspeed *.cpp *.h *.cu *.hip *.tr *.cuh *.cc *.json
 recursive-include csrc *.cpp *.h *.cu *.tr *.cuh *.cc
diff --git a/README.md b/README.md
index bfa03a6e8c9a..b50b85af844f 100755
--- a/README.md
+++ b/README.md
@@ -1,7 +1,4 @@
 [![License MIT](https://badgen.net/badge/license/MIT/blue)](https://github.com/Microsoft/DeepSpeed/blob/master/LICENSE)
-[![PyPI version](https://badge.fury.io/py/deepspeed.svg)](https://pypi.org/project/deepspeed/)
-[![Downloads](https://pepy.tech/badge/deepspeed)](https://pepy.tech/project/deepspeed)
-[![Build](https://badgen.net/badge/build/check-status/blue)](#build-pipeline-status)
 
 
 <div align="center">
@@ -9,222 +6,11 @@
  <img src="docs/assets/images/DeepSpeed_dark_transparent.svg#gh-dark-mode-only" width="400px">
 </div>
 
-## Latest News
-<b> DeepSpeed trained the world's most powerful language models ([MT-530B](https://www.microsoft.com/en-us/research/blog/using-deepspeed-and-megatron-to-train-megatron-turing-nlg-530b-the-worlds-largest-and-most-powerful-generative-language-model/), [BLOOM](https://huggingface.co/blog/bloom-megatron-deepspeed)); [learn how](https://www.deepspeed.ai/tutorials/large-models-w-deepspeed/).</b>
+## DeeperSpeed
 
-* [2023/02] [Automatic Tensor Parallelism: Enables tensor parallelism by default without providing an injection policy](https://www.deepspeed.ai/tutorials/automatic-tensor-parallelism/)
-* [2022/12] [DeepSpeed Data Efficiency: A composable library that makes better use of data, increases training efficiency, and improves model quality](https://www.deepspeed.ai/2022/12/11/data-efficiency.html)
-* [2022/11] [Stable Diffusion Image Generation under 1 second w. DeepSpeed MII](https://github.com/microsoft/DeepSpeed-MII/tree/main/examples/benchmark/txt2img)
-* [2022/10] [DeepSpeed-MII: instant speedup on 24,000+ open-source DL models with up to 40x cheaper inference](https://www.deepspeed.ai/2022/10/10/mii.html)
-* [2022/09] [ZeRO-Inference: Democratizing massive model inference](https://www.deepspeed.ai/2022/09/09/zero-inference.html)
-* [2022/07] [Azure and DeepSpeed empower easy-to-use and high-performance model training](https://azure.microsoft.com/en-us/blog/azure-empowers-easytouse-highperformance-and-hyperscale-model-training-using-deepspeed/)
+DeeperSpeed is a fork of Microsoft's [Deepspeed](https://github.com/microsoft/DeepSpeed) library that is tailor-made for the [GPT-NeoX](https://github.com/EleutherAI/gpt-neox) by [EleutherAI](https://www.eleuther.ai/). 
 
----
+Prior to 3/9/2023, DeeperSpeed was based on an old version of DeepSpeed (0.3.15). In order to migrate to the latest upstream DeepSpeed version while allowing users to access the old versions of GPT-NeoX and DeeperSpeed, we have introduced two versioned releases for both libraries:
 
-# Extreme Speed and Scale for DL Training and Inference
-
-[DeepSpeed](https://www.deepspeed.ai/) is an easy-to-use deep learning optimization software suite that enables unprecedented scale and speed for Deep Learning Training and Inference. With DeepSpeed you can:
-
-* Train/Inference dense or sparse models with billions or trillions of parameters
-* Achieve excellent system throughput and efficiently scale to thousands of GPUs
-* Train/Inference on resource constrained GPU systems
-* Achieve unprecedented low latency and high throughput for inference
-* Achieve extreme compression for an unparalleled inference latency and model size reduction with low costs
-
----
-
-# DeepSpeed's three innovation pillars
-
-<img src="docs/assets/images/3pillars.png" width="800px">
-
-
-## DeepSpeed-Training
-
-DeepSpeed offers a confluence of system innovations, that has made large scale DL training effective, and efficient, greatly improved ease of use, and redefined the DL training landscape in terms of scale that is possible. These innovations such as ZeRO, 3D-Parallelism, DeepSpeed-MoE, ZeRO-Infinity, etc. fall under the training pillar. Learn more: [DeepSpeed-Training](https://www.deepspeed.ai/training/)
-
-## DeepSpeed-Inference
-
-DeepSpeed brings together innovations in parallelism technology such as tensor, pipeline, expert and ZeRO-parallelism, and combines them with high performance custom inference kernels, communication optimizations and heterogeneous memory technologies to enable inference at an unprecedented scale, while achieving unparalleled latency, throughput and cost reduction. This systematic composition of system technologies for inference falls under the inference pillar. Learn more: [DeepSpeed-Inference](https://www.deepspeed.ai/inference)
-
-
-## DeepSpeed-Compression
-
-To further increase the inference efficiency, DeepSpeed offers easy-to-use and flexible-to-compose compression techniques for researchers and practitioners to compress their models while delivering faster speed, smaller model size, and significantly reduced compression cost. Moreover, SoTA innovations on compression like ZeroQuant and XTC are included under the compression pillar. Learn more: [DeepSpeed-Compression](https://www.deepspeed.ai/compression)
-
----
-
-# DeepSpeed Software Suite
-
-## DeepSpeed Library
-
-   The [DeepSpeed](https://github.com/microsoft/deepspeed) library (this repository) implements and packages the innovations and technologies in DeepSpeed Training, Inference and Compression Pillars into a single easy-to-use, open-sourced repository. It allows for easy composition of multitude of features within a single training, inference or compression pipeline. The DeepSpeed Library is heavily adopted by the DL community, and has been used to enable some of the most powerful models (see [DeepSpeed Adoption](#deepspeed-adoption)).
-
-## Model Implementations for Inference (MII)
-
-   [Model Implementations for Inference (MII)](https://github.com/microsoft/deepspeed-mii) is an open-sourced repository for making low-latency and high-throughput inference accessible to all data scientists by alleviating the need to apply complex system optimization techniques themselves. Out-of-box, MII offers support for thousands of widely used DL models, optimized using DeepSpeed-Inference, that can be deployed with a few lines of code, while achieving significant latency reduction compared to their vanilla open-sourced versions.
-
-## DeepSpeed on Azure
-
-   DeepSpeed users are diverse and have access to different environments. We recommend to try DeepSpeed on Azure as it is the simplest and easiest method. The recommended method to try DeepSpeed on Azure is through AzureML [recipes](https://github.com/Azure/azureml-examples/tree/main/v1/python-sdk/workflows/train/deepspeed). The job submission and data preparation scripts have been made available [here](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples/azureml). For more details on how to use DeepSpeed on Azure, please follow the [Azure tutorial](https://www.deepspeed.ai/tutorials/azure/).
-
----
-
-# DeepSpeed Adoption
-
-DeepSpeed is an important part of Microsoft’s new
-[AI at Scale](https://www.microsoft.com/en-us/research/project/ai-at-scale/)
-initiative to enable next-generation AI capabilities at scale, where you can find more
-information [here](https://innovation.microsoft.com/en-us/exploring-ai-at-scale).
-
-DeepSpeed has been used to train many different large-scale models, below is a list of several examples that we are aware of (if you'd like to include your model please submit a PR):
-
-  * [Megatron-Turing NLG (530B)](https://www.microsoft.com/en-us/research/blog/using-deepspeed-and-megatron-to-train-megatron-turing-nlg-530b-the-worlds-largest-and-most-powerful-generative-language-model/)
-  * [Jurassic-1 (178B)](https://uploads-ssl.webflow.com/60fd4503684b466578c0d307/61138924626a6981ee09caf6_jurassic_tech_paper.pdf)
-  * [BLOOM (176B)](https://huggingface.co/blog/bloom-megatron-deepspeed)
-  * [GLM (130B)](https://github.com/THUDM/GLM-130B)
-  * [YaLM (100B)](https://github.com/yandex/YaLM-100B)
-  * [GPT-NeoX (20B)](https://github.com/EleutherAI/gpt-neox)
-  * [AlexaTM (20B)](https://www.amazon.science/blog/20b-parameter-alexa-model-sets-new-marks-in-few-shot-learning)
-  * [Turing NLG (17B)](https://www.microsoft.com/en-us/research/blog/turing-nlg-a-17-billion-parameter-language-model-by-microsoft/)
-  * [METRO-LM (5.4B)](https://arxiv.org/pdf/2204.06644.pdf)
-
-DeepSpeed has been integrated with several different popular open-source DL frameworks such as:
-
-|                                                                                                | Documentation                                |
-| ---------------------------------------------------------------------------------------------- | -------------------------------------------- |
-<img src="docs/assets/images/transformers-light.png#gh-light-mode-only" width="250px"><img src="docs/assets/images/transformers-dark.png#gh-dark-mode-only" width="250px"> | [Transformers with DeepSpeed](https://huggingface.co/docs/transformers/main/main_classes/deepspeed) |
-| <img src="docs/assets/images/accelerate-light.png#gh-light-mode-only" width="250px"><img src="docs/assets/images/accelerate-dark.png#gh-dark-mode-only" width="250px"> | [Accelerate with DeepSpeed](https://huggingface.co/docs/accelerate/usage_guides/deepspeed) |
-| <img src="docs/assets/images/lightning-light.svg#gh-light-mode-only" width="200px"><img src="docs/assets/images/lightning-dark.svg#gh-dark-mode-only" width="200px"> | [Lightning with DeepSpeed](https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.strategies.DeepSpeedStrategy.html) |
-| <img src="docs/assets/images/mosaicml.svg" width="200px"> | [MosaicML with DeepSpeed](https://docs.mosaicml.com/en/latest/trainer/using_the_trainer.html?highlight=deepspeed#deepspeed-integration) |
-| <img src="docs/assets/images/determined.svg" width="225px"> | [Determined with DeepSpeed](https://docs.determined.ai/latest/training/apis-howto/deepspeed/overview.html) |
-
----
-
-# Build Pipeline Status
-
-| Description | Status |
-| ----------- | ------ |
-| NVIDIA | [![nv-torch12-p40](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch12-p40.yml/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch12-p40.yml) [![nv-torch18-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch18-v100.yml/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch18-v100.yml) [![nv-torch-latest-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch-latest-v100.yml/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch-latest-v100.yml) [![nv-inference](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-inference.yml/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-inference.yml) [![nv-nightly](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-nightly.yml/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-nightly.yml) |
-| AMD | [![amd](https://github.com/microsoft/DeepSpeed/actions/workflows/amd.yml/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/amd.yml) |
-| PyTorch Nightly | [![nv-torch-nightly-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch-nightly-v100.yml/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch-nightly-v100.yml) |
-| Integrations | [![nv-transformers-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-transformers-v100.yml/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-transformers-v100.yml) [![nv-lightning-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-lightning-v100.yml/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-lightning-v100.yml) [![nv-accelerate-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-accelerate-v100.yml/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-accelerate-v100.yml) |
-| Misc | [![Formatting](https://github.com/microsoft/DeepSpeed/actions/workflows/formatting.yml/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/formatting.yml) [![pages-build-deployment](https://github.com/microsoft/DeepSpeed/actions/workflows/pages/pages-build-deployment/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/pages/pages-build-deployment) [![Documentation Status](https://readthedocs.org/projects/deepspeed/badge/?version=latest)](https://deepspeed.readthedocs.io/en/latest/?badge=latest)|
-
-# Installation
-
-The quickest way to get started with DeepSpeed is via pip, this will install
-the latest release of DeepSpeed which is not tied to specific PyTorch or CUDA
-versions. DeepSpeed includes several C++/CUDA extensions that we commonly refer
-to as our 'ops'.  By default, all of these extensions/ops will be built
-just-in-time (JIT) using [torch's JIT C++ extension loader that relies on
-ninja](https://pytorch.org/docs/stable/cpp_extension.html) to build and
-dynamically link them at runtime.
-
-## Requirements
-* [PyTorch](https://pytorch.org/) must be installed _before_ installing DeepSpeed.
-* For full feature support we recommend a version of PyTorch that is >= 1.8 and ideally the latest PyTorch stable release.
-* A CUDA or ROCm compiler such as [nvcc](https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/#introduction) or [hipcc](https://github.com/ROCm-Developer-Tools/HIPCC) used to compile C++/CUDA/HIP extensions.
-* Specific GPUs we develop and test against are listed below, this doesn't mean your GPU will not work if it doesn't fall into this category it's just DeepSpeed is most well tested on the following:
-  * NVIDIA: Pascal, Volta, Ampere, and Hopper architectures
-  * AMD: MI100 and MI200
-
-## PyPI
-We regularly push releases to [PyPI](https://pypi.org/project/deepspeed/) and encourage users to install from there in most cases.
-
-```bash
-pip install deepspeed
-```
-
-After installation, you can validate your install and see which extensions/ops
-your machine is compatible with via the DeepSpeed environment report.
-
-```bash
-ds_report
-```
-
-If you would like to pre-install any of the DeepSpeed extensions/ops (instead
-of JIT compiling) or install pre-compiled ops via PyPI please see our [advanced
-installation instructions](https://www.deepspeed.ai/tutorials/advanced-install/).
-
-## Windows
-Windows support is partially supported with DeepSpeed. On Windows you can build wheel with following steps, currently only inference mode is supported.
-1. Install pytorch, such as pytorch 1.8 + cuda 11.1
-2. Install visual cpp build tools, such as VS2019 C++ x64/x86 build tools
-3. Launch cmd console with Administrator privilege for creating required symlink folders
-4. Run `python setup.py bdist_wheel` to build wheel in `dist` folder
-
-# Features
-
-Please checkout [DeepSpeed-Training](https://www.deepspeed.ai/training), [DeepSpeed-Inference](https://www.deepspeed.ai/inference) and [DeepSpeed-Compression](https://www.deepspeed.ai/compression) pages for full set of features offered along each of these three pillars.
-
-# Further Reading
-
-All DeepSpeed documentation, tutorials, and blogs can be found on our website: [deepspeed.ai](https://www.deepspeed.ai/)
-
-
-|                                                                                                | Description                                  |
-| ---------------------------------------------------------------------------------------------- | -------------------------------------------- |
-| [Getting Started](https://www.deepspeed.ai/getting-started/)                                   |  First steps with DeepSpeed                  |
-| [DeepSpeed JSON Configuration](https://www.deepspeed.ai/docs/config-json/)                     |  Configuring DeepSpeed                       |
-| [API Documentation](https://deepspeed.readthedocs.io/en/latest/)                               |  Generated DeepSpeed API documentation       |
-| [Tutorials](https://www.deepspeed.ai/tutorials/)                                               |  Tutorials                                   |
-| [Blogs](https://www.deepspeed.ai/posts/)                                                       |  Blogs                                   |
-
-
-# Contributing
-DeepSpeed welcomes your contributions! Please see our
-[contributing](CONTRIBUTING.md) guide for more details on formatting, testing,
-etc.
-
-## Contributor License Agreement
-This project welcomes contributions and suggestions. Most contributions require you to
-agree to a Contributor License Agreement (CLA) declaring that you have the right to, and
-actually do, grant us the rights to use your contribution. For details, visit
-https://cla.opensource.microsoft.com.
-
-When you submit a pull request, a CLA bot will automatically determine whether you need
-to provide a CLA and decorate the PR appropriately (e.g., status check, comment). Simply
-follow the instructions provided by the bot. You will only need to do this once across
-all repos using our CLA.
-
-## Code of Conduct
-This project has adopted the [Microsoft Open Source Code of
-Conduct](https://opensource.microsoft.com/codeofconduct/). For more information see the
-[Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or contact
-[opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
-
-# Publications
-1. Samyam Rajbhandari, Jeff Rasley, Olatunji Ruwase, Yuxiong He. (2019) ZeRO: memory optimizations toward training trillion parameter models. [arXiv:1910.02054](https://arxiv.org/abs/1910.02054) and [In Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis (SC '20)](https://dl.acm.org/doi/10.5555/3433701.3433727).
-2. Jeff Rasley, Samyam Rajbhandari, Olatunji Ruwase, and Yuxiong He. (2020) DeepSpeed: System Optimizations Enable Training Deep Learning Models with Over 100 Billion Parameters. [In Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining (KDD '20, Tutorial)](https://dl.acm.org/doi/10.1145/3394486.3406703).
-3. Minjia Zhang, Yuxiong He. (2020) Accelerating Training of Transformer-Based Language Models with Progressive Layer Dropping. [arXiv:2010.13369](https://arxiv.org/abs/2010.13369) and [NeurIPS 2020](https://proceedings.neurips.cc/paper/2020/hash/a1140a3d0df1c81e24ae954d935e8926-Abstract.html).
-4. Jie Ren, Samyam Rajbhandari, Reza Yazdani Aminabadi, Olatunji Ruwase, Shuangyan Yang, Minjia Zhang, Dong Li, Yuxiong He. (2021) ZeRO-Offload: Democratizing Billion-Scale Model Training. [arXiv:2101.06840](https://arxiv.org/abs/2101.06840) and [USENIX ATC 2021](https://www.usenix.org/conference/atc21/presentation/ren-jie).
-5. Hanlin Tang, Shaoduo Gan, Ammar Ahmad Awan, Samyam Rajbhandari, Conglong Li, Xiangru Lian, Ji Liu, Ce Zhang, Yuxiong He. (2021) 1-bit Adam: Communication Efficient Large-Scale Training with Adam's Convergence Speed. [arXiv:2102.02888](https://arxiv.org/abs/2102.02888) and [ICML 2021](http://proceedings.mlr.press/v139/tang21a.html).
-6. Samyam Rajbhandari, Olatunji Ruwase, Jeff Rasley, Shaden Smith, Yuxiong He. (2021) ZeRO-Infinity: Breaking the GPU Memory Wall for Extreme Scale Deep Learning. [arXiv:2104.07857](https://arxiv.org/abs/2104.07857) and [SC 2021](https://dl.acm.org/doi/abs/10.1145/3458817.3476205).
-7. Conglong Li, Ammar Ahmad Awan, Hanlin Tang, Samyam Rajbhandari, Yuxiong He. (2021) 1-bit LAMB: Communication Efficient Large-Scale Large-Batch Training with LAMB's Convergence Speed. [arXiv:2104.06069](https://arxiv.org/abs/2104.06069) and [HiPC 2022](https://hipc.org/advance-program/).
-8. Conglong Li, Minjia Zhang, Yuxiong He. (2021) The Stability-Efficiency Dilemma: Investigating Sequence Length Warmup for Training GPT Models. [arXiv:2108.06084](https://arxiv.org/abs/2108.06084) and [NeurIPS 2022](https://openreview.net/forum?id=JpZ5du_Kdh).
-9. Yucheng Lu, Conglong Li, Minjia Zhang, Christopher De Sa, Yuxiong He. (2022) Maximizing Communication Efficiency for Large-scale Training via 0/1 Adam. [arXiv:2202.06009](https://arxiv.org/abs/2202.06009).
-10. Samyam Rajbhandari, Conglong Li, Zhewei Yao, Minjia Zhang, Reza Yazdani Aminabadi, Ammar Ahmad Awan, Jeff Rasley, Yuxiong He. (2022) DeepSpeed-MoE: Advancing Mixture-of-Experts Inference and Training to Power Next-Generation AI Scale [arXiv:2201.05596](https://arxiv.org/abs/2201.05596) and [ICML 2022](https://proceedings.mlr.press/v162/rajbhandari22a.html).
-11. Shaden Smith, Mostofa Patwary, Brandon Norick, Patrick LeGresley, Samyam Rajbhandari, Jared Casper, Zhun Liu, Shrimai Prabhumoye, George Zerveas, Vijay Korthikanti, Elton Zhang, Rewon Child, Reza Yazdani Aminabadi, Julie Bernauer, Xia Song, Mohammad Shoeybi, Yuxiong He, Michael Houston, Saurabh Tiwary, Bryan Catanzaro. (2022) Using DeepSpeed and Megatron to Train Megatron-Turing NLG 530B, A Large-Scale Generative Language Model [arXiv:2201.11990](https://arxiv.org/abs/2201.11990).
-12. Xiaoxia Wu, Zhewei Yao, Minjia Zhang, Conglong Li, Yuxiong He. (2022) Extreme Compression for Pre-trained Transformers Made Simple and Efficient. [arXiv:2206.01859](https://arxiv.org/abs/2206.01859) and [NeurIPS 2022](https://openreview.net/forum?id=xNeAhc2CNAl).
-13. Zhewei Yao, Reza Yazdani Aminabadi, Minjia Zhang, Xiaoxia Wu, Conglong Li, Yuxiong He. (2022) ZeroQuant: Efficient and Affordable Post-Training Quantization for Large-Scale Transformers. [arXiv:2206.01861](https://arxiv.org/abs/2206.01861) and [NeurIPS 2022](https://openreview.net/forum?id=f-fVCElZ-G1).
-14. Reza Yazdani Aminabadi, Samyam Rajbhandari, Minjia Zhang, Ammar Ahmad Awan, Cheng Li, Du Li, Elton Zheng, Jeff Rasley, Shaden Smith, Olatunji Ruwase, Yuxiong He. (2022) DeepSpeed Inference: Enabling Efficient Inference of Transformer Models at Unprecedented Scale. [arXiv:2207.00032](https://arxiv.org/abs/2207.00032) and [SC 2022](https://dl.acm.org/doi/abs/10.5555/3571885.3571946).
-15. Zhewei Yao, Xiaoxia Wu, Conglong Li, Connor Holmes, Minjia Zhang, Cheng Li, Yuxiong He. (2022) Random-LTD: Random and Layerwise Token Dropping Brings Efficient Training for Large-scale Transformers. [arXiv:2211.11586](https://arxiv.org/abs/2211.11586).
-16. Conglong Li, Zhewei Yao, Xiaoxia Wu, Minjia Zhang, Yuxiong He. (2022) DeepSpeed Data Efficiency: Improving Deep Learning Model Quality and Training Efficiency via Efficient Data Sampling and Routing. [arXiv:2212.03597](https://arxiv.org/abs/2212.03597).
-
-
-# Videos
-1. DeepSpeed KDD 2020 Tutorial
-    1. [Overview](https://www.youtube.com/watch?v=CaseqC45DNc&list=PLa85ZdUjfWS21mgibJ2vCvLziprjpKoW0&index=29)
-    2. [ZeRO + large model training](https://www.youtube.com/watch?v=y4_bCiAsIAk&list=PLa85ZdUjfWS21mgibJ2vCvLziprjpKoW0&index=28)
-    3. [17B T-NLG demo](https://www.youtube.com/watch?v=9V-ZbP92drg&list=PLa85ZdUjfWS21mgibJ2vCvLziprjpKoW0&index=27)
-    4. [Fastest BERT training + RScan tuning](https://www.youtube.com/watch?v=o1K-ZG9F6u0&list=PLa85ZdUjfWS21mgibJ2vCvLziprjpKoW0&index=26)
-    5. DeepSpeed hands on deep dive: [part 1](https://www.youtube.com/watch?v=_NOk-mBwDYg&list=PLa85ZdUjfWS21mgibJ2vCvLziprjpKoW0&index=92), [part 2](https://www.youtube.com/watch?v=sG6_c4VXLww&list=PLa85ZdUjfWS21mgibJ2vCvLziprjpKoW0&index=94), [part 3](https://www.youtube.com/watch?v=k9yPkBTayos&list=PLa85ZdUjfWS21mgibJ2vCvLziprjpKoW0&index=93)
-    6. [FAQ](https://www.youtube.com/watch?v=nsHu6vEgPew&list=PLa85ZdUjfWS21mgibJ2vCvLziprjpKoW0&index=24)
-2. Microsoft Research Webinar
-    * Registration is free and all videos are available on-demand.
-    * [ZeRO & Fastest BERT: Increasing the scale and speed of deep learning training in DeepSpeed](https://note.microsoft.com/MSR-Webinar-DeepSpeed-Registration-On-Demand.html).
-3. [DeepSpeed on AzureML](https://youtu.be/yBVXR8G8Bg8)
-4. Community Tutorials
-    * [DeepSpeed: All the tricks to scale to gigantic models (Mark Saroufim)](https://www.youtube.com/watch?v=pDGI668pNg0)
-    * [Turing-NLG, DeepSpeed and the ZeRO optimizer (Yannic Kilcher)](https://www.youtube.com/watch?v=tC01FRB0M7w)
-    * [Ultimate Guide To Scaling ML Models (The AI Epiphany)](https://www.youtube.com/watch?v=hc0u4avAkuM)
+- Version 1.0 of [GPT-NeoX](https://github.com/EleutherAI/gpt-neox/releases/tag/v1.0) and [DeeperSpeed](https://github.com/EleutherAI/DeeperSpeed/releases/tag/v1.0) maintain snapshots of the old stable versions that [GPT-NeoX-20B](https://arxiv.org/abs/2204.06745) and the [Pythia Suite](https://github.com/EleutherAI/pythia) were trained on.
+- Version 2.0 of [GPT-NeoX](https://github.com/EleutherAI/gpt-neox/releases/tag/v2.0) and [DeeperSpeed](https://github.com/EleutherAI/DeeperSpeed/releases/tag/v2.0) are the latest versions built on the latest DeepSpeed, and will be maintained going forward.
diff --git a/SECURITY.md b/SECURITY.md
index e0dfff56a956..9e9391ee0bac 100644
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -12,7 +12,7 @@ If you believe you have found a security vulnerability in any Microsoft-owned re
 
 Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://msrc.microsoft.com/create-report).
 
-If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc).
+If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc).
 
 You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc).
 
diff --git a/accelerator/__init__.py b/accelerator/__init__.py
index d29c3b15467b..efed1ef84aca 100644
--- a/accelerator/__init__.py
+++ b/accelerator/__init__.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .abstract_accelerator import DeepSpeedAccelerator
-from .real_accelerator import get_accelerator, set_accelerator
+from .real_accelerator import get_accelerator, set_accelerator, is_current_accelerator_supported
diff --git a/accelerator/abstract_accelerator.py b/accelerator/abstract_accelerator.py
index 03dfbe9df43d..0c7ee3212fad 100644
--- a/accelerator/abstract_accelerator.py
+++ b/accelerator/abstract_accelerator.py
@@ -1,14 +1,22 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import abc
 from abc import ABC
 
 
 class DeepSpeedAccelerator(ABC):
+
     def __init__(self):
         self._name = None
         self._communication_backend_name = None
 
+    @abc.abstractmethod
+    def is_synchronized_device(self):
+        ...
+
     # Device APIs
     @abc.abstractmethod
     def device_name(self, device_index):
@@ -139,6 +147,10 @@ def max_memory_reserved(self, device_index=None):
     def total_memory(self, device_index=None):
         ...
 
+    @abc.abstractmethod
+    def available_memory(self, device_index=None):
+        ...
+
     # Data types
     @abc.abstractmethod
     def is_bf16_supported(self):
@@ -148,6 +160,10 @@ def is_bf16_supported(self):
     def is_fp16_supported(self):
         ...
 
+    @abc.abstractmethod
+    def supported_dtypes(self):
+        ...
+
     # Misc
     @abc.abstractmethod
     def amp(self):
@@ -173,6 +189,10 @@ def lazy_call(self, callback):
     def communication_backend_name(self):
         ...
 
+    @abc.abstractmethod
+    def is_triton_supported(self):
+        ...
+
     # Tensor operations
     @property
     @abc.abstractmethod
@@ -210,7 +230,11 @@ def LongTensor(self):
         ...
 
     @abc.abstractmethod
-    def pin_memory(self, tensor):
+    def pin_memory(self, tensor, align_bytes=1):
+        ...
+
+    @abc.abstractmethod
+    def is_pinned(self, tensor):
         ...
 
     @abc.abstractmethod
diff --git a/accelerator/cpu_accelerator.py b/accelerator/cpu_accelerator.py
new file mode 100644
index 000000000000..fdbbd33c07a2
--- /dev/null
+++ b/accelerator/cpu_accelerator.py
@@ -0,0 +1,282 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+from deepspeed.accelerator.abstract_accelerator import DeepSpeedAccelerator
+import oneccl_bindings_for_pytorch  # noqa: F401 # type: ignore
+import psutil
+import os
+
+
+# accelerator for Intel CPU
+class CPU_Accelerator(DeepSpeedAccelerator):
+
+    def __init__(self):
+        self._name = 'cpu'
+        self._communication_backend_name = 'ccl'
+        self.max_mem = psutil.Process().memory_info().rss
+
+    def is_synchronized_device(self):
+        return True
+
+    # Device APIs
+    def device_name(self, device_index=None):
+        return 'cpu'
+
+    def device(self, device_index=None):
+        return None
+
+    def set_device(self, device_index):
+        return
+
+    def current_device(self):
+        return os.environ.get('LOCAL_RANK', 0)
+
+    def current_device_name(self):
+        return 'cpu'
+
+    def device_count(self):
+        device_count = int(os.environ.get('LOCAL_SIZE', 0))
+        if device_count > 0:
+            return device_count
+        else:
+            from deepspeed.utils.numa import get_numa_cores
+            # Count NUMA node for number of cpu accelerators. On machine with HBM
+            # In flat mode, HBM is in separate NUMA node with no cores on this node.
+            # Ignore these NUMA nodes with no cores.
+            numa_core_lists = get_numa_cores()
+            numa_count = 0
+            prev_core_list = []
+            for core_list in numa_core_lists:
+                if len(core_list) > 0 and core_list != prev_core_list:
+                    numa_count += 1
+                    prev_core_list = core_list
+            return numa_count
+
+    def synchronize(self, device_index=None):
+        return
+
+    # RNG APIs
+    def random(self):
+        return torch.random
+
+    def set_rng_state(self, new_state, device_index=None):
+        if device_index == None:
+            return torch.set_rng_state(new_state)
+        return torch.set_rng_state(new_state, device_index)
+
+    def get_rng_state(self, device_index=None):
+        return torch.get_rng_state()
+
+    def manual_seed(self, seed):
+        return torch.manual_seed(seed)
+
+    def manual_seed_all(self, seed):
+        return torch.manual_seed(seed)
+
+    def initial_seed(self, seed):
+        return torch.initial_seed(seed)
+
+    def default_generator(self, device_index):
+        return torch.default_generator
+
+    # Streams/Events
+    @property
+    def Stream(self):
+        return None
+
+    def stream(self, stream):
+        from deepspeed.runtime.utils import noop_context
+        return noop_context()
+
+    def current_stream(self, device_index=None):
+        return None
+
+    def default_stream(self, device_index=None):
+        return None
+
+    @property
+    def Event(self):
+        return None
+
+    # Memory management
+    def empty_cache(self):
+        return
+
+    def get_rss(self):
+        mem = psutil.Process().memory_info().rss
+        if mem > self.max_mem:
+            self.max_mem = mem
+        return mem
+
+    def reset_rss(self):
+        mem = psutil.Process().memory_info().rss
+        self.max_mem = mem
+        return mem
+
+    def memory_allocated(self, device_index=None):
+        return self.get_rss()
+
+    def max_memory_allocated(self, device_index=None):
+        self.get_rss()
+        return self.max_mem
+
+    def reset_max_memory_allocated(self, device_index=None):
+        self.reset_rss()
+        return
+
+    def memory_cached(self, device_index=None):
+        return self.get_rss()
+
+    def max_memory_cached(self, device_index=None):
+        self.get_rss()
+        return self.max_mem
+
+    def reset_max_memory_cached(self, device_index=None):
+        self.reset_rss()
+        return
+
+    def memory_stats(self, device_index=None):
+        mem = self.get_rss()
+        mem_stat = {}
+        mem_stat['allocated_bytes.all.current'] = mem
+        mem_stat['allocated_bytes.all.peak'] = self.max_mem
+        return mem_stat
+
+    def reset_peak_memory_stats(self, device_index=None):
+        self.reset_rss()
+        return
+
+    def memory_reserved(self, device_index=None):
+        return self.get_rss()
+
+    def max_memory_reserved(self, device_index=None):
+        self.get_rss()
+        return self.max_mem
+
+    def total_memory(self, device_index=None):
+        return psutil.virtual_memory().total
+
+    def available_memory(self, device_index=None):
+        return psutil.virtual_memory().available
+
+    # Misc
+    def amp(self):
+        return torch.cpu.amp
+
+    def is_available(self):
+        return True
+
+    def range_push(self, msg):
+        # TODO itt is currently not supported yet
+        # return torch.profiler.itt.range_push(msg)
+        return
+
+    def range_pop(self):
+        # TODO itt is currently not supported yet
+        # return torch.profiler.itt.range_pop()
+        return
+
+    def lazy_call(self, callback):
+        return callback()
+
+    def communication_backend_name(self):
+        return self._communication_backend_name
+
+    def is_triton_supported(self):
+        return False
+
+    # Data types
+    def is_bf16_supported(self):
+        return True
+
+    def is_fp16_supported(self):
+        return False
+
+    def supported_dtypes(self):
+        return [torch.float, torch.bfloat16]
+
+    # Tensor operations
+
+    @property
+    def BFloat16Tensor(self):
+        return torch.BFloat16Tensor
+
+    @property
+    def ByteTensor(self):
+        return torch.ByteTensor
+
+    @property
+    def DoubleTensor(self):
+        return torch.DoubleTensor
+
+    @property
+    def FloatTensor(self):
+        return torch.FloatTensor
+
+    @property
+    def HalfTensor(self):
+        return torch.HalfTensor
+
+    @property
+    def IntTensor(self):
+        return torch.IntTensor
+
+    @property
+    def LongTensor(self):
+        return torch.LongTensor
+
+    def pin_memory(self, tensor, align_bytes=1):
+        return tensor
+
+    def is_pinned(self, tensor):
+        return tensor.is_pinned()
+
+    def op_builder_dir(self):
+        try:
+            # is op_builder from deepspeed or a 3p version? this should only succeed if it's deepspeed
+            # if successful this also means we're doing a local install and not JIT compile path
+            from op_builder import __deepspeed__  # noqa: F401 # type: ignore
+            return "op_builder.cpu"
+        except ImportError:
+            return "deepspeed.ops.op_builder.cpu"
+
+    def on_accelerator(self, tensor):
+        device_str = str(tensor.device)
+        if device_str.startswith('cpu'):
+            return True
+        else:
+            return False
+
+    # create an instance of op builder and return, name specified by class_name
+    def create_op_builder(self, op_name):
+        builder_class = self.get_op_builder(op_name)
+        if builder_class != None:
+            return builder_class()
+        return None
+
+    # return an op builder class, name specified by class_name
+    def get_op_builder(self, class_name):
+        try:
+            # is op_builder from deepspeed or a 3p version? this should only succeed if it's deepspeed
+            # if successful this also means we're doing a local install and not JIT compile path
+            from op_builder import __deepspeed__  # noqa: F401 # type: ignore
+            from op_builder.cpu import CCLCommBuilder, FusedAdamBuilder, CPUAdamBuilder, NotImplementedBuilder
+        except ImportError:
+            from deepspeed.ops.op_builder.cpu import CCLCommBuilder, FusedAdamBuilder, CPUAdamBuilder, NotImplementedBuilder
+
+        if class_name == "CCLCommBuilder":
+            return CCLCommBuilder
+        elif class_name == "FusedAdamBuilder":
+            return FusedAdamBuilder
+        elif class_name == "CPUAdamBuilder":
+            return CPUAdamBuilder
+        else:
+            # return a NotImplementedBuilder to avoid get NoneType[Name] in unit tests
+            return NotImplementedBuilder
+
+    def build_extension(self):
+        from torch.utils.cpp_extension import BuildExtension
+        return BuildExtension
diff --git a/accelerator/cuda_accelerator.py b/accelerator/cuda_accelerator.py
index 945ba42a3a9c..2786b425ca7f 100644
--- a/accelerator/cuda_accelerator.py
+++ b/accelerator/cuda_accelerator.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import os
 import pkgutil
@@ -12,30 +15,32 @@
 except ImportError:
     pass
 
+# Delay import pynvml to avoid import error when CUDA is not available
+pynvml = None
+
 
 class CUDA_Accelerator(DeepSpeedAccelerator):
+
     def __init__(self):
         self._name = 'cuda'
         self._communication_backend_name = 'nccl'
+        if pynvml is None:
+            self._init_pynvml()
 
-        # begin initialize for create_op_builder()
-        # put all valid class name <--> class type mapping into class_dict
-        op_builder_dir = self.op_builder_dir()
-        op_builder_module = importlib.import_module(op_builder_dir)
-
-        for _, module_name, _ in pkgutil.iter_modules([os.path.dirname(op_builder_module.__file__)]):
-            # avoid self references
-            if module_name != 'all_ops' and module_name != 'builder':
-                module = importlib.import_module("{}.{}".format(
-                    op_builder_dir,
-                    module_name))
-                for member_name in module.__dir__():
-                    if member_name.endswith(
-                            'Builder'
-                    ) and member_name != "OpBuilder" and member_name != "CUDAOpBuilder" and member_name != "TorchCPUOpBuilder":  # avoid abstract classes
-                        if not member_name in self.class_dict:
-                            self.class_dict[member_name] = getattr(module, member_name)
-        # end initialize for create_op_builder()
+    def _init_pynvml(self):
+        global pynvml
+        try:
+            import pynvml
+        except ImportError:
+            return
+        try:
+            pynvml.nvmlInit()
+        except pynvml.NVMLError:
+            pynvml = None
+            return
+
+    def is_synchronized_device(self):
+        return False
 
     # Device APIs
     def device_name(self, device_index=None):
@@ -148,6 +153,31 @@ def max_memory_reserved(self, device_index=None):
     def total_memory(self, device_index=None):
         return torch.cuda.get_device_properties(device_index).total_memory
 
+    def _get_nvml_gpu_id(self, torch_gpu_id):
+        """
+        credit: https://discuss.pytorch.org/t/making-pynvml-match-torch-device-ids-cuda-visible-devices/103020
+
+        Remap torch device id to nvml device id, respecting CUDA_VISIBLE_DEVICES.
+
+        If the latter isn't set return the same id
+        """
+        # if CUDA_VISIBLE_DEVICES is used automagically remap the id since pynvml ignores this env var
+        if "CUDA_VISIBLE_DEVICES" in os.environ:
+            ids = list(map(int, os.environ.get("CUDA_VISIBLE_DEVICES", "").split(",")))
+            return ids[torch_gpu_id]  # remap
+        else:
+            return torch_gpu_id
+
+    def available_memory(self, device_index=None):
+        if pynvml:
+            if device_index is None:
+                device_index = self.current_device()
+            handle = pynvml.nvmlDeviceGetHandleByIndex(self._get_nvml_gpu_id(device_index))
+            info = pynvml.nvmlDeviceGetMemoryInfo(handle)
+            return info.free
+        else:
+            return self.total_memory(device_index) - self.memory_allocated(device_index)
+
     # Data types
     def is_bf16_supported(self):
         return torch.cuda.is_bf16_supported()
@@ -159,6 +189,9 @@ def is_fp16_supported(self):
         else:
             return False
 
+    def supported_dtypes(self):
+        return [torch.float, torch.half, torch.bfloat16]
+
     # Misc
     def amp(self):
         if hasattr(torch.cuda, 'amp'):
@@ -182,6 +215,13 @@ def lazy_call(self, callback):
     def communication_backend_name(self):
         return self._communication_backend_name
 
+    def is_triton_supported(self):
+        major, _ = torch.cuda.get_device_capability()
+        if major >= 8:
+            return True
+        else:
+            return False
+
     # Tensor operations
 
     @property
@@ -212,9 +252,12 @@ def IntTensor(self):
     def LongTensor(self):
         return torch.cuda.LongTensor
 
-    def pin_memory(self, tensor):
+    def pin_memory(self, tensor, align_bytes=1):
         return tensor.pin_memory()
 
+    def is_pinned(self, tensor):
+        return tensor.is_pinned()
+
     def on_accelerator(self, tensor):
         device_str = str(tensor.device)
         if device_str.startswith('cuda:'):
@@ -226,7 +269,7 @@ def op_builder_dir(self):
         try:
             # is op_builder from deepspeed or a 3p version? this should only succeed if it's deepspeed
             # if successful this also means we're doing a local install and not JIT compile path
-            from op_builder import __deepspeed__  # noqa: F401
+            from op_builder import __deepspeed__  # noqa: F401 # type: ignore
             return "op_builder"
         except ImportError:
             return "deepspeed.ops.op_builder"
@@ -234,10 +277,35 @@ def op_builder_dir(self):
     # dict that holds class name <--> class type mapping i.e.
     # 'AsyncIOBuilder': <class 'op_builder.async_io.AsyncIOBuilder'>
     # this dict will be filled at init stage
-    class_dict = {}
+    class_dict = None
+
+    def _lazy_init_class_dict(self):
+        if self.class_dict != None:
+            return
+        else:
+            self.class_dict = {}
+            # begin initialize for create_op_builder()
+            # put all valid class name <--> class type mapping into class_dict
+            op_builder_dir = self.op_builder_dir()
+            op_builder_module = importlib.import_module(op_builder_dir)
+            op_builder_absolute_path = os.path.dirname(op_builder_module.__file__)
+            for _, module_name, _ in pkgutil.iter_modules([op_builder_absolute_path]):
+                # avoid self references,
+                # skip sub_directories which contains ops for other backend(cpu, npu, etc.).
+                if module_name != 'all_ops' and module_name != 'builder' and not os.path.isdir(
+                        os.path.join(op_builder_absolute_path, module_name)):
+                    module = importlib.import_module("{}.{}".format(op_builder_dir, module_name))
+                    for member_name in module.__dir__():
+                        if member_name.endswith(
+                                'Builder'
+                        ) and member_name != "OpBuilder" and member_name != "CUDAOpBuilder" and member_name != "TorchCPUOpBuilder":  # avoid abstract classes
+                            if not member_name in self.class_dict:
+                                self.class_dict[member_name] = getattr(module, member_name)
+            # end initialize for create_op_builder()
 
     # create an instance of op builder and return, name specified by class_name
     def create_op_builder(self, class_name):
+        self._lazy_init_class_dict()
         if class_name in self.class_dict:
             return self.class_dict[class_name]()
         else:
@@ -245,6 +313,7 @@ def create_op_builder(self, class_name):
 
     # return an op builder class, name specified by class_name
     def get_op_builder(self, class_name):
+        self._lazy_init_class_dict()
         if class_name in self.class_dict:
             return self.class_dict[class_name]
         else:
diff --git a/accelerator/mps_accelerator.py b/accelerator/mps_accelerator.py
new file mode 100644
index 000000000000..77595f6b636c
--- /dev/null
+++ b/accelerator/mps_accelerator.py
@@ -0,0 +1,237 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+
+from .abstract_accelerator import DeepSpeedAccelerator
+
+# During setup stage torch may not be installed, pass on no torch will
+# allow op builder related API to be executed.
+try:
+    import torch.mps
+except ImportError:
+    pass
+
+
+class MPS_Accelerator(DeepSpeedAccelerator):
+
+    def __init__(self):
+        self._name = "mps"
+        self._communication_backend_name = None
+
+    def is_synchronized_device(self):
+        return False
+
+    # Device APIs
+    def device_name(self, device_index=None):
+        if device_index == None:
+            return "mps"
+        return "mps:{}".format(device_index)
+
+    def device(self, device_index):
+        return torch.device("mps", index=0)
+
+    def set_device(self, device_index):
+        return
+
+    def current_device(self):
+        return torch.device("mps", index=0)
+
+    def current_device_name(self):
+        return "mps:0"
+
+    def device_count(self):
+        return 1
+
+    def synchronize(self, device_index=None):
+        return torch.mps.synchronize()
+
+    # RNG APIs
+    def random(self):
+        return torch.random
+
+    def set_rng_state(self, new_state, device_index=None):
+        return torch.mps.set_rng_state(new_state)
+
+    def get_rng_state(self, device_index=None):
+        return torch.mps.get_rng_state()
+
+    def manual_seed(self, seed):
+        return torch.mps.manual_seed(seed)
+
+    def manual_seed_all(self, seed):
+        return torch.mps.manual_seed(seed)
+
+    def seed(self):
+        return torch.mps.seed()
+
+    def initial_seed(self, seed):
+        return
+
+    def default_generator(self, device_index):
+        return
+
+    # Streams/Events
+    @property
+    def Stream(self):
+        return None
+
+    def stream(self, stream):
+        return None
+
+    def current_stream(self, device_index=None):
+        return None
+
+    def default_stream(self, device_index=None):
+        return None
+
+    @property
+    def Event(self):
+        return None
+
+    # Memory management
+    def empty_cache(self):
+        return torch.mps.empty_cache()
+
+    def memory_allocated(self, device_index=None):
+        return torch.mps.current_allocated_memory()
+
+    def max_memory_allocated(self, device_index=None):
+        return torch.mps.driver_allocated_memory()
+
+    def set_per_process_memory_fraction(self, fraction):
+        return torch.mps.set_per_process_memory_fraction(fraction)
+
+    def reset_max_memory_allocated(self, device_index=None):
+        return
+
+    def memory_cached(self, device_index=None):
+        return
+
+    def max_memory_cached(self, device_index=None):
+        return
+
+    def reset_max_memory_cached(self, device_index=None):
+        return
+
+    def memory_stats(self, device_index=None):
+        return
+
+    def reset_peak_memory_stats(self, device_index=None):
+        return
+
+    def memory_reserved(self, device_index=None):
+        return
+
+    def max_memory_reserved(self, device_index=None):
+        return
+
+    def total_memory(self, device_index=None):
+        return
+
+    def available_memory(self, device_index=None):
+        return
+
+    # Data types
+    def is_bf16_supported(self):
+        return False
+
+    def is_fp16_supported(self):
+        return False
+
+    def supported_dtypes(self):
+        return [torch.float]
+
+    # Misc
+    def amp(self):
+        return
+
+    def is_available(self):
+        return hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
+
+    def range_push(self, msg):
+        return
+
+    def range_pop(self):
+        return
+
+    def lazy_call(self, callback):
+        return
+
+    def communication_backend_name(self):
+        return self._communication_backend_name
+
+    def is_triton_supported(self):
+        return False
+
+    # Tensor operations
+    @property
+    def BFloat16Tensor(self):
+        return
+
+    @property
+    def ByteTensor(self):
+        return
+
+    @property
+    def DoubleTensor(self):
+        return
+
+    @property
+    def FloatTensor(self):
+        return
+
+    @property
+    def HalfTensor(self):
+        return
+
+    @property
+    def IntTensor(self):
+        return
+
+    @property
+    def LongTensor(self):
+        return
+
+    def pin_memory(self, tensor, align_bytes=1):
+        return tensor.pin_memory()
+
+    def is_pinned(self, tensor):
+        return tensor.is_pinned()
+
+    def on_accelerator(self, tensor):
+        device_str = str(tensor.device)
+        if device_str.startswith("mps"):
+            return True
+        else:
+            return False
+
+    def op_builder_dir(self):
+        try:
+            # is op_builder from deepspeed or a 3p version? this should only succeed if it's deepspeed
+            # if successful this also means we're doing a local install and not JIT compile path
+            from op_builder import __deepspeed__  # noqa: F401 # type: ignore
+
+            return "op_builder"
+        except ImportError:
+            return "deepspeed.ops.op_builder"
+
+    # create an instance of op builder, specified by class_name
+    def create_op_builder(self, op_name):
+        builder_class = self.get_op_builder(op_name)
+        if builder_class != None:
+            return builder_class()
+        return None
+
+    # return an op builder class, specified by class_name
+    def get_op_builder(self, class_name):
+        from deepspeed.ops.op_builder.cpu import NotImplementedBuilder
+
+        return NotImplementedBuilder
+
+    def build_extension(self):
+        from torch.utils.cpp_extension import BuildExtension
+
+        return BuildExtension
diff --git a/accelerator/npu_accelerator.py b/accelerator/npu_accelerator.py
new file mode 100644
index 000000000000..7ebbd320bb15
--- /dev/null
+++ b/accelerator/npu_accelerator.py
@@ -0,0 +1,257 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+import importlib
+import inspect
+
+from .abstract_accelerator import DeepSpeedAccelerator
+# During setup stage torch may not be installed, pass on no torch will
+# allow op builder related API to be executed.
+try:
+    import torch.npu
+except ImportError:
+    pass
+
+
+class NPU_Accelerator(DeepSpeedAccelerator):
+
+    def __init__(self):
+        super().__init__()
+        self._name = 'npu'
+        self._communication_backend_name = 'hccl'
+        # dict that holds class name <--> class type mapping i.e.
+        # 'AsyncIOBuilder': <class 'op_builder.async_io.AsyncIOBuilder'>
+        # this dict will be filled at init stage
+        self.class_dict = None
+
+    def is_synchronized_device(self):
+        return False
+
+    # Device APIs
+    def device_name(self, device_index=None):
+        if device_index == None:
+            return 'npu'
+        return 'npu:{}'.format(device_index)
+
+    def device(self, device_index=None):
+        return torch.npu.device(device_index)
+
+    def set_device(self, device_index):
+        torch.npu.set_device(device_index)
+
+    def current_device(self):
+        return torch.npu.current_device()
+
+    def current_device_name(self):
+        return 'npu:{}'.format(torch.npu.current_device())
+
+    def device_count(self):
+        return torch.npu.device_count()
+
+    def synchronize(self, device_index=None):
+        return torch.npu.synchronize(device_index)
+
+    # RNG APIs
+    def random(self):
+        return torch.random
+
+    def set_rng_state(self, new_state, device_index=None):
+        if device_index is None:
+            return torch.npu.set_rng_state(new_state)
+
+        return torch.npu.set_rng_state(new_state, device_index)
+
+    def get_rng_state(self, device_index=None):
+        if device_index is None:
+            return torch.npu.get_rng_state()
+
+        return torch.npu.get_rng_state(device_index)
+
+    def manual_seed(self, seed):
+        return torch.npu.manual_seed(seed)
+
+    def manual_seed_all(self, seed):
+        return torch.npu.manual_seed_all(seed)
+
+    def initial_seed(self, seed):
+        return torch.npu.initial_seed(seed)
+
+    def default_generator(self, device_index):
+        return torch.npu.default_generators[device_index]
+
+    # Streams/Events
+    @property
+    def Stream(self):
+        return torch.npu.Stream
+
+    def stream(self, stream):
+        return torch.npu.stream(stream)
+
+    def current_stream(self, device_index=None):
+        return torch.npu.current_stream(device_index)
+
+    def default_stream(self, device_index=None):
+        return torch.npu.default_stream(device_index)
+
+    @property
+    def Event(self):
+        return torch.npu.Event
+
+    # Memory management
+    def empty_cache(self):
+        return torch.npu.empty_cache()
+
+    def memory_allocated(self, device_index=None):
+        return torch.npu.memory_allocated(device_index)
+
+    def max_memory_allocated(self, device_index=None):
+        return torch.npu.max_memory_allocated(device_index)
+
+    def reset_max_memory_allocated(self, device_index=None):
+        return torch.npu.reset_max_memory_allocated(device_index)
+
+    def memory_cached(self, device_index=None):
+        return torch.npu.memory_cached(device_index)
+
+    def max_memory_cached(self, device_index=None):
+        return torch.npu.max_memory_cached(device_index)
+
+    def reset_max_memory_cached(self, device_index=None):
+        return torch.npu.reset_max_memory_cached(device_index)
+
+    def memory_stats(self, device_index=None):
+        if hasattr(torch.npu, 'memory_stats'):
+            return torch.npu.memory_stats(device_index)
+
+    def reset_peak_memory_stats(self, device_index=None):
+        if hasattr(torch.npu, 'reset_peak_memory_stats'):
+            return torch.npu.reset_peak_memory_stats(device_index)
+
+    def memory_reserved(self, device_index=None):
+        if hasattr(torch.npu, 'memory_reserved'):
+            return torch.npu.memory_reserved(device_index)
+
+    def max_memory_reserved(self, device_index=None):
+        if hasattr(torch.npu, 'max_memory_reserved'):
+            return torch.npu.max_memory_reserved(device_index)
+
+    def total_memory(self, device_index=None):
+        return torch.npu.get_device_properties(device_index).total_memory
+
+    def available_memory(self, device_index=None):
+        return self.total_memory(device_index) - self.memory_allocated(device_index)
+
+    # Data types
+    def is_bf16_supported(self):
+        return torch.npu.is_bf16_supported()
+
+    def is_fp16_supported(self):
+        return True
+
+    def supported_dtypes(self):
+        return [torch.float, torch.half, torch.bfloat16]
+
+    # Misc
+    def amp(self):
+        if hasattr(torch.npu, 'amp'):
+            return torch.npu.amp
+        return None
+
+    def is_available(self):
+        return torch.npu.is_available()
+
+    def range_push(self, msg):
+        return
+
+    def range_pop(self):
+        return
+
+    def lazy_call(self, callback):
+        return torch.npu._lazy_call(callback)
+
+    def communication_backend_name(self):
+        return self._communication_backend_name
+
+    def is_triton_supported(self):
+        return False
+
+    # Tensor operations
+
+    @property
+    def BFloat16Tensor(self):
+        return torch.npu.BFloat16Tensor
+
+    @property
+    def ByteTensor(self):
+        return torch.npu.ByteTensor
+
+    @property
+    def DoubleTensor(self):
+        return torch.npu.DoubleTensor
+
+    @property
+    def FloatTensor(self):
+        return torch.npu.FloatTensor
+
+    @property
+    def HalfTensor(self):
+        return torch.npu.HalfTensor
+
+    @property
+    def IntTensor(self):
+        return torch.npu.IntTensor
+
+    @property
+    def LongTensor(self):
+        return torch.npu.LongTensor
+
+    def pin_memory(self, tensor, align_bytes=1):
+        return tensor.pin_memory()
+
+    def is_pinned(self, tensor):
+        return tensor.is_pinned()
+
+    def on_accelerator(self, tensor):
+        device_str = str(tensor.device)
+        if device_str.startswith('npu:'):
+            return True
+        else:
+            return False
+
+    def op_builder_dir(self):
+        try:
+            # is op_builder from deepspeed or a 3p version? this should only succeed if it's deepspeed
+            # if successful this also means we're doing a local install and not JIT compile path
+            from op_builder import __deepspeed__  # noqa: F401 # type: ignore
+            return "op_builder.npu"
+        except ImportError:
+            return "deepspeed.ops.op_builder.npu"
+
+    def _lazy_init_class_dict(self):
+        if self.class_dict:
+            return
+
+        op_builder_module = importlib.import_module(self.op_builder_dir())
+
+        # get op builder class from op_builder/npu/__init__.py
+        self.class_dict = {}
+        for class_name, class_obj in inspect.getmembers(op_builder_module, inspect.isclass):
+            self.class_dict[class_name] = class_obj
+
+    # create an instance of op builder and return, name specified by class_name
+    def create_op_builder(self, class_name):
+        builder_class = self.get_op_builder(class_name)
+        return None if builder_class is None else builder_class()
+
+    # return an op builder class, name specified by class_name
+    def get_op_builder(self, class_name):
+        self._lazy_init_class_dict()
+        if class_name in self.class_dict:
+            return self.class_dict[class_name]
+        else:
+            return self.class_dict['NotImplementedBuilder'] if 'NotImplementedBuilder' in self.class_dict else None
+
+    def build_extension(self):
+        from torch.utils.cpp_extension import BuildExtension
+        return BuildExtension
diff --git a/accelerator/real_accelerator.py b/accelerator/real_accelerator.py
index 06cbb0b08e7a..49133489b051 100644
--- a/accelerator/real_accelerator.py
+++ b/accelerator/real_accelerator.py
@@ -1,4 +1,15 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+import os
+
+try:
+    # Importing logger currently requires that torch is installed, hence the try...except
+    # TODO: Remove logger dependency on torch.
+    from deepspeed.utils import logger as accel_logger
+except ImportError as e:
+    accel_logger = None
 
 try:
     from accelerator.abstract_accelerator import DeepSpeedAccelerator as dsa1
@@ -9,6 +20,8 @@
 except ImportError as e:
     dsa2 = None
 
+SUPPORTED_ACCELERATOR_LIST = ['cuda', 'cpu', 'xpu', 'npu', 'mps']
+
 ds_accelerator = None
 
 
@@ -23,51 +36,148 @@ def _validate_accelerator(accel_obj):
     # accelerator.abstractor_accelerator
     # or deepspeed.accelerator.abstract_accelerator, consider accel_obj
     # is a conforming object
-    if not ((dsa1 != None and isinstance(accel_obj,
-                                         dsa1)) or
-            (dsa2 != None and isinstance(accel_obj,
-                                         dsa2))):
-        raise AssertionError(
-            f'{accel_obj.__class__.__name__} accelerator is not subclass of DeepSpeedAccelerator'
-        )
+    if not ((dsa1 is not None and isinstance(accel_obj, dsa1)) or (dsa2 is not None and isinstance(accel_obj, dsa2))):
+        raise AssertionError(f"{accel_obj.__class__.__name__} accelerator is not subclass of DeepSpeedAccelerator")
 
     # TODO: turn off is_available test since this breaks tests
-    #assert accel_obj.is_available(), \
+    # assert accel_obj.is_available(), \
     #    f'{accel_obj.__class__.__name__} accelerator fails is_available() test'
 
 
+def is_current_accelerator_supported():
+    return get_accelerator() in SUPPORTED_ACCELERATOR_LIST
+
+
 def get_accelerator():
     global ds_accelerator
-    if ds_accelerator is None:
+    if ds_accelerator is not None:
+        return ds_accelerator
+
+    accelerator_name = None
+    ds_set_method = None
+    # 1. Detect whether there is override of DeepSpeed accelerators from environment variable.
+    if "DS_ACCELERATOR" in os.environ.keys():
+        accelerator_name = os.environ["DS_ACCELERATOR"]
+        if accelerator_name == "xpu":
+            try:
+                from intel_extension_for_deepspeed import XPU_Accelerator  # noqa: F401 # type: ignore
+            except ImportError as e:
+                raise ValueError(
+                    f"XPU_Accelerator requires intel_extension_for_deepspeed, which is not installed on this system.")
+        elif accelerator_name == "cpu":
+            try:
+                import intel_extension_for_pytorch  # noqa: F401 # type: ignore
+            except ImportError as e:
+                raise ValueError(
+                    f"CPU_Accelerator requires intel_extension_for_pytorch, which is not installed on this system.")
+        elif accelerator_name == "npu":
+            try:
+                import torch_npu  # noqa: F401 # type: ignore
+            except ImportError as e:
+                raise ValueError(f"NPU_Accelerator requires torch_npu, which is not installed on this system.")
+            pass
+        elif accelerator_name == "mps":
+            try:
+                import torch.mps
+
+                # should use torch.mps.is_available() if it exists someday but this is used as proxy
+                torch.mps.current_allocated_memory()
+            except (RuntimeError, ImportError) as e:
+                raise ValueError(f"MPS_Accelerator requires torch.mps, which is not installed on this system.")
+        elif is_current_accelerator_supported():
+            raise ValueError(f'DS_ACCELERATOR must be one of {SUPPORTED_ACCELERATOR_LIST}. '
+                             f'Value "{accelerator_name}" is not supported')
+        ds_set_method = "override"
+
+    # 2. If no override, detect which accelerator to use automatically
+    if accelerator_name is None:
+        # We need a way to choose among different accelerator types.
+        # Currently we detect which accelerator extension is installed
+        # in the environment and use it if the installing answer is True.
+        # An alternative might be detect whether CUDA device is installed on
+        # the system but this comes with two pitfalls:
+        # 1. the system may not have torch pre-installed, so
+        #    get_accelerator().is_available() may not work.
+        # 2. Some scenario like install on login node (without CUDA device)
+        #    and run on compute node (with CUDA device) may cause mismatch
+        #    between installation time and runtime.
+
         try:
-            from intel_extension_for_deepspeed import XPU_Accelerator
+            from intel_extension_for_deepspeed import XPU_Accelerator  # noqa: F401,F811 # type: ignore
+
+            accelerator_name = "xpu"
         except ImportError as e:
             pass
-        else:
-            ds_accelerator = XPU_Accelerator()
-            _validate_accelerator(ds_accelerator)
-            return ds_accelerator
-
+        if accelerator_name is None:
+            try:
+                import intel_extension_for_pytorch  # noqa: F401,F811 # type: ignore
+
+                accelerator_name = "cpu"
+            except ImportError as e:
+                pass
+        if accelerator_name is None:
+            try:
+                import torch_npu  # noqa: F401,F811 # type: ignore
+
+                accelerator_name = "npu"
+            except ImportError as e:
+                pass
+        if accelerator_name is None:
+            try:
+                import torch.mps
+
+                # should use torch.mps.is_available() if it exists someday but this is used as proxy
+                torch.mps.current_allocated_memory()
+                accelerator_name = "mps"
+            except (RuntimeError, ImportError) as e:
+                pass
+        if accelerator_name is None:
+            accelerator_name = "cuda"
+
+        ds_set_method = "auto detect"
+
+    # 3. Set ds_accelerator accordingly
+    if accelerator_name == "cuda":
         from .cuda_accelerator import CUDA_Accelerator
+
         ds_accelerator = CUDA_Accelerator()
-        _validate_accelerator(ds_accelerator)
+    elif accelerator_name == "cpu":
+        from .cpu_accelerator import CPU_Accelerator
+
+        ds_accelerator = CPU_Accelerator()
+    elif accelerator_name == "xpu":
+        # XPU_Accelerator is already imported in detection stage
+        ds_accelerator = XPU_Accelerator()
+    elif accelerator_name == "npu":
+        from .npu_accelerator import NPU_Accelerator
+
+        ds_accelerator = NPU_Accelerator()
+    elif accelerator_name == "mps":
+        from .mps_accelerator import MPS_Accelerator
+
+        ds_accelerator = MPS_Accelerator()
+    _validate_accelerator(ds_accelerator)
+    if accel_logger is not None:
+        accel_logger.info(f"Setting ds_accelerator to {ds_accelerator._name} ({ds_set_method})")
     return ds_accelerator
 
 
 def set_accelerator(accel_obj):
     global ds_accelerator
     _validate_accelerator(accel_obj)
+    if accel_logger is not None:
+        accel_logger.info(f"Setting ds_accelerator to {accel_obj._name} (model specified)")
     ds_accelerator = accel_obj
 
 
-'''
+"""
 -----------[code] test_get.py -----------
 from deepspeed.accelerator import get_accelerator
 my_accelerator = get_accelerator()
-print(f'{my_accelerator._name=}')
-print(f'{my_accelerator._communication_backend=}')
-print(f'{my_accelerator.HalfTensor().device=}')
-print(f'{my_accelerator.total_memory()=}')
+logger.info(f'{my_accelerator._name=}')
+logger.info(f'{my_accelerator._communication_backend=}')
+logger.info(f'{my_accelerator.HalfTensor().device=}')
+logger.info(f'{my_accelerator.total_memory()=}')
 -----------[code] test_get.py -----------
 
 ---[output] python test_get.py---------
@@ -81,16 +191,16 @@ def set_accelerator(accel_obj):
 -----------[code] test_set.py -----------
 from deepspeed.accelerator.cuda_accelerator import CUDA_Accelerator
 cu_accel = CUDA_Accelerator()
-print(f'{id(cu_accel)=}')
+logger.info(f'{id(cu_accel)=}')
 from deepspeed.accelerator import set_accelerator, get_accelerator
 set_accelerator(cu_accel)
 
 my_accelerator = get_accelerator()
-print(f'{id(my_accelerator)=}')
-print(f'{my_accelerator._name=}')
-print(f'{my_accelerator._communication_backend=}')
-print(f'{my_accelerator.HalfTensor().device=}')
-print(f'{my_accelerator.total_memory()=}')
+logger.info(f'{id(my_accelerator)=}')
+logger.info(f'{my_accelerator._name=}')
+logger.info(f'{my_accelerator._communication_backend=}')
+logger.info(f'{my_accelerator.HalfTensor().device=}')
+logger.info(f'{my_accelerator.total_memory()=}')
 -----------[code] test_set.py -----------
 
 
@@ -102,4 +212,4 @@ def set_accelerator(accel_obj):
 my_accelerator.HalfTensor().device=device(type='cuda', index=0)
 my_accelerator.total_memory()=34089730048
 ---[output] python test_set.py---------
-'''
+"""
diff --git a/benchmarks/README.md b/benchmarks/README.md
new file mode 100644
index 000000000000..4c88b2dd091c
--- /dev/null
+++ b/benchmarks/README.md
@@ -0,0 +1,6 @@
+# DeepSpeed Benchmarks
+
+If you are looking for DeepSpeed benchmarks, please see the following resources:
+
+1. [Communication Benchmarking Suite](https://github.com/microsoft/DeepSpeedExamples/tree/master/benchmarks/communication)
+2. [Inference Benchmarks](https://github.com/microsoft/DeepSpeedExamples/tree/master/benchmarks/inference)
diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py
deleted file mode 100644
index fcb45ab2b685..000000000000
--- a/benchmarks/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-'''Copyright The Microsoft DeepSpeed Team'''
diff --git a/benchmarks/communication/README.md b/benchmarks/communication/README.md
deleted file mode 100644
index f760465b5c97..000000000000
--- a/benchmarks/communication/README.md
+++ /dev/null
@@ -1,75 +0,0 @@
-# Running Communication Benchmarks
-
-
-To run benchmarks, there are two options:
-
-1. Run a single communication operation:
-
-For example, run with a single large message size:
-<pre>
-deepspeed all_reduce.py
-</pre>
-
-Scan across message sizes:
-<pre>
-deepspeed all_reduce.py --scan
-</pre>
-
-2. Run all available communication benchmarks:
-
-<pre>
-deepspeed run_all.py
-</pre>
-
-Like the individual benchmarks, `run_all.py` supports scanning arguments for the max message size, bw-unit, etc. Simply pass the desired arguments to `run_all.py` and they'll be propagated to each comm op.
-
-<pre>
-usage: ds_bench [-h] [--local_rank LOCAL_RANK] [--trials TRIALS] [--warmups WARMUPS] [--maxsize MAXSIZE] [--async-op] [--bw-unit {Gbps,GBps}] [--backend {nccl}] [--dist {deepspeed,torch}] [--scan] [--raw] [--all-reduce] [--all-gather] [--all-to-all]
-                [--pt2pt] [--broadcast] [--dtype DTYPE] [--mem-factor MEM_FACTOR] [--debug]
-
-optional arguments:
-  -h, --help            show this help message and exit
-  --local_rank LOCAL_RANK
-  --trials TRIALS       Number of timed iterations
-  --warmups WARMUPS     Number of warmup (non-timed) iterations
-  --maxsize MAXSIZE     Max message size as a power of 2
-  --async-op            Enables non-blocking communication
-  --bw-unit {Gbps,GBps}
-  --backend {nccl}      Communication library to use
-  --dist {deepspeed,torch}
-                        Distributed DL framework to use
-  --scan                Enables scanning all message sizes
-  --raw                 Print the message size and latency without units
-  --all-reduce          Run all_reduce
-  --all-gather          Run all_gather
-  --all-to-all          Run all_to_all
-  --pt2pt               Run pt2pt
-  --broadcast           Run broadcast
-  --dtype DTYPE         PyTorch tensor dtype
-  --mem-factor MEM_FACTOR
-                        Proportion of max available GPU memory to use for single-size evals
-  --debug               Enables all_to_all debug prints
-</pre>
-
-Note that `ds_bench` is a pre-packaged wrapper around `run_all.py`. Users can pass the same arguments as well:
-
-<pre>
-<path to deepspeed>/bin/ds_bench --scan --trials=10
-</pre>
-
-Finally, users can choose specific communication operations to run in `run_all.py` or `ds_bench` by passing them as arguments (all operations are run by default). For example:
-
-<pre>
-deepspeed run_all.py --scan --all-reduce --all-to-all --broadcast
-</pre>
-
-
-# Adding Communication Benchmarks
-
-To add new communication benchmarks, follow this general procedure:
-
-1. Copy a similar benchmark file (e.g. to add `reduce_scatter`, copy `all_reduce.py` as a template)
-2. Add a new bw formula in `utils.get_bw`, a new maximum tensor element formula in `utils.max_numel`, and a new arg in `utils.benchmark_parser`
-3. Replace comm op calls in new file with find-replace
-4. Find a good default `mem_factor` for use in `run_<collective>_single()` function
-5. Add new comm op to `run_all.py`
diff --git a/benchmarks/communication/__init__.py b/benchmarks/communication/__init__.py
deleted file mode 100644
index fcb45ab2b685..000000000000
--- a/benchmarks/communication/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-'''Copyright The Microsoft DeepSpeed Team'''
diff --git a/benchmarks/communication/all_gather.py b/benchmarks/communication/all_gather.py
deleted file mode 100644
index dc97267b3840..000000000000
--- a/benchmarks/communication/all_gather.py
+++ /dev/null
@@ -1,159 +0,0 @@
-'''Copyright The Microsoft DeepSpeed Team'''
-
-from benchmarks.communication.utils import *
-from benchmarks.communication.constants import *
-from deepspeed.accelerator import get_accelerator
-
-import time
-
-
-# Run all_gather and print metrics
-def timed_all_gather(input, output, args):
-    if args.dist == 'torch':
-        import torch.distributed as dist
-    elif args.dist == 'deepspeed':
-        import deepspeed.comm as dist
-
-    sync_all()
-    # Warmups, establish connections, etc.
-    for i in range(args.warmups):
-        # use all_gather_base if available
-        if args.dist == 'torch':
-            if hasattr(torch.distributed, "_all_gather_base"):
-                dist._all_gather_base(output, input, group=None, async_op=args.async_op)
-            else:
-                output_tensors = list(
-                    torch.chunk(output_tensor,
-                                cdb.get_world_size(group)))
-                dist.all_gather(output_tensors, input_tensor, group=group, async_op=True)
-        elif args.dist == 'deepspeed':
-            dist.allgather_fn(output, input, group=None, async_op=args.async_op)
-    sync_all()
-
-    # time the actual comm op trials times and average it
-    pre = time.perf_counter()
-    for i in range(args.trials):
-        # use all_gather_base if available
-        if args.dist == 'torch':
-            if hasattr(torch.distributed, "_all_gather_base"):
-                dist._all_gather_base(output, input, group=None, async_op=args.async_op)
-            else:
-                output_tensors = list(
-                    torch.chunk(output_tensor,
-                                cdb.get_world_size(group)))
-                dist.all_gather(output_tensors, input_tensor, group=group, async_op=True)
-        elif args.dist == 'deepspeed':
-            dist.allgather_fn(output, input, group=None, async_op=args.async_op)
-    sync_all()
-    duration = time.perf_counter() - pre
-
-    # maintain and clean performance data
-    avg_duration = duration / args.trials
-    size = input.element_size() * input.nelement()
-    n = dist.get_world_size()
-    tput, busbw = get_bw('all_gather', size, avg_duration, args)
-    tput_str, busbw_str, duration_str = get_metric_strings(args, tput, busbw, avg_duration)
-    desc = f'{input.nelement()}x{input.element_size()}'
-
-    if not args.raw:
-        size = convert_size(size)
-
-    print_rank_0(
-        f"{size:<20} {desc:25s} {duration_str:20s} {tput_str:20s} {busbw_str:20s}")
-
-
-def run_all_gather(local_rank, args):
-    if args.dist == 'torch':
-        import torch.distributed as dist
-    elif args.dist == 'deepspeed':
-        import deepspeed.comm as dist
-
-    # Prepare benchmark header
-    print_header(args, 'all_gather')
-    global_rank = dist.get_rank()
-    world_size = dist.get_world_size()
-
-    if args.scan:
-        # Create list of message sizes
-        M_LIST = []
-        for x in (2**p for p in range(1, args.maxsize)):
-            M_LIST.append(x)
-
-        sync_all()
-        # loop over various tensor sizes
-        for M in M_LIST:
-            global_rank = dist.get_rank()
-            try:
-                mat = torch.ones(world_size,
-                                 M,
-                                 dtype=getattr(
-                                     torch,
-                                     args.dtype)).to(
-                                         get_accelerator().device_name(local_rank))
-                sync_all()
-                input = ((mat.mul_(float(global_rank))).view(-1))
-                # Delete original mat to avoid OOM
-                del mat
-                get_accelerator().empty_cache()
-                output = torch.zeros(input.nelement() * world_size,
-                                     dtype=getattr(
-                                         torch,
-                                         args.dtype)).to(
-                                             get_accelerator().device_name(local_rank))
-            except RuntimeError as e:
-                if 'out of memory' in str(e):
-                    if dist.get_rank() == 0:
-                        print('WARNING: Ran out of GPU memory. Exiting comm op.')
-                    sync_all()
-                    break
-            sync_all()
-            timed_all_gather(input, output, args)
-    else:
-        # all_gather_base saves memory
-        if (args.dist == 'torch'
-                and hasattr(torch.distributed,
-                            "_all_gather_base")) or (args.dist == 'deepspeed'
-                                                     and dist.has_allgather_base):
-            mem_factor = args.mem_factor + 0.2
-        else:
-            mem_factor = args.mem_factor
-        # Send the biggest message size our GPUs can fit. If you're facing OOM errors, reduce the mem_factor
-        sync_all()
-        elements_per_gpu = max_numel(comm_op='all_gather',
-                                     dtype=getattr(torch,
-                                                   args.dtype),
-                                     mem_factor=mem_factor,
-                                     local_rank=local_rank,
-                                     args=args)
-        try:
-            mat = torch.ones(elements_per_gpu,
-                             dtype=getattr(torch,
-                                           args.dtype)).to(
-                                               get_accelerator().device_name(local_rank))
-            # multiply each GPU's tensor by the rank to ease debugging
-            input = ((mat.mul_(float(global_rank))).view(-1))
-            # Delete original mat to avoid OOM
-            del mat
-            get_accelerator().empty_cache()
-            output = torch.zeros(
-                elements_per_gpu * world_size,
-                dtype=getattr(torch,
-                              args.dtype)).to(get_accelerator().device_name(local_rank))
-        except RuntimeError as e:
-            if 'out of memory' in str(e):
-                if dist.get_rank() == 0:
-                    print(
-                        'WARNING: Ran out of GPU memory. Try to reduce the --mem-factor argument!'
-                    )
-                sync_all()
-                return
-
-        sync_all()
-        timed_all_gather(input, output, args)
-
-
-if __name__ == "__main__":
-    args = benchmark_parser().parse_args()
-    rank = args.local_rank
-    init_processes(local_rank=rank, args=args)
-    run_all_gather(local_rank=rank, args=args)
diff --git a/benchmarks/communication/all_reduce.py b/benchmarks/communication/all_reduce.py
deleted file mode 100644
index edc1b99301c0..000000000000
--- a/benchmarks/communication/all_reduce.py
+++ /dev/null
@@ -1,113 +0,0 @@
-'''Copyright The Microsoft DeepSpeed Team'''
-
-from benchmarks.communication.utils import *
-from benchmarks.communication.constants import *
-from deepspeed.accelerator import get_accelerator
-
-import time
-
-
-def timed_all_reduce(input, args):
-    if args.dist == 'torch':
-        import torch.distributed as dist
-    elif args.dist == 'deepspeed':
-        import deepspeed.comm as dist
-
-    sync_all()
-    # Warmups, establish connections, etc.
-    for i in range(args.warmups):
-        dist.all_reduce(input, async_op=args.async_op)
-    sync_all()
-
-    # time the actual comm op trials times and average it
-    pre = time.perf_counter()
-    for i in range(args.trials):
-        dist.all_reduce(input, async_op=args.async_op)
-    sync_all()
-    duration = time.perf_counter() - pre
-
-    # maintain and clean performance data
-    avg_duration = duration / args.trials
-    size = input.element_size() * input.nelement()
-    n = dist.get_world_size()
-    tput, busbw = get_bw('all_reduce', size, avg_duration, args)
-    tput_str, busbw_str, duration_str = get_metric_strings(args, tput, busbw, avg_duration)
-    desc = f'{input.nelement()}x{input.element_size()}'
-
-    if not args.raw:
-        size = convert_size(size)
-
-    print_rank_0(
-        f"{size:<20} {desc:25s} {duration_str:20s} {tput_str:20s} {busbw_str:20s}")
-
-
-def run_all_reduce(local_rank, args):
-    if args.dist == 'torch':
-        import torch.distributed as dist
-    elif args.dist == 'deepspeed':
-        import deepspeed.comm as dist
-
-    # Prepare benchmark header
-    print_header(args, 'all_reduce')
-
-    world_size = dist.get_world_size()
-    global_rank = dist.get_rank()
-
-    if args.scan:
-        M_LIST = []
-        for x in (2**p for p in range(1, args.maxsize)):
-            M_LIST.append(x)
-
-        sync_all()
-        # loop over various tensor sizes
-        for M in M_LIST:
-            global_rank = dist.get_rank()
-            try:
-                mat = torch.ones(world_size,
-                                 M,
-                                 dtype=getattr(
-                                     torch,
-                                     args.dtype)).to(
-                                         get_accelerator().device_name(local_rank))
-                sync_all()
-                input = ((mat.mul_(float(global_rank))).view(-1))
-            except RuntimeError as e:
-                if 'out of memory' in str(e):
-                    if dist.get_rank() == 0:
-                        print('WARNING: Ran out of GPU memory. Exiting comm op.')
-                    sync_all()
-                    break
-            sync_all()
-            timed_all_reduce(input, args)
-    else:
-        # Send the biggest message size our GPUs can fit. If you're facing OOM errors, reduce the mem_factor
-        # Don't need output tensor, so we double mem_factor
-        elements_per_gpu = max_numel(comm_op='all_reduce',
-                                     dtype=getattr(torch,
-                                                   args.dtype),
-                                     mem_factor=args.mem_factor * 2,
-                                     local_rank=local_rank,
-                                     args=args)
-        try:
-            mat = torch.ones(elements_per_gpu,
-                             dtype=getattr(torch,
-                                           args.dtype)).to(
-                                               get_accelerator().device_name(local_rank))
-            input = ((mat.mul_(float(global_rank))).view(-1))
-        except RuntimeError as e:
-            if 'out of memory' in str(e):
-                if dist.get_rank() == 0:
-                    print(
-                        'WARNING: Ran out of GPU memory. Try to reduce the --mem-factor argument!'
-                    )
-                sync_all()
-                return
-        sync_all()
-        timed_all_reduce(input, args)
-
-
-if __name__ == "__main__":
-    args = benchmark_parser().parse_args()
-    rank = args.local_rank
-    init_processes(local_rank=rank, args=args)
-    run_all_reduce(local_rank=rank, args=args)
diff --git a/benchmarks/communication/all_to_all.py b/benchmarks/communication/all_to_all.py
deleted file mode 100644
index bd35cf290e4c..000000000000
--- a/benchmarks/communication/all_to_all.py
+++ /dev/null
@@ -1,134 +0,0 @@
-'''Copyright The Microsoft DeepSpeed Team'''
-
-from benchmarks.communication.utils import *
-from benchmarks.communication.constants import *
-from deepspeed.accelerator import get_accelerator
-
-import time
-
-
-def timed_all_to_all(input, output, args):
-    if args.dist == 'torch':
-        import torch.distributed as dist
-    elif args.dist == 'deepspeed':
-        import deepspeed.comm as dist
-
-    sync_all()
-    # Warmups, establish connections, etc.
-    for i in range(args.warmups):
-        dist.all_to_all_single(output, input, async_op=args.async_op)
-    sync_all()
-
-    # time the actual comm op trials times and average it
-    pre = time.perf_counter()
-    for i in range(args.trials):
-        dist.all_to_all_single(output, input, async_op=args.async_op)
-    sync_all()
-    duration = time.perf_counter() - pre
-
-    # maintain and clean performance data
-    avg_duration = duration / args.trials
-    size = input.element_size() * input.nelement()
-    n = dist.get_world_size()
-    tput, busbw = get_bw('all_to_all', size, avg_duration, args)
-    tput_str, busbw_str, duration_str = get_metric_strings(args, tput, busbw, avg_duration)
-    desc = f'{input.nelement()}x{input.element_size()}'
-
-    if not args.raw:
-        size = convert_size(size)
-
-    print_rank_0(
-        f"{size:<20} {desc:25s} {duration_str:20s} {tput_str:20s} {busbw_str:20s}")
-
-
-def run_all_to_all(local_rank, args):
-    if args.dist == 'torch':
-        import torch.distributed as dist
-    elif args.dist == 'deepspeed':
-        import deepspeed.comm as dist
-
-    world_size = dist.get_world_size()
-    global_rank = dist.get_rank()
-    # Prepare benchmark header
-    print_header(args, 'all_to_all')
-
-    if args.scan:
-        M_LIST = []
-        for x in (2**p for p in range(1, args.maxsize)):
-            M_LIST.append(x)
-
-        sync_all()
-        # loop over various tensor sizes
-        for M in M_LIST:
-            global_rank = dist.get_rank()
-            try:
-                mat = torch.ones(world_size,
-                                 M,
-                                 dtype=getattr(
-                                     torch,
-                                     args.dtype)).to(
-                                         get_accelerator().device_name(local_rank))
-                assert mat.numel() % world_size == 0, f"tensor cannot be divided in {world_size} chunks"
-                sync_all()
-                input = ((mat.mul_(float(global_rank))).view(-1))
-                output = (mat.clone().view(-1))
-            except RuntimeError as e:
-                if 'out of memory' in str(e):
-                    if dist.get_rank() == 0:
-                        print('WARNING: Ran out of GPU memory. Exiting comm op.')
-                    sync_all()
-                    break
-            sync_all()
-            timed_all_to_all(input, output, args)
-    else:
-        # Send the biggest message size our GPUs can fit. If you're facing OOM errors, reduce the mem_factor
-        elements_per_gpu = max_numel(comm_op='all_to_all',
-                                     dtype=getattr(torch,
-                                                   args.dtype),
-                                     mem_factor=args.mem_factor,
-                                     local_rank=local_rank,
-                                     args=args)
-        try:
-            mat = torch.ones(elements_per_gpu,
-                             dtype=getattr(torch,
-                                           args.dtype)).to(
-                                               get_accelerator().device_name(local_rank))
-            assert mat.numel() % world_size == 0, f"tensor with {mat.numel()} elements cannot be divided in {world_size} chunks"
-            input = ((mat.mul_(float(global_rank))).view(-1))
-            # Delete original mat to avoid OOM
-            del mat
-            get_accelerator().empty_cache()
-            output = torch.zeros(
-                elements_per_gpu,
-                dtype=getattr(torch,
-                              args.dtype)).to(get_accelerator().device_name(local_rank))
-        except RuntimeError as e:
-            if 'out of memory' in str(e):
-                if dist.get_rank() == 0:
-                    print(
-                        'WARNING: Ran out of GPU memory. Try to reduce the --mem-factor argument!'
-                    )
-                sync_all()
-                return
-        sync_all()
-
-        if args.debug:
-            for i in range(world_size):
-                if i == global_rank:
-                    print(f"Before AllToAll Input List at rank {global_rank}: {input}")
-                dist.barrier()
-
-        timed_all_to_all(input, output, args)
-
-        if args.debug:
-            for i in range(world_size):
-                if i == global_rank:
-                    print(f"AllToAll Results at rank {global_rank}: {output}")
-                dist.barrier()
-
-
-if __name__ == "__main__":
-    args = benchmark_parser().parse_args()
-    rank = args.local_rank
-    init_processes(local_rank=rank, args=args)
-    run_all_to_all(local_rank=rank, args=args)
diff --git a/benchmarks/communication/broadcast.py b/benchmarks/communication/broadcast.py
deleted file mode 100644
index 633e46638fac..000000000000
--- a/benchmarks/communication/broadcast.py
+++ /dev/null
@@ -1,114 +0,0 @@
-'''Copyright The Microsoft DeepSpeed Team'''
-
-import torch
-from benchmarks.communication.utils import *
-from benchmarks.communication.constants import *
-from deepspeed.accelerator import get_accelerator
-
-import time
-
-
-def timed_broadcast(input, args):
-    if args.dist == 'torch':
-        import torch.distributed as dist
-    elif args.dist == 'deepspeed':
-        import deepspeed.comm as dist
-
-    sync_all()
-    # Warmups, establish connections, etc.
-    for i in range(args.warmups):
-        dist.broadcast(input, 0, async_op=args.async_op)
-    sync_all()
-
-    # time the actual comm op trials times and average it
-    pre = time.perf_counter()
-    for i in range(args.trials):
-        dist.broadcast(input, 0, async_op=args.async_op)
-    sync_all()
-    duration = time.perf_counter() - pre
-
-    # maintain and clean performance data
-    avg_duration = duration / args.trials
-    size = input.element_size() * input.nelement()
-    n = dist.get_world_size()
-    tput, busbw = get_bw('broadcast', size, avg_duration, args)
-    tput_str, busbw_str, duration_str = get_metric_strings(args, tput, busbw, avg_duration)
-    desc = f'{input.nelement()}x{input.element_size()}'
-
-    if not args.raw:
-        size = convert_size(size)
-
-    print_rank_0(
-        f"{size:<20} {desc:25s} {duration_str:20s} {tput_str:20s} {busbw_str:20s}")
-
-
-def run_broadcast(local_rank, args):
-    if args.dist == 'torch':
-        import torch.distributed as dist
-    elif args.dist == 'deepspeed':
-        import deepspeed.comm as dist
-
-    # Prepare benchmark header
-    print_header(args, 'broadcast')
-
-    world_size = dist.get_world_size()
-    global_rank = dist.get_rank()
-
-    if args.scan:
-        M_LIST = []
-        for x in (2**p for p in range(1, args.maxsize)):
-            M_LIST.append(x)
-
-        sync_all()
-        # loop over various tensor sizes
-        for M in M_LIST:
-            global_rank = dist.get_rank()
-            try:
-                mat = torch.ones(world_size,
-                                 M,
-                                 dtype=getattr(
-                                     torch,
-                                     args.dtype)).to(
-                                         get_accelerator().device_name(local_rank))
-                sync_all()
-                input = ((mat.mul_(float(global_rank))).view(-1))
-            except RuntimeError as e:
-                if 'out of memory' in str(e):
-                    if dist.get_rank() == 0:
-                        print('WARNING: Ran out of GPU memory. Exiting comm op.')
-                    sync_all()
-                    break
-            sync_all()
-            timed_broadcast(input, args)
-    else:
-        # Send the biggest message size our GPUs can fit. If you're facing OOM errors, reduce the mem_factor
-        # Don't need output tensor, so we double mem_factor
-        elements_per_gpu = max_numel(comm_op='broadcast',
-                                     dtype=getattr(torch,
-                                                   args.dtype),
-                                     mem_factor=args.mem_factor * 2,
-                                     local_rank=local_rank,
-                                     args=args)
-        try:
-            mat = torch.ones(elements_per_gpu,
-                             dtype=getattr(torch,
-                                           args.dtype)).to(
-                                               get_accelerator().device_name(local_rank))
-            input = ((mat.mul_(float(global_rank))).view(-1))
-        except RuntimeError as e:
-            if 'out of memory' in str(e):
-                if dist.get_rank() == 0:
-                    print(
-                        'WARNING: Ran out of GPU memory. Try to reduce the --mem-factor argument!'
-                    )
-                sync_all()
-                return
-        sync_all()
-        timed_broadcast(input, args)
-
-
-if __name__ == "__main__":
-    args = benchmark_parser().parse_args()
-    rank = args.local_rank
-    init_processes(local_rank=rank, args=args)
-    run_broadcast(local_rank=rank, args=args)
diff --git a/benchmarks/communication/constants.py b/benchmarks/communication/constants.py
deleted file mode 100644
index 935927acd174..000000000000
--- a/benchmarks/communication/constants.py
+++ /dev/null
@@ -1,10 +0,0 @@
-'''Copyright The Microsoft DeepSpeed Team'''
-from deepspeed.accelerator import get_accelerator
-
-DEFAULT_WARMUPS = 5
-DEFAULT_TRIALS = 50
-DEFAULT_TYPE = 'float'
-DEFAULT_BACKEND = get_accelerator().communication_backend_name()
-DEFAULT_UNIT = 'Gbps'
-DEFAULT_DIST = 'deepspeed'
-DEFAULT_MAXSIZE = 24
diff --git a/benchmarks/communication/pt2pt.py b/benchmarks/communication/pt2pt.py
deleted file mode 100644
index 1c890fc42e93..000000000000
--- a/benchmarks/communication/pt2pt.py
+++ /dev/null
@@ -1,132 +0,0 @@
-'''Copyright The Microsoft DeepSpeed Team'''
-
-from benchmarks.communication.utils import *
-from benchmarks.communication.constants import *
-from deepspeed.accelerator import get_accelerator
-
-import time
-
-
-def timed_pt2pt(input, args):
-    if args.dist == 'torch':
-        import torch.distributed as dist
-    elif args.dist == 'deepspeed':
-        import deepspeed.comm as dist
-
-    sync_all()
-    # Warmups, establish connections, etc.
-    for i in range(args.warmups):
-        if dist.get_rank() == 0:
-            if args.async_op:
-                dist.isend(input, 1)
-            else:
-                dist.send(input, 1)
-        if dist.get_rank() == 1:
-            if args.async_op:
-                dist.irecv(input, src=0)
-            else:
-                dist.recv(input, src=0)
-    sync_all()
-
-    # time the actual comm op trials times and average it
-    pre = time.perf_counter()
-    for i in range(args.trials):
-        if dist.get_rank() == 0:
-            if args.async_op:
-                dist.isend(input, 1)
-            else:
-                dist.send(input, 1)
-        if dist.get_rank() == 1:
-            if args.async_op:
-                dist.irecv(input, src=0)
-            else:
-                dist.recv(input, src=0)
-
-    sync_all()
-    duration = time.perf_counter() - pre
-
-    # maintain and clean performance data
-    avg_duration = duration / args.trials
-    size = input.element_size() * input.nelement()
-    n = dist.get_world_size()
-    tput, busbw = get_bw('pt2pt', size, avg_duration, args)
-    tput_str, busbw_str, duration_str = get_metric_strings(args, tput, busbw, avg_duration)
-    desc = f'{input.nelement()}x{input.element_size()}'
-
-    if not args.raw:
-        size = convert_size(size)
-
-    print_rank_0(
-        f"{size:<20} {desc:25s} {duration_str:20s} {tput_str:20s} {busbw_str:20s}")
-
-
-def run_pt2pt(local_rank, args):
-    if args.dist == 'torch':
-        import torch.distributed as dist
-    elif args.dist == 'deepspeed':
-        import deepspeed.comm as dist
-
-    # Prepare benchmark header
-    print_header(args, 'pt2pt')
-    global_rank = dist.get_rank()
-    world_size = dist.get_world_size()
-
-    if args.scan:
-        # Create list of message sizes
-        M_LIST = []
-        for x in (2**p for p in range(1, args.maxsize)):
-            M_LIST.append(x)
-
-        sync_all()
-        # loop over various tensor sizes
-        for M in M_LIST:
-            global_rank = dist.get_rank()
-            try:
-                mat = torch.ones(world_size,
-                                 M,
-                                 dtype=getattr(
-                                     torch,
-                                     args.dtype)).to(
-                                         get_accelerator().device_name(local_rank))
-                sync_all()
-                input = ((mat.mul_(float(global_rank))).view(-1))
-            except RuntimeError as e:
-                if 'out of memory' in str(e):
-                    if dist.get_rank() == 0:
-                        print('WARNING: Ran out of GPU memory. Exiting comm op.')
-                    sync_all()
-                    break
-            sync_all()
-            timed_pt2pt(input, args)
-    else:
-        # Send the biggest message size our GPUs can fit. If you're facing OOM errors, reduce the mem_factor
-        # Don't need output tensor, so double mem_factor
-        elements_per_gpu = max_numel(comm_op='pt2pt',
-                                     dtype=getattr(torch,
-                                                   args.dtype),
-                                     mem_factor=args.mem_factor * 2,
-                                     local_rank=local_rank,
-                                     args=args)
-        try:
-            mat = torch.ones(elements_per_gpu,
-                             dtype=getattr(torch,
-                                           args.dtype)).to(
-                                               get_accelerator().device_name(local_rank))
-            input = ((mat.mul_(float(global_rank))).view(-1))
-        except RuntimeError as e:
-            if 'out of memory' in str(e):
-                if dist.get_rank() == 0:
-                    print(
-                        'WARNING: Ran out of GPU memory. Try to reduce the --mem-factor argument!'
-                    )
-                sync_all()
-                return
-        sync_all()
-        timed_pt2pt(input, args)
-
-
-if __name__ == "__main__":
-    args = benchmark_parser().parse_args()
-    rank = args.local_rank
-    init_processes(local_rank=rank, args=args)
-    run_pt2pt(local_rank=rank, args=args)
diff --git a/benchmarks/communication/run_all.py b/benchmarks/communication/run_all.py
deleted file mode 100644
index 7ec562cc9ae0..000000000000
--- a/benchmarks/communication/run_all.py
+++ /dev/null
@@ -1,49 +0,0 @@
-'''Copyright The Microsoft DeepSpeed Team'''
-
-from benchmarks.communication.utils import *
-from benchmarks.communication.all_reduce import run_all_reduce
-from benchmarks.communication.all_gather import run_all_gather
-from benchmarks.communication.all_to_all import run_all_to_all
-from benchmarks.communication.pt2pt import run_pt2pt
-from benchmarks.communication.broadcast import run_broadcast
-from benchmarks.communication.constants import *
-
-
-# For importing
-def main(args, rank):
-
-    init_processes(local_rank=rank, args=args)
-
-    ops_to_run = []
-    if args.all_reduce:
-        ops_to_run.append('all_reduce')
-    if args.all_gather:
-        ops_to_run.append('all_gather')
-    if args.broadcast:
-        ops_to_run.append('broadcast')
-    if args.pt2pt:
-        ops_to_run.append('pt2pt')
-    if args.all_to_all:
-        ops_to_run.append('all_to_all')
-
-    if len(ops_to_run) == 0:
-        ops_to_run = ['all_reduce', 'all_gather', 'all_to_all', 'broadcast', 'pt2pt']
-
-    for comm_op in ops_to_run:
-        if comm_op == 'all_reduce':
-            run_all_reduce(local_rank=rank, args=args)
-        if comm_op == 'all_gather':
-            run_all_gather(local_rank=rank, args=args)
-        if comm_op == 'all_to_all':
-            run_all_to_all(local_rank=rank, args=args)
-        if comm_op == 'pt2pt':
-            run_pt2pt(local_rank=rank, args=args)
-        if comm_op == 'broadcast':
-            run_broadcast(local_rank=rank, args=args)
-
-
-# For directly calling benchmark
-if __name__ == "__main__":
-    args = benchmark_parser().parse_args()
-    rank = args.local_rank
-    main(args, rank)
diff --git a/benchmarks/communication/utils.py b/benchmarks/communication/utils.py
deleted file mode 100644
index b913dda14fe5..000000000000
--- a/benchmarks/communication/utils.py
+++ /dev/null
@@ -1,220 +0,0 @@
-'''Copyright The Microsoft DeepSpeed Team'''
-
-import torch
-import os
-import math
-import argparse
-from benchmarks.communication.constants import *
-from deepspeed.accelerator import get_accelerator
-
-global dist
-
-
-def init_torch_distributed(backend):
-    global dist
-    import torch.distributed as dist
-    torch.distributed.init_process_group(backend)
-    local_rank = int(os.environ['LOCAL_RANK'])
-    get_accelerator().set_device(local_rank)
-
-
-def init_deepspeed_comm(backend):
-    global dist
-    import deepspeed
-    import deepspeed.comm as dist
-    deepspeed.init_distributed(dist_backend=backend)
-    local_rank = int(os.environ['LOCAL_RANK'])
-    get_accelerator().set_device(local_rank)
-
-
-def init_processes(local_rank, args):
-    if args.dist == 'deepspeed':
-        init_deepspeed_comm(args.backend)
-    elif args.dist == 'torch':
-        init_torch_distributed(args.backend)
-    else:
-        print_rank_0(f"distributed framework {args.dist} not supported")
-        exit(0)
-
-
-def print_rank_0(message):
-    if dist.get_rank() == 0:
-        print(message)
-
-
-def print_header(args, comm_op):
-    if comm_op == 'pt2pt':
-        world_size = 2
-    else:
-        world_size = dist.get_world_size()
-    tput = f'Throughput ({args.bw_unit})'
-    busbw = f'BusBW ({args.bw_unit})'
-    header = f"\n---- Performance of {comm_op} on {world_size} devices ---------------------------------------------------------\n"
-    duration_str = 'Duration'
-    if args.raw:
-        duration_str += ' (us)'
-    header += f"{'Size (Bytes)':20s} {'Description':25s} {duration_str:20s} {tput:20s} {busbw:20s}\n"
-    header += "----------------------------------------------------------------------------------------------------"
-    print_rank_0(header)
-
-
-def get_bw(comm_op, size, duration, args):
-    n = dist.get_world_size()
-    tput = 0
-    busbw = 0
-    if comm_op == "all_to_all":
-        tput = (size / duration)
-        busbw = (size / duration) * ((n - 1) / n)
-    elif comm_op == "all_gather":
-        size *= n
-        tput = (size / duration)
-        busbw = (size / duration) * ((n - 1) / n)
-    elif comm_op == "all_reduce":
-        tput = (size * 2 / duration)
-        busbw = (size / duration) * (2 * (n - 1) / n)
-    elif comm_op == "pt2pt" or comm_op == "broadcast":
-        tput = (size / duration)
-        busbw = tput
-    else:
-        print_rank_0("wrong comm_op specified")
-        exit(0)
-
-    if args.bw_unit == 'Gbps':
-        tput *= 8
-        busbw *= 8
-
-    return tput, busbw
-
-
-def get_metric_strings(args, tput, busbw, duration):
-    duration_ms = duration * 1e3
-    duration_us = duration * 1e6
-    tput = f'{tput / 1e9:.3f}'
-    busbw = f'{busbw /1e9:.3f}'
-
-    if duration_us < 1e3 or args.raw:
-        duration = f'{duration_us:.3f}'
-        if not args.raw:
-            duration += ' us'
-    else:
-        duration = f'{duration_ms:.3f} ms'
-    return tput, busbw, duration
-
-
-def sync_all():
-    get_accelerator().synchronize()
-    dist.barrier()
-
-
-def max_numel(comm_op, dtype, mem_factor, local_rank, args):
-    dtype_size = _element_size(dtype)
-    max_memory_per_gpu = get_accelerator().total_memory(local_rank) * mem_factor
-    if comm_op == 'all_reduce' or comm_op == 'pt2pt' or comm_op == 'broadcast':
-        elements_per_gpu = int(max_memory_per_gpu // dtype_size)
-    elif comm_op == 'all_gather':
-        # all_gather performance is lower for non-powers of two, and the output buffer size scales with world size
-        # Therefore, divide by world size and round down to nearest power of 2
-        elements_per_gpu = int(max_memory_per_gpu // dtype_size // dist.get_world_size())
-        elements_per_gpu = int(pow(2, int(math.log(elements_per_gpu, 2))))
-    elif comm_op == 'all_to_all':
-        # Number of elements must be divisible by world_size
-        # all_to_all performance is lower for non-powers of two. Round down like all_gather.
-        elements_per_gpu = int(max_memory_per_gpu // dtype_size)
-        elements_per_gpu = int(dist.get_world_size() *
-                               round(elements_per_gpu / dist.get_world_size()))
-        elements_per_gpu = int(pow(2, int(math.log(elements_per_gpu, 2))))
-    else:
-        print(f"This communication operation: {comm_op} is not supported yet")
-        exit(0)
-    return elements_per_gpu
-
-
-# Helper function to pretty-print message sizes
-def convert_size(size_bytes):
-    if size_bytes == 0:
-        return "0B"
-    size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
-    i = int(math.floor(math.log(size_bytes, 1024)))
-    p = math.pow(1024, i)
-    s = round(size_bytes / p, 2)
-    return "%s %s" % (s, size_name[i])
-
-
-# Copied from torch. Need to add the func here for old torch compatibility.
-def _element_size(dtype):
-    """
-    Returns the element size for a dtype, in bytes
-    """
-    if not isinstance(dtype, torch.dtype):
-        raise RuntimeError(f'expected torch.dtype, but got {type(dtype)}')
-
-    if dtype.is_complex:
-        return torch.finfo(dtype).bits >> 2
-    elif dtype.is_floating_point:
-        return torch.finfo(dtype).bits >> 3
-    elif dtype == torch.bool:
-        # NOTE: torch.bool is not supported in torch.iinfo()
-        return 1
-    else:
-        return torch.iinfo(dtype).bits >> 3
-
-
-def benchmark_parser():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--local_rank", type=int)
-    parser.add_argument("--trials",
-                        type=int,
-                        default=DEFAULT_TRIALS,
-                        help='Number of timed iterations')
-    parser.add_argument("--warmups",
-                        type=int,
-                        default=DEFAULT_WARMUPS,
-                        help='Number of warmup (non-timed) iterations')
-    parser.add_argument("--maxsize",
-                        type=int,
-                        default=24,
-                        help='Max message size as a power of 2')
-    parser.add_argument("--async-op",
-                        action="store_true",
-                        help='Enables non-blocking communication')
-    parser.add_argument("--bw-unit",
-                        type=str,
-                        default=DEFAULT_UNIT,
-                        choices=['Gbps',
-                                 'GBps'])
-    parser.add_argument("--backend",
-                        type=str,
-                        default=DEFAULT_BACKEND,
-                        choices=['nccl',
-                                 'ccl'],
-                        help='Communication library to use')
-    parser.add_argument("--dist",
-                        type=str,
-                        default=DEFAULT_DIST,
-                        choices=['deepspeed',
-                                 'torch'],
-                        help='Distributed DL framework to use')
-    parser.add_argument("--scan",
-                        action="store_true",
-                        help='Enables scanning all message sizes')
-    parser.add_argument("--raw",
-                        action="store_true",
-                        help='Print the message size and latency without units')
-    parser.add_argument("--all-reduce", action="store_true", help='Run all_reduce')
-    parser.add_argument("--all-gather", action="store_true", help='Run all_gather')
-    parser.add_argument("--all-to-all", action="store_true", help='Run all_to_all')
-    parser.add_argument("--pt2pt", action="store_true", help='Run pt2pt')
-    parser.add_argument("--broadcast", action="store_true", help='Run broadcast')
-    parser.add_argument("--dtype",
-                        type=str,
-                        default=DEFAULT_TYPE,
-                        help='PyTorch tensor dtype')
-    parser.add_argument(
-        "--mem-factor",
-        type=float,
-        default=.4,
-        help='Proportion of max available GPU memory to use for single-size evals')
-    parser.add_argument("--debug",
-                        action="store_true",
-                        help='Enables all_to_all debug prints')
-    return parser
diff --git a/benchmarks/inference/bert-bench.py b/benchmarks/inference/bert-bench.py
deleted file mode 100644
index 9d586d033cd7..000000000000
--- a/benchmarks/inference/bert-bench.py
+++ /dev/null
@@ -1,92 +0,0 @@
-'''Copyright The Microsoft DeepSpeed Team'''
-
-import torch
-import time
-import deepspeed
-import argparse
-from transformers import pipeline
-from deepspeed.accelerator import get_accelerator
-
-parser = argparse.ArgumentParser()
-parser.add_argument("--model", "-m", type=str, help="hf model name")
-parser.add_argument("--deepspeed", action="store_true", help="use deepspeed inference")
-parser.add_argument("--dtype", type=str, default="fp16", help="fp16 or fp32")
-parser.add_argument("--max-tokens", type=int, default=50, help="max new tokens")
-parser.add_argument("--local_rank", type=int, default=0, help="local rank")
-parser.add_argument("--trials", type=int, default=30, help="number of trials")
-parser.add_argument("--kernel-inject", action="store_true", help="inject kernels on")
-parser.add_argument("--graphs", action="store_true", help="CUDA Graphs on")
-args = parser.parse_args()
-
-
-def print_latency(latency_set, title, warmup=3):
-    # trim warmup queries
-    latency_set = latency_set[warmup:]
-    count = len(latency_set)
-    if count > 0:
-        latency_set.sort()
-        n50 = (count - 1) * 0.5 + 1
-        n90 = (count - 1) * 0.9 + 1
-        n95 = (count - 1) * 0.95 + 1
-        n99 = (count - 1) * 0.99 + 1
-        n999 = (count - 1) * 0.999 + 1
-
-        avg = sum(latency_set) / count
-        p50 = latency_set[int(n50) - 1]
-        p90 = latency_set[int(n90) - 1]
-        p95 = latency_set[int(n95) - 1]
-        p99 = latency_set[int(n99) - 1]
-        p999 = latency_set[int(n999) - 1]
-
-        print(f"====== latency stats {title} ======")
-        print("\tAvg Latency: {0:8.2f} ms".format(avg * 1000))
-        print("\tP50 Latency: {0:8.2f} ms".format(p50 * 1000))
-        print("\tP90 Latency: {0:8.2f} ms".format(p90 * 1000))
-        print("\tP95 Latency: {0:8.2f} ms".format(p95 * 1000))
-        print("\tP99 Latency: {0:8.2f} ms".format(p99 * 1000))
-        print("\t999 Latency: {0:8.2f} ms".format(p999 * 1000))
-
-
-deepspeed.init_distributed()
-
-print(args.model, args.max_tokens, args.dtype)
-
-if args.dtype.lower() == "fp16":
-    dtype = torch.float16
-else:
-    dtype = torch.float32
-
-pipe = pipeline("fill-mask", model=args.model, framework="pt", device=args.local_rank)
-
-if dtype == torch.half:
-    pipe.model.half()
-
-mask = pipe.tokenizer.mask_token
-
-br = pipe(f"Hello I'm a {mask} model")
-if args.deepspeed:
-    pipe.model = deepspeed.init_inference(pipe.model,
-                                          dtype=dtype,
-                                          mp_size=1,
-                                          replace_with_kernel_inject=args.kernel_inject,
-                                          enable_cuda_graph=args.graphs)
-    pipe.model.profile_model_time()
-
-responses = []
-times = []
-mtimes = []
-for i in range(args.trials):
-    get_accelerator().synchronize()
-    start = time.time()
-    r = pipe(f"Hello I'm a {mask} model")
-    get_accelerator().synchronize()
-    end = time.time()
-    responses.append(r)
-    times.append((end - start))
-    mtimes += pipe.model.model_times()
-    #print(f"{pipe.model.model_times()=}")
-
-print_latency(times, "e2e latency")
-print_latency(mtimes, "model latency")
-
-print(responses[0:3])
diff --git a/benchmarks/inference/collect_results.py b/benchmarks/inference/collect_results.py
deleted file mode 100644
index 0e51033114db..000000000000
--- a/benchmarks/inference/collect_results.py
+++ /dev/null
@@ -1,147 +0,0 @@
-'''Copyright The Microsoft DeepSpeed Team'''
-
-import os
-import re
-import argparse
-import pandas as pd
-
-parser = argparse.ArgumentParser()
-parser.add_argument(
-    "--results-dir",
-    "-r",
-    type=str,
-    default="./results",
-    help="directory containing sweep results",
-)
-parser.add_argument("--version",
-                    "-v",
-                    type=int,
-                    default=0,
-                    help="version to be collected")
-parser.add_argument("--gen-text-n",
-                    "-n",
-                    type=int,
-                    default=1,
-                    help="expected number of generated text")
-parser.add_argument("--output",
-                    "-o",
-                    type=str,
-                    default="./results.csv",
-                    help="output file")
-args = parser.parse_args()
-
-
-def get_branch(file_path):
-    match = re.match(r".*\/(.*)\.log", file_path)
-    if match is None:
-        return False
-    else:
-        return match.groups()[0]
-
-
-def get_benchmark_params(root_dir, file_path):
-    match = re.match(
-        rf"{root_dir}\/(.+?)_(fp\d+)_(true|false)_(true|false)_(\d+)gpus_v(\d+)\/",
-        file_path,
-    )
-    if match is None:
-        return False
-    else:
-        model, dtype, graphs, kernel, gpus, version = match.groups()
-        bool_dict = {"true": True, "false": False}
-        return {
-            "model": model,
-            "dtype": dtype,
-            "graphs": bool_dict[graphs.lower()],
-            "kernel": bool_dict[kernel.lower()],
-            "gpus": int(gpus),
-            "version": int(version),
-        }
-
-
-def get_perf_data(file_content):
-    matches = re.findall(r"\s+(.+?)\sLatency:\s+(\d+\.\d+)\sms", file_content)
-    if matches is []:
-        return False
-    else:
-        return {f"latency-{key}": float(val) for key, val in matches}
-
-
-def get_generated_text(file_content, gen_text_n):
-    file_content = file_content.replace("\n", " ")
-    file_content = file_content.replace("\t", " ")
-    matches = re.findall(r"RESPONSE\s(\d+):\s+[-]{30}\s+(.+?)\s+[-]{30}", file_content)
-    if len(matches) != gen_text_n:
-        return False
-    else:
-        return {f"generated-text-{key}": val for key, val in matches}
-
-
-def get_error(file_content):
-    matches = re.findall(r"Error:\s+(.+?)\n", file_content)
-    if matches is []:
-        return False
-    else:
-        return {f"error": val for val in matches}
-
-
-if __name__ == "__main__":
-    # List to collect data from all benchmarks
-    benchmarks_data = []
-
-    # Walk through directory of results from sweep.sh
-    for root, dirs, files in os.walk(args.results_dir):
-        # Because of how some models are named, the dir structure for results can vary, e.g.:
-        # "EleutherAI/gpt-neo_*/baseline.log" versus "gpt2_*/baseline.log"
-        if dirs:
-            continue
-
-        # Get data from baseline and each tested branch
-        for name in files:
-            file_path = os.path.join(root, name)
-
-            branch = get_branch(file_path)
-            if not branch:
-                print(f"WARNING: Could not detect branch for file {file_path}, skipping")
-                continue
-
-            params = get_benchmark_params(args.results_dir, file_path)
-            if not params:
-                print(
-                    f"WARNING: Could not detect benchmark settings for file {file_path}, skipping"
-                )
-                continue
-
-            # Verify that the version matches that which we want to collect
-            if params["version"] != args.version:
-                continue
-
-            with open(file_path, "r") as f:
-                file_content = f.read()
-
-            perf_data = get_perf_data(file_content)
-            if not perf_data:
-                print(
-                    f"WARNING: Could not detect benchmark performance data for file {file_path}"
-                )
-
-            generated_text = get_generated_text(file_content, args.gen_text_n)
-            if not generated_text:
-                print(f"WARNING: Could not detect generated text for file {file_path}")
-
-            error = get_error(file_content)
-            if error:
-                print(f"Error found in {file_path}, collecting error info...")
-                benchmarks_data.append({"branch": branch, **params, **error})
-                continue
-
-            benchmarks_data.append({
-                "branch": branch,
-                **params,
-                **perf_data,
-                **generated_text
-            })
-
-    # Convert to a DataFrame and save
-    benchmarks_df = pd.DataFrame(benchmarks_data)
-    benchmarks_df.to_csv(args.output)
diff --git a/benchmarks/inference/gpt-bench.py b/benchmarks/inference/gpt-bench.py
deleted file mode 100644
index 29578b30cf1f..000000000000
--- a/benchmarks/inference/gpt-bench.py
+++ /dev/null
@@ -1,124 +0,0 @@
-'''Copyright The Microsoft DeepSpeed Team'''
-
-import os
-import torch
-import time
-import deepspeed
-import argparse
-from transformers import pipeline
-from deepspeed.accelerator import get_accelerator
-
-parser = argparse.ArgumentParser()
-parser.add_argument("--model", "-m", type=str, help="hf model name")
-parser.add_argument("--deepspeed", action="store_true", help="use deepspeed inference")
-parser.add_argument("--dtype",
-                    type=str,
-                    default="fp16",
-                    choices=["fp16",
-                             "fp32",
-                             "int8"],
-                    help="int8, fp16, or fp32")
-parser.add_argument("--graphs", action="store_true", help="CUDA Graphs on")
-parser.add_argument("--kernel-inject", action="store_true", help="inject kernels on")
-parser.add_argument("--max-tokens", type=int, default=50, help="max new tokens")
-parser.add_argument("--local_rank",
-                    type=int,
-                    default=int(os.getenv("LOCAL_RANK",
-                                          "0")),
-                    help="local rank")
-parser.add_argument("--world_size",
-                    type=int,
-                    default=int(os.getenv("WORLD_SIZE",
-                                          "1")),
-                    help="world size")
-parser.add_argument("--trials", type=int, default=30, help="number of trials")
-args = parser.parse_args()
-
-
-def print_latency(latency_set, title, warmup=3):
-    # trim warmup queries
-    latency_set = list(latency_set)
-    latency_set = latency_set[warmup:]
-    count = len(latency_set)
-    if count > 0:
-        latency_set.sort()
-        n50 = (count - 1) * 0.5 + 1
-        n90 = (count - 1) * 0.9 + 1
-        n95 = (count - 1) * 0.95 + 1
-        n99 = (count - 1) * 0.99 + 1
-        n999 = (count - 1) * 0.999 + 1
-
-        avg = sum(latency_set) / count
-        p50 = latency_set[int(n50) - 1]
-        p90 = latency_set[int(n90) - 1]
-        p95 = latency_set[int(n95) - 1]
-        p99 = latency_set[int(n99) - 1]
-        p999 = latency_set[int(n999) - 1]
-
-        print(f"====== latency stats {title} ======")
-        print("\tAvg Latency: {0:8.2f} ms".format(avg * 1000))
-        print("\tP50 Latency: {0:8.2f} ms".format(p50 * 1000))
-        print("\tP90 Latency: {0:8.2f} ms".format(p90 * 1000))
-        print("\tP95 Latency: {0:8.2f} ms".format(p95 * 1000))
-        print("\tP99 Latency: {0:8.2f} ms".format(p99 * 1000))
-        print("\t999 Latency: {0:8.2f} ms".format(p999 * 1000))
-
-
-deepspeed.init_distributed()
-
-if args.local_rank == 0:
-    print("BENCHMARK SETTINGS:")
-    print(f"\tMODEL: {args.model}")
-    print(f"\tMAX_TOKENS: {args.max_tokens}")
-    print(f"\tDTYPE: {args.dtype}")
-    print(f"\tCUDA_GRAPHS: {args.graphs}")
-    print(f"\tKERNEL_INJECT: {args.kernel_inject}")
-
-if args.dtype == "int8":
-    dtype = torch.int8
-elif args.dtype == "fp16":
-    dtype = torch.float16
-else:
-    dtype = torch.float32
-
-pipe = pipeline("text-generation",
-                model=args.model,
-                framework="pt",
-                device=args.local_rank)
-
-if dtype == torch.float16:
-    pipe.model.half()
-
-if args.deepspeed:
-    pipe.model = deepspeed.init_inference(
-        pipe.model,
-        dtype=dtype,
-        mp_size=args.world_size,
-        replace_with_kernel_inject=args.kernel_inject,
-        enable_cuda_graph=args.graphs,
-    )
-    pipe.model.profile_model_time()
-
-responses = []
-times = []
-mtimes = []
-for i in range(args.trials):
-    get_accelerator().synchronize()
-    start = time.time()
-    r = pipe("DeepSpeed is", do_sample=False, max_new_tokens=args.max_tokens)
-    get_accelerator().synchronize()
-    end = time.time()
-    responses.append(r)
-    times.append(end - start)  # / (args.max_tokens - 3))
-    mtimes.append(sum(pipe.model.model_times()))
-
-if args.local_rank == 0:
-    print_latency(times, "(e2e) latency")
-    print_latency(mtimes, "(model-only) latency")
-    print_latency(map(lambda t: t / (args.max_tokens - 3),
-                      times),
-                  "(e2e) per token latency")
-    print(f"RESPONSE 0:")
-    print("-" * 30)
-    print(responses[0][0]["generated_text"])
-    print("-" * 30)
diff --git a/benchmarks/inference/requirements.txt b/benchmarks/inference/requirements.txt
deleted file mode 100644
index 00899dd5f485..000000000000
--- a/benchmarks/inference/requirements.txt
+++ /dev/null
@@ -1 +0,0 @@
-transformers>=4.21.3
diff --git a/benchmarks/inference/run_model.sh b/benchmarks/inference/run_model.sh
deleted file mode 100644
index 8e5fe3ac0133..000000000000
--- a/benchmarks/inference/run_model.sh
+++ /dev/null
@@ -1,36 +0,0 @@
-set -x
-
-model=$1
-branch1=$2
-branch2=$3
-dtype=$4
-graphs=$5
-kernel=$6
-gpus=$7
-
-version=0
-log_path=results/${model}_${dtype}_${graphs}_${kernel}_${gpus}gpus_v${version}
-mkdir -p ${log_path}
-
-params="--dtype $dtype "
-if [[ "$graphs" == "true" ]]; then
-    params+="--graphs "
-fi
-if [[ "$kernel" == "true" ]]; then
-    params+="--kernel "
-fi
-
-echo "baseline $log_path"
-deepspeed --num_gpus 1 gpt-bench.py -m "${model}" $params &> ${log_path}/baseline.log
-
-cd ../../
-git checkout ${branch1}
-cd -
-echo "ds ${branch1} $log_path"
-deepspeed --num_gpus $gpus gpt-bench.py --deepspeed -m "${model}" $params &> ${log_path}/ds-${branch1}.log
-
-cd ../../
-git checkout ${branch2}
-cd -
-echo "ds ${branch2} $log_path"
-deepspeed --num_gpus $gpus gpt-bench.py --deepspeed -m "${model}" $params&> ${log_path}/ds-${branch2}.log
diff --git a/benchmarks/inference/sweep.sh b/benchmarks/inference/sweep.sh
deleted file mode 100644
index aabcb0bfdbd8..000000000000
--- a/benchmarks/inference/sweep.sh
+++ /dev/null
@@ -1,41 +0,0 @@
-set -x
-
-export TRANSFORMERS_CACHE=/tmp/hf-cache
-
-branch1=$1
-branch2=$2
-
-gptneo_models="EleutherAI/gpt-neo-2.7B EleutherAI/gpt-neo-1.3B EleutherAI/gpt-neo-125M"
-gpt2_models="gpt2 gpt2-large gpt2-xl"
-gptj_models="EleutherAI/gpt-j-6B"
-opt_models="facebook/opt-125m facebook/opt-1.3b facebook/opt-2.7b facebook/opt-6.7b facebook/opt-13b"
-bloom_models="bigscience/bloom-560m bigscience/bloom-1b7 bigscience/bloom-3b bigscience/bloom-7b1"
-
-for gpus in `echo "1 2 4 8"`; do
-    for dtype in `echo "fp16 fp32"`; do
-        for graphs in `echo "true false"`; do
-            for kernel in `echo "true false"`; do
-                params="$dtype $graphs $kernel $gpus"
-                for m in `echo "$gptneo_models"`; do
-                  bash run_model.sh $m $branch1 $branch2 $params
-                done
-
-                for m in `echo "$gpt2_models"`; do
-                  bash run_model.sh $m $branch1 $branch2 $params
-                done
-
-                for m in `echo "$gptj_models"`; do
-                  bash run_model.sh $m $branch1 $branch2 $params
-                done
-
-                for m in `echo "$opt_models"`; do
-                  bash run_model.sh $m $branch1 $branch2 $params
-                done
-
-                for m in `echo "$bloom_models"`; do
-                  bash run_model.sh $m $branch1 $branch2 $params
-                done
-            done
-        done
-    done
-done
diff --git a/bin/ds_elastic b/bin/ds_elastic
index c9987d4565da..1c78aea88894 100755
--- a/bin/ds_elastic
+++ b/bin/ds_elastic
@@ -9,11 +9,7 @@ from deepspeed.elasticity import compute_elastic_config
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
     parser.add_argument('-c', '--config', type=str, help="DeepSpeed config json")
-    parser.add_argument('-w',
-                        '--world-size',
-                        type=int,
-                        default=0,
-                        help="Intended/current world size")
+    parser.add_argument('-w', '--world-size', type=int, default=0, help="Intended/current world size")
     args = parser.parse_args()
     ds_config = json.load(open(args.config, 'r'))
 
@@ -26,7 +22,9 @@ if __name__ == '__main__':
     print(json.dumps(elastic_config, indent=4, sort_keys=True))
 
     if args.world_size > 0:
-        final_batch_size, valid_gpus, micro_batch_size = compute_elastic_config(ds_config=ds_config, target_deepspeed_version=ds_version, world_size=args.world_size)
+        final_batch_size, valid_gpus, micro_batch_size = compute_elastic_config(ds_config=ds_config,
+                                                                                target_deepspeed_version=ds_version,
+                                                                                world_size=args.world_size)
         print('------------------------------------------')
         print(f"Calculated results for world size {args.world_size}:")
         print('------------------------------------------')
diff --git a/blogs/README.md b/blogs/README.md
new file mode 100644
index 000000000000..c5c72fd4efb0
--- /dev/null
+++ b/blogs/README.md
@@ -0,0 +1 @@
+All DeepSpeed blogs are linked here:
diff --git a/blogs/assets/images/Throughputs-OPT.png b/blogs/assets/images/Throughputs-OPT.png
new file mode 100755
index 000000000000..9b968319f759
Binary files /dev/null and b/blogs/assets/images/Throughputs-OPT.png differ
diff --git a/blogs/assets/images/ds-chat-overview.png b/blogs/assets/images/ds-chat-overview.png
new file mode 100755
index 000000000000..479ca4bb2ad7
Binary files /dev/null and b/blogs/assets/images/ds-chat-overview.png differ
diff --git a/blogs/assets/images/figure3.png b/blogs/assets/images/figure3.png
new file mode 100755
index 000000000000..44b8e67bd6b9
Binary files /dev/null and b/blogs/assets/images/figure3.png differ
diff --git a/blogs/assets/images/figure4.png b/blogs/assets/images/figure4.png
new file mode 100755
index 000000000000..dca56637049c
Binary files /dev/null and b/blogs/assets/images/figure4.png differ
diff --git a/blogs/assets/images/figure5.png b/blogs/assets/images/figure5.png
new file mode 100755
index 000000000000..6282c0d19ed1
Binary files /dev/null and b/blogs/assets/images/figure5.png differ
diff --git a/blogs/assets/images/figure6.png b/blogs/assets/images/figure6.png
new file mode 100755
index 000000000000..8e60773b5709
Binary files /dev/null and b/blogs/assets/images/figure6.png differ
diff --git a/blogs/assets/images/figure7.png b/blogs/assets/images/figure7.png
new file mode 100755
index 000000000000..9b400b11efb0
Binary files /dev/null and b/blogs/assets/images/figure7.png differ
diff --git a/blogs/assets/images/hero-figure-black-ja.png b/blogs/assets/images/hero-figure-black-ja.png
new file mode 100644
index 000000000000..5c6cf05d3e7a
Binary files /dev/null and b/blogs/assets/images/hero-figure-black-ja.png differ
diff --git a/blogs/assets/images/hero-figure-black.png b/blogs/assets/images/hero-figure-black.png
new file mode 100755
index 000000000000..6a05f4b27bb9
Binary files /dev/null and b/blogs/assets/images/hero-figure-black.png differ
diff --git a/blogs/assets/images/hybrid-engine.png b/blogs/assets/images/hybrid-engine.png
new file mode 100755
index 000000000000..5e501108a5b7
Binary files /dev/null and b/blogs/assets/images/hybrid-engine.png differ
diff --git a/blogs/assets/images/mascot.png b/blogs/assets/images/mascot.png
new file mode 100755
index 000000000000..e9f7a354fc85
Binary files /dev/null and b/blogs/assets/images/mascot.png differ
diff --git a/blogs/assets/images/triton-bert-base-latency.png b/blogs/assets/images/triton-bert-base-latency.png
new file mode 100644
index 000000000000..4f733f4d1afe
Binary files /dev/null and b/blogs/assets/images/triton-bert-base-latency.png differ
diff --git a/blogs/assets/images/triton-bert-large-latency.png b/blogs/assets/images/triton-bert-large-latency.png
new file mode 100644
index 000000000000..d82dc0ccac51
Binary files /dev/null and b/blogs/assets/images/triton-bert-large-latency.png differ
diff --git a/blogs/comm-opt/README.md b/blogs/comm-opt/README.md
new file mode 100644
index 000000000000..4767c4342816
--- /dev/null
+++ b/blogs/comm-opt/README.md
@@ -0,0 +1,82 @@
+<div align="center">
+
+# Communication Optimizations for Large-Scale Training
+
+</div>
+
+
+## Table of Contents
+1. [Introduction](#introduction)
+2. [Gradient AllReduce Optimization for ZeRO stages 1 and 2](#ar-opt)
+3. [Optimizing Parameter All-Gather for ZeRO2 Training](#ag-opt)
+4. [Optimizing AlltoAll for Sequence-Parallel Training](#sp-opt)
+
+
+## 1. Introduction <a name="introduction"></a>
+Training LLMs on large datasets can be extremely costly both in terms of hardware resources and time. An important step to minimize such costs is to carefully combine an appropriate number of resources together with a scalable library that guarantees training completion within a time limit. In this post, we discuss a key aspect of the scalability features of DeepSpeed, the communication optimization. Communication collectives (e.g., all-reduce, all-gather, etc.) are critical pieces of many popular DeepSpeed technologies (e.g., ZeRO, MoE, AutoTP, etc.), and in the following sections we discuss our new optimizations of some of these collectives. These optimizations are available in DeepSpeed versions >= 0.x.x.
+
+## 2. Gradient AllReduce Optimization for ZeRO stages 1 and 2 <a name="ar-opt"></a>
+
+Before diving into this optimization, let's take a step back and show some of the case studies that demonstrate the need.
+
+AllReduce operation is an important part of the training process. In ZeRO, we handle this in buckets, which can be configured to get good communication throughput. As the number of GPUs increases, we encounter smaller-partition AllReduces. In this case, the current bucketing scheme cannot help with the communication overhead. This mostly becomes an issue when training smaller-scale models (like Llama-7B) with large number of GPUs.
+
+For instance, when training a dense-7B architecture with Zero stages 1 or 2, we encounter a 1 and 2 second increase for the AllReduce time by increasing from 256 to 512 and 1024 A100 GPUs. This issue mostly arises from the fact that, the gradient-averaging happens with smaller partitions (#parameters / #GPUs) per-GPU rank. This issue gets more serious when training MoE architectures (3 - 12 second) for which the expert's parameters can be farther away due to the current parallelism layout of data and expert parallelism.
+
+In this section, we introduce two main optimization techniques for alleviating these communication bottleneck.
+
+First, Multi-rank bucketing for the same process group: for this optimization, we simply pack all data that requires to be reduced from different ranks into one big flattened tensor and call AllReduce instead of reduce operations. After the reduction, we scatter the right portion of data to the corresponding ranks.
+
+Second, add new layout for the expert-data parallelism: the default parallelism layout for MoE architecture (as shown in Fig 1) is planned in a way that the experts are placed first on E parallel GPUs and replicated D times (data-parallel). With this layout, we encounter slower AllReduce as data-parallel ranks are placed farther away especially when we have cross-rank communication. We call this layout E + D.
+
+<div align="center">
+  <img src="assets/images/e+d.png" alt="" width=800 /><br>
+
+  *Fig 1: Different MoE parallel layout. left) E + D, which places the GPUs in EP dimension first before adding DP, right) D + E, that replicates each expert by DP size, before constructing EP. We get faster AllReduce for the second layout while increasing the AlltoAll time. It potentially results in faster e2e training time, as the communication volume for AllReduce (total parameter size) is normally much more than AlltoAll (MLP activation memory).*<br>
+</div>
+By changing this layout from E + D to D + E (shown in Fig 1), where we first replicate each expert by D times and then add them across expert-parallel dimension, we can reduce the AllReduce time substantially. On an A100-DGX cluster, where each node has 8 GPUs, we see about 8x reduction in cross-node infiniband communication-volume for the parameter update process, which are now processed faster using the intra-node NVLinks. Note that by adding this optimization, we increase the cost of AlltoAll happening for the MoE part of the model, however, we have seen that the performance benefit of AllReduce overweighs this cost.
+
+Table 1 summarizes the saving observed for training a 7B dense and a MoE architecture by using the optimized AllReduce scheme. After applying the multi-rank bucketing technique, we reduce the AllReduce time by 4x for dense architecture and 5x - 8x for the MoE one. In addition, we obtain an extra 3x saving using the new D + E layout for the MoE architecture. Therefore, we see higher performance gain on MoE architectures when using large number of GPUs. For instance, when training a 7B-base MoE architecture, we reduce iteration-time from 13 sec to 9.5 sec on 512 GPUs (37%) and from 16.1 sec to 5.1 sec on 1k-GPU setup (3.2x).
+<div align="center">
+
+|  | GPUs | AllReduce time | Iteration time |
+|----------|:------:|:------:|:------:|
+baseline (dense)	| 1024|	1.2 | 5.4
+optimized (dense)	| 1024|	0.36 | 4.5
+baseline (MoE)	| 1024 |	11.5 | 16.1
+optimized (MoE)	| 1024	| 0.45 | 5.1
+
+Table 1. AllReduce saving observed for both dense and MoE architectures.
+
+</div>
+
+## 3. Optimizing Parameter All-Gather for ZeRO2 Training <a name="ag-opt"></a>
+
+The same as with AllReduce, all-gather takes longer as we have more partitions. As the parameters are stored in a flattened buffer for ZeRO stage-2, we can simply have a one call to all-gather the parameters into this tensor.
+
+When all-gathering the updated parameters at Zero-Stage2, the bucketing scheme uses several narrow operations and creates a list of tensors with the bucket size from each partition. We needed this scheme to align with the `all_gather` operation from PyTorch.
+However, by adding the support for the `all_gather_into_tensor`, operation that has been added to the newer versions of PyTorch, we can simply have a kernel call to do the full-parameter all-gather. With this optimization, we see about 2x reduction in the step time for large-scale training.
+
+## 4. Optimizing AlltoAll for Sequence-Parallel Training <a name="sp-opt"></a>
+
+For this part of the optimization, we add some fusion for the communication that is required for the DeepSpeed-Ulysses to provide a more scalable approach for when we increase the SP from 2 to 8 (for this study, we consider A100-DGX hardware, which has 8 GPUs per-node and by increasing the parallelism more than 8, we encounter performance-hit by the cross-node communication).
+
+These fusions are done at two levels:
+1. Fuse the sequence AlltoAll for q,k, and v: we Scatter the heads using the mixed tensor rather than splitting them beforehand. For this part, we need to get some more information from the modeling side (such as the number of q and kv heads), to split the heads before calling AlltoAll. We have added some new changes on the Megatron-DeepSpeed repo that incorporate these changes for the sequence-parallelism.
+2. Fuse the AlltoAll tensors and call the PyTorch's AlltoAll-sinlge API: we reshape the tensors for the scatter dimension and use a single tensor for AlltoAll which alleviates the overhead of using a list of tensors which requires a contiguous call for each element of the list.
+
+By adding these optimizations, we see about 10 to 15% speedup compared to the previous design, and obtain good scalability across different SP-degree and context-lengths. In the following table, we show the improvement achieved by using SP, when doubling the GPU-count and increasing the SP-degree. We obtain over 80% of efficiency when increasing from 256 to 512 GPUs using SP-2. Furthermore, by increasing the sequence-length and SP, while keeping the processed tokens similar, we achieve over 75% of efficiency for 2x more resources. On the other hand, if we can double the number of tokens (shown on the last row of table 2), we can improve the performance to 1.81x.
+
+<div align="center">
+
+| GPUs | bsz | seq | Tokens (M) | SP | Sample (4K)-per-second | Speedup (x) |
+|----------|:------:|:------:|:------:|:------:|:------:|:------:|
+256	| 256|	8192	|2|1	| 60.71	 |1
+512	| 256|	8192	|2|2	| 111.18 |	1.83
+512	| 128|	16384 |2|4 | 108.81 |	1.79
+512	| 64	|32768	|2|8	| 106.54 |	1.75
+512	| 64	|65536	|4|8	| 110.05 |	1.81
+
+Table 2. Sequence-Parallelism scalability using DeepSpeed-Ulysses.
+
+</div>
diff --git a/blogs/comm-opt/assets/images/e+d.png b/blogs/comm-opt/assets/images/e+d.png
new file mode 100644
index 000000000000..72ad0f583857
Binary files /dev/null and b/blogs/comm-opt/assets/images/e+d.png differ
diff --git a/blogs/comm-opt/assets/images/sp+fp.png b/blogs/comm-opt/assets/images/sp+fp.png
new file mode 100644
index 000000000000..0b2940418f7a
Binary files /dev/null and b/blogs/comm-opt/assets/images/sp+fp.png differ
diff --git a/blogs/comm-opt/assets/images/sp-conv.png b/blogs/comm-opt/assets/images/sp-conv.png
new file mode 100644
index 000000000000..e1e36b4436a0
Binary files /dev/null and b/blogs/comm-opt/assets/images/sp-conv.png differ
diff --git a/blogs/deepspeed-chat/README.md b/blogs/deepspeed-chat/README.md
new file mode 100644
index 000000000000..43501652b98e
--- /dev/null
+++ b/blogs/deepspeed-chat/README.md
@@ -0,0 +1,332 @@
+<div align="center">
+
+# DeepSpeed Chat: Easy, Fast and Affordable RLHF Training of ChatGPT-like Models at All Scales
+
+</div>
+
+<div align="center">
+
+<img src="../assets/images/hero-figure-black.png" width="1000px" alt="DeepSpeed-Chat!"/>
+
+</div>
+
+To cite DeepSpeed Chat, please cite our [arxiv report](https://arxiv.org/abs/2308.01320):
+
+```
+@article{yao2023dschat,
+  title={{DeepSpeed-Chat: Easy, Fast and Affordable RLHF Training of ChatGPT-like Models at All Scales}},
+  author={Zhewei Yao and Reza Yazdani Aminabadi and Olatunji Ruwase and Samyam Rajbhandari and Xiaoxia Wu and Ammar Ahmad Awan and Jeff Rasley and Minjia Zhang and Conglong Li and Connor Holmes and Zhongzhu Zhou and Michael Wyatt and Molly Smith and Lev Kurilenko and Heyang Qin and Masahiro Tanaka and Shuai Che and Shuaiwen Leon Song and Yuxiong He},
+  journal={arXiv preprint arXiv:2308.01320},
+  year={2023}
+}
+```
+
+# 1. Overview
+
+ChatGPT-like models have taken the AI world by storm, and it would not be an overstatement to say that its impact on the digital world has been revolutionary. These models are incredibly versatile, capable of performing tasks like summarization, coding, and translation with results that are on-par or even exceeding the capabilities of human experts. Given the sheer power of these models, multiple efforts are underway in the AI open-source community to make ChatGPT-style models more accessible (e.g. ChatLLaMa, Alpaca, Vicuna, Databricks-Dolly, etc.).
+
+Despite these incredible efforts, there is still a lack of an end-to-end RLHF pipeline capable of training a powerful ChatGPT-like model that is easily accessible to the AI community. For instance, training a modest 6.7B ChatGPT model with existing systems typically requires an expensive multi-GPU setup that is beyond the reach of many data scientists. Even with access to such computing resources, training efficiency is often less than 5% of what these machines are capable of (as illustrated [shortly](#effective-throughput-and-scalability-analysis)). Finally, existing solutions simply cannot support easy, fast and affordable training state-of-the-art ChatGPT models with hundreds of billions of parameters, even given access to multi-GPU clusters.
+
+These limitations stem from a lack of a robust system design that is capable of effectively supporting the complex InstructGPT’s RLHF training pipeline which is quite different from the standard pre-training and fine-tuning pipelines that existing DL systems are designed for. Therefore, in the spirit of democratizing ChatGPT-like models, and making RLHF training truly accessible to the AI community, today we are releasing DeepSpeed-Chat with the following three capabilities:
+
+(i) ***Easy-to-use Training and Inference Experience for ChatGPT Like Models***: A single script capable of taking a pre-trained Huggingface model, running it through all three steps of InstructGPT training using DeepSpeed-RLHF system and producing your very own ChatGPT like model. In addition, we provide an inference API for testing conversation-style interactions after the model is trained.
+
+(ii) ***DeepSpeed-RLHF Pipeline***: DeepSpeed-RLHF pipeline primarily replicates the training pipeline from the InstructGPT paper with careful attention to ensure completeness and one-to-one correspondence with the three-steps that includes a) Supervised Fine-tuning (SFT), b) Reward Model Fine-tuning and c) Reinforcement Learning with Human Feedback (RLHF). Additionally, we offer data abstraction and blending capabilities to enable training with multiple data sources.
+
+(iii) ***DeepSpeed-RLHF System***: A robust and sophisticated RLHF system that combines the training and inference prowess of DeepSpeed into a single unified Hybrid Engine (DeepSpeed-HE) for RLHF. The Hybrid-Engine is capable of seamlessly transitioning between inference and training modes within RLHF, allowing it to leverage various optimizations from DeepSpeed-Inference such as tensor-parallelism and high-performance transformer kernels for generation, while also benefiting from the multitude of ZeRO- and LoRA-based memory optimization strategies for RL training. DeepSpeed-HE is also aware of the full RLHF pipeline, allowing it to make optimal decisions in terms of memory management and data movement across different phases of RLHF.
+
+
+DeepSpeed-RLHF system is capable of unparalleled efficiency at scale, making complex RLHF training fast, affordable, and easily accessible to the AI community:
+
+***Efficiency and Affordability***: In terms of efficiency, [DeepSpeed-HE is over 15x faster than existing systems](#throughput-and-model-size-scalability-comparisons-with-existing-rlhf-systems), making RLHF training both fast and affordable. For instance, DeepSpeed-HE can train an OPT-13B in just 9 hours and OPT-30B in 18 hours on Azure Cloud for under $300 and $600, respectively.
+
+
+<div align="center">
+
+| GPUs        | OPT-6.7B |  OPT-13B     |     OPT-30B     |     OPT-66B    |
+|-------------|:--------:|:--------------:|:-------------:|:-----------:|
+| 8x A100-40GB     | 5.7 hours | 10.8 hours |	 1.85 days |	 NA |
+| 8x A100-80GB     | 4.1 hours ($132) | 	9 hours ($290) | 	18 hours ($580) | 	 2.1 days ($1620) |
+
+*Table 1. Single-Node 8x A100: Training Time and Corresponding Approximate Cost on Azure.*
+
+</div>
+
+
+***Excellent Scalability***: DeepSpeed-HE supports models with hundreds of billions of parameters and can achieve excellent scalability on multi-node multi-GPU systems. As a result, even a 13B model can be trained in 1.25 hours and a massive 175B model can be trained with DeepSpeed-HE in under a day.
+
+
+<div align="center">
+
+| GPUs	        | OPT-13B 	    | OPT-30B	      | OPT-66B	      | OPT-175B |
+|---------------|:-----------------:|:---------------:|:-------------:|:-------------:|
+| 64x A100-80G	| 1.25 hours ($320)	| 4 hours ($1024) | 7.5 hours ($1920)	| 20 hours ($5120)|
+
+*Table 2. Multi-Node 64x A100-80GB: Training Time and Corresponding Approximate Cost on Azure.*
+</div>
+
+> ***Very Important Details***: The numbers in both tables above are for Step 3 of the training and are based on actual measured training throughput on DeepSpeed-RLHF curated dataset and training recipe which trains for one epoch on a total of 135M tokens. We have in total 67.5M query tokens (131.9k queries with sequence length 256) and 67.5M generated tokens (131.9k answers with sequence length 256), and a maximum global batch size per step of 0.5M tokens (1024 query-answer pairs). We urge readers to pay attention to these specifications before making any cost and e2e time comparisons with DeepSpeed-RLHF. See our [benchmark settings](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/BenckmarkSetting.md) page for more details.
+
+
+***Democratizing RLHF Training***: With just a single GPU, DeepSpeed-HE supports training models with over 13 billion parameters, enabling data scientists without access to multi-GPU systems to create not just toy RLHF models but large and powerful ones that can be used in real-world scenarios.
+
+<div align="center">
+
+|            |	V100 32G | 	A6000 48G | A100 40G | A100 80G  |
+|------------|:---------:|:----------:|:--------:|:---------:|
+| Model Size |	OPT-2.7B | 	OPT-6.7B  | OPT-6.7B | OPT-13B |
+
+Table 3. Max Model Size Supported by DeepSpeed-HE on a Single GPU
+
+</div>
+
+Next, we dive deeper into the three capabilities of DeepSpeed-Chat introduced above.
+
+# 2. Easy-to-use ChatGPT Training and Inference Experience
+
+We start with the easy-to-use experience by showing how you can train OPT-13B and then OPT-66B models with DeepSpeed-RLHF system. If you are short on time, you can even train an OPT-1.3B model on a single consumer-grade GPU in just two hours.  We also demonstrate how you can use our DeepSpeed-chat RLHF API to develop your own custom pipelines.
+
+## Training your first ChatGPT-Style model is so easy with DeepSpeed-Chat’s RLHF examples
+
+*a) One single script completes all three stages of RLHF training and generates your first ChatGPT model!*
+
+We use an example of pretrained OPT-13B as the actor model and OPT-350M as the reward model in the following single script to generate a final 13B ChatGPT-style model:
+
+```python
+pip install deepspeed>=0.9.0
+
+git clone https://github.com/microsoft/DeepSpeedExamples.git
+cd DeepSpeedExamples/applications/DeepSpeed-Chat/
+pip install -r requirements.txt
+
+python train.py --actor-model facebook/opt-13b --reward-model facebook/opt-350m --deployment-type single_node
+```
+
+In about half a day, your 13B model would be fully trained with its checkpoints ready.  The following table demonstrates a breakdown of the training time for each of the three steps:
+
+<div align="center">
+
+| Model Sizes                      | Step 1 | Step 2 | Step 3 | Total  |
+|--------------------------------- |:------:|:------:|:------:|:------:|
+| Actor: OPT-13B, Reward: OPT-350M | 2.5hr	| 0.25hr | 10.8hr | 13.6hr |
+
+Table 4. E2E time breakdown for training a 13 billion parameter ChatGPT model via DeepSpeed-Chat on a single DGX node with 8 NVIDIA A100-40G GPUs.
+
+</div>
+
+*b) Test your final model with DeepSpeed-Chat’s Inference API*
+
+Now it is time to plug-in and test your final model trained by DeepSpeed-Chat! You can play with not only Q&A sessions but also conversation-style interactions.
+
+```
+Human:        Do you know Microsoft?
+Assistant:    Yes, I’m a big fan of Microsoft.  What about them?
+Human:        Can you explain it to a 6-year old child? I wonder how I should describe it
+Assistant:    Sure, I can try.  Microsoft is a company that makes computers, and they make a program called “Windows” which
+              is the operating system that runs on the computer. It’s like the “brain” of the computer.  It’s where all the
+              programs and files are stored. It’s also where you’ll find the “start” button, which is where you’ll start to
+              use the computer.
+```
+
+## Want to try different model sizes and configurations? You got it!
+We understand users often like to play with different model sizes and configurations to meet their training time, resources, and quality requirements. With DeepSpeed-Chat, users can easily do that. For example, if you want to train a larger and higher-quality model on your GPU cluster for your research or business, you can simply use the same script with your desired model size e.g., 66B and GPU counts e.g., 64 GPUs:
+
+```python
+python train.py --actor-model facebook/opt-66b --reward-model facebook/opt-350m --deployment-type multi_node
+```
+
+Within 9 hours, you can have your 66 billion parameters ChatGPT model ready to be served in your favorite front-end GUI:
+
+
+<div align="center">
+
+| Model Sizes                       | Step 1  | Step 2 | Step 3 | Total  |
+|---------------------------------- |:-------:|:------:|:------:|:------:|
+| Actor: OPT-66B, Reward: OPT-350M | 82 mins | 5 mins | 7.5hr  | 9hr    |
+
+Table 5. E2E time breakdown for training a 66 billion parameter ChatGPT model via DeepSpeed-Chat on 8 DGX nodes with 8 NVIDIA A100-80G GPUs/node.
+
+</div>
+
+If you only have around 1-2 hours for coffee or lunch break, you can also try to train a small/toy model with DeepSpeed-Chat. For example, we prepared a training example for a 1.3B model with a single dataset to test our framework on your consumer-grade GPUs. The best part is that you will have your model checkpoint ready to play with when you are back from your lunch break!
+
+```python
+python train.py --actor-model facebook/opt-1.3b --reward-model facebook/opt-350m --deployment-type single_gpu
+```
+
+<div align="center">
+
+| Model Sizes                      | Step 1    | Step 2   | Step 3 | Total  |
+|--------------------------------- |:---------:|:--------:|:------:|:------:|
+| Actor: OPT-1.3B, Reward: OPT-350M | 2900 secs | 670 secs | 1.2hr | 2.2hr |
+
+*Table 6. E2E time breakdown for training a 1.3 billion parameter ChatGPT model via DeepSpeed-Chat on a single commodity NVIDIA A6000 GPU with 48GB memory.*
+
+</div>
+
+## Customizing your own RLHF training pipeline using DeepSpeed-Chat’s RLHF APIs
+DeepSpeed-Chat allows users to build their very own RLHF training pipeline using our flexible APIs shown below, which users can use to reconstruct their own RLHF training strategy. This enables a general interface and backend for creating a wide range of RLHF algorithms for research exploration.
+
+```python
+
+engine = DeepSpeedRLHFEngine(
+  actor_model_name_or_path=args.actor_model_name_or_path,
+  critic_model_name_or_path=args.critic_model_name_or_path,
+  tokenizer=tokenizer,
+  num_total_iters=num_total_iters,
+  args=args)
+
+trainer = DeepSpeedPPOTrainer(engine=engine, args=args)
+
+for prompt_batch in prompt_train_dataloader:
+  out = trainer.generate_experience(prompt_batch)
+  actor_loss, critic_loss = trainer.train_rlhf(out)
+```
+
+# 3. Full-fledged RLHF Training Pipeline
+
+To provide a seamless training experience, we follow InstructGPT and include a full-fledged end-to-end training pipeline in DeepSpeed-Chat as shown in Figure 1.
+
+<div align="center">
+
+<img src="../assets/images/ds-chat-overview.png" width="800px" alt="DeepSpeed-Chat!"/>
+
+*Figure 1: The illustration of DeepSpeed-Chat’s RLHF training pipeline with optional features.*
+
+</div>
+
+Our pipeline includes three main steps:
+
+*	**Step 1: Supervised finetuning (SFT)**, where human responses to various queries are carefully selected to finetune the pretrained language models.
+*	**Step 2: Reward model finetuning**, where a separate (usually smaller than the SFT) model (RW) is trained with a dataset that has human-provided rankings of multiple answers to the same query.
+*	**Step 3: RLHF training**, where the SFT model is further finetuned with the reward feedback from the RW model using the Proximal Policy Optimization (PPO) algorithm.
+
+We provide two additional features in Step 3 to help improve model quality:
+
+*	**Exponential Moving Average (EMA) collection**, where an EMA based checkpoint can be chosen for the final evaluation.
+*	**Mixture Training**, which mixes the pretraining objective (i.e., the next word prediction) with the PPO objective to prevent regression performance on public benchmarks like SQuAD2.0.
+
+The two training features, EMA and Mixed Training, are often omitted by other recent efforts since they can be optional. However, according to InstructGPT, EMA checkpoints generally provide better response quality than conventional final trained model and Mixture Training can help the model retain the pre-training benchmark solving ability. As such, we provide them for users to fully get the training experience as described in InstructGPT and strike for higher model quality.
+
+In addition to being highly consistent with InstructGPT paper, we also provide convenient features to support researchers and practitioners in training their own RLHF model with multiple data resources:
+
+*	***Data Abstraction and Blending Capabilities:*** DeepSpeed-Chat is able to train the model with multiple datasets for better model quality. It is equipped with (1) an abstract dataset layer to unify the format of different datasets; and (2) data splitting/blending capabilities so that the multiple datasets are properly blended and then split across the 3 training stages.
+
+To illustrate the effectiveness of our training pipeline, we demonstrate the model quality with multi-round conversation as shown in the experience section.
+
+
+# 4. DeepSpeed Hybrid Engine – Unified Infrastructure to Power and Optimize RLHF Training
+
+Step 1 and Step 2 of the instruct-guided RLHF pipeline resemble regular fine-tuning of large models, and they are powered by ZeRO-based optimizations and a flexible combination of parallelism strategies in DeepSpeed training to achieve scale and speed.  Step 3 of the pipeline, on the other hand, is the most complex part to handle in terms of performance implications. Each iteration requires efficient processing of two phases a) inference phase for token/experience generation, producing inputs for the training and b) training phase to update the weights of actor and reward models, as well as the interaction and scheduling between them.  It introduces two major costs: (1) the memory cost, as several copies of the SFT and RW models need to be served throughout stage 3; and (2) the predominant generation phase, which if not accelerated properly, will significantly slow down the entire stage 3. Additionally, the two important features we added in Stage 3, including Exponential Moving Average (EMA) collection and Mixture Training, will incur additional memory and training costs.
+
+To tackle these challenges, we composed the full system capability of DeepSpeed Training and Inference into a unified infrastructure that we call **Hybrid Engine**. It leverages the original DeepSpeed engines for fast training mode while effortlessly applying DeepSpeed inference engine for generation/evaluation mode, providing a significantly faster training system for RLHF training at Stage 3. As Figure 2 shows, the transition between DeepSpeed training and inference engine is seamless: by having the typical eval and train modes enabled for the actor model, when running for inference and training pipeline, DeepSpeed selects its different optimizations to run the model faster and improve the overall system throughput.
+
+
+<div align="center">
+
+<img src="../assets/images/hybrid-engine.png" width="600px" alt="DeepSpeed-Chat!"/>
+
+*Figure 2. DeepSpeed Hybrid Engine design for accelerating the most time-consuming portion of a RLHF pipeline.*
+
+</div>
+
+During its inference execution for the experience generation phase of RLHF training, DeepSpeed Hybrid Engine uses a light-weight memory management system to handle the KV-cache and intermediate results, together with highly optimized inference-adapted kernels and tensor parallelism implementation, to achieve a significant boost in throughput (tokens-per-second) compared to the existing solutions.
+
+During the training execution, Hybrid Engine enables memory optimization techniques such as DeepSpeed’s ZeRO family of technologies and Low Rank Adaption (LoRA).   We designed and implemented these system optimizations in a way that they are compatible with each other and can be composed together to deliver the highest training efficiency under the unified Hybrid Engine.
+
+Hybrid Engine can seamlessly change model partitioning across training and inference to support tensor-parallelism based inferencing and ZeRO-based sharding mechanisms for training. It can also reconfigure the memory system to maximize memory availability during each of these modes. This allows for improved performance by avoiding memory allocation bottlenecks and supporting large batch sizes. Packed with a spectrum of system technologies from DeepSpeed training and inference, Hybrid Engine pushes the boundary of modern RLHF training and delivers unparalleled scale and system efficiency for RLHF workloads.
+
+
+# 5. DeepSpeed RLHF: Unparalleled Scale and Efficiency via Hybrid Engine
+
+## Capability Recap
+
+As discussed, DeepSpeed-HE is an amalgamation of powerful system technologies for inference and training, architected to achieve excellent scale and efficiency for DeepSpeed-RLHF pipeline across a wide range of hardware, making RLHF training fast, affordable, and easily accessible to AI community.
+
+In terms of efficiency and affordability, as shown in Table 1, DeepSpeed-HE can train OPT-13B in just 9 hours and OPT-30B in 18 hours on Azure Cloud for under $300 and $600, respectively. In terms of speed and scalability, as shown in Table 2, even a 13B model can be trained in 1.25 hours and a massive 175B model can be trained in under a day using a 64 GPU cluster. And in terms of accessibility and democratization of RLHF, DeepSpeed-HE supports training models with over 13 billion parameters on a single GPU as shown in Table 3.
+
+## Throughput and Model Size Scalability Comparisons with Existing RLHF Systems
+
+Compared to other RLHF systems like Colossal-AI or HuggingFace powered by native PyTorch, DeepSpeed-RLHF excels in system performance and model scalability:
+
+*	With respect to throughput, DeepSpeed enables over 10x improvement for RLHF training on a single GPU (Figure 3). On multi-GPU setup, it enables  6 – 19x  speedup over Colossal-AI and 1.4 – 10.5x over HuggingFace DDP (Figure 4).
+*	With respect to model scalability, Colossal-AI can run a max model size of 1.3B on a single GPU and 6.7B on a single A100 40G node, DeepSpeed-HE can run 6.5B and 50B models respectively on the same hardware, up to 7.5x larger.
+
+Therefore, with over an order of magnitude higher throughput, DeepSpeed-HE unlocks the ability to train significantly larger actor models under the same latency budget or train models of similar size at over 10x lower cost, compared to existing RLHF systems like Colossal-AI or HuggingFace DDP.
+
+<div align="center">
+
+<img src="../assets/images/figure3.png" width="600px" />
+
+*Figure 3. Step 3 throughput comparison against two other system frameworks for accelerating RLHF \
+training on a single NVIDIA A100-40G commodity GPU.  No icons represent OOM scenarios.*
+
+</div>
+
+<div align="center">
+
+<img src="../assets/images/figure4.png" width="600px" />
+
+*Figure 4. End-to-end training throughput comparison for step 3 of the training pipeline (the most time \
+consuming portion) with different model sizes on a single DGX node equipped with 8 NVIDIA A100-40G GPUs.\
+No icons represent OOM scenarios.*
+
+</div>
+
+This improvement in efficiency stems from DeepSpeed-HE’s ability to accelerate RLHF generation phase of the RLHF processing by leveraging DeepSpeed inference optimizations. Figure 5 shows the time breakdown for a 1.3B parameter model at an RLHF training iteration: majority of the time goes to the generation phase. By leveraging high performance inference kernels from DeepSpeed, DeepSpeed-HE can achieve up to 9x throughput improvement during this phase over HuggingFace and 15x over Colossal-AI allowing it to achieve unparalleled end-to-end efficiency.
+
+<div align="center">
+
+<img src="../assets/images/figure5.png" width="600px" />
+
+*Figure 5. Superior generation phase acceleration from DeepSpeed Chat’s Hybrid Engine: A time/sequence breakdown for training OPT-1.3B actor model + OPT-350M reward model on a single DGX node with 8 A100-40G GPUs.*
+
+</div>
+
+## Effective Throughput and Scalability Analysis
+
+***(I) Effective Throughput Analysis.*** The effective throughput of DeepSpeed-HE during Stage 3 of the RLHF training depends on the throughput that it achieves during the generation and RL training phases. In our RLHF pipeline, the generation phase comprises approximately 20% of the total computation while the RL training phase comprises of remaining 80% (see [benchmark settings](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/BenckmarkSetting.md) page for details). However, despite having a small proportion, the former can take a large portion of the e2e time as it requires running the actor model once for each of the 256 generated tokens with an initial prompt of 256 tokens, making it memory bandwidth bound and difficult to achieve high throughput. In contrast, the RL training phase is compute bound running the reference actor model with just a couple of forward and backward passes with full 512 tokens from both prompt and generation per sample and can achieve good throughput.
+
+<div align="center">
+
+<img src="../assets/images/Throughputs-OPT.png" width="600px" />
+
+*Figure 6. RLHF Generation, training, and effective throughput with DeepSpeed-HE for different model sizes, at the GPU count that maximizes efficiency.*
+
+</div>
+
+To maximize the effective throughput, DeepSpeed-HE optimizes both phases. First, it uses the largest batch size possible to get higher efficiency in both phases. Second, during the generation phase, it leverages high-performance transformer kernels to maximize GPU memory bandwidth utilization when the model fits in single GPU memory, and leverages tensor-parallelism (TP) when it does not. Using TP in the generation phase instead of ZeRO to fit the model reduces the inter-GPU communication and maintains high GPU memory bandwidth utilization.
+
+Figure 6 shows the best achievable effective throughput for DeepSpeed-HE in terms of TFlops/GPU for model sizes ranging from 1.3B to 175B. It also shows the throughput achieved by each of the generation and training phases. DeepSpeed-HE is the most efficient for models in the range 6.7B-66B. Going beyond this range to 175B, the throughput drops due to the limited memory to support larger batch sizes, while still achieving 1.2x better efficiency than the small 1.3B model. The per-GPU throughput of these gigantic models could improve further when we scale them to more GPUs with more memory available for larger batch sizes.
+
+Furthermore, we would like to point out that our effective performance is 19x higher than existing systems, as shown in Figure 4, which suggests that they are operating at lower than 5% of the peak. This demonstrates the challenge of optimizing RLHF workloads as well as the effectiveness of our system despite the challenge.
+
+<div align="center">
+
+<img src="../assets/images/figure7.png" width="600px" />
+
+*Figure 7. Scalability for training 13B (left) and 66B (right) actor model+350M reward model on an increasing number of DGX nodes with 8 A100-40/80G GPUs*
+
+</div>
+
+***(II) Scalability Analysis.*** The best effective throughput for different model sizes is achieved at different GPU count. This is in part because some of the larger model sizes require more memory to run. However, a large part of this behavior stems from DeepSpeed-HE’s scalability properties that we discuss next.
+
+Figure 7 shows that DeepSeed-RLHF has achieved good scaling overall on up to 64 GPUs. However, if we look more closely, it shows that DeepSpeed-RLHF training achieves super-linear scaling at small scale, followed by near linear or sub-linear scaling at larger scales. This is due to the interaction between memory availability and max global batch size.
+
+As DeepSpeed-HE is powered by ZeRO-based technology for training, it allows model states to be partitioned across the available GPUs. As a result, the memory consumption per GPU reduces with the increase in the number of GPUs, allowing DeepSpeed-HE to support a larger batch per GPU resulting in super-linear scaling. However, at large scale, while the available memory continues to increase, the maximum global batch size (1024, in our case, with a sequence length of 512) limits the batch size per GPU, resulting in near-linear or sub-linear scaling.
+As a result, for a given max global batch size, DeepSpeed-HE achieves the best throughput and cost efficiency at the boundary of super-linear and sub-linear scalability, and the exact point is mostly determined by the largest batch size that can be run per GPU as the function of available memory and global batch size.
+
+# 6. Release: Try DeepSpeed Chat Today!
+
+We are very excited to share that DeepSpeed-Chat is now open-sourced and available to the AI community.
+
+* To get started, please visit our github page for DeepSpeed-Chat: [GitHub Landing Page](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat)
+
+* We will continue to improve DeepSpeed-Chat with your feedback and support. Our [roadmap](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/README.md#-deepspeed-chats-roadmap-) shows currently supported features as well as ones that are planned for the future.
+
+DeepSpeed-Chat is part of the bigger DeepSpeed ecosystem comprising a multitude of Deep Learning systems and modeling technologies. To learn more,
+
+* Please visit our [website](https://www.deepspeed.ai/) for detailed blog posts, tutorials, and helpful documentation.
+* You can also follow us on our [English Twitter](https://twitter.com/MSFTDeepSpeed), [Japanese Twitter](https://twitter.com/MSFTDeepSpeedJP), and [Chinese Zhihu](https://www.zhihu.com/people/deepspeed) for latest news on DeepSpeed.
+
+DeepSpeed welcomes your contributions! We encourage you to report issues, contribute PRs, and join discussions on the [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/) page. Please see our [contributing guide](https://github.com/microsoft/DeepSpeed/blob/master/CONTRIBUTING.md) for more details. We are open to collaborations with universities, research labs, companies, such as those working together on deep learning research, applying DeepSpeed to empower real-world AI models and applications, and so on. For such requests (and other requests unsuitable for GitHub), please directly email to deepspeed-info@microsoft.com.
diff --git a/blogs/deepspeed-chat/assets/images/ds_chat_main.png b/blogs/deepspeed-chat/assets/images/ds_chat_main.png
new file mode 100644
index 000000000000..3266a425b102
Binary files /dev/null and b/blogs/deepspeed-chat/assets/images/ds_chat_main.png differ
diff --git a/blogs/deepspeed-chat/assets/images/ds_chat_stability_sweep.png b/blogs/deepspeed-chat/assets/images/ds_chat_stability_sweep.png
new file mode 100644
index 000000000000..d98cd765bc6a
Binary files /dev/null and b/blogs/deepspeed-chat/assets/images/ds_chat_stability_sweep.png differ
diff --git a/blogs/deepspeed-chat/assets/images/ds_chat_zero_offload_gpu.png b/blogs/deepspeed-chat/assets/images/ds_chat_zero_offload_gpu.png
new file mode 100644
index 000000000000..935cadc0cf13
Binary files /dev/null and b/blogs/deepspeed-chat/assets/images/ds_chat_zero_offload_gpu.png differ
diff --git a/blogs/deepspeed-chat/assets/images/dschat-llama-13b-HE-perf.png b/blogs/deepspeed-chat/assets/images/dschat-llama-13b-HE-perf.png
new file mode 100644
index 000000000000..56cf6280d8a5
Binary files /dev/null and b/blogs/deepspeed-chat/assets/images/dschat-llama-13b-HE-perf.png differ
diff --git a/blogs/deepspeed-chat/assets/images/dschat-llama-7b-HE-perf.png b/blogs/deepspeed-chat/assets/images/dschat-llama-7b-HE-perf.png
new file mode 100644
index 000000000000..93342fffbc60
Binary files /dev/null and b/blogs/deepspeed-chat/assets/images/dschat-llama-7b-HE-perf.png differ
diff --git a/blogs/deepspeed-chat/assets/images/dschat-mpzero-llama.png b/blogs/deepspeed-chat/assets/images/dschat-mpzero-llama.png
new file mode 100644
index 000000000000..ae7f5f62f52e
Binary files /dev/null and b/blogs/deepspeed-chat/assets/images/dschat-mpzero-llama.png differ
diff --git a/blogs/deepspeed-chat/assets/images/llama2-with-he.png b/blogs/deepspeed-chat/assets/images/llama2-with-he.png
new file mode 100644
index 000000000000..de5015d277bf
Binary files /dev/null and b/blogs/deepspeed-chat/assets/images/llama2-with-he.png differ
diff --git a/blogs/deepspeed-chat/assets/images/sweep_after_lora_fix.png b/blogs/deepspeed-chat/assets/images/sweep_after_lora_fix.png
new file mode 100644
index 000000000000..d12dca7ac49c
Binary files /dev/null and b/blogs/deepspeed-chat/assets/images/sweep_after_lora_fix.png differ
diff --git a/blogs/deepspeed-chat/assets/images/sweep_before_lora_fix.png b/blogs/deepspeed-chat/assets/images/sweep_before_lora_fix.png
new file mode 100644
index 000000000000..ab01a46cddf7
Binary files /dev/null and b/blogs/deepspeed-chat/assets/images/sweep_before_lora_fix.png differ
diff --git a/blogs/deepspeed-chat/assets/images/zero_offload_after_stability.png b/blogs/deepspeed-chat/assets/images/zero_offload_after_stability.png
new file mode 100644
index 000000000000..8392f8230ff7
Binary files /dev/null and b/blogs/deepspeed-chat/assets/images/zero_offload_after_stability.png differ
diff --git a/blogs/deepspeed-chat/chinese/README.md b/blogs/deepspeed-chat/chinese/README.md
new file mode 100644
index 000000000000..03bf9b69449f
--- /dev/null
+++ b/blogs/deepspeed-chat/chinese/README.md
@@ -0,0 +1,325 @@
+<div align="center">
+
+# DeepSpeed Chat: 一键式RLHF训练，让你的类ChatGPT千亿大模型提速省钱15倍
+
+</div>
+
+<div align="center">
+
+<img src="../../assets/images/hero-figure-black.png" width="1000px" alt="DeepSpeed-Chat!"/>
+
+</div>
+
+如需引用 DeepSpeed Chat，请引用我们的[arxiv report](https://arxiv.org/abs/2308.01320):
+
+```
+@article{yao2023dschat,
+  title={{DeepSpeed-Chat: Easy, Fast and Affordable RLHF Training of ChatGPT-like Models at All Scales}},
+  author={Zhewei Yao and Reza Yazdani Aminabadi and Olatunji Ruwase and Samyam Rajbhandari and Xiaoxia Wu and Ammar Ahmad Awan and Jeff Rasley and Minjia Zhang and Conglong Li and Connor Holmes and Zhongzhu Zhou and Michael Wyatt and Molly Smith and Lev Kurilenko and Heyang Qin and Masahiro Tanaka and Shuai Che and Shuaiwen Leon Song and Yuxiong He},
+  journal={arXiv preprint arXiv:2308.01320},
+  year={2023}
+}
+```
+
+# 1. 概述
+
+近日来，ChatGPT及类似模型引发了人工智能（AI）领域的一场风潮。 这场风潮对数字世界产生了革命性影响。ChatGPT类模型具有惊人的泛用性，能够执行归纳、编程、翻译等任务，其结果与人类专家相当甚至更优。为了使ChatGPT等模型的训练和部署更轻松，AI 开源社区进行了各种尝试（例如 ChatLLaMa、Alpaca、Vicuna、Databricks-Dolly等）。
+
+然而，尽管开源社区付出了巨大的努力，目前仍缺乏一个支持端到端的基于人工反馈机制的强化学习（RLHF）的规模化系统，这使得训练强大的类ChatGPT模型十分困难。例如，使用现有的开源系统训练一个具有 67 亿参数的类ChatGPT模型通常需要昂贵的多卡至多节点的 GPU 集群，但这些资源对大多数数据科学家或研究者而言难以获取。同时，即使有了这样的计算资源，[现有的开源系统的训练效率通常还不到这些机器所能达到的最大效率的5%](#有效吞吐量和可扩展性分析)。简而言之，即使有了昂贵的多GPU集群，现有解决方案也无法轻松、快速、经济的训练具有数千亿参数的最先进的类ChatGPT模型。
+
+ChatGPT模型的训练是基于InstructGPT论文中的RLHF方式。这与常见的大语言模型的预训练和微调截然不同。这使得现有深度学习系统在训练类ChatGPT模型时存在种种局限。因此，为了让ChatGPT类型的模型更容易被普通数据科学家和研究者使用，并使RLHF训练真正普及到AI社区，我们发布了 DeepSpeed-Chat。DeepSpeed-Chat具有以下三大核心功能：
+
+
+（i）***简化 ChatGPT 类型模型的训练和强化推理体验***：只需一个脚本即可实现多个训练步骤，包括使用 Huggingface 预训练的模型、使用 DeepSpeed-RLHF 系统运行 InstructGPT 训练的所有三个步骤、甚至生成你自己的类ChatGPT模型。此外，我们还提供了一个易于使用的推理API，用于用户在模型训练后测试对话式交互。
+
+（ii）***DeepSpeed-RLHF 模块***：DeepSpeed-RLHF 复刻了 InstructGPT 论文中的训练模式，并确保包括a) 监督微调（SFT），b) 奖励模型微调和 c) 基于人类反馈的强化学习（RLHF）在内的三个步骤与其一一对应。此外，我们还提供了数据抽象和混合功能，以支持用户使用多个不同来源的数据源进行训练。
+
+（iii）***DeepSpeed-RLHF 系统***：我们将 DeepSpeed 的训练（training engine）和推理能力（inference engine) 整合到一个统一的混合引擎（DeepSpeed Hybrid Engine or DeepSpeed-HE）中用于 RLHF 训练。DeepSpeed-HE 能够在 RLHF 中无缝地在推理和训练模式之间切换，使其能够利用来自 DeepSpeed-Inference 的各种优化，如张量并行计算和高性能CUDA算子进行语言生成，同时对训练部分还能从 ZeRO- 和 LoRA-based 内存优化策略中受益。DeepSpeed-HE 还能够自动在 RLHF 的不同阶段进行智能的内存管理和数据缓存。
+
+
+DeepSpeed-RLHF 系统在大规模训练中具有无与伦比的效率，使复杂的 RLHF 训练变得快速、经济并且易于大规模推广：
+
+**高效性和经济性**：[DeepSpeed-HE 比现有系统快 15 倍以上](#与现有-RLHF-系统的吞吐量和模型大小可扩展性比较)，使 RLHF 训练快速且经济实惠。例如，DeepSpeed-HE 在 Azure 云上只需 9 小时即可训练一个 OPT-13B模型，只需 18 小时即可训练一个 OPT-30B模型。这两种训练分别花费不到 300 美元和 600 美元。
+
+<div align="center">
+
+| GPUs        | OPT-6.7B |  OPT-13B     |     OPT-30B     |     OPT-66B    |
+|-------------|:--------:|:--------------:|:-------------:|:-----------:|
+| 8x A100-40GB     | 5.7 hours | 10.8 hours |	 1.85 days |	 NA |
+| 8x A100-80GB     | 4.1 hours ($132) | 	9 hours ($290) | 	18 hours ($580) | 	 2.1 days ($1620) |
+
+*表 1. 单节点 8x A100：训练时长及预估的 Azure 费用。*
+
+</div>
+
+***卓越的扩展性***：DeepSpeed-HE 能够支持训练拥有数千亿参数的模型，并在多节点多 GPU 系统上展现出卓越的扩展性。因此，即使是一个拥有 130 亿参数的模型，也只需 1.25 小时就能完成训练。而对于庞大的 拥有1750 亿参数的模型，使用 DeepSpeed-HE 进行训练也只需不到一天的时间。
+
+<div align="center">
+
+| GPUs	        | OPT-13B 	    | OPT-30B	      | OPT-66B	      | OPT-175B |
+|---------------|:-----------------:|:---------------:|:-------------:|:-------------:|
+| 64x A100-80G	| 1.25 hours ($320)	| 4 hours ($1024) | 7.5 hours ($1920)	| 20 hours ($5120)|
+
+*表 2. 多节点 64x A100-80GB：训练时长及预估的 Azure 费用。*
+</div>
+
+> ***非常重要的细节***: 上述两个表格（即表一和表二）中的数据均针对 RLHF 训练的第 3 步，基于实际数据集和 DeepSpeed-RLHF 训练吞吐量的测试。该训练在总共 1.35 亿（135M）个字符（token）上进行一个时期（epoch）的训练。我们总共有 6750 万个查询（query）字符（131.9k 个 query，每个序列长度为 256）和 6750 万个生成/回答字符（131.9k 个答案，每个序列长度为 256），每步的最大全局字符批量大小约为 500 万个字符（1024 个查询-答案对）。在与 DeepSpeed-RLHF 进行任何成本和端到端时间比较之前，我们建议读者注意这些设定。想要了解更多详细信息，请参阅我们的页面 [benchmark setting](https://github.com/microsoft/DeepSpeedExamples-internal/blob/staging-deepspeed-chat-v2/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/BenckmarkSetting.md)。
+
+***实现 RLHF 训练的普及化***：仅凭单个 GPU，DeepSpeed-HE 就能支持训练超过 130 亿参数的模型。这使得那些无法使用多 GPU 系统的数据科学家和研究者不仅能够轻松创建轻量级的 RLHF 模型，还能创建大型且功能强大的模型，以应对不同的使用场景。
+
+<div align="center">
+
+|            |	V100 32G | 	A6000 48G | A100 40G | A100 80G  |
+|------------|:---------:|:----------:|:--------:|:---------:|
+| Model Size |	OPT-2.7B | 	OPT-6.7B  | OPT-6.7B | OPT-13B |
+
+*表 3. DeepSpeed-HE 在不同的 GPU 单卡上支持的最大模型。*
+
+</div>
+
+接下来，我们将深入了解上面介绍的 DeepSpeed-Chat 的三个功能。
+
+# 2. 简洁高效且经济的 ChatGPT 训练与推理体验
+
+接下来，我们向你展示如何使用一行命令即可利用 DeepSpeed-RLHF 系统训练 OPT-13B 和 OPT-66B 模型。即使时间有限，你也可以在大约两小时内在单个消费级 GPU 上训练一个 OPT-1.3B 模型。此外，我们还演示了如何利用我们的 DeepSpeed-chat RLHF API 来开发你自己的模型：
+
+## 使用 DeepSpeed-Chat 的 RLHF 示例轻松训练你的第一个 类ChatGPT 模型
+
+*a) 仅需一个脚本，完成 RLHF 训练的全部三个阶段，生成你的第一个 ChatGPT 模型！*
+
+以下示例展示了如何使用一个脚本，以预训练的 OPT-13B 作为 actor 模型，OPT-350M 作为 reward 模型，生成一个最终的 130 亿参数的 ChatGPT 类型的模型：
+```
+pip install deepspeed>=0.9.0
+
+git clone https://github.com/microsoft/DeepSpeedExamples.git
+cd DeepSpeedExamples/applications/DeepSpeed-Chat/
+pip install -r requirements.txt
+
+python train.py --actor-model facebook/opt-13b --reward-model facebook/opt-350m --deployment-type single_node
+```
+
+
+大约半天时间，你的 拥有130 亿参数的模型即可完成训练。下表展示了整个过程在三个步骤中所需的时间：
+
+<div align="center">
+
+| Model Sizes                      | Step 1 | Step 2 | Step 3 | Total  |
+|--------------------------------- |:------:|:------:|:------:|:------:|
+| Actor: OPT-13B, Reward: OPT-350M | 2.5hr	| 0.25hr | 10.8hr | 13.6hr |
+
+*表 4. 在单个DGX节点上（8x A100-40G），针对不同的RLHF步骤， 使用DeepSpeed-Chat训练OPT-13b所需的时间。*
+
+</div>
+
+*b) 使用 DeepSpeed-Chat 的推理 API 测试训练好的模型*
+
+将训练好的模型导入提供的简易 API，你不仅可以与模型进行问答互动，还可以进行对话式交流。
+
+```
+Human:        Do you know Microsoft?
+Assistant:    Yes, I’m a big fan of Microsoft.  What about them?
+Human:        Can you explain it to a 6-year old child? I wonder how I should describe it
+Assistant:    Sure, I can try.  Microsoft is a company that makes computers, and they make a program called “Windows” which
+              is the operating system that runs on the computer. It’s like the “brain” of the computer.  It’s where all the
+              programs and files are stored. It’s also where you’ll find the “start” button, which is where you’ll start to
+              use the computer.
+```
+
+## 想尝试不同的模型大小和配置？轻松实现！
+我们了解到用户通常喜欢尝试不同的模型大小和配置，以满足他们不同的训练时间、资源和质量的需求。借助 DeepSpeed-Chat，你可以轻松实现这些目标。例如，如果你想在 GPU 集群上训练一个更大、更高质量的模型，用于你的研究或业务，你可以使用相同的脚本，只需输入你期望的模型大小（例如 660 亿参数）和 GPU 数量（例如 64 个 GPU）：
+
+```python
+python train.py --actor-model facebook/opt-66b --reward-model facebook/opt-350m --deployment-type multi_node
+```
+
+在 接下来的9 小时内，你将拥有一个 660 亿参数的 ChatGPT 模型，并可以在你喜欢的前端 GUI 中使用：
+
+<div align="center">
+
+| Model Sizes                       | Step 1  | Step 2 | Step 3 | Total  |
+|---------------------------------- |:-------:|:------:|:------:|:------:|
+| Actor: OPT-66B, Reward: OPT-350M | 82 mins | 5 mins | 7.5hr  | 9hr    |
+
+*表 5. 在8个DGX节点上（64x A100-80G），针对不同的RLHF步骤， 使用DeepSpeed-Chat训练OPT-66b所需的时间。*
+
+</div>
+
+如果你只有大约 1-2 小时的咖啡或午餐休息时间，你也可以尝试使用 DeepSpeed-Chat 训练一个小型模型。例如，我们为单个数据集准备了一个 13 亿参数模型的训练示例，以便在你的消费级 GPU 上测试我们的框架。当你从午餐休息回来时，你的模型已经准备好供你使用！
+
+```python
+python train.py --actor-model facebook/opt-1.3b --reward-model facebook/opt-350m --deployment-type single_gpu
+```
+
+<div align="center">
+
+| Model Sizes                      | Step 1    | Step 2   | Step 3 | Total  |
+|--------------------------------- |:---------:|:--------:|:------:|:------:|
+| Actor: OPT-1.3B, Reward: OPT-350M | 2900 secs | 670 secs | 1.2hr | 2.2hr |
+
+*表 6.  在单个消费级A6000-48G上，针对不同的RLHF步骤， 使用DeepSpeed-Chat训练OPT-1.3b所需的时间。*
+
+</div>
+
+## 利用 DeepSpeed-Chat 的 RLHF API 自定义你自己的 RLHF 训练流程
+DeepSpeed-Chat 允许用户使用我们灵活的 API（如下所示）构建自己的 RLHF 训练流程，用户可以使用这些 API 重建自己的 RLHF 训练策略。我们希望这些功能可以为研究探索中创建各种 RLHF 算法提供通用接口和后端。
+```python
+
+engine = DeepSpeedRLHFEngine(
+  actor_model_name_or_path=args.actor_model_name_or_path,
+  critic_model_name_or_path=args.critic_model_name_or_path,
+  tokenizer=tokenizer,
+  num_total_iters=num_total_iters,
+  args=args)
+
+trainer = DeepSpeedPPOTrainer(engine=engine, args=args)
+
+for prompt_batch in prompt_train_dataloader:
+  out = trainer.generate_experience(prompt_batch)
+  actor_loss, critic_loss = trainer.train_rlhf(out)
+```
+
+# 3. 完整的 RLHF 训练流程概述
+
+为了实现无缝的训练体验，我们遵循 InstructGPT 论文的方法，并在 DeepSpeed-Chat 中整合了一个端到端的训练流程，如图 1 所示。
+
+<div align="center">
+
+<img src="../../assets/images/ds-chat-overview.png" width="800px" alt="DeepSpeed-Chat!"/>
+
+*图 1: DeepSpeed-Chat 的 RLHF 训练流程图示，包含了一些可选择的功能。*
+
+</div>
+
+我们的流程包括三个主要步骤：
+
+*	**步骤1：监督微调（SFT）** —— 使用精选的人类回答来微调预训练的语言模型以应对各种查询；
+* **步骤2：奖励模型微调** —— 使用一个包含人类对同一查询的多个答案打分的数据集来训练一个独立的（通常比 SFT 小的）奖励模型（RW）；
+*	**步骤3：RLHF 训练** —— 利用 Proximal Policy Optimization（PPO）算法，根据 RW 模型的奖励反馈进一步微调 SFT 模型。
+
+在步骤3中，我们提供了两个额外的功能，以帮助提高模型质量：
+* **指数移动平均（EMA）** —— 可以选择基于 EMA 的检查点进行最终评估
+*	**混合训练** —— 将预训练目标（即下一个单词预测）与 PPO 目标混合，以防止在像 SQuAD2.0 这样的公开基准测试中的性能损失
+
+这两个训练功能，EMA 和混合训练，常常被其他的开源框架所忽略，因为它们并不会妨碍训练的进行。然而，根据 InstructGPT，EMA 通常比传统的最终训练模型提供更好的响应质量，而混合训练可以帮助模型保持预训练基准解决能力。因此，我们为用户提供这些功能，以便充分获得 InstructGPT 中描述的训练体验，并争取更高的模型质量。
+
+除了与 InstructGPT 论文高度一致外，我们还提供了一项方便的功能，以支持研究人员和从业者使用多个数据资源训练他们自己的 RLHF 模型：
+
+*	**数据抽象和混合能力**： DeepSpeed-Chat 能够使用多个不同来源的数据集训练模型以获得更好的模型质量。它配备了（1）一个抽象数据集层，以统一不同数据集的格式；以及（2）数据拆分/混合功能，以便多个数据集在 3 个训练阶段中被适当地混合然后拆分。
+
+在我们之前的章节中，你可以看到使用整个 DeepSpeed-Chat 训练模型在多轮对话中的表现。
+
+# 4. DeepSpeed Hybrid Engine —— 统一的高效混合引擎，为 RLHF 训练提供动力并进行优化
+
+DeepSpeed-Chat流程的前两步与大型模型的常规微调相似，得益于基于ZeRO的内存管理优化和DeepSpeed训练中的并行策略灵活组合，实现了规模和速度的提升。然而，流程的第三步在性能方面是最具挑战性的部分。每次迭代都需要高效处理两个阶段：a) 生成回答的推理阶段，为训练提供输入；b) 更新 actor 和 reward 模型权重的训练阶段，以及它们之间的交互和调度。这引入了两个主要困难：（1）内存成本，因为在第三阶段的整个过程中需要运行多个SFT和RW模型；（2）生成回答阶段的速度较慢，如果没有正确加速，将显著拖慢整个第三阶段。此外，我们在第三阶段中添加的两个重要可选功能，包括指数移动平均（EMA）收集和混合训练，将产生额外的内存和训练成本。
+
+为了应对这些挑战，我们将DeepSpeed训练和推理的系统功能整合为一个统一的基础设施，称为混合引擎（Hybrid Engine）。它利用原始DeepSpeed引擎进行高速训练模式，同时轻松应用DeepSpeed推理引擎进行生成/评估模式，为第三阶段的RLHF训练提供了一个明显更快的训练系统。如图2所示，DeepSpeed训练和推理引擎之间的过渡是无缝的：通过为actor模型启用典型的eval和train模式，当运行推理和训练流程时，DeepSpeed选择其不同的优化来运行模型更快并提高整个系统吞吐量。
+
+<div align="center">
+
+<img src="../../assets/images/hybrid-engine.png" width="600px" alt="DeepSpeed-Chat!"/>
+
+*Figure 2. 设计图解：DeepSpeed Hybrid Engine，用于加速 RLHF 流程中最耗时的部分。*
+
+</div>
+
+在RLHF训练的经验生成阶段的推理执行过程中，DeepSpeed混合引擎使用轻量级内存管理系统来处理KV缓存和中间结果，同时使用高度优化的推理CUDA核和张量并行计算。与现有解决方案相比，DeepSpeed-HE显著提高了吞吐量（每秒token数）。
+
+在训练执行过程中，混合引擎使用了多种内存优化技术，如DeepSpeed的ZeRO系列技术和现在流行的LoRA方法。这些技术在混合引擎中可以彼此兼容，并可以组合在一起以提供最高训练效率。
+
+DeepSpeed-HE可以在训练和推理之间无缝更改模型分区，以支持基于张量并行计算的推理和基于ZeRO的分片机制进行训练。它还会重新配置内存系统以在此期间最大化内存可用性。DeepSpeed-HE还通过规避内存分配瓶颈和支持大批量大小来进一步提高性能。混合引擎集成了DeepSpeed训练和推理的一系列系统技术，突破了现有RLHF训练的极限，并为RLHF工作负载提供了无与伦比的规模和系统效率。
+
+# 5. DeepSpeed RLHF: 通过 Hybrid Engine 实现无与伦比的规模和效率
+
+## 回顾
+
+如前所述，DeepSpeed-HE 是一个将强大的用于推理和训练的结合系统，旨在使 DeepSpeed-RLHF 在各种硬件上实现卓越的规模和效率，使 RLHF 训练快速、经济并且易于 AI 社区使用。
+
+在效率和经济性方面，如表 1 所示，DeepSpeed-HE 在 Azure 云上只需 9 小时即可训练一个OPT-13B模型，只需 18 小时既可训练 OPT-30B模型，分别花费不到 300 美元和 600 美元。在速度和可扩展性方面，如表 2 所示，即使是 13B 的模型也可以在 1.25 小时内训练，而庞大的 175B 模型可以在不到一天的时间内使用 64 个 GPU 集群进行训练。在 RLHF 的可访问性和普及化方面，DeepSpeed-HE 可以在单个 GPU 上训练超过 130 亿参数的模型，如表 3 所示。
+
+## 与现有 RLHF 系统的吞吐量和模型大小可扩展性比较
+
+与其他 RLHF 系统（如 Colossal-AI 或由原生 PyTorch 提供支持的 HuggingFace）相比，DeepSpeed-RLHF 在系统性能和模型可扩展性方面表现出色：
+
+* 就吞吐量而言，DeepSpeed 在单个 GPU 上的 RLHF 训练中实现了 10 倍以上的改进（图 3）。在多 GPU 设置中，它比 Colossal-AI 快 6 - 19 倍，比 HuggingFace DDP 快 1.4 - 10.5 倍（图 4）。
+* 就模型可扩展性而言，Colossal-AI 可以在单个 GPU 上运行最大 1.3B 的模型，在单个 A100 40G 节点上运行 6.7B 的模型，而 DeepSpeed-HE 可以在相同的硬件上分别运行 6.5B 和 50B 的模型，实现高达 7.5 倍的提升。
+
+因此，凭借超过一个数量级的更高吞吐量，与现有的 RLHF 系统（如 Colossal-AI 或 HuggingFace DDP）相比，DeepSpeed-HE 拥有在相同时间预算下训练更大的 actor 模型的能力，或者以十分之一的成本训练类似大小的模型的能力。
+
+
+<div align="center">
+
+<img src="../../assets/images/figure3.png" width="600px" />
+
+*图 3. 在单个 NVIDIA A100-40G GPU 上，将 RLHF 训练的吞吐量与另外两个系统框架在步骤 3 进行比较。没有图标表示 OOM（内存不足）的情况*
+
+</div>
+
+<div align="center">
+
+<img src="../../assets/images/figure4.png" width="600px" />
+
+*图 4. 在单个 DGX 节点上，使用 8 个 NVIDIA A100-40G GPU，对训练流程第 3 步（耗时最长的部分）的不同模型大小进行端到端训练吞吐量比较。没有图标表示 OOM（内存不足）的情况。*
+
+</div>
+
+这种效率的提高是 DeepSpeed-HE 利用 DeepSpeed 推理优化在 RLHF 处理过程中加速 RLHF 生成的结果。图 5 显示了 RLHF 训练迭代中 1.3B 参数模型的时间消耗细节：大部分时间用于生成阶段。通过利用 DeepSpeed 的高性能推理内核，DeepSpeed-HE 在这个阶段可以实现比 HuggingFace 高达 9 倍的吞吐量改进，比 Colossal-AI 高 15 倍，从而实现无与伦比的端到端效率。
+
+<div align="center">
+
+<img src="../../assets/images/figure5.png" width="600px" />
+
+*图 5. DeepSpeed Chat 的混合引擎在生成阶段的优越加速：在单个 DGX 节点上使用 8 个 A100-40G GPU 训练 OPT-1.3B actor 模型 + OPT-350M reward 模型的时间/序列分解。*
+
+</div>
+
+## 有效吞吐量和可扩展性分析
+
+***(I) 有效吞吐量分析。*** 在 RLHF 训练的第 3 阶段，DeepSpeed-HE 的有效吞吐量取决于它在生成和 RL 训练阶段所实现的吞吐量。在我们的 RLHF （详见 [benchmarking setting](https://github.com/microsoft/DeepSpeedExamples/blob/master/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/BenckmarkSetting.md)）中，生成阶段占总计算的约 20%，而 RL 训练阶段占剩余的 80%。然而，尽管比例较小，前者可能会占用大部分的端到端时间，因为它需要为每个生成的字符运行一次 actor 模型，使其受到内存带宽限制，难以实现高吞吐量。相比之下，RL 训练阶段是计算密集型的，仅需运行参考 actor 模型进行几次前向和后向传递，每个样本都有来自提示和生成的全部 512 个字符，可以实现良好的吞吐量。
+
+<div align="center">
+
+<img src="../../assets/images/figure6.png" width="600px" />
+
+*图 6. 在最大效率的情况下，DeepSpeed-HE 针对不同模型大小的RLHF生成、训练和有效吞吐量。*
+
+</div>
+
+为了最大化有效吞吐量，DeepSpeed-HE 对两个阶段进行了优化。首先，它使用尽可能大的批量大小以在两个阶段上获得更高的效率。其次，在生成阶段，它利用高性能CUDA内核在模型在单个 GPU 上最大化 GPU 内存带宽利用率，并在其他情况下利用张量并行（Tensor Parallelism, 简写作TP）进行计算。DeepSpeed-HE进一步在生成阶段使用 TP 而不是 ZeRO 以减少 GPU 之间的通信并保持高 GPU 内存带宽利用率。
+
+图 6 显示了 DeepSpeed-HE 在 1.3B 到 175B 的模型大小范围内可以实现的最佳有效吞吐量（以 TFlops/GPU 表示）。它还分别显示了在生成和训练阶段实现的吞吐量。DeepSpeed-HE 对 6.7B-66B 范围内的模型最为高效。超出这个范围到 175B 时，由于内存有限，无法支持更大的批量大小，吞吐量下降，但仍比小型 1.3B 模型的效率高 1.2 倍。当我们将这些巨大的模型扩展到更多具有更多内存的 GPU 时，这些模型的每个 GPU 吞吐量可能会进一步提高。
+
+此外，我们想指出，如图 2 所示，我们系统的有效性能比现有系统高 19 倍，这表明它们的运行速度低于峰值的 5%。这说明了优化 RLHF 工作负载的挑战以及我们的系统在面对挑战时的有效性。
+
+<div align="center">
+
+<img src="../../assets/images/figure7.png" width="600px" />
+
+*图 7. 在不同数量的DGX (A100-40/80G GPU) 节点上，进行13B（左）和66B（右）actor 模型 和 350M reward 模型的可扩展性训练。*
+
+</div>
+
+***(II) 可扩展性分析。*** 不同模型大小的最佳有效吞吐量取决于不同的 GPU 数量。部分原因是因为一些较大的模型大小需要更多的内存来运行。基于此，我们接下来讨论 DeepSpeed-HE 的可扩展性特性。
+
+图 7 显示 DeepSeed-RLHF 在多达 64 个 GPU的集群 上实现了良好的整体扩展。然而，如果我们仔细观察，可以发现 DeepSpeed-RLHF 训练在小规模时实现了超线性扩展，随后在较大规模时实现了接近线性或次线性扩展。这是由于内存可用性和最大全局批量大小之间的相互作用。
+
+DeepSpeed-HE 的核心技术基于 ZeRO，用于训练过程中将模型状态分割到每个GPU上。这意味着随着 GPU 数量的增加，每个 GPU 的内存消耗会减少，使得 DeepSpeed-HE 能够在每个 GPU 上支持更大的批量，从而实现超线性扩展。然而，在大规模情况下，尽管可用内存持续增加，但最大全局批量大小仍然限制了每个 GPU 的批量大小，导致接近线性或次线性扩展。因此，在给定的最大全局批量大小（例如，我们设置为 1024 个句子，每个句子长度为 512）下，DeepSpeed-HE 在超线性和次线性可扩展性之间实现了最佳的吞吐量和成本效益。具体的平衡点主要取决于每个 GPU 上可运行的最大批量大小，而这又受到可用内存和全局批量大小的函数所决定。
+
+# 6. 发布：现在就尝试DeepSpeed Chat吧!
+
+我们非常高兴地宣布，DeepSpeed-Chat现已开源并向 AI 社区开放。
+
+* 如果你发现我们的成果对你有用或者喜欢我们的开源成果，请在 [DeepSpeed](https://github.com/microsoft/DeepSpeed) 和 [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples)上点⭐。
+
+
+* 请访问我们的DeepSpeed-Chat GitHub页面以开始使用：[GitHub 登陆页面](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat)
+
+
+* 我们将继续根据你的反馈和支持改进 DeepSpeed-Chat。我们的[计划图](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/README.md#-deepspeed-chats-roadmap-)显示了当前支持的功能以及计划在未来支持的功能。
+
+DeepSpeed-Chat 是更大的DeepSpeed生态系统的一部分，包括众多深度学习系统和建模技术。要了解更多信息，
+
+* 请访问我们的[网站](https://www.deepspeed.ai/)，了解详细的博客文章、教程和有用的文档。
+* 我们会在[知乎](https://www.zhihu.com/people/deepspeed)上发布最新中文博客及动态。你还可以关注我们的[英文 Twitter](https://twitter.com/MSFTDeepSpeed) 和[日文 Twitter](https://twitter.com/MSFTDeepSpeedJP)。
+
+
+DeepSpeed 欢迎你的贡献！我们鼓励你在 [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/) 页面报告问题、贡献 PR 并参与讨论。请参阅我们的[贡献指南](https://github.com/microsoft/DeepSpeed/blob/master/CONTRIBUTING.md)了解更多详情。我们愿意与大学、研究实验室、公司等进行合作，共同开展深度学习研究，将 DeepSpeed 应用于赋能现实世界的 AI 模型和应用等。对于此类需求（以及其他不适合在 GitHub 上提出的需求），请直接发送电子邮件至 deepspeed-info@microsoft.com。
diff --git a/blogs/deepspeed-chat/ds-chat-release-8-31/README.md b/blogs/deepspeed-chat/ds-chat-release-8-31/README.md
new file mode 100644
index 000000000000..e1d4bf952bf1
--- /dev/null
+++ b/blogs/deepspeed-chat/ds-chat-release-8-31/README.md
@@ -0,0 +1,352 @@
+<div align="center">
+
+# DeepSpeed-Chat: Llama/Llama-2 system support, efficiency boost, and training stability improvements
+
+</div>
+
+<div align="center">
+  <img src="../assets/images/ds_chat_main.png" alt="" width="850"/>
+</div>
+
+# Table of Contents
+1. [Introduction](#introduction)
+2. [System Support for Llama and Llama-2 models](#system-support-llama)
+3. [Improved Efficiency and Accessibility](#new-features)
+    - [3.3x Higher Throughput with MixZ++ for LoRA](#mixz)
+    - [ZeRO-Offload Support for Larger Models with 16x fewer GPUs](#zero-offload)
+4. [Stability Bug Fixes](#stability-bug-fixes)
+5. [Software Improvements](#software-improvements)
+    - [Characterization Scripts](#characterization-scripts)
+    - [Instrumentation](#instrumentation)
+    - [Testing](#testing)
+6. [Try Out DeepSpeed-Chat](#try-out-deepspeed-chat)
+
+
+# 1. Introduction <a name="introduction"></a>
+
+DeepSpeed-Chat is a general system framework for RLHF training that enables easy, fast, affordable, and scalable training of ChatGPT-style models that we [publicly released on GitHub](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-chat/README.md). The detailed performance and capabilities of DeepSpeed-Chat have been published in our [blog post](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat) and [arXiv](https://arxiv.org/abs/2308.01320) paper.
+
+We are happy to share that today we are improving DeepSpeed-Chat along three areas: i) system support for Llama/Llama-2 family of models, ii) system features for improved efficiency and accessibility, and iii) stability and software enhancements.
+
+- **System support for training Llama and Llama-2 models**
+
+  We ***introduce system support for training Llama and Llama-2 models*** in DeepSpeed-Chat enabling and leveraging various optimizations and features including the Hybrid Engine, ZeRO family of optimizations, Low-Rank Adaptation (LoRA) support, as well as full integration into the three-stage DeepSpeed-Chat RLHF pipeline. By leveraging the Hybrid-Engine, we speed up the experience generation phase for Llama-2-7B and Llama-2-13B models by **up to 7.1X**.
+
+-  **New System Features for Improved Efficiency and Accessibility**
+    - ***Mixed Precision ZeRO++ ([MixZ++](https://github.com/microsoft/DeepSpeed/pull/3954))***. It is an extended set of optimization strategies built upon [ZeRO++](https://www.deepspeed.ai/tutorials/zeropp/) tailored to reduce memory usage and improve training/inference efficiency for RLHF training with LoRA. MixZ++ partitions model parameters across GPUs to reduce footprint and gathers them with quantized communication only when needed similar to its ZeRO and ZeRO++ siblings. Our evaluation indicates MixZ++ increases the training throughput by **up to 3.3x** for the Llama-2-70B model running on 128 V100 GPUs.
+
+    - ***[ZeRO-Offload](https://www.microsoft.com/en-us/research/blog/zero-infinity-and-deepspeed-unlocking-unprecedented-model-scale-for-deep-learning-training/)***. It is an optimization that offloads optimizer memory and computation from the GPU to the host CPU, enabling larger models to be trained with fewer GPU resources. After training stability fixes and testing, we have enabled this feature across all three stages of the DeepSpeed-Chat RLHF training pipeline. ZeRO-Offload reduces the minimum number of GPUs required to train large models by **up to 16x**.
+
+- **Stability and Software Enhancements**
+
+  - DeepSpeed-Chat contains a rich set of features for training across many different platforms and scenarios. Composing these features in a systematic way and ensuring both system stability and decent training convergence is critical for the usability of the framework. Thus, in addition to new features in DeepSpeed-Chat, many system stability and training convergence issues have been fixed both in DeepSpeed-Chat (client code) and DeepSpeed (runtime). These improvements have been thoroughly tested using the OPT model family for end-to-end training. Furthermore, end-to-end testing, characterization scripts, and several instrumentation features like TensorBoard support are now also available. *To try out these latest features and software improvements, please use DeepSpeed release [v0.10.2](https://github.com/microsoft/DeepSpeed/tree/v0.10.2) and the latest DeepSpeed-Chat in [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples)*.
+
+  - Finally, to ensure the long-term health of the DeepSpeed-Chat training framework, [PyTests](https://github.com/microsoft/DeepSpeedExamples/blob/master/applications/DeepSpeed-Chat/training/tests/test_training.py) were added for testing Step 3 of the RLHF training pipeline and are run on a nightly basis through a newly developed [GitHub Actions workflow](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-ds-chat.yml).
+
+We now dive into the details of our new features, training stability, and software improvements.
+
+# 2. System Support for Llama and Llama-2 models <a name="system-support-llama"></a>
+
+The DeepSpeed-Chat training framework now provides system support for the Llama and Llama-2 models across all three stages of training. To support this, we encountered a spectrum of issues, spanning from minor runtime errors to intricate performance-related challenges. In particular, the Llama model architecture which deviates from the standard Transformers block, was incompatible with DeepSpeed's inference kernels and the DeepSpeed container policy used by the Hybrid Engine. Addressing these hurdles necessitated extensive modifications across our DeepSpeed-Chat pipeline and the DeepSpeed runtime including code to support the ZeRO family of optimizations and their interaction with optimized inference kernels in the Hybrid Engine. We have resolved these challenges to ensure that DeepSpeed-Chat can support Llama and Llama-2 and provide our users with the best possible experience. The details can be seen from several PRs that have been merged in our codebases.
+
+## Key Supported Optimizations
+
+The following key optimizations in DeepSpeed are now fully integrated for Llama and Llama-2 models:
+
+- **DeepSpeed-Chat Integration**: Fully integrated into the complete, end-to-end three-stage DeepSpeed-Chat RLHF training framework, based on the OpenAI InstructGPT training strategy.
+- **Hybrid Engine**: DeepSpeed Hybrid Engine allows for superior generation phase [acceleration](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-chat/README.md#throughput-and-model-size-scalability-comparisons-with-existing-rlhf-systems), now supported for all Llama-1 model variants, Llama-2-7B, and Llama-2-13B models.
+- **ZeRO and ZeRO-Offload**: Fully supported by the [ZeRO](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-chat/README.md#throughput-and-model-size-scalability-comparisons-with-existing-rlhf-systems) family of optimizations including offload support leveraging full memory capacity of a system thus enabling training of even larger models.
+- **Mixed Precision ZeRO++ (MixZ++)**: Enhanced support for larger models like Llama-2-70B through the new MixZ++ feature, improving efficiency and reducing memory usage when there are frozen or non-trainable parameters.
+- **LoRA**: Fully supported by the [LoRA](https://github.com/microsoft/LoRA) feature, which vastly reduces the storage requirements for large language models by freezing original weights and learning pairs of rank-decomposition matrices.
+
+## Getting Started
+
+Users looking to try the new Llama and Llama-2 model support can get started by using the newly added Llama scripts.
+| Step Number | Scripts |
+| --- | --- |
+| 1 | [Llama-2 Step 1 Scripts](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/llama2) |
+| 2 | [Llama-2 Step 2 Scripts](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/llama2) |
+| 3 | [Llama-2 Step 3 Scripts](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/llama2) |
+
+*Note*: While all the system aspects of Llama and Llama-2 support have been extensively tested, there are no guarantees about training convergence and may require hyper-parameter tuning to achieve convergence.
+
+## Performance Evaluation
+
+We highlight the performance benefits of the Hybrid Engine for Llama-2 models on NVIDIA A100 and V100 GPUs in this section. Improved performance for larger models like Llama-2-70B and reduced resource requirements via ZeRO-Offload are discussed in the [next section](#new-features).
+
+#### A100 Performance Evaluation
+Using A100 GPUs, we achieve 7.1x faster generation for Llama-2-7B and 5.4x faster generation for Llama-2-13B with DeepSpeed-Chat Hybrid Engine compared to DeepSpeed-Chat without Hybrid Engine (baseline) as shown in *Figure 1*.
+
+
+<div align="center">
+  <img src="../assets/images/llama2-with-he.png" alt="Up to 7.1x faster Llama-2 generation with DS-Chat Hybrid Engine" width="600"/>
+
+  *Figure 1: Up to 7.1x faster Llama-2 generation with DS-Chat Hybrid Engine*
+
+</div>
+
+#### V100 Performance Evaluation
+Using V100 GPUs, we achieve 4x faster generation for Llama-2-7B and 2.1x faster generation for Llama-2-13B with DeepSpeed-Chat Hybrid Engine compared to DeepSpeed-Chat without Hybrid Engine (baseline) as shown in *Figure 2*.
+
+<div align="center">
+  <img src="../assets/images/dschat-llama-7b-HE-perf.png" alt="4x faster Llama-2-7B generation with DS-Chat Hybrid Engine" width="400"/>
+  <img src="../assets/images/dschat-llama-13b-HE-perf.png" alt="2.1x faster Llama-2-13B generation with DS-Chat Hybrid Engine" width="400"/>
+
+  *Figure 2: [Left] 4x faster Llama-2-7B generation with DS-Chat Hybrid Engine (16 V100 GPUs) [Right] 2.1x faster Llama-2-13B generation with DS-Chat Hybrid Engine on 32 V100 GPUS vs. DS-Chat without Hybrid Engine on 16 V100 GPUs.*
+
+</div>
+
+
+# 3. Improved Efficiency and Accessibility <a name="new-features"></a>
+
+We now dive into the details of two new features we are introducing today: 1) Mixed Precision ZeRO++ (MixZ++) and 2) ZeRO-Offload. Both these features offer unique benefits for DeepSpeed-Chat users. MixZ++ provides up to 3.3x better throughput for LoRA-enabled training and ZeRO-Offload reduces the minimum number of GPUs required to train by up to 16x.
+
+## 3.3x Higher Throughput with MixZ++ for LoRA <a name="mixz"></a>
+
+Mixed Precision ZeRO++ ([MixZ++](https://github.com/microsoft/DeepSpeed/pull/3954)) is an extended set of optimization strategies built upon [ZeRO](https://www.deepspeed.ai/tutorials/zero/) and [ZeRO++](https://www.deepspeed.ai/tutorials/zeropp/) tailored to reduce memory usage and improve training/inference efficiency for RLHF training with LoRA.
+
+Similar to [ZeRO](https://www.deepspeed.ai/tutorials/zero/), MixZ++ partitions model parameters across GPUs to reduce footprint and gathers them only when needed. In addition, similar to ZeRO++, MixZ++ allows for hierarchical partitioning and quantized communication. The hierarchical partitioning allows all the parameters to be stored within a node when possible so that the communication happens within a node, where communication bandwidth is significantly higher than communicating across nodes. The communication overhead is further reduced by quantizing the weights before gathering them.
+
+Finally, unlike ZeRO++ where parameters are always stored in fp16/bf16, and quantized/dequantized before and after communication, MixZ++ can persistently store the frozen weights in [Low-Rank Adaptation (LoRA)](https://github.com/microsoft/LoRA) in lower-precision, significantly reducing the communication overhead, eliminating quantization overhead, and supporting larger batch sizes that enable better efficiency.
+
+A comprehensive exploration of technical details can be accessed through our [ZeRO++ blog](https://www.microsoft.com/en-us/research/blog/deepspeed-zero-a-leap-in-speed-for-llm-and-chat-model-training-with-4x-less-communication/), [MixZ++ tutorial](https://www.deepspeed.ai/tutorials/mixed_precision_zeropp/), and [paper](https://arxiv.org/pdf/2306.10209.pdf).
+
+#### Highlights
+
+State-of-the-art approaches like [QLoRA](https://arxiv.org/abs/2305.14314) focus on combining multiple techniques like quantization of LoRA weights, relying on new datatypes such as NF4, and memory-management/offload techniques like paged optimizers to enable finetuning of large models on a single GPU. MixZ++ is our approach to enable large model training powered by quantization but is designed to scale to a large number of GPUs with simplicity and compatibility with existing technologies like ZeRO-Offload and DeepSpeed Hybrid Engine.
+
+MixZ++ has the following highlights:
+- Simplicity: A general solution requiring no assumptions about the model and/or optimizer. Integrating it into your training script is as simple as adding a single line of code.
+- Performance: Powered by a set of highly optimized CUDA kernels that enables efficient quantization/dequantization. The evaluation shows up to 3.3x higher throughput for Llama-2-70B training on 128 GPUs compared to the ZeRO-3 baseline (*Figure 3*).
+- Compatibility: Compatible with DeepSpeed/ZeRO features like DeepSpeed Hybrid Engine, ZeRO-Offload, etc.
+- Scalability: Designed to scale to a large number of GPUs. It is tested on up to 384 GPUs on Azure.
+
+
+#### Performance Evaluation
+To assess the effectiveness of MixZ++ for LoRA-enabled training, we carried out a series of RLHF training experiments (Step 3) using the Llama-2-70B model. These experiments were conducted on hardware configurations featuring 64 and 128 V100 GPUs. A visual representation of the experiment results is shown in the following figure:
+
+<div align="center">
+  <img src="../assets/images/dschat-mpzero-llama.png" alt="Mixed Precision ZeRO++ Evaluation" width="600"/>
+
+  *Figure 3: We achieve 3.3x increased throughput for RLHF training of Llama-2-70B on 128 V100 GPUs using Mixed Precision ZeRO++ vs. ZeRO-3. We obsvered 2x improved throughout for the same experiment on 64 V100 GPUs.*
+
+</div>
+
+Specifically, our results showcase a 2x increase in training throughput when utilizing 64 GPUs with MixZ++, compared to the ZeRO-3 baseline. Furthermore, when scaling up to 128 GPUs, the speedup effect becomes even more pronounced, with a substantial 3.3x improvement in training throughput. These outcomes underscore the potential of MixZ++ as a powerful tool for improving training efficiency in large-scale GPU settings.
+
+To try this feature, please refer to [MixZ++ tutorial](https://www.deepspeed.ai/tutorials/mixed_precision_zeropp/).
+
+## ZeRO-Offload Support for Larger Models with 16x fewer GPUs <a name="zero-offload"></a>
+
+[ZeRO-Offload](https://www.microsoft.com/en-us/research/blog/zero-infinity-and-deepspeed-unlocking-unprecedented-model-scale-for-deep-learning-training/) powers unprecedented model sizes by leveraging the full memory capacity of a system, concurrently exploiting all heterogeneous memory. Modern GPU clusters have 2-3x more CPU memory than GPU memory. ZeRO-Offload capitalizes on this disparity and offloads optimizer memory and computation from the GPU to the host CPU, enabling larger models to be trained with fewer GPU resources without being bottlenecked by the CPU's lower bandwidth. ZeRO-Offload allows training of large models on up to 16x fewer GPUs as we can see in *Figure 4*.
+
+<div align="center">
+  <img src="../assets/images/ds_chat_zero_offload_gpu.png" width="600">
+
+  *Figure 4: ZeRO-Offload enables us to train Llama-2-7B with 16x fewer GPUs. 16 V100 GPUs are required for training Llama-2-7B with DS-Chat ZeRO-3. Enabling LoRA allows for the number of GPUs to be reduced to 4 while enabling ZeRO-Offload reduces the number of needed GPUs to 1. The HuggingFace Baseline does not run due to memory limitations.*
+
+</div>
+
+ZeRO-Offload was [disabled](https://github.com/microsoft/DeepSpeedExamples/pull/553)
+ with the initial release of DeepSpeed-Chat due to training instability that was observed when it was used with Hybrid Engine and LoRA. After improvements to Hybrid Engine and LoRA as well as extensive testing of all feature configurations for ZeRO Stage2 and ZeRO Stage 3, this feature can now be enabled across all three steps of the DeepSpeed-Chat training framework. Please note that configuring ZeRO-Offload with ZeRO Stage 2 and Hybrid Engine with LoRA disabled is currently unsupported due to observed training instability.
+
+<div align="center">
+  <img src="../assets/images/zero_offload_after_stability.png" width="750">
+
+  *Figure 5: Reward scores for all supported DeepSpeed-Chat configurations with ZeRO-Offload enabled. Run with 16 V100 GPUs, [AdamG012/chat-opt-1.3b-sft-deepspeed](https://huggingface.co/AdamG012/chat-opt-1.3b-sft-deepspeed) actor model, [AdamG012/chat-opt-350m-reward-deepspeed](https://huggingface.co/AdamG012/chat-opt-350m-reward-deepspeed) critic model, DS commit: [f036f00c](https://github.com/microsoft/DeepSpeed/tree/f036f00c3763694e539a9070a98130e2667e49bd), DSE commit: [81a8521f](https://github.com/microsoft/DeepSpeedExamples/tree/81a8521f05e2761eed34fcf65f19873df9f74403).*
+
+</div>
+
+# 4. Stability Bug Fixes <a name="stability-bug-fixes"></a>
+
+A wide range of issues have been addressed in the DeepSpeed runtime and the DeepSpeed-Chat pipeline. These fixes enable advanced features such as Hybrid Engine, LoRA, and ZeRO-Offload to run across all training steps of the DeepSpeed-Chat pipeline and improve training stability and convergence.
+
+<div align="center">
+  <img src="../assets/images/ds_chat_stability_sweep.png" width="750">
+
+  *Figure 6: Step 3 Reward Scores for all supported DeepSpeed-Chat configurations. Run with 16 V100 GPUs, [AdamG012/chat-opt-1.3b-sft-deepspeed](https://huggingface.co/AdamG012/chat-opt-1.3b-sft-deepspeed) actor model, [AdamG012/chat-opt-350m-reward-deepspeed](https://huggingface.co/AdamG012/chat-opt-350m-reward-deepspeed) critic model, DS commit: [f036f00c](https://github.com/microsoft/DeepSpeed/tree/f036f00c3763694e539a9070a98130e2667e49bd), DSE commit: [81a8521f](https://github.com/microsoft/DeepSpeedExamples/tree/81a8521f05e2761eed34fcf65f19873df9f74403).*
+
+</div>
+
+*Figure 6* above shows the training convergence across all supported DeepSpeed-Chat configurations. This data was collected using 16 V100 NVIDIA GPUs, the [AdamG012/chat-opt-1.3b-sft-deepspeed](https://huggingface.co/AdamG012/chat-opt-1.3b-sft-deepspeed) OPT model as the actor, the [AdamG012/chat-opt-350m-reward-deepspeed](https://huggingface.co/AdamG012/chat-opt-350m-reward-deepspeed) OPT model as the critic, and the following DeepSpeed and DeepSpeedExamples repository commits: DS commit: [f036f00c](https://github.com/microsoft/DeepSpeed/tree/f036f00c3763694e539a9070a98130e2667e49bd), DSE commit: [81a8521f](https://github.com/microsoft/DeepSpeedExamples/tree/81a8521f05e2761eed34fcf65f19873df9f74403).
+
+We now dive into the details of all the fixes across different areas.
+
+## DeepSpeed-Chat Pipeline Fixes
+
+In this section we discuss the functionality and training stability fixes in the DeepSpeed-Chat pipeline.
+
+- **Training Stability:**
+
+  - [PR #620 - Make training more stable](https://github.com/microsoft/DeepSpeedExamples/pull/620)
+
+    - To improve the training stability in Step 3, several different areas of training were tuned and changed. To start, the Kullback-Liebler (KL) divergence used in the Proximal Policy Optimization (PPO) trainer was slightly tuned to reduce divergence between the new and reference policies and improve the reward score. Next, the sequence generation function in the PPO trainer (`_generate_sequence()`) removed the specification of a `min_length` in the Actor model's `generate()` call, which means generated sequences won't be artificially enlarged, allowing for the possibility of sequence generation to collapse i.e. when training convergence is extremely poor. A minor off-by-one error was also fixed in the PPO trainer's reward computation function (`compute_rewards()`). Finally, the PPO trainer's RLHF training function was updated to zero out the reward and value after the end of a conversation to prevent incorrect `advantages` and `returns`.
+
+  - [PR #633 - DS Chat Step 3 - Add separate Lora Adam optimizer group](https://github.com/microsoft/DeepSpeedExamples/pull/633)
+
+    - The [LoRA](https://github.com/microsoft/LoRA) feature is supported across all three training steps of the DeepSpeed-Chat framework. Prior to this stability effort, there was no distinction between the overall learning rate and the LoRA learning rate i.e. the LoRA learning rate was set to whatever the overall learning rate was. This led to instability in training convergence and can be seen in *Figure 7* below showing the reward score across training steps for various Step 3 configurations:
+
+      <div align="center">
+        <img src="../assets/images/sweep_before_lora_fix.png" width="650">
+
+        *Figure 7: Before the fix, the sweep across all ZeRO-2 cases without a separate LoRA learning rate shows training instability when LoRA is used.*
+
+      </div>
+
+      To address this training convergence issue, when creating the optimizer grouped parameters, the LoRA `lora_right_weight` and `lora_left_weight` parameters were explicitly separated out and given their own LoRA-specific learning rate. After this change, a dramatic improvement in stability was observed, as shown in the figure below:
+
+      <div align="center">
+        <img src="../assets/images/sweep_after_lora_fix.png" width="650">
+
+        *Figure 8: After creating a separate LoRA learning rate, the sweep across all ZeRO-2 cases shows proper convergence.*
+
+      </div>
+
+      The next fix details the addition of separate LoRA learning rate arguments.
+
+  - [PR ##685 Add LoRA LR for DS Chat steps 1-3](https://github.com/microsoft/DeepSpeedExamples/pull/685)
+
+    - A *separate* LoRA learning rate argument can now be provided in each of the three training steps, with Step 3 having individual LoRA learning rates for the Actor and Critic models.
+
+- **Bug Fixes:**
+
+  - [PR #636 - DS Chat Step 3 - Fix Zero Stage 3](https://github.com/microsoft/DeepSpeedExamples/pull/636)
+
+      - During DeepSpeed-Chat Step 3 training, we observed hangs when ZeRO Stage 3 was enabled for the actor model and when the `world_size > 1`. When observing the state of each rank, one rank would still be in the sequence generation phase `self._generate_sequence()`, while the other rank had already progressed to the `self.actor_model()` call. This ZeRO Stage 3 desynchronization, due to misaligned token generation between the GPUs, can normally be automatically detected and accounted for in the HuggingFace Transformers library via `synced_gpus`. However, due to the nature of the DeepSpeed-Chat pipeline and the lifetime of the corresponding model configuration objects, this automatic detection code was not triggered. To resolve this, when invoking the `generate()` function, the `synced_gpus` argument is explicitly passed and set to `True` when ZeRO Stage 3 is being used.
+
+  - [PR #658 - Fix only optimize lora and ack-ckpting compatible](https://github.com/microsoft/DeepSpeedExamples/pull/658)
+
+    - This fix allows Step 3 training to run with the combination of gradient checkpointing and *LoRA-only* parameter optimization, a previously unsupported training case. With the addition of the [enable_input_require_grads](https://github.com/huggingface/transformers/blob/f26099e7b5cf579f99a42bab6ddd371bf2c8d548/src/transformers/modeling_utils.py#L1225) model utility function in the HuggingFace Transformers library, which enables the gradients for the input embeddings, gradient checkpointing and optimization of *only* the LoRA parameters is made possible.
+
+  - [PR #576 - Fix argparse](https://github.com/microsoft/DeepSpeedExamples/pull/576)
+
+    - An external contributor helped in resolving an argument parsing issue.
+
+  - [PR #584 - Fix unused parameter bug](https://github.com/microsoft/DeepSpeedExamples/pull/584)
+
+    - An external contributor fixed the passing of an uninitialized parameter that was hardcoded earlier.
+
+
+## Hybrid Engine Fixes
+In this section we discuss several fixes in the Hybrid Engine.
+
+- [PR #3563 - Fix LoRA Fuse/Unfuse in Hybrid Engine](https://github.com/microsoft/DeepSpeed/pull/3563)
+
+  - During Step 3 training for OPT with LoRA and Hybrid Engine enabled, an issue arose regarding a tensor size mismatch of the LoRA weights. Specifically, the LoRA QKV weights were not fused in the OPT container policy, yet they were expected to be fused by the Hybrid Engine. This challenge was effectively resolved by introducing both fused and unfused LoRA methods in the Hybrid Engine. We thank @sxjscience for providing this fix.
+
+- [PR #3883 - Extend HE-Lora test with Z3 support + Fix/add guard in HE for Z3](https://github.com/microsoft/DeepSpeed/pull/3883)
+
+  - The Hybrid Engine was updated to properly check whether ZeRO Stage 3 was enabled when resetting the inference container parameters, along with expanding the corresponding unit tests.
+
+
+## ZeRO Stage 3 Fixes
+In this section we discuss several fixes in support of the ZeRO Stage 3 feature.
+
+- [PR #3819 - Fix racing condition in GatheredParameters](https://github.com/microsoft/DeepSpeed/pull/3819)
+
+  - A race condition in the the ZeRO `GatheredParameters` context, which resulted in various `'status': 'INFLIGHT'` issues, was fixed by removing duplicate input parameters that were being passed from the Hybrid Engine.
+
+- [PR #3884 - Separate ZeRO3 InflightParamRegistry for train and eval](https://github.com/microsoft/DeepSpeed/pull/3884)
+
+  - The ZeRO Stage 3 `InflightParamRegistry` was updated to use a separate `InflightParamRegistry` for training and evaluation, fixing an issue where leftover parameters in flight were causing inflight parameter errors. These fixes, along with related fixes in the Hybrid Engine, enabled the use of the ZeRO-Offload feature in the DeepSpeed-Chat training pipeline.
+
+- [PR #3928 - Remove the param.ds_tensor from print](https://github.com/microsoft/DeepSpeed/pull/3928)
+
+  - A minor change that was necessary to address the DeepSpeed-Chat Step 3 hang issue ([PR #636](https://github.com/microsoft/DeepSpeedExamples/pull/636)) as it allowed us to progress further into execution and observe the desynchronization point.
+
+
+# 5. Software Improvements <a name="software-improvements"></a>
+
+To improve the characterization, ease of debug, and maintainability of the DeepSpeed-Chat framework, several areas of software improvements have been completed. Characterization scripts were added to enable systematic composition of features, instrumentation was added to improve insight into the behavior of training, and a testing CI workflow was added to improve the maintainability of the DeepSpeed-Chat training framework.
+
+## Characterization Scripts
+
+The DeepSpeed-Chat training framework provides a rich set of features (Hybrid Engine, ZeRO, LoRA, etc.) that can be composed in many different combinations, depending on the scenario. The interactions between the features are often complex and composing them in a systematic way for characterization is useful for understanding their behavior. To support such use cases, characterization scripts have been added to run sweeps of Steps 1, 2, and 3 training for various combinations of features. The scripts default to OPT but can be modified to run with Llama. Please see the READMEs in the following folders for more details:
+
+- [Step 1 Sweep Scripts](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/opt/single_node/sweep)
+- [Step 2 Sweep Scripts](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/opt/single_node/sweep)
+- [Step 3 Sweep Scripts](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/opt/single_node/sweep)
+
+For example, the Step 3 characterization script sweeps across various training features:
+| Feature | Values |
+| --- | --- |
+| ZeRO Stage | 2, 3 |
+| Hybrid Engine | True, False |
+| ZeRO-Offload | True, False |
+| LoRA | True, False |
+
+And can be ran as follows:
+
+<pre>
+DeepSpeedExamples/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning$ bash training_scripts/opt/single_node/sweep/run_step3_sweep.sh
+</pre>
+
+The training log for each combination of features will be stored in a folder with the name formatted as `z${z}_he_${he}_offload_${offload}_lora_${lora}`
+
+
+Related PRs:
+
+- [DS Chat Characterization Scripts (Step 1 and 3)](https://github.com/microsoft/DeepSpeedExamples/pull/638)
+- [Add step 2 sweep script, clean up scripts](https://github.com/microsoft/DeepSpeedExamples/pull/664)
+- [Update script location and docs for all 3 steps](https://github.com/microsoft/DeepSpeedExamples/pull/681)
+
+## Instrumentation
+
+To gain better insight into DeepSpeed-Chat training, new [instrumentation features](https://github.com/microsoft/DeepSpeedExamples/pull/624) were added across all three steps of DeepSpeed-Chat and can be enabled via arguments to each step's `main.py`.
+
+| Argument | Description | Step(s) |
+| --- | --- | --- |
+| --print_loss         | Print loss during each step                                                     | 1     |
+| --enable_tensorboard | Enable TensorBoard logging at the model Runtime Engine level                    | 1,2,3 |
+|                      | Enable TensorBoard logging at the Training Pipeline level                       | 3     |
+| --tensorboard_path   | Path to write TensorBoard log                                                   | 1,2,3 |
+| --print_answers      | Print actor model prompt and answers during training across all ranks           | 3     |
+
+
+### TensorBoard
+TensorBoard logging can be enabled in each of the three training steps, with some slight nuances in Step 3. To start, for each training step, the `enable_tensorboard` argument can be used to enable a TensorBoard monitor at the Runtime Engine level ([see documentation](https://www.deepspeed.ai/docs/config-json/#monitoring-module-tensorboard-wandb-csv)) and is reflected in the corresponding model training configuration:
+```python
+"tensorboard": {
+    "enabled": enable_tensorboard,
+    "output_path": f"{tb_path}/ds_tensorboard_logs/",
+    "job_name": f"{tb_name}_tensorboard"
+}
+```
+
+- **Step 3**:
+    Due to Step 3 initializing both an Actor and a Critic model, _each_ of the models will have their own corresponding TensorBoard monitor at the Runtime Engine level. Beyond that, Step 3 training also contains a Pipeline-level TensorBoard monitor a level above the model runtime engines, which captures the `reward`, `actor_loss`, `actor_loss_sum`, `critic_loss`, and `critic_loss_sum`.
+
+## Testing
+
+As part of the DeepSpeed team's commitment to maintaining the DeepSpeed-Chat training framework, continuous integration [PyTest](https://github.com/microsoft/DeepSpeedExamples/blob/master/applications/DeepSpeed-Chat/training/tests/test_training.py) testing has been added for Step 3 RLHF training in a new [GitHub Actions workflow](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-ds-chat.yml).
+
+| Description | Status |
+| ----------- | ------ |
+| Integrations | [![nv-ds-chat](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-ds-chat.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-ds-chat.yml) |
+
+ The workflow is run on a **nightly** basis across a **16-case** test matrix (see table below), and uses the **facebook/opt-125m** model for both the actor and critic.
+
+| Parameter | Values |
+| --- | --- |
+| ZeRO Stage    | 2, 3        |
+| Hybrid Engine | True, False |
+| ZeRO-Offload  | True, False |
+| LoRA          | True, False |
+
+Each configuration (16 total) runs through a limited number of Step 3 non-overflow training steps (i.e. steps where neither actor nor critic overflow) and saves the actor/critic models. Assertions are used to check if the training pipeline executed correctly and if the actor and critic models were saved properly.
+
+# 6. Try Out DeepSpeed-Chat <a name="try-out-deepspeed-chat"></a>
+We are very excited to share this DeepSpeed-Chat feature and stability release.
+
+* To get started, please visit our GitHub page for DeepSpeed-Chat: [GitHub Landing Page](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat)
+
+* We will continue to improve DeepSpeed-Chat with your feedback and support. Our [roadmap](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/README.md#-deepspeed-chats-roadmap-) shows currently supported features as well as ones that are planned for future.
+
+DeepSpeed-Chat is part of the bigger DeepSpeed ecosystem comprising of a multitude of Deep Learning systems and modeling technologies. To learn more,
+
+* Please visit our [website](https://www.deepspeed.ai/) for detailed blog posts, tutorials, and helpful documentation.
+* You can also follow us on our [English Twitter](https://twitter.com/MSFTDeepSpeed), [Japanese Twitter](https://twitter.com/MSFTDeepSpeedJP), and [Chinese Zhihu](https://www.zhihu.com/people/deepspeed) for latest news on DeepSpeed.
+
+DeepSpeed welcomes your contributions! We encourage you to report issues, contribute PRs, and join discussions on the [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/) page. Please see our [contributing guide](https://github.com/microsoft/DeepSpeed/blob/master/CONTRIBUTING.md) for more details. We are open to collaborations with universities, research labs, companies, such as those working together on deep learning research, applying DeepSpeed to empower real-world AI models and applications, and so on. For such requests (and other requests unsuitable for GitHub), please directly email to deepspeed-info@microsoft.com.
+
+* "Star" our [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/) and [DeepSpeedExamples GitHub](https://github.com/microsoft/DeepSpeedExamples/) repositories if you like our work!
diff --git a/blogs/deepspeed-chat/japanese/README.md b/blogs/deepspeed-chat/japanese/README.md
new file mode 100644
index 000000000000..e7aa62721417
--- /dev/null
+++ b/blogs/deepspeed-chat/japanese/README.md
@@ -0,0 +1,345 @@
+<div align="center">
+
+# DeepSpeed Chat: ChatGPTライクなモデルを簡単・高速・低コストに、あらゆるスケールで学習
+
+</div>
+
+<div align="center">
+
+<img src="../../assets/images/hero-figure-black-ja.png" width="1000px" alt="DeepSpeed-Chat!"/>
+
+</div>
+
+DeepSpeed Chat を引用するには、こちらの[arxiv report](https://arxiv.org/abs/2308.01320)を引用してください:
+
+```
+@article{yao2023dschat,
+  title={{DeepSpeed-Chat: Easy, Fast and Affordable RLHF Training of ChatGPT-like Models at All Scales}},
+  author={Zhewei Yao and Reza Yazdani Aminabadi and Olatunji Ruwase and Samyam Rajbhandari and Xiaoxia Wu and Ammar Ahmad Awan and Jeff Rasley and Minjia Zhang and Conglong Li and Connor Holmes and Zhongzhu Zhou and Michael Wyatt and Molly Smith and Lev Kurilenko and Heyang Qin and Masahiro Tanaka and Shuai Che and Shuaiwen Leon Song and Yuxiong He},
+  journal={arXiv preprint arXiv:2308.01320},
+  year={2023}
+}
+```
+
+# 1. 概要
+
+ChatGPT（チャットGPT）やその類似モデルは、AIの世界に旋風を巻き起こし、デジタル業界に革命的な影響を与えています。これらのモデルは非常に汎用性が高く、要約、コーディング、翻訳などの多様なタスクを、人間の専門家と同等か、それ以上の結果で実施できます。その圧倒的な性能を受けて、AI関連のオープンソースコミュニティでは、ChatGPTスタイルのモデルをより利用しやすくするための複数の取り組みが始まっています（ChatLLaMa、Alpaca、Vicuna、Databricks-Dollyなど）。
+
+しかし、様々なプロジェクトで多大な努力が投じられたにも関わらず、ChatGPTライクなモデルの訓練で必要となるRLHF（Reinforcement Learning from Human Feedback）を、十分に簡単かつ高い効率で実行できるend-to-endなパイプラインは、これまで存在していませんでした。例えば、6.7BのChatGPTライクなモデルを訓練するには、高価なGPUが多数必要になり、多くのデータサイエンティストにとっては実施が困難でした。また仮にそうした計算資源があったとしても、従来のソフトウェアでは、ハードウェアの5%未満の性能しか引き出せませんでした（[概要](#実効スループットとスケーラビリティ)）。さらには、従来のソフトウェアを用いて、簡単かつ高速に、かつ低コストで、数千億のパラメータを持つ最先端のChatGPTライクなモデルの訓練する方法はありませんでした。
+
+ChatGPTの訓練に用いられるInstructGPTにおいて提案されたRLHFでは、これまでの標準的な事前学習やファインチューニングと全く異なり、はるかに複雑なパイプラインが必要となります。従来のソフトウェアでは、そうしたパイプラインが効果的にサポートする仕組みがありませんでした。そこで、RLHFの訓練を広くAIコミュニティで利用可能とし、ChatGPTのようなモデルを誰もが作成できるにするため、以下の機能を備えたDeepSpeed-Chatをリリースすることになりました。
+
+(i) ***容易に実施可能なChatGPTライクなモデルの訓練と推論***: Hugging Faceレポジトリで提供されている学習済みモデルから開始して、InstructGPT学習の全3ステップを実行し、独自のChatGPTライクなモデルを生成できるスクリプトを提供します。また、学習後の会話形式のインタラクションをテストするための推論APIを提供します。
+
+(ii) ***DeepSpeed-RLHF パイプライン***: DeepSpeed-RLHFパイプラインは、InstructGPTの学習パイプラインの3つのステップ a) 教師付きファインチューニング (Supervised fine-tuning, SFT), b) 報酬モデルのファインチューニング, c) RLHF (Reinforcement Learning with Human Feedback) を、包括的に、かつ1対1の対応を保って再現するものです。また、複数のデータソースからの同時学習を可能にするために、学習データの抽象化・ブレンド機能を提供します。
+
+(iii) ***DeepSpeed-RLHF システム***: DeepSpeedの学習・推論機能を統合した、RLHF用のハイブリッドエンジン DeepSpeed-HE を提供します。DeepSpeed-HE は、RLHFのパイプライン内で推論モードと訓練モードをシームレスに切り替えでき、テンソル並列や高性能なTransformerカーネルなど、DeepSpeed-Inferenceのさまざまな最適化技術を推論に活用できる一方、強化学習の訓練では、ZeROやLoRAベースの多数のメモリ最適化技術を利用します。また、DeepSpeed-HEはRLHFパイプラインに完全に適合した設計となっており、RLHFのさまざまなフェーズでメモリ管理やデータ移動の面で最適な技術を適用できます。
+
+DeepSpeed-RLHFシステムは、大規模モデルの学習において類を見ない効率性を実現し、AIコミュニティが、複雑なRLHFの訓練を高速かつ安価に、そして容易に利用できるようにします：
+
+***実行効率とコスト***: 実行効率において、[DeepSpeed-HEは既存システムよりも15倍以上速く](#実効スループットとスケーラビリティ)、RLHFの訓練を高速かつ低コストに行うことができます。例えば、DeepSpeed-HEは、Azure Cloud上でOPT-13Bモデルをわずか9時間で、OPT-30Bを18時間で訓練でき、それぞれのコストは300ドル以下、600ドル以下です。
+
+<div align="center">
+
+| GPUs    |                           OPT-6.7B                           |             OPT-13B             | OPT-30B | OPT-66B |
+| ------- | :----------------------------------------------------------: | :------------------------------: | :-----: | :-----: |
+| 8x A100-40GB   |                           5.7 時間                           |            10.8 時間            | 1.85 日 |   NA   |
+| 8x A100-80GB | 4.1 時間 ($132)                         　  | 	9 時間 ($290) | 18 時間 ($580) | 2.1 日（$1620） |
+
+*表1. ノード1台（8x A100）を用いた場合の訓練時間とAzureでの概算実行コスト*
+
+</div>
+
+
+***高スケーラビリティ***: DeepSpeed-HEは、数千億のパラメータを持つモデルをサポートし、複数ノード・複数GPUのシステムで、優れたスケーラビリティを実現することができます。その結果、13Bのモデルであれば1.25時間で学習でき、175Bの巨大モデルでも、1日以内に学習できます。
+
+<div align="center">
+
+| GPUs         |              OPT-13B              |               OPT-30B               | OPT-66B | OPT-like-175B |
+| ------------ | :-------------------------------: | :---------------------------------: | :-----: | :-----------: |
+| 64x A100-80G | 1.25 時間 ($320)	| 4 時間 ($1024) | 7.5 時間 ($1920)	| 20 時間 ($5120) |        |              |
+
+*表2. 複数ノード（64x A100-80GB）を用いた場合の訓練時間とAzureでの概算実行コスト*
+</div>
+
+> ***注意事項***: 上記の2つの表の数値は、訓練のステージ3のものです。DeepSpeed-RLHFが用いるデータセットと訓練の設定において、合計1.35億トークンを1エポックで訓練した際のスループットの実測値に基づいています。合計6750万のクエリートークン（配列長256の13万件のクエリー）と6750万の生成トークン（配列長256の13万件の回答）があり、ステップごとの最大グローバルバッチサイズは 50万 トークン（クエリーと回答それぞれ1024件）です。DeepSpeedRLHFを用いた場合のコストおよび実行時間の比較にあたっては、これらの詳細をよくご確認ください。さらに詳細な情報は[ベンチマーク設定](https://github.com/microsoft/DeepSpeedExamples/blob/staging-deepspeed-chat-v2/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/BenckmarkSetting.md)を参照ください。
+
+***RLHFを誰もが利用できるように***: DeepSpeed-HEは、1台のGPUのみで130億以上のパラメーターを持つモデルの訓練を実行できます。複数のGPUを備えた高価な計算設備を持たないデータサイエンティストも、小規模なトイモデルではなく、実際のシナリオで使用できる大規模で強力なRLHFモデルを作成できます。
+
+<div align="center">
+
+|            | V100 32G | A6000 48G | A100 40G | A100 80G |
+| ---------- | :------: | :-------: | :------: | :------: |
+| モデルサイズ| OPT-2.7B | OPT-6.7B | OPT-6.7B | OPT-13B |
+
+表3. DeepSpeed-HEを用いて1台のGPUで訓練できるモデルサイズ
+
+</div>
+
+以降では、上記で紹介したDeepSpeed-Chatの3つの機能を深く掘り下げて紹介していきます。
+
+# 2. ChatGPTの訓練と推論を手軽に実行
+
+初めに、DeepSpeed-RLHFを用いた訓練の容易さを示す例として、OPT-13BとOPT-66Bのモデルを訓練できることを示します。訓練を実行できる時間が短い場合でも、コンシューマーグレードのGPU1つで、OPT-1.3Bモデルをわずか2時間で訓練することも可能です。DeepSpeed-ChatのAPIを用いて、カスタムされたパイプラインをどのように作るかも紹介していきます。
+
+## DeepSpeed-Chatの例を用いたChatGPTスタイルのモデルの訓練
+
+*a) スクリプトを一つ実行するだけで、RLHF訓練の全3ステージが実行され、ChatGPTライクなモデルを生成されるのを見てみましょう！*
+
+ここではアクターモデルとして事前学習済みOPT-13B、報酬モデルとしてOPT-350MをRLHFで訓練する例を示します。
+以下のスクリプトを実行すると、最終的に130億パラメータのChatGPTライクなモデルが生成されます。
+
+```python
+pip install deepspeed>=0.9.0
+
+git clone https://github.com/microsoft/DeepSpeedExamples.git
+cd DeepSpeedExamples/applications/DeepSpeed-Chat/
+pip install -r requirements.txt
+
+python train.py --actor-model facebook/opt-13b --reward-model facebook/opt-350m --deployment-type single_node
+```
+
+8台のA100を備えたDGXサーバを使う場合、130億パラメータのモデルを半日で訓練できます。以下は各ステップに要する時間の内訳です。
+
+<div align="center">
+
+| モデルサイズ| Step 1 | Step 2 | Step 3 | Total |
+| -------------------------------- | :----: | :----: | :----: | :----: |
+| アクターモデル: OPT-13B, 報酬モデル: OPT-350M | 2.5時間 | 0.25時間 | 10.8時間 | 13.6時間 |
+
+表4. 8台のA100を備えたDGXサーバ1台でのend-to-end処理時間の内訳
+
+</div>
+
+*b) 学習したモデルをDeepSpeed Chatの推論APIで試してみましょう！*
+
+学習させたモデルを組み込んで、試してみましょう。質問応答だけでなく、会話形式のやりとりも楽しめます。
+
+```
+Human:        Do you know Microsoft?
+Assistant:    Yes, I’m a big fan of Microsoft.  What about them?
+Human:        Can you explain it to a 6-year old child? I wonder how I should describe it
+Assistant:    Sure, I can try.  Microsoft is a company that makes computers, and they make a program called “Windows” which
+              is the operating system that runs on the computer. It’s like the “brain” of the computer.  It’s where all the
+              programs and files are stored. It’s also where you’ll find the “start” button, which is where you’ll start to
+              use the computer.
+```
+
+## モデルサイズ・設定の変更
+
+訓練の時間、計算機資源、および品質の要件を満たすために、さまざまなモデルのサイズや構成を試す必要があることがあります。DeepSpeed-Chatを使用すれば、簡単にそれが可能です。例えば、研究やビジネスのために、GPUクラスタでより大規模で高品質なモデルを訓練したい場合、希望するモデルサイズとGPU数を指定するだけです。以下は、アクターモデルのパラメータ数を66Bに、GPU数を64に指定する例です。
+
+```python
+python train.py --actor-model facebook/opt-66b --reward-model facebook/opt-350m --deployment-type multi_node
+```
+
+64台のA100 (80GBメモリ) GPUを使用する場合、9時間で660億パラメータのChatGPTモデルを訓練できます。
+
+<div align="center">
+
+| モデルサイズ                      | Step 1 | Step 2 | Step 3 | Total |
+| -------------------------------- | :-----: | :----: | :----: | :---: |
+|アクターモデル: OPT-66B, 報酬モデル: OPT-350M | 82分 | 5分 | 7.5時間 |  9時間  |
+
+表5. 8台のA100(80GBメモリ)を備えたDGXサーバ8台で660億パラメータのモデルを訓練する場合のend-to-end処理時間の内訳
+
+</div>
+
+1～2時間のコーヒータイムや昼休みに、DeepSpeed-Chatで小規模なトイモデルをトレーニングしてみるのも良いでしょう。例えば、コンシューマグレードのGPUでの訓練を動かしてみるため、1つのデータセットで1.3Bのモデルを訓練する例を用意しました。これなら、昼休みから戻ったときに、できあがったモデルのチェックポイントを試してみることができます。
+
+```python
+python train.py --actor-model facebook/opt-1.3b --reward-model facebook/opt-350m --deployment-type single_gpu
+```
+
+<div align="center">
+
+| モデルサイズ                       |  Step 1  |  Step 2  | Step 3 | Total |
+| --------------------------------- | :-------: | :------: | :----: | :---: |
+| Actor: OPT-1.3B, Reward: OPT-350M | 2900 秒| 670 秒 | 1.2時間 |  2.2時間  |
+
+*表6. コモディティグレードのGPU（A6000）1台でのend-to-end処理時間の内訳*
+
+</div>
+
+## DeepSpeed-ChatのAPIを用いたRLHFパイプラインのカスタマイズ
+
+DeepSpeed-Chatでは、設定を柔軟に指定できるAPIを提供しており、ユーザーが独自のRLHF訓練パイプラインを構築することができます。このAPIを通じて、探索的な研究のための幅広いRLHFアルゴリズムを作成するための、汎用的なインターフェースとバックエンドを利用できます。
+
+```python
+
+engine = DeepSpeedRLHFEngine(
+  actor_model_name_or_path=args.actor_model_name_or_path,
+  critic_model_name_or_path=args.critic_model_name_or_path,
+  tokenizer=tokenizer,
+  num_total_iters=num_total_iters,
+  args=args)
+
+trainer = DeepSpeedPPOTrainer(engine=engine, args=args)
+
+for prompt_batch in prompt_train_dataloader:
+  out = trainer.generate_experience(prompt_batch)
+  actor_loss, critic_loss = trainer.train_rlhf(out)
+```
+
+# 3. 完全なRLHF訓練パイプライン
+
+シームレスに訓練を実行するため、DeepSpeed-Chatには、InstructGPTと同様の完全なend-to-endの訓練パイプラインが組み込まれています（図1）。
+
+<div align="center">
+
+<img src="../../assets/images/ds-chat-overview.png" width="800px" alt="DeepSpeed-Chat!"/>
+
+*図1: DeepSpeed-Chatの訓練パイプラインの概要*
+
+</div>
+
+このパイプラインは、次の3つのステップに分かれています。
+
+* Step 1 教師付きファインチューニング（Supervised finetuning, SFT）: 様々なクエリに対する人間の回答を慎重に選択し、事前学習された言語モデルをファインチューニングします。
+* Step 2 報酬モデルのファインチューニング：同じクエリに対する複数の回答のランキングを、人間が提供したデータセットを用いて、別のモデル（報酬モデルと呼ばれ、通常はSFTより小さい）を学習します。
+* Step 3 RLHF訓練: Proximal Policy Optimization（PPO）アルゴリズムを用いて、報酬モデルからのフィードバックによりSFTモデルをさらにファインチューニングします。
+
+ステップ3では、さらにモデルの品質を向上させるため、以下の2つの機能を追加で使用することができます。
+
+* 指数移動平均 (EMA) 収集: EMAベースのモデルチェックポイントを最終評価に使用できます。
+* 混合学習: SQuAD2.0のような公開ベンチマークでのモデル品質低下を防ぐために、事前学習の指標（次の単語予測）とPPOの指標を混合して使用します。
+
+これらの2つの機能は、最近のオープンソースプロジェクトではしばしば省かれることがあります。しかし、InstructGPTによれば、EMAチェックポイントは一般に、従来の最終学習済みモデルよりも優れた応答品質を実現できます。また混合学習によって、学習前のベンチマーク解答能力を保持できます。DeepSpeed-Chatでは、InstructGPTで示されたのと同様の訓練を実施可能とするために、これらの機能を提供しています。
+
+また、InstructGPTと同様の内容を実施する機能に加え、研究者や開発者が複数のデータリソースを用いて独自のRLHFモデルを訓練するのを支援するため、以下の便利な機能も提供しています。
+
+* データの抽象化・ブレンド機能: モデルの品質を向上させるため、複数のデータセットでモデルを訓練することができます。このため、DeepSpeed-Chatは、以下の二つの機能も備えています。 1）異なるデータセットの形式を統一するための抽象データセット層、（2）複数のデータセットを適切にブレンドし、3つのトレーニングステージに分割するためのデータ分割・ブレンド機能。
+
+
+# 4. DeepSpeedハイブリッドエンジン – RLHF訓練のための基盤
+
+与えられた指示に基づいて学習するRLHFパイプラインのステップ1とステップ2は、大規模モデルの通常のファインチューニングと似ています。そのため、DeepSpeed-Chatでは、DeepSpeedのZeROの技術による最適化と、DeepSpeedの様々な並列化の柔軟な組み合わせによって、高いスケーラビリティと高速な学習を実現しています。一方、ステップ3は、パフォーマンスへの影響という点で、最も複雑な処理を行う部分です。学習の各反復で、 a)トークン/経験生成と訓練のためのインプットを生成するための推論フェーズ、b) アクターモデルと報酬モデルのパラメータ更新する訓練フェーズの２つのフェーズがあり、さらにそれらの間の相互作用とスケジューリングを効率的に処理する必要があります。 これらを実現するには、 (1) SFTと報酬モデルの複数のコピーをステージ3全体を通して利用するためのメモリ利用の最適化、 (2) ステージ3全体の速度に大きな影響を与える生成フェーズの高速化 という2つの課題があります。指数移動平均（EMA）収集と混合学習を使用する場合には、必要なメモリ量と処理時間はさらに増大します。
+
+これらの課題に取り組むため、我々はDeepSpeedの訓練と推論の全システム機能を統一した基盤機能を、ハイブリッドエンジン DeepSpeed-HE として構成しました。これは、訓練モードではオリジナルのDeepSpeedエンジンを活用し、生成/推論モードではDeepSpeedの推論エンジンを適用することで、ステージ3のRLHFの訓練を大幅に高速化します。図2に示すように、DeepSpeedの訓練エンジンと推論エンジンは、シームレスに切り替えられます。アクターモデルに対して推論モードや訓練モードを有効にしておけば、推論や訓練パイプラインを実行する際に、DeepSpeedがそれぞれに異なる最適化を選択して、システム全体のスループットを改善します。
+
+
+<div align="center">
+
+<img src="../../assets/images/hybrid-engine.png" width="600px" alt="DeepSpeed-Chat!"/>
+
+*図2. RLHFで最も処理時間がかかる部分を高速化するハイブリッドエンジン（DeepSpeed-HE）*
+
+</div>
+
+RLHF訓練の経験生成フェーズにおける推論では、DeepSpeed-HE は、KVキャッシュと中間結果を扱う軽量なメモリ管理システム、および推論のために高度に最適化されたカーネルと、テンソル並列機能により、既存のソフトウェアと比較してスループット（トークン/秒）を大幅に向上させています。
+
+また訓練では、DeepSpeedの一連のZeROの技術や、Low Rank Adaption（LoRA）などのメモリ最適化技術を利用できます。DeepSpeed-HEでは、非常に高い効率の訓練を実現するため、これらの複数の最適化技術を互いに組み合わせることが可能なように実装されています。
+
+DeepSpeed-HEは、訓練と推論の両方で、モデルの分割をシームレスに変更し、テンソル並列を使用した推論と、DeepSpeedのZeROの技術によるシャーディング機構を使用した訓練でサポートしています。また、メモリを最大限に活用するため、これらのモードごとにメモリの割り当てを再構成します。これにより、メモリ割り当てのボトルネックを回避するとともに、大規模なバッチサイズをサポートすることでパフォーマンスを向上させることができます。DeepSpeedの訓練や推論など、さまざまなシステム技術を集約したハイブリッドエンジンは、最新のRLHF訓練の限界を超えて、RLHFを比類ない規模と効率で実行可能にします。
+
+
+# 5. DeepSpeed RLHF: ハイブリッドエンジン DeepSpeed-HEによる類を見ないスケールと高い効率
+
+## 機能の概要
+
+これまでに説明してきたように、DeepSpeed-HEは、推論と学習のための強力な技術を融合するものです。幅広いハードウェアで、DeepSpeed-RLHFパイプラインの優れたスケーラビリティと高い実行効率を実現するように設計されており、RLHFの学習を高速かつ低コストで、AIコミュニティが簡単に利用できるようにします。
+
+表1は、異なるモデルサイズとGPUでの、実行効率と費用を示しています。DeepSpeed-HEを用いると、Azure Cloud上でOPT-13Bをわずか9時間、OPT-30Bを18時間で訓練でき、必要な費用はそれぞれ300ドル、600ドル以下です。スピードとスケーラビリティの面では、表2に示すように、13Bパラメータのモデルでも1.25時間で学習でき、64GPUのクラスタを使えば175Bの巨大モデルも1日以内に学習できます。また、誰もがRLHFを利用できるようにするという観点から、DeepSpeed-HEを用いると、表3に示すように、130億以上のパラメータを持つモデルを、1つのGPUで訓練することもできるようになっています。
+
+
+## 既存のRLHFシステムとのスループットとモデルサイズのスケーラビリティ比較
+
+DeepSpeed-RLHFは、Colossal-AIや、ネイティブのPyTorchを用いたHugging Faceなどの他のRLHFを訓練可能なシステムと比較して、実行速度とスケーラビリティの両方で優れています。
+
+* スループットに関しては、DeepSpeedは単一GPUでのRLHFトレーニングで10倍以上の向上を実現しています（図3）。複数GPU環境では、Colossal-AIと比較して6～19倍、Hugging Face DDPと比較して1.4～10.5倍のスピードアップを実現しています（図4）。
+* モデルのスケーラビリティに関しては、Colossal-AIが最大で1.3Bのモデルを単一GPUで、6.7BのモデルをA100-40Gを備えた単一のノードで訓練できますが、DeepSpeed-HEは同じハードウェアでそれぞれ6.5Bと50Bのサイズのモデルを訓練できます。これは、最大で7.5倍のモデルサイズを扱えることになります。
+
+したがって、DeepSpeed-HEは、Colossal-AIやHugging Face DDPなどの既存のRLHFシステムと比較して、1桁以上高いスループットを実現しており、同じ実行時間ではるかに大きなアクターモデルを訓練したり、10倍以上低いコストで同様のサイズのモデルを訓練することができます。
+
+<div align="center">
+
+<img src="../../assets/images/figure3.png" width="600px" />
+
+*図3. 他フレームワークとのStep 3のスループット比較（1台のA100-40Gを使用。バツ印はメモリ不足で実行できないことを示す）*
+
+</div>
+
+<div align="center">
+
+<img src="../../assets/images/figure4.png" width="600px" />
+
+*図4.  ステップ3（全3ステップ処理時間の大半を占める）のEnd-to-endの訓練スループット比較 (8台のA100-40Gを備えた1台のDGXノードを使用。バツ印はメモリ不足で実行できないことを示す）*
+
+</div>
+
+この効率化は、DeepSpeed-HEが、DeepSpeedの高度に最適化された推論機能を活用して、RLHF処理の生成フェーズを高速化したことに起因しています。図5は、1.3BパラメータモデルのRLHF訓練の時間内訳を示したもので、時間の大半は生成フェーズに費やされていることが分かります。DeepSpeedの高性能な推論カーネルを活用することで、DeepSpeed-HEはこのフェーズでHugging Faceの9倍、Colossal-AIの15倍のスループット向上を達成し、end-to-endの類を見ない効率化を実現しています。
+
+<div align="center">
+
+<img src="../../assets/images/figure5.png" width="600px" />
+
+*図5. DeepSpeed-HEを用いた生成フェーズの高速化（OPT-1.3Bベースのアクターモデル + OPT-350Mベースの報酬モデル、8台のA100-40Gを備えた1台のDGXノードを使用）*
+
+</div>
+
+## 実効スループットとスケーラビリティ
+
+***(I) 実効スループット分析*** RLHFのステージ3におけるDeepSpeed-HEの実効スループットは、生成フェーズと強化学習の訓練フェーズの両方のスループットで決まります。我々の作成したRLHFのパイプラインでは、生成フェーズが全計算量の約20%を占め、強化学習の訓練フェーズが残りの80%を占めています（詳細は[ベンチマークのページ](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/BenckmarkSetting.md)を参照）。しかし、計算量で見た割合が少ないとはいえ、前者は生成された256個のトークンのそれぞれに対して、初期プロンプトの256個のトークンに対してアクターモデルによる推論をそれぞれ1回実行する必要があるため、end-to-endの時間で見ると、その大部分を占めることになり、メモリ帯域が制限されて高いスループットを得ることが難しくなります。一方、強化学習の訓練フェーズでは、1サンプルあたりプロンプトと生成の両方から512個のトークンをフルに使用して、参照アクターモデルについて、数回のフォワードパスとバックワードパスで実行できるため、高いスループットを達成できます。
+
+<div align="center">
+
+<img src="../../assets/images/Throughputs-OPT.png" width="600px" />
+
+*図6. DeepSpeed-HEを用いたRLHFにおける生成、訓練、および実効スループット（GPU数は最善の効率を得られるように設定）*
+
+</div>
+
+実効スループットを最大化するために、DeepSpeed-HEは、生成フェーズと強化学習の訓練フェーズの両方を最適化しています。まず、両フェーズでより高い効率を得るために、可能な限り大きなバッチサイズを使用します。次に、生成フェーズでは、高性能なTransformerのカーネルを活用して、モデルが単一のGPUメモリに収まる場合はGPUメモリ帯域幅の利用を最大化するとともに、メモリに収まらない場合はテンソル並列（Tensor parallelism）も併用します。生成フェーズでは、ZeROによる省メモリ化の代わりに、テンソル並列を使用することで、GPU間通信を減らし、GPUメモリ帯域幅の利用率を高く保つことができます。
+
+図6では、モデルサイズが1.3Bから175Bの範囲で、DeepSpeed-HEで達成可能な最良の実効スループットを、GPUあたりのTFlopsで示しています。また、生成と訓練の各フェーズで達成されたスループットも示しています。これを見ると、DeepSpeed-HEは、6.7B～66Bのモデルで高い効率を達成していることが分かります。この範囲を超えて175Bまでモデルを大きくすると、メモリが制限により、大きなバッチサイズが設定できなくなり、スループットが低下しますが、それでも1.3Bのモデルよりも1.2倍の効率性を達成しています。こうした巨大なモデルを学習する際のGPUあたりのスループットは、より大きなバッチサイズを扱えるように、より多くのメモリを搭載したGPUにスケールアップすれば、さらに向上する可能性があります。
+
+さらに、図4に示すように、我々の実効性能は既存システムの19倍であり、これは既存システムはピーク性能の5%以下で動作していることを示唆しています。これは、RLHFワークロードを最適化することの難しさとともに、我々のシステムがRLHFパイプラインにおいて有効であることを示しています。
+
+<div align="center">
+
+<img src="../../assets/images/figure7.png" width="600px" />
+
+*図7. DGXノード（ノードあたり8台のA100-40/80G）の数を増加させた場合のスケーラビリティ（13Bおよび66Bのアクターモデルと350Mの報酬モデルを使用）*
+
+</div>
+
+***(II) スケーラビリティ分析*** モデルサイズごとに、最良のスループットを得られるGPU数は異なります。これは、モデルサイズが大きくなると、実行に多くのメモリを必要とすることに加え、以下に説明する DeepSpeed-HE のスケーラビリティ特性にも起因しています。
+
+図7は、DeepSeed-RLHF が最大 64 GPU で全体的に良好なスケーラビリティを達成したことを示しています。しかし、より詳細に見ると、DeepSpeed-RLHFの訓練では、小規模な環境では超線形（super linear）なスケーリングを達成し、大規模では線形（linear）またはそれ以下のスケーラビリティになっていることが分かります。これは、メモリの可用性と最大グローバルバッチサイズとの間の相互作用によるものです。
+
+DeepSpeed-HEはトレーニングにZeROの技術を採用しているため、利用可能なGPU間でモデルを分割することが可能です。その結果、GPUあたりのメモリ消費量はGPU数の増加とともに減少し、DeepSpeed-HEはGPUあたりでより大きなバッチサイズをサポートできるようになり、超線形のスケーリングが実現できます。しかし、より大規模になると、利用可能なメモリが増加し続ける一方で、最大グローバルバッチサイズが制限されているため、GPUあたりのバッチサイズを小さくすることになり、線形またはそれ以下のスケーリングになります。その結果、与えられた最大グローバルバッチサイズに対して、DeepSpeed-HEは、スーパーリニアとサブリニアのスケーラビリティの境界で最高のスループットとコスト効率を達成し、正確なポイントは、利用可能なメモリとグローバルバッチサイズの関数としてGPUごとに実行できる最大バッチサイズによってほぼ決定されます。
+
+
+# 6. DeepSpeed-Chatのリリース: さっそく試してみましょう！
+
+DeepSpeed-ChatをオープンソースソフトウェアとしてAIコミュニティに公開できることを嬉しく思います。
+
+* DeepSpeed-Chatの[GitHubページ](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat)を見て、早速使い始めましょう。
+* ユーザのみなさまからのフィードバックと協力で、これからも継続的に DeepSpeed-Chat を改善していく予定です。現在サポートされている機能や、将来的にサポートされている機能については、[ロードマップ](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/README.md#-deepspeed-chats-roadmap-)をご覧ください。
+
+
+# 7. DeepSpeedについて
+
+DeepSpeedは、きわめて大規模かつ高速な深層学習を、容易に実現するための様々な機能を持ったソフトウェアです。
+DeepSpeed-Chatは、DeepSpeedの一連のソフトウェアエコシステムの一部です。
+DeepSpeedは、以下のような機能を提供します。
+
+* 数十億～1兆規模のパラメータを持つdenseあるいはsparseなモデルの訓練と推論
+* 高いスループットと数千GPU規模のスケーラビリティ
+* 限られたGPUリソース環境における訓練と推論
+* 類のないレベルの低遅延かつ高スループットな推論
+* 高度なモデル圧縮技術による低遅延な推論とモデルサイズ削減
+
+DeepSpeedは、Microsoftの[AI at Scale initiative](https://www.microsoft.com/en-us/research/project/ai-at-scale/)の一部で、次世代AIの機能の大規模な実現を進めています。詳細は[こちら](https://innovation.microsoft.com/en-us/exploring-ai-at-scale)をご覧ください。DeepSpeedは、[Megatron-Turing NLG (530B)](https://www.microsoft.com/en-us/research/blog/using-deepspeed-and-megatron-to-train-megatron-turing-nlg-530b-the-worlds-largest-and-most-powerful-generative-language-model/), [Jurassic-1 (178B)](https://uploads-ssl.webflow.com/60fd4503684b466578c0d307/61138924626a6981ee09caf6_jurassic_tech_paper.pdf), [BLOOM (176B)](https://huggingface.co/blog/bloom-megatron-deepspeed), [GLM (130B)](https://github.com/THUDM/GLM-130B), [YaLM (100B)](https://github.com/yandex/YaLM-100B) を含め、様々な大規模モデルを学習するのに使用されてきました。
+
+またDeepSpeedは、 [Hugging Face Transformers](https://huggingface.co/docs/transformers/main/main_classes/deepspeed), [Hugging Face Accelerate](https://huggingface.co/docs/accelerate/usage_guides/deepspeed), [PyTorch Lightning](https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.strategies.DeepSpeedStrategy.html), [MosaicML Composer](https://docs.mosaicml.com/en/latest/trainer/using_the_trainer.html?highlight=deepspeed#deepspeed-integration), [Determined AI](https://docs.determined.ai/latest/training/apis-howto/deepspeed/overview.html) など、多くの著名なオープンソースの深層学習フレームワークのバックエンドとして利用されています。
+
+DeepSpeedについてのより詳しい情報は、以下をご覧ください。
+
+* [DeepSpeedのWebサイト](https://www.deepspeed.ai/) には、DeepSpeedの技術に関する詳細なブログ記事、チュートリアル、ドキュメントなどが掲載されています。
+* [DeepSpeedのTwitterアカウント (英語)](https://twitter.com/MSFTDeepSpeed) では、DeepSpeedの最新情報を発信していますので、ぜひフォローください。[日本語版のTwitterアカウント](https://twitter.com/MSFTDeepSpeedJP)もあり、最新の情報を日本語で発信しています。
+
+DeepSpeedチームは、ユーザの方々からのフィードバックやご連絡を受け付けています。
+
+* ユーザのみなさまからのバグ報告、Pull request、さまざまな議論への参加は、[GitHub](https://github.com/microsoft/DeepSpeed/)で受け付けています。詳細については、[ガイドライン](https://github.com/microsoft/DeepSpeed/blob/master/CONTRIBUTING.md)を確認してください。
+* DeepSpeedチームでは、DeepSpeedを用いた深層学習の研究や実世界へのAIモデルやアプリケーションに関して、大学、研究所、企業との方々とのコラボレーションを行っています（日本語でコミュニケーション可能な研究員も在籍しています）。こうしたコラボレーションについてのご要望（およびGitHubには適さないその他の話題）については、deepspeed-info@microsoft.com まで直接メールをお送りください。
diff --git a/blogs/deepspeed-fastgen/README.md b/blogs/deepspeed-fastgen/README.md
new file mode 100644
index 000000000000..c309a9def53f
--- /dev/null
+++ b/blogs/deepspeed-fastgen/README.md
@@ -0,0 +1,302 @@
+<div align="center">
+
+# DeepSpeed-FastGen: High-throughput Text Generation for LLMs via MII and DeepSpeed-Inference
+
+</div>
+
+<div align="center">
+ <img src="assets/images/fastgen-hero-light.png#gh-light-mode-only" width="850px">
+ <img src="assets/images/fastgen-hero-dark.png#gh-dark-mode-only" width="850px">
+</div>
+
+## Table of Contents
+1. [Introduction](#introduction)
+2. [Key LLM Serving Techniques](#background)
+3. [Dynamic SplitFuse: A Novel Prompt and Generation Composition Strategy](#technical-approach)
+4. [Performance Evaluation](#performance-evaluation)
+5. [DeepSpeed-FastGen: Implementation and Usage](#using-deepspeed-fastgen)
+6. [Try out DeepSpeed-FastGen](#try)
+7. [Acknowledgements](#acknowledgements)
+
+
+## 1. Introduction <a name="introduction"></a>
+
+Large language models (LLMs) like GPT-4 and LLaMA have emerged as a dominant workload in serving a wide range of applications infused with AI at every level. From general chat models to document summarization, and from autonomous driving to copilots at every layer of the software stack, the demand to deploy and serve these models at scale has skyrocketed. While frameworks like DeepSpeed, PyTorch, and several others can regularly achieve good hardware utilization during LLM training, the interactive nature of these applications and the poor arithmetic intensity of tasks like open-ended text generation have become the bottleneck for inference throughput in existing systems.
+
+To this end, frameworks like [vLLM](https://arxiv.org/pdf/2309.06180.pdf) powered by PagedAttention and research systems like [Orca](https://www.usenix.org/system/files/osdi22-yu.pdf) have significantly improved the performance of inference for LLMs. However, these systems still struggle to provide consistent quality of service, particularly for workloads with longer prompts. These long prompt workloads are becoming increasingly important as more and more models, like [MPT-StoryWriter](https://www.mosaicml.com/blog/mpt-7b), and systems, such as [DeepSpeed Ulysses](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-ulysses), support context windows stretching to tens of thousands of tokens. To better understand the problem space, we provide detailed examples of how text generation works for LLMs in two distinct phases called prompt processing and generation. When systems treat them as distinct phases, generation will be preempted by prompt processing that risks breaking the service level agreements (SLAs).
+
+Today, we are glad to present DeepSpeed-FastGen, a system that overcomes these limitations by leveraging the proposed Dynamic SplitFuse technique and offers up to 2.3x higher effective throughput compared to state-of-the-art systems like vLLM. DeepSpeed-FastGen leverages the combination of DeepSpeed-MII and DeepSpeed-Inference to provide an easy-to-use serving system.
+
+**Quick Start:** Trying DeepSpeed-FastGen is as simple as installing the latest [DeepSpeed-MII](https://github.com/microsoft/DeepSpeed-MII) release:
+
+```bash
+pip install deepspeed-mii
+```
+
+To generate text using a simple non-persistent pipeline deployment, run the following code. For more details, please see [Section 5](#using-deepspeed-fastgen).
+
+```python
+from mii import pipeline
+pipe = pipeline("mistralai/Mistral-7B-v0.1")
+output = pipe(["Hello, my name is", "DeepSpeed is"], max_new_tokens=128)
+print(output)
+```
+
+## 2. Existing LLM Serving Techniques in Literature <a name="background"></a>
+
+A text generation workload for a single sequence consists of two phases: 1) prompt processing, in which the user-provided text is efficiently processed as a batch of tokens to build a key-value (KV) cache for attention, and 2) token generation, which will add a single token to that cache and generate a new token. Over the course of generating a sequence of text, the model will make many forward calls to the model to generate the full sequence of text. Two major techniques have been proposed in the literature and deployed in systems that address various limitations and bottlenecks that may arise during these phases.
+
+_<b> Blocked KV Caching: </b>_
+
+vLLM identified that memory fragmentation due to large monolithic KV-caches significantly reduced the concurrency of LLM serving systems and proposed [Paged Attention](https://arxiv.org/pdf/2309.06180.pdf) to enable non-contiguous caches and increase total system throughput. Rather than assign individual variable-sized contiguous chunks of memory, the underlying storage in the KV cache is fixed-sized blocks (also known as pages). The blocked KV-cache increases system throughput by increasing the amount of potential sequence concurrency by eliminating KV-cache induced memory fragmentation. Non-contiguous KV cache implementations are also included in [HuggingFace TGI](https://github.com/huggingface/text-generation-inference) and [NVIDIA TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM).
+
+_<b> Continuous Batching: </b>_
+
+In the past, dynamic batching, in which a server would wait for multiple requests to process in phase with each other, was used to improve GPU utilization. However, this approach has drawbacks, as it typically requires padding inputs to identical lengths or stalling the system to wait to construct a larger batch.
+
+Recent advancement in large language model (LLM) inference and serving has been focusing on fine granularity scheduling and optimizing memory efficiency. For instance, Orca proposes _iteration-level scheduling_ (also known as continuous batching) which makes distinct scheduling decisions at each forward pass of the model. This allows requests to join/leave the batch as needed, eliminating the need for padding requests thus improving the overall throughput. In addition to Orca, continuous batching has been implemented in NVIDIA TRT-LLM, HuggingFace TGI, and vLLM.
+
+In current systems, there are two primary approaches to implement continuous batching. In TGI and vLLM, the generation phase is preempted to perform prompt processing (called infill in TGI) before continuing with generation. In Orca, these phases are not distinguished; instead, Orca will add a prompt into the running batch so long as the total number of sequences doesn't reach a fixed bound. Both of these approaches to varying degrees need to stall generation to process long prompts (see [Section 3B](#splitfuse)).
+
+To address these shortcomings, we propose a novel prompt and generation composition strategy, Dynamic SplitFuse.
+
+## 3. Dynamic SplitFuse: A Novel Prompt and Generation Composition Strategy<a name="technical-approach"></a>
+
+DeepSpeed-FastGen is built to leverage continuous batching and non-contiguous KV caches to enable increased occupancy and higher responsivity for serving LLMs in the data center, similar to existing frameworks such as TRT-LLM, TGI, and vLLM. In order to achieve a new level of performance, DeepSpeed-FastGen introduces SplitFuse which leverages dynamic prompt and generation decomposition and unification to further improve continuous batching and system throughput.
+
+### A. Three Performance Insights
+Before describing  Dynamic SplitFuse, we answer three key performance questions that together motivate its design.
+
+*__1. What factors impact the forward pass of a single LLM?__* In order to effectively schedule, it is necessary to understand what are the relevant independent variables the scheduling loop should control. We observe below that the composition of sequences in a forward pass (the batch size in sequences) has a negligible impact on performance compared to the raw number of tokens in the forward pass. This means an effective scheduler can be built around a single signal, the number of tokens in the forward pass.
+
+<div align="center">
+<img src="assets/images/observation-prompt-v-latency.png" alt="" width="480"/><br>
+</div>
+
+*__2. How does a model's throughput respond to changing the number of tokens in the forward pass?__* An LLM has two key operating regions with a relatively steep transition. With a small number of tokens, the GPU bottleneck is reading the model from memory and so throughput scales with the number of tokens, whereas with many tokens the model is throughput bound by compute and sees near-constant throughput.  The model should run highly efficiently if all forward passes are in the throughput-saturating region.
+
+<div align="center">
+<img src="assets/images/observation-prompt-v-flops.png" alt="" width="480"/><br>
+</div>
+
+*__3. How should a pool of tokens be scheduled across multiple forward passes?__* We observe above that for well-aligned inputs the token-throughput curve is concave, which means the second derivative is bound to be less than or equal to 0. As an example, let $f(x)$ be a concave function of latency to throughput for a given model. For a concave function $f(x)$, the following holds:
+
+  $$0 \geq \lim_{h \to 0} \frac{f(x + h) - 2f(x) + f(x - h)}{h^2}$$
+
+  $$0 \geq f(x + h) - 2f(x) + f(x - h)$$
+
+  $$2f(x) \geq f(x + h) + f(x - h)$$
+
+This states that for a given pool of `2x` tokens to process, the manner that maximizes throughput is that which evenly splits them between two batches. More generally, in a system that must consume and process P tokens over F forward passes, the ideal partitioning scheme will divide them equally.
+
+### B. Dynamic SplitFuse <a name="splitfuse"></a>
+
+Dynamic SplitFuse is a novel token composition strategy for prompt processing and token generation. DeepSpeed-FastGen utilizes Dynamic SplitFuse to run at a consistent forward size by leveraging the capability to take partial tokens from prompts and compose this with generation. In particular, Dynamic SplitFuse performs two key behaviors:
+
+1. Long prompts are decomposed into much smaller chunks and scheduled across multiple forward passes (iterations) with only the final pass performing any generation.
+2. Short prompts will be composed to exactly fill a target token budget. Even short prompts may be decomposed to ensure the budget is precisely met and the forward sizes are well-aligned.
+
+Together, these two techniques provide concrete benefits on all user metrics:
+
+1. *__Better Responsiveness__:* Since long prompts no longer require extremely long forward passes to process, the model will provide lower client latency. More forward passes are performed within the same window of time.
+2. *__Higher Efficiency:__* Fusion of short prompts to larger token budgets enables the model to consistently operate in the high throughput regime.
+3. *__Lower variance and better consistency:__* Since forward passes are of consistent size and forward pass size is the primary determinant of performance, the latency of each forward pass is much more consistent than competing systems as is the perceived generation frequency. There are no pre-emption or long-running prompts to increase the latency as in other prior work.
+
+Consequently, DeepSpeed-FastGen will consume tokens from incoming prompts at a rate that permits fast ongoing generation while adding tokens to the system that increase system utilization, providing lower latency and higher throughput streaming generation to all clients as compared to other state-of-the-art serving systems.
+
+<div align="center">
+  <img src="assets/images/fastgen-overview-light.png#gh-light-mode-only" width="640">
+  <img src="assets/images/fastgen-overview-dark.png#gh-dark-mode-only" width="640"><br>
+
+  *Figure 1: Illustration of continuous batching strategies. Each block shows the execution of a forward pass.  An arrow indicates that the forward pass has sequences with one or more tokens generated. vLLM performs either token generations or prompt processing in a forward pass; token generation preempts prompt processing. Orca runs prompts at their complete length alongside generation. Dynamic SplitFuse performs dynamic composition of fixed-sized batches composed of both generation and prompt tokens.*
+
+</div>
+
+## 4. Performance Evaluation  <a name="performance-evaluation"></a>
+
+DeepSpeed-FastGen provides state-of-the-art LLM serving performance leveraging its blocked KV cache and Dynamic SplitFuse continuous batching. We evaluate DeepSpeed-FastGen against vLLM on a range of models and hardware configurations following the benchmarking methodology discussed below.
+
+### A. Benchmarking Methodology
+
+We use two primary quantitative schemes for measuring performance.
+
+**Throughput-Latency Curves:** Two key metrics for production readiness are throughput (measured in requests per second) and latency (the responsiveness of each request). To measure this, we instantiate multiple clients (ranging from 1 to 32) concurrently and send requests (512 in total) to the server. The resulting latency of each request is measured at the endpoint and throughput is measured by the end-to-end time to complete the experiment.
+
+**Effective Throughput:** Interactive applications, such as chat applications, can have more stringent and complex requirements than can be captured by top-level metrics like end-to-end latency. In particular, we focus on the increasingly popular chat user scenario:
+
+  1. A user initiates a task by sending a prompt.
+  2. The system processes the prompt and returns the first token.
+  3. Subsequent tokens are streamed to the user as they are produced.
+
+At each point in this process there is an opportunity for a system to provide an adverse user experience; for example, if the first token arrives too slowly or the generation appears to stop for some time. We propose an SLA framework that considers both of these dimensions.
+
+As the lengths of prompts and generated texts vary significantly, affecting computational costs, it is impractical to set rigid SLA values for throughput and latency. Therefore, we define the SLA for prompt latency as |tokens in prompt| / 512 seconds (= 512 tokens/s). Additionally, considering humans' reading speed, we set the SLA for generation latency on the Exponential Moving Average (EMA) to 2, 4, or 6 tokens/sec. Requests that adhere to these SLAs are deemed successful, and the throughput of these successful requests is referred to as **effective throughput**.
+
+We evaluate vLLM and DeepSpeed-FastGen on both Llama-2 7B, Llama-2 13B, and Llama-2 70B on NVIDIA A100, H100, and A6000.
+
+### B. Throughput-Latency Analysis
+
+In this experiment, DeepSpeed-FastGen outperforms vLLM in both throughput and latency, providing equivalent latency with greater throughput or more responsive latency and the same throughput. On Llama-2 70B with 4 A100x80GB, DeepSpeed-FastGen demonstrates up to 2x higher throughput (1.36 rps vs. 0.67 rps) at identical latency (9 seconds) or up to 50% latency reduction (7 seconds vs. 14 seconds) while achieving the same throughput (1.2 rps), as shown in Figure 2. These trends hold when evaluating Llama-2 13B as shown in Figure 3.
+
+<div align="center">
+  <img src="assets/images/throughput_latency.png" alt="" width="850"/><br>
+
+  *Figure 2: Throughput and latency of text generation using Llama 2 70B (Tensor parallelism across 4 A100-80GB GPUs). A normal distribution was applied to prompt and generation lengths with averages of 1200/2600 and 128/60, respectively, and a 30% variance*
+</div><br>
+
+<div align="center">
+  <img src="assets/images/throughput_latency_13B_no_arrow.png" alt="" width="850"/><br>
+
+  *Figure 3: Throughput and latency of text generation using Llama 2 13B (A100-80GB GPU, no tensor parallelism). A normal distribution was applied to prompt and generation lengths with averages of 1200/2600 and 60/128, respectively, and a 30% variance*
+</div>
+
+### C. Effective Throughput Analysis
+
+Under the effective throughput analysis that considers both first token latency and the rate at which generation occurs, DeepSpeed-FastGen provides up to 2.3x higher throughput than vLLM. Figure 4 presents a comparative analysis of the effective throughputs of DeepSpeed-FastGen and vLLM. Each plotted point denotes the effective throughput derived from a specific number of clients. As we scaled the number of clients, we initially observed an increase in effective throughput. However, the latency also significantly increases as the number of clients approaches the system's capacity, causing many requests to fail in meeting the SLA. Consequently, the effective throughput will either saturate or decrease at some point. From a usability perspective, it's not particularly relevant how many clients are required to achieve the max effective throughput; the maximum point of the line is the optimal serving point.
+
+<div align="center">
+  <img src="assets/images/effective_throughput.png" alt="" width="1200" />
+
+  *Figure 4: Effective throughput of DeepSpeed-FastGen and vLLM (Llama 2 70B/A100-80GB using tensor parallelism across 4 A100-80GB GPUs. A normal distribution was applied to prompt and generation lengths with averages of 2600 and 60, respectively, and a 30% variance)*
+</div><br>
+
+When vLLM preempts the ongoing generation of previous requests, the generation latency experiences a notable increase. This leads to vLLM's effective throughput appearing lower than its directly measured throughput. At vLLM's peak, the effective throughput was 0.63 queries/sec and around 28% of requests failed to meet the 4 tokens/s SLA. At the same SLA, DeepSpeed-FastGen achieved 1.42 queries/sec (less than 1% of requests failed to meet the SLA), which is 2.3x higher than vLLM.
+
+### D. Token Level Timing Analysis
+
+Figure 5 displays the P50, P90, and P95 latencies of the generation processes. Both vLLM and DeepSpeed-FastGen exhibit similar P50 latencies, but vLLM demonstrates significantly higher latencies for P90 and P95.
+Regarding the P95 latencies, DeepSpeed-FastGen achieved a reduction of 3.7 times.
+
+This discrepancy is due to a noticeable spike in vLLM's generation latency when it preempts the ongoing generation to process new prompts.
+In contrast, DeepSpeed-FastGen typically processes the prompt and generation for previous requests concurrently, leading to much more consistent generation latency.
+
+
+<div align="center">
+  <img src="assets/images/token_latency.png" alt="" width="400"/><br>
+
+  *Figure 5: Per-Token generation Latency of Llama 2 70B/A100-80GB using tensor parallelism across 4 A100-80GB GPUs, 16 clients. A normal distribution was applied to prompt and generation lengths with averages of 2600 and 128, respectively, and a 30% variance.*
+</div><br>
+
+
+### E. Scalability using Load Balancing
+
+DeepSpeed-FastGen offers replica-level load balancing that evenly distributes requests across multiple servers, allowing you to effortlessly scale up your application.
+
+Figure 6 illustrates the scalability of DeepSpeed-FastGen when employing the load balancer and up to 16 replicas. Note that we utilized 4 A100 GPUs to compute the Llama 2 70B model. In total, we employed 8 nodes to run the 16 replicas. The results demonstrate nearly perfect scalability with DeepSpeed-FastGen.
+Given that the throughput of a single replica is 1.46 queries/sec, the throughput with 16 replicas reaches 23.7 queries/sec, marking a linear 16x increase compared to a single replica.
+
+<div align="center">
+  <img src="assets/images/repl_scale_llama70b_tp4_p2600g60.png" alt="" width="400"/><br>
+
+  *Figure 6: Scalability using the load balancing feature. A normal distribution was applied to prompt and generation lengths with averages of 2600 and 60, respectively, and a 30% variance*<br>
+</div>
+
+### F. Other Hardware Platforms
+
+In addition to the deep analysis on A100, we provide additional benchmarking results for H100 and A6000. The same performance trends were observed on both A6000 and H100 as A100.
+
+<div align="center">
+  <img src="assets/images/H100_benchmark.png" alt="" width="800"/><br>
+
+  *Figure 7: Throughput-latency curve and effective throughput of Llama 2 70b using 8 H100 GPUs. A normal distribution was applied to prompt and generation lengths with averages of 2600 and 60, respectively, and a 30% variance*<br>
+</div>
+
+<div align="center">
+  <img src="assets/images/A6000_benchmark.png" alt="" width="800"/><br>
+
+  *Figure 8: Throughput-latency curve and effective throughput of Llama 2 7b using A6000. A normal distribution was applied to prompt and generation lengths with averages of 2600 and 60, respectively, and a 30% variance*<br>
+</div>
+
+## 5. DeepSpeed-FastGen: Implementation and Usage  <a name="using-deepspeed-fastgen"></a>
+
+DeepSpeed-FastGen is the synergistic composition of [DeepSpeed-MII](https://github.com/microsoft/DeepSpeed-MII) and [DeepSpeed-Inference](https://github.com/microsoft/DeepSpeed) as illustrated in the figure below. Together, both of these software packages provide various components of the system including the frontend APIs, the host and device infrastructure to schedule batches using Dynamic SplitFuse, optimized kernel implementations, and the tools to construct new model implementations.
+
+<div align="center">
+<img src="assets/images/fastgen-arch-light.png#gh-light-mode-only" width="800px">
+<img src="assets/images/fastgen-arch-dark.png#gh-dark-mode-only" width="800px">
+</div>
+
+
+The fastest way to get started with our alpha release of DeepSpeed-FastGen is: `pip install deepspeed-mii`.
+
+Please follow our [Getting Started](https://github.com/microsoft/deepspeed-mii#getting-started-with-mii) guide for more details. For usage and reporting issues, please use the [DeepSpeed-MII Github repository](https://github.com/microsoft/DeepSpeed-MII).
+
+### A. Supported Models
+
+We currently support the following model architectures in this alpha release of DeepSpeed-FastGen:
+
+* [LLaMA](https://huggingface.co/models?other=llama) and [LLaMA-2](https://huggingface.co/models?other=llama-2)
+* [Mistral](https://huggingface.co/models?other=mistral)
+* [OPT](https://huggingface.co/models?other=opt)
+
+All current models leverage [HuggingFace](https://github.com/huggingface) APIs in our backend to provide both the model weights and the model's corresponding tokenizer.
+
+We plan to add additional models in the coming weeks and months after the initial release. If there are specific model architectures you would like supported, please [file an issue](https://github.com/microsoft/DeepSpeed-MII/issues) and let us know.
+
+### B. Deployment options
+All of the examples below are runnable in [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples/tree/master/inference/mii). Once installed you have two options for deployment: an interactive non-persistent pipeline or a persistent serving deployment:
+
+#### Non-persistent pipeline
+
+The non-persistent pipeline deployment is a great and fast way to get started and can be done with only a few lines of code. Non-persistent models are only around for the duration of the python script you are running but are useful for temporary interactive sessions.
+
+```python
+from mii import pipeline
+pipe = pipeline("mistralai/Mistral-7B-v0.1")
+output = pipe(["Hello, my name is", "DeepSpeed is"], max_new_tokens=128)
+print(output)
+```
+
+#### Persistent deployment
+
+A persistent deployment is ideal for use with long-running and production applications. The persistent deployment uses a lightweight GRPC server that can be created using the following 2 lines:
+
+
+```python
+import mii
+mii.serve("mistralai/Mistral-7B-v0.1")
+```
+
+The above server can be queried by multiple clients at once thanks to the built-in load balancer from DeepSpeed-MII. Creating a client also just takes 2 lines of code:
+
+```python
+client = mii.client("mistralai/Mistral-7B-v0.1")
+output = client.generate("Deepspeed is", max_new_tokens=128)
+print(output)
+```
+
+A persistent deployment can be terminated when it is no longer needed:
+
+```python
+client.terminate_server()
+```
+
+### C. Advanced Installation Information
+
+For ease of use and a significant reduction in lengthy compile times that many projects require in this space, we distribute a pre-compiled Python wheel covering the majority of our custom kernels through a new library called [DeepSpeed-Kernels](https://github.com/microsoft/DeepSpeed-Kernels). We have found this library to be very portable across environments with NVIDIA GPUs with compute capabilities 8.0+ (Ampere+), CUDA 11.6+, and Ubuntu 20+. In most cases, you shouldn't even need to know this library exists as it is a dependency of DeepSpeed-MII and will be installed with it. However, if for whatever reason you need to compile our kernels manually please see our [advanced installation docs](https://github.com/microsoft/DeepSpeed-Kernels#source).
+
+
+# 6. Try Out DeepSpeed-FastGen <a name="try"></a>
+We are very excited to share this DeepSpeed-FastGen alpha release.
+
+* To get started, please visit our GitHub page for DeepSpeed-MII: [GitHub Landing Page](https://github.com/microsoft/DeepSpeed-MII)
+
+DeepSpeed-FastGen is part of the bigger DeepSpeed ecosystem comprising a multitude of Deep Learning systems and modeling technologies. To learn more,
+
+* Please visit our [website](https://www.deepspeed.ai/) for detailed blog posts, tutorials, and helpful documentation.
+* You can also follow us on our [English Twitter](https://twitter.com/MSFTDeepSpeed), [Japanese Twitter](https://twitter.com/MSFTDeepSpeedJP), and [Chinese Zhihu](https://www.zhihu.com/people/deepspeed) for latest news on DeepSpeed.
+
+DeepSpeed welcomes your contributions! We encourage you to report issues, contribute PRs, and join discussions on the [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/) page. Please see our [contributing guide](https://github.com/microsoft/DeepSpeed/blob/master/CONTRIBUTING.md) for more details. We are open to collaborations with universities, research labs, and companies, such as those working together on deep learning research, applying DeepSpeed to empower real-world AI models and applications, and so on. For such requests (and other requests unsuitable for GitHub), please directly email to deepspeed-info@microsoft.com.
+
+The following items are on our roadmap and we plan to engage with our community on these through our GitHub issues and PRs:
+
+- Performance improvements
+- Broader model support
+- New hardware backends through collaboration with partners
+- Release performance benchmarks (used to generate plots in this blog)
+
+**"Star" our [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/) and [DeepSpeedMII GitHub](https://github.com/microsoft/DeepSpeed-MII/) repositories if you like our work!**
+
+# 7. Acknowledgements <a name="acknowledgements"></a>
+
+We would like to thank various open-source community projects including HuggingFace, vLLM, and HuggingFace TGI. We have leveraged HF APIs to support models and tokenizers in our alpha release and will continue to add more models. We especially acknowledge and thank the developers of [Flash Attention](https://github.com/Dao-AILab/flash-attention) for their great work. We have extensively leveraged FlashAttention kernels in our system with modifications that have been acknowledged in our code repositories at appropriate file headers. Finally, we want to thank the developers of [FasterTransformer](https://github.com/NVIDIA/FasterTransformer) kernels that we have used in our MoE kernels (released as part of DeepSpeed-Kernels repository).
diff --git a/blogs/deepspeed-fastgen/assets/images/A6000_benchmark.png b/blogs/deepspeed-fastgen/assets/images/A6000_benchmark.png
new file mode 100644
index 000000000000..9d4ab55f5f7a
Binary files /dev/null and b/blogs/deepspeed-fastgen/assets/images/A6000_benchmark.png differ
diff --git a/blogs/deepspeed-fastgen/assets/images/H100_benchmark.png b/blogs/deepspeed-fastgen/assets/images/H100_benchmark.png
new file mode 100644
index 000000000000..89fb9ca3e1ce
Binary files /dev/null and b/blogs/deepspeed-fastgen/assets/images/H100_benchmark.png differ
diff --git a/blogs/deepspeed-fastgen/assets/images/effective_throughput.png b/blogs/deepspeed-fastgen/assets/images/effective_throughput.png
new file mode 100644
index 000000000000..11c7f82bc54f
Binary files /dev/null and b/blogs/deepspeed-fastgen/assets/images/effective_throughput.png differ
diff --git a/blogs/deepspeed-fastgen/assets/images/effective_throughput_main.png b/blogs/deepspeed-fastgen/assets/images/effective_throughput_main.png
new file mode 100644
index 000000000000..1b9a38306e8e
Binary files /dev/null and b/blogs/deepspeed-fastgen/assets/images/effective_throughput_main.png differ
diff --git a/blogs/deepspeed-fastgen/assets/images/fast-gen-overview.jpg b/blogs/deepspeed-fastgen/assets/images/fast-gen-overview.jpg
new file mode 100644
index 000000000000..2affbf8a4cc3
Binary files /dev/null and b/blogs/deepspeed-fastgen/assets/images/fast-gen-overview.jpg differ
diff --git a/blogs/deepspeed-fastgen/assets/images/fastgen-arch-dark.png b/blogs/deepspeed-fastgen/assets/images/fastgen-arch-dark.png
new file mode 100644
index 000000000000..9b90357a3f1b
Binary files /dev/null and b/blogs/deepspeed-fastgen/assets/images/fastgen-arch-dark.png differ
diff --git a/blogs/deepspeed-fastgen/assets/images/fastgen-arch-light.png b/blogs/deepspeed-fastgen/assets/images/fastgen-arch-light.png
new file mode 100644
index 000000000000..9e754abde85d
Binary files /dev/null and b/blogs/deepspeed-fastgen/assets/images/fastgen-arch-light.png differ
diff --git a/blogs/deepspeed-fastgen/assets/images/fastgen-hero-dark.png b/blogs/deepspeed-fastgen/assets/images/fastgen-hero-dark.png
new file mode 100755
index 000000000000..6ac1a775805b
Binary files /dev/null and b/blogs/deepspeed-fastgen/assets/images/fastgen-hero-dark.png differ
diff --git a/blogs/deepspeed-fastgen/assets/images/fastgen-hero-light.png b/blogs/deepspeed-fastgen/assets/images/fastgen-hero-light.png
new file mode 100755
index 000000000000..af8f1defe653
Binary files /dev/null and b/blogs/deepspeed-fastgen/assets/images/fastgen-hero-light.png differ
diff --git a/blogs/deepspeed-fastgen/assets/images/fastgen-overview-dark.png b/blogs/deepspeed-fastgen/assets/images/fastgen-overview-dark.png
new file mode 100755
index 000000000000..dde598a985d8
Binary files /dev/null and b/blogs/deepspeed-fastgen/assets/images/fastgen-overview-dark.png differ
diff --git a/blogs/deepspeed-fastgen/assets/images/fastgen-overview-light.png b/blogs/deepspeed-fastgen/assets/images/fastgen-overview-light.png
new file mode 100755
index 000000000000..bdb5f8df483e
Binary files /dev/null and b/blogs/deepspeed-fastgen/assets/images/fastgen-overview-light.png differ
diff --git a/blogs/deepspeed-fastgen/assets/images/observation-prompt-v-flops.png b/blogs/deepspeed-fastgen/assets/images/observation-prompt-v-flops.png
new file mode 100644
index 000000000000..6d45880588d9
Binary files /dev/null and b/blogs/deepspeed-fastgen/assets/images/observation-prompt-v-flops.png differ
diff --git a/blogs/deepspeed-fastgen/assets/images/observation-prompt-v-latency.png b/blogs/deepspeed-fastgen/assets/images/observation-prompt-v-latency.png
new file mode 100644
index 000000000000..7c14e2bf6e53
Binary files /dev/null and b/blogs/deepspeed-fastgen/assets/images/observation-prompt-v-latency.png differ
diff --git a/blogs/deepspeed-fastgen/assets/images/repl_scale_llama70b_tp4_p2600g60.png b/blogs/deepspeed-fastgen/assets/images/repl_scale_llama70b_tp4_p2600g60.png
new file mode 100644
index 000000000000..834c06dfb07a
Binary files /dev/null and b/blogs/deepspeed-fastgen/assets/images/repl_scale_llama70b_tp4_p2600g60.png differ
diff --git a/blogs/deepspeed-fastgen/assets/images/th_lat_curve_llama70b_tp4_p1200g128.png b/blogs/deepspeed-fastgen/assets/images/th_lat_curve_llama70b_tp4_p1200g128.png
new file mode 100644
index 000000000000..df16b5bebc53
Binary files /dev/null and b/blogs/deepspeed-fastgen/assets/images/th_lat_curve_llama70b_tp4_p1200g128.png differ
diff --git a/blogs/deepspeed-fastgen/assets/images/th_lat_curve_llama70b_tp4_p2600g128.png b/blogs/deepspeed-fastgen/assets/images/th_lat_curve_llama70b_tp4_p2600g128.png
new file mode 100644
index 000000000000..8b69a8a1718b
Binary files /dev/null and b/blogs/deepspeed-fastgen/assets/images/th_lat_curve_llama70b_tp4_p2600g128.png differ
diff --git a/blogs/deepspeed-fastgen/assets/images/throughput_latency.png b/blogs/deepspeed-fastgen/assets/images/throughput_latency.png
new file mode 100644
index 000000000000..aaceebde7038
Binary files /dev/null and b/blogs/deepspeed-fastgen/assets/images/throughput_latency.png differ
diff --git a/blogs/deepspeed-fastgen/assets/images/throughput_latency_13B_no_arrow.png b/blogs/deepspeed-fastgen/assets/images/throughput_latency_13B_no_arrow.png
new file mode 100644
index 000000000000..cc7b8ec1ec05
Binary files /dev/null and b/blogs/deepspeed-fastgen/assets/images/throughput_latency_13B_no_arrow.png differ
diff --git a/blogs/deepspeed-fastgen/assets/images/token_latency.png b/blogs/deepspeed-fastgen/assets/images/token_latency.png
new file mode 100644
index 000000000000..405a3c0d06ed
Binary files /dev/null and b/blogs/deepspeed-fastgen/assets/images/token_latency.png differ
diff --git a/blogs/deepspeed-fastgen/chinese/README.md b/blogs/deepspeed-fastgen/chinese/README.md
new file mode 100644
index 000000000000..fb9cc7319ab6
--- /dev/null
+++ b/blogs/deepspeed-fastgen/chinese/README.md
@@ -0,0 +1,299 @@
+<div align="center">
+
+# DeepSpeed-FastGen：通过 MII 和 DeepSpeed-Inference 实现 LLM 高吞吐量文本生成
+
+</div>
+
+<div align="center">
+ <img src="../assets/images/fastgen-hero-light.png#gh-light-mode-only" width="850px">
+ <img src="../assets/images/fastgen-hero-dark.png#gh-dark-mode-only" width="850px">
+</div>
+
+## 目录
+1. [引言](#introduction)
+2. [关键的 LLM 服务技术](#background)
+3. [动态 SplitFuse：一种新颖的提示和生成组合策略](#technical-approach)
+4. [性能评估](#performance-evaluation)
+5. [DeepSpeed-FastGen：实现与使用](#using-deepspeed-fastgen)
+6. [尝试 DeepSpeed-FastGen](#try)
+7. [致谢](#acknowledgements)
+
+
+## 1. 引言 <a name="introduction"></a>
+
+GPT-4 和 LLaMA 这样的大型语言模型（LLMs）已在各个层次上成为了集成 AI 的主流服务应用。从常规聊天模型到文档摘要，从自动驾驶到各个软件中的Copilot功能，这些模型的部署和服务需求正在迅速增加。像 DeepSpeed、PyTorch 和其他几个框架可以在 LLM 训练期间实现良好的硬件利用率。但它们在与用户互动及处理开放式文本生成等任务时，受限于这些操作的计算密集度相对较低，现有系统往往在推理吞吐量上遇到瓶颈。
+
+为了解决这一问题， [vLLM](https://arxiv.org/pdf/2309.06180.pdf) 这样由 PagedAttention 驱动的框架和 [Orca](https://www.usenix.org/system/files/osdi22-yu.pdf) 这样的系统显著提高了 LLM 推理的性能。然而，这些系统在面对长提示的工作负载时，依旧难以提供良好的服务质量。随着越来越多的模型（例如 [MPT-StoryWriter](https://www.mosaicml.com/blog/mpt-7b)）和系统（例如[DeepSpeed Ulysses](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-ulysses)）支持延伸到数万个令牌的上下文窗口，这些长提示工作负载变得越来越重要。为了更好地理解问题，我们在下文中提供了详细的示例来说明 LLM 的文本生成是如何在“提示处理”和“生成”的这两个阶段中工作的。当系统将它们视为不同的阶段时，生成阶段将被提示处理所抢占，这可能会破坏服务级别协议（SLAs）。
+
+今天，我们很高兴地介绍 DeepSpeed-FastGen 框架，它通过采用我们提出的动态 SplitFuse 技术，能够提供比vLLM 等先进系统高出多达 2.3 倍的有效吞吐量。DeepSpeed-FastGen 是 DeepSpeed-MII 和 DeepSpeed-Inference 的结合，提供了一个易于使用的服务系统。
+
+**快速开始：** 要使用 DeepSpeed-FastGen 只需安装最新的 [DeepSpeed-MII](https://github.com/microsoft/DeepSpeed-MII) 发行版：
+
+```bash
+pip install deepspeed-mii
+```
+
+要使用简单的非持久性管道部署并生成文本，请运行以下代码。更多详情，请参见[第 5 节](#using-deepspeed-fastgen)。
+
+```python
+from mii import pipeline
+pipe = pipeline("mistralai/Mistral-7B-v0.1")
+output = pipe(["Hello, my name is", "DeepSpeed is"], max_new_tokens=128)
+print(output)
+```
+
+## 2. 现有 LLM 服务技术 <a name="background"></a>
+
+单个序列的文本生成工作负载包含两个阶段：1）提示处理，此阶段系统处理用户输入的文本，将其转换成一系列令牌并构建用于注意力机制的键值（KV）缓存；2）生成令牌，即向缓存中添加单个令牌并产生新的令牌。在生成文本序列的过程中，系统将对模型进行多次前向调用以生成完整的文本序列。现有文献和系统中已经提出了两种主要技术，它们解决了这些阶段中可能出现的各种限制和瓶颈。
+
+_<b> 分块 KV 缓存：</b>_
+
+vLLM识别出大型单体KV缓存导致的内存碎片化显著降低了大型语言模型服务系统的并发性，并提出了“分页注意力”[Paged Attention](https://arxiv.org/pdf/2309.06180.pdf) 机制来实现非连续KV缓存，并增加整个系统的总吞吐量。此技术采用分页缓存机制，从而提升了系统的整体吞吐量。不同于之前分配各个不同大小的连续内存块的做法，分块 KV 缓存中的底层存储是固定大小的块（也称为页面）。分块 KV 缓存通过消除 KV 缓存引起的内存碎片化，增加了潜在的序列并发量，从而增加了系统吞吐量。非连续 KV 缓存也被 [HuggingFace TGI](https://github.com/huggingface/text-generation-inference) 和 [NVIDIA TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) 等框架所实现。
+
+_<b> 连续批处理：</b>_
+
+过去，动态批处理（服务器等待多个请求以同步处理）被用来提高 GPU 利用率。然而，这种方法有缺点，因为它通常需要将输入填充到相同长度或使系统等待以构建更大的批次（batch）。
+
+近期大型语言模型（LLM）推理和服务的优化一直专注于细粒度调度和优化内存效率。例如，Orca 提出了 _迭代级调度_（也称为连续批处理），它在模型的每次前向传递时作出独特的调度决策。这允许请求根据需要加入/离开批次，从而消除了填充请求的需要，提高了总体吞吐量。除了 Orca，NVIDIA TRT-LLM、HuggingFace TGI 和 vLLM 也实现了连续批处理。
+
+在当前系统中，有两种主要方法来实现连续批处理。在 TGI 和 vLLM 中，生成阶段被抢占以执行提示处理（在 TGI 中称为填充）然后继续生成。在 Orca 中，这些阶段不被区分；相反，只要总序列数没有达到固定限制，Orca 就会将提示加入正在运行的批次中。这两种方法都在不同程度上需要暂停生成以处理长提示（参见[第 3B 节](#splitfuse)）。
+
+为了解决这些缺点，我们提出了一种新颖的提示和生成组合策略，动态 SplitFuse。
+
+## 3. 动态 SplitFuse：一种新颖的提示和生成组合策略<a name="technical-approach"></a>
+
+类似于现有的框架如 TRT-LLM、TGI 和 vLLM，DeepSpeed-FastGen 的目标是利用连续批处理和非连续 KV 缓存技术，以提升数据中心服务大型语言模型（LLM）的硬件利用率和响应速度。为了实现更高的性能，DeepSpeed-FastGen 提出了 SplitFuse 技术，它利用动态提示和生成分解, 统一来进一步改善连续批处理和系统吞吐量。
+
+### A. 三个性能见解
+在描述动态 SplitFuse 之前，我们回答三个关键的性能问题，这些问题解释了SplitFuse背后的逻辑。
+
+*__1. 哪些因素影响单个 LLM 的前向传递？__* 为了有效地调度，我们必须首先了解调度过程中应考虑的独立变量有哪些。我们观察到，在前向传递中序列的组成（序列中的批次大小）对性能的影响可以忽略不计。这意味着我们可以围绕单一变量--即前向传递中的令牌数量--构建一个高效的调度器。
+
+<div align="center">
+<img src="../assets/images/observation-prompt-v-latency.png" alt="" width="480"/><br>
+</div>
+
+*__2. 模型的吞吐量与前向传递中令牌数量的关系如何？__* 一个 LLM 有两个关键的运行区间，并且过渡相对陡峭。当令牌数量较少时，GPU 的瓶颈是从内存中读取模型，因此吞吐量会随着令牌数量的增加而上升，而当令牌数量很多时，模型的吞吐量受GPU计算能力限制，吞吐量近乎恒定。因此如果我们能将所有前向传递都保持在吞吐量饱和区间，则模型运行效率最高。
+
+<div align="center">
+<img src="../assets/images/observation-prompt-v-flops.png" alt="" width="480"/><br>
+</div>
+
+*__3. 如何在多个前向传递中调度一组令牌？__* 我们在上图中观察到，对于对齐良好的输入，令牌吞吐量曲线是凹的，这意味着第二导数必定小于或等于 0。设 $f(x)$ 为给定模型的延迟至吞吐量的凹函数。则对于凹函数 $f(x)$，以下关系成立：
+
+  $$0 \geq \lim_{h \to 0} \frac{f(x + h) - 2f(x) + f(x - h)}{h^2}$$
+
+  $$0 \geq f(x + h) - 2f(x) + f(x - h)$$
+
+  $$2f(x) \geq f(x + h) + f(x - h)$$
+
+这表明，对于给定的 `2x` 个总令牌来说，最大化吞吐量的方式是将它们均匀分割到两个批次之间。更一般地说，在一个系统中，如果要在 F 个前向传递中处理 P 个令牌，最理想的分区方案是均匀分配它们。
+
+### B. 动态分割融合（Dynamic SplitFuse） <a name="splitfuse"></a>
+
+动态分割融合是一种用于提示处理和令牌生成的新型令牌组成策略。DeepSpeed-FastGen 利用动态分割融合策略，通过从提示中取出部分令牌并与生成过程相结合，使得模型可以保持一致的前向传递大小（forward size）。具体来说，动态分割融合执行两个关键行为：
+
+1. 将长提示分解成更小的块，并在多个前向传递（迭代）中进行调度，只有在最后一个传递中才执行生成。
+2. 短提示将被组合以精确填满目标令牌预算。即使是短提示也可能被分解，以确保预算被精确满足，前向大小（forward sizes）保持良好对齐。
+
+动态分割融合（Dynamic SplitFuse）提升了以下性能指标：
+
+1. **更好的响应性：** 由于长提示不再需要极长的前向传递来处理，模型将提供更低的客户端延迟。在同一时间窗口内执行的前向传递更多。
+2. **更高的效率：** 短提示的融合到更大的令牌预算使模型能够持续运行在高吞吐量状态。
+3. **更低的波动和更好的一致性：** 由于前向传递的大小一致，且前向传递大小是性能的主要决定因素，每个前向传递的延迟比其他系统更加一致。生成频率也是如此，因为DeepSpeed-FastGen不需要像其他先前的系统那样抢占或长时间运行提示，因此延迟会更低。
+
+因此，与现有最先进的服务系统相比，DeepSpeed-FastGen 将以允许快速、持续生成的速率消耗来自提示的令牌，同时向系统添加令牌，提高系统利用率，提供更低的延迟和更高的吞吐量流式生成给所有客户端。
+
+<div align="center">
+  <img src="../assets/images/fastgen-overview-light.png#gh-light-mode-only" width="640">
+  <img src="../assets/images/fastgen-overview-dark.png#gh-dark-mode-only" width="640"><br>
+
+  *图 1: 连续批处理策略的示意图。每个块显示一个前向传递的执行。箭头表示前向传递有一个或多个生成的令牌序列。vLLM 在一个前向传递中要么生成令牌，要么处理提示；令牌生成抢占提示处理。Orca 在生成过程中以完整长度处理提示。DeepSpeed-FastGen动态分割融合则执行固定大小批次的动态组合，包括生成和提示令牌。*
+
+</div>
+
+## 4. 性能评估 <a name="performance-evaluation"></a>
+
+DeepSpeed-FastGen 利用分块 KV 缓存和动态分割融合连续批处理，提供了最先进的 LLM 服务性能。我们以下述的基准测试方法对 DeepSpeed-FastGen 和 vLLM 在一系列模型和硬件配置上进行评估。
+
+### A. 基准测试方法论
+
+我们采用两种主要的定量方法来衡量性能。
+
+**吞吐量-延迟曲线：** 生产环境的两个关键指标是吞吐量（以每秒请求计）和延迟（每个请求的响应性）。为了衡量这一点，我们模拟了多个客户端（数量从 1 到 32 不等）同时向服务器发送请求（总计 512 个）的情况。每个请求的结果延迟在端点测量，吞吐量通过完成实验的端到端时间来测量。
+
+**有效吞吐量：** 诸如聊天应用程序之类的交互式应用程序可能有比上述指标（如端到端延迟）更严格和复杂的要求。以越来越受欢迎的聊天应用为例：
+
+  1. 用户通过发送提示（输入）来开始对话。
+  2. 系统处理提示并返回第一个令牌。
+  3. 随着生成的进行，后续令牌被流式传输给用户。
+
+在这个过程的每个阶段，系统都有可能提供不利的用户体验；例如，第一个令牌到达得太慢；或生成似乎停止了一段时间。我们提出了一个考虑这两个维度的 SLA 框架。
+
+由于提示和生成文本的长度差异很大，影响计算成本，因此设定同一个 SLA 值对于吞吐量和延迟是不切实际的。因此，我们将提示延迟的 SLA 定义为 “|提示中的令牌|/512” 秒（= 512 令牌/秒）。此外，考虑到人类的阅读速度，我们将生成延迟的 SLA 设置在指数移动平均（EMA）上为 2、4 或 6 令牌/秒。能够达到这些 SLA 的请求被认为是成功的，这些成功请求的吞吐量被称为**有效吞吐量**。
+
+我们通过在 NVIDIA A100、H100 和 A6000 上运行 Llama-2 7B、Llama-2 13B 和 Llama-2 70B 对 vLLM 和 DeepSpeed-FastGen进行了评估。
+
+### B. 吞吐量-延迟分析
+
+在这个实验中，DeepSpeed-FastGen 在吞吐量和延迟方面都优于 vLLM，在相同的延迟下DeepSpeed-FastGen的吞吐量更大；在相同的吞吐量下DeepSpeed-FastGen的响应延迟更小。如图 2 所示，在 Llama-2 70B 运行于 4 个 A100x80GB 的情况下，DeepSpeed-FastGen 展示了高达 2 倍的吞吐量（1.36 rps 对比 0.67 rps）在相同的延迟（9 秒）下；或高达 50% 的延迟减少（7 秒对比 14 秒）同时实现相同的吞吐量（1.2 rps）。评估 Llama-2 13B 时DeepSpeed-FastGen也呈现了这些趋势，如图 3 所示。
+
+<div align="center">
+  <img src="../assets/images/throughput_latency.png" alt="" width="850"/><br>
+
+  *图 2: 使用 Llama 2 70B 进行文本生成的吞吐量和延迟（使用 4 个 A100-80GB GPU 的张量并行）。提示和生成长度遵循正态分布，平均值分别为 1200/2600 和 128/60，方差为 30%*
+</div><br>
+
+<div align="center">
+  <img src="../assets/images/throughput_latency_13B_no_arrow.png" alt="" width="850"/><br>
+
+  *图 3: 使用 Llama 2 13B 进行文本生成的吞吐量和延迟（A100-80GB GPU，无张量并行）。提示和生成长度遵循正态分布，平均值分别为 1200/2600 和 60/128，并且有 30% 的方差*
+</div>
+
+### C. 有效吞吐量分析
+
+在考虑了首个令牌的延迟和生成速率的有效吞吐量分析下，DeepSpeed-FastGen 提供的吞吐量比 vLLM 高出多达 2.3 倍。图 4 展示了 DeepSpeed-FastGen 和 vLLM 的有效吞吐量的比较分析。每个绘制的点表示从特定数量的客户端得出的有效吞吐量。当我们扩大客户端数量时，我们最初观察到有效吞吐量的增加。然而，当客户端数量接近系统容量时，延迟也显著增加，导致许多请求未能满足 SLA。因此，有效吞吐量将在某个点上饱和或减少。从可用性角度来看，达到最大有效吞吐量所需的客户端数量并不特别重要；线条的最高点是最优的服务点。
+
+<div align="center">
+  <img src="../assets/images/effective_throughput.png" alt="" width="1200" />
+
+  *图 4: DeepSpeed-FastGen 和 vLLM 的有效吞吐量（Llama 2 70B/A100-80GB 使用张量并行在 4 个 A100-80GB GPU 上。提示和生成长度遵循正态分布，平均值分别为 2600 和 60，并且有 30% 的方差)*
+</div><br>
+
+当 vLLM 抢占正在进行的先前请求的生成时，生成延迟会明显增加。这导致 vLLM 的有效吞吐量看起来低于其直接测量的吞吐量。在 vLLM 的峰值时，有效吞吐量为 0.63 查询/秒，大约 28% 的请求未能满足 4 令牌/秒的 SLA。在相同的 SLA 下，DeepSpeed-FastGen 达到了 1.42 查询/秒（不到 1% 的请求未能满足 SLA），这是 vLLM 的 2.3 倍。
+
+### D. 令牌级时间分析
+
+图 5 显示了生成过程的 P50、P90 和 P95 延迟。vLLM 和 DeepSpeed-FastGen 展示了类似的 P50 延迟，但 vLLM 的 P90 和 P95 延迟显著更高。
+
+这种差异是由于 vLLM 在抢占正在进行的生成以处理新提示时，生成延迟出现显著增加所导致的。
+相比之下，DeepSpeed-FastGen 通常会同时处理之前请求的提示和生成，导致生成延迟更加一致。
+
+<div align="center">
+  <img src="../assets/images/token_latency.png" alt="" width="400"/><br>
+
+  *图 5: 使用张量并行在 4 个 A100-80GB GPU 上的 Llama 2 70B/A100-80GB 的每令牌生成延迟，16 客户端。提示和生成长度遵循正态分布，平均值分别为 2600 和 128，并且有 30% 的方差。
+</div><br>
+
+
+### E. 使用负载均衡的可扩展性
+
+DeepSpeed-FastGen 提供了副本级负载均衡，可以将请求均匀分布在多个服务器上，让您轻松扩展应用程序。
+
+图 6 展示了 DeepSpeed-FastGen 在使用负载均衡器和最多 16 个副本时的可扩展性。请注意，我们使用了 4 个 A100 GPU 来计算每个 Llama 2 70B 模型。总共，我们使用了 8 个节点来运行 16 个副本。结果展示了 DeepSpeed-FastGen 几乎完美的可扩展性。
+单个副本时DeepSpeed-FastGen的吞吐量为 1.46 查询/秒，而16 个副本的吞吐量达到了 23.7 查询/秒，与单个副本相比标志着线性的 16 倍增长。
+
+<div align="center">
+  <img src="../assets/images/repl_scale_llama70b_tp4_p2600g60.png" alt="" width="400"/><br>
+
+  *图 6: 使用负载均衡功能的可扩展性。提示和生成长度遵循正态分布，平均值分别为 2600 和 60，并且有 30% 的方差*<br>
+</div>
+
+### F. 其他硬件平台
+
+除了对 A100 的深入分析，我们还提供了 H100 和 A6000 的基准测试结果。在 A6000 和 H100 上观察到的性能趋势与 A100 相同。
+
+<div align="center">
+  <img src="../assets/images/H100_benchmark.png" alt="" width="800"/><br>
+
+  *图 7: 使用 8 个 H100 GPU 的 Llama 2 70b 的吞吐量-延迟曲线和有效吞吐量。提示和生成长度遵循正态分布，平均值分别为 2600 和 60，并且有 30% 的方差*<br>
+</div>
+
+<div align="center">
+  <img src="../assets/images/A6000_benchmark.png" alt="" width="800"/><br>
+
+  *图 8: 使用 A6000 的 Llama 2 7b 的吞吐量-延迟曲线和有效吞吐量。提示和生成长度遵循正态分布，平均值分别为 2600 和 60，并且有 30% 的方差*<br>
+</div>
+
+## 5. DeepSpeed-FastGen：软件实现与使用指南 <a name="using-deepspeed-fastgen"></a>
+
+DeepSpeed-FastGen 是 [DeepSpeed-MII](https://github.com/microsoft/DeepSpeed-MII) 和 [DeepSpeed-Inference](https://github.com/microsoft/DeepSpeed) 的协同组合，如下图所示。这两个软件包共同提供了系统的各个组成部分，包括前端 API、用于使用动态 SplitFuse 调度批次的主机和设备基础设施、优化的内核实现，以及构建新模型实现的工具。
+
+<div align="center">
+<img src="../assets/images/fastgen-arch-light.png#gh-light-mode-only" width="800px">
+<img src="../assets/images/fastgen-arch-dark.png#gh-dark-mode-only" width="800px">
+</div>
+
+
+使用我们的 alpha 版 DeepSpeed-FastGen 最快的入门方式是：`pip install deepspeed-mii`。
+
+请按照我们的 [入门指南](https://github.com/microsoft/deepspeed-mii#getting-started-with-mii) 获取更多细节。如需使用和报告问题，请使用 [DeepSpeed-MII Github 仓库](https://github.com/microsoft/DeepSpeed-MII)。
+
+### A. 支持的模型
+
+在 DeepSpeed-FastGen 的当前 alpha 版本中，我们目前支持以下模型架构：
+
+* [LLaMA](https://huggingface.co/models?other=llama) 和 [LLaMA-2](https://huggingface.co/models?other=llama-2)
+* [Mistral](https://huggingface.co/models?other=mistral)
+* [OPT](https://huggingface.co/models?other=opt)
+
+所有当前模型都利用了后端的 [HuggingFace](https://github.com/huggingface) API 来提供模型权重和模型对应的分词器。
+
+> 我们计划在最初发布后的几周和几个月内添加更多模型。如果您希望支持特定的模型架构，请[提交问题](https://github.com/microsoft/DeepSpeed-MII/issues)来让我们知道。
+
+### B. 部署选项
+以下所有示例均可在 [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples/tree/master/inference/mii) 中运行。安装后，您有两种部署方式：交互式非持久管道或持久化服务部署：
+
+#### 非持久管道
+
+非持久管道部署是快速入门的好方法，只需几行代码即可完成。非持久模型只在您运行的 python 脚本期间存在，适用于临时交互式会话。
+
+```python
+from mii import pipeline
+pipe = pipeline("mistralai/Mistral-7B-v0.1")
+output = pipe(["Hello, my name is", "DeepSpeed is"], max_new_tokens=128)
+print(output)
+```
+
+#### 持久部署
+
+持久部署非常适合用于长时间运行和生产的应用。持久部署使用了轻量级的 GRPC 服务器，可以使用以下两行代码创建：
+
+```python
+import mii
+mii.serve("mistralai/Mistral-7B-v0.1")
+```
+
+上述服务器可以同时被多个客户端查询，这要归功于 DeepSpeed-MII 内置的负载平衡器。创建客户端也只需要两行代码：
+
+```python
+client = mii.client("mistralai/Mistral-7B-v0.1")
+output = client.generate("Deepspeed is", max_new_tokens=128)
+print(output)
+```
+
+持久部署可以在不再需要时终止：
+
+```python
+client.terminate_server()
+```
+
+### C. 高级安装方式
+
+为了使用方便并显著减少许多其他框架所需的冗长编译时间，我们通过名为 [DeepSpeed-Kernels](https://github.com/microsoft/DeepSpeed-Kernels) 的新库分发了覆盖我们大部分自定义内核的预编译 Python wheel。我们发现这个库在环境中非常便携，只要这些环境具有 NVIDIA GPU 计算能力 8.0+（Ampere+）、CUDA 11.6+ 和 Ubuntu 20+。在大多数情况下，您甚至不需要知道这个库的存在，因为它是 DeepSpeed-MII 的依赖项，并将自动与之一起安装。然而，如果您因任何原因需要手动编译我们的内核，请参阅我们的[高级安装文档](https://github.com/microsoft/DeepSpeed-Kernels#source)。
+
+
+# 6. 尝试 DeepSpeed-FastGen <a name="try"></a>
+我们非常高兴分享 DeepSpeed-FastGen 的首个 alpha 版本。
+
+* 要开始，请访问我们的 DeepSpeed-MII GitHub 页面： [GitHub 登陆页面](https://github.com/microsoft/DeepSpeed-MII)
+
+DeepSpeed-FastGen 是更大的 DeepSpeed 生态系统的一部分，该生态系统包含了多种深度学习系统和建模技术。要了解更多，
+
+* 请访问我们的[网站](https://www.deepspeed.ai/)，详细查看博客文章、教程和有用的文档。
+* 您也可以通过我们的[英文 Twitter](https://twitter.com/MSFTDeepSpeed)、[日本 Twitter](https://twitter.com/MSFTDeepSpeedJP) 和[中文知乎](https://www.zhihu.com/people/deepspeed) 关注我们，以获取 DeepSpeed 的最新消息。
+
+DeepSpeed 欢迎您的贡献！我们鼓励您在 [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/) 页面上报告问题、贡献 PR，并参与讨论。有关更多详细信息，请参见我们的[贡献指南](https://github.com/microsoft/DeepSpeed/blob/master/CONTRIBUTING.md)。我们愿意与大学、研究实验室和公司合作，比如那些在深度学习研究上共同工作，应用 DeepSpeed 来赋能真实世界的 AI 模型和应用等。对于那些不适合在 GitHub 上提出的请求（以及其他请求），请直接发送电子邮件至 deepspeed-info@microsoft.com。
+
+以下项目在我们的路线图上，我们计划通过我们的 GitHub 问题和 PR 与我们的社区在这些项目上进行交流：
+
+- 性能改进
+- 更广泛的模型支持
+- 通过与合作伙伴的合作支持新硬件后端
+- 发布性能测试套件（例如此博客中生成的图表）
+
+如果您喜欢我们的工作，请为我们的 [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/) 和 [DeepSpeedMII GitHub](https://github.com/microsoft/DeepSpeed-MII/) 仓库打上“星标”！
+
+# 7. 致谢 <a name="acknowledgements"></a>
+
+我们要对包括 HuggingFace、vLLM 和 HuggingFace TGI 在内的多个开源社区项目表示感谢。在 alpha 版本中, 我们利用 HF API 来调用模型和分词器，并计划未来添加更多模型。我们特别感谢 [Flash Attention](https://github.com/Dao-AILab/flash-attention) 开发者的出色工作。我们在系统中广泛利用了 FlashAttention 内核，并已经在我们的代码库的对应的文件头部进行了致谢。最后，我们要感谢我们在 MoE 内核（作为 DeepSpeed-Kernels 仓库的一部分发布）中使用的 [FasterTransformer](https://github.com/NVIDIA/FasterTransformer) 内核的开发者。
diff --git a/blogs/deepspeed-fastgen/japanese/README.md b/blogs/deepspeed-fastgen/japanese/README.md
new file mode 100644
index 000000000000..9729854afcf0
--- /dev/null
+++ b/blogs/deepspeed-fastgen/japanese/README.md
@@ -0,0 +1,315 @@
+<div align="center">
+
+# DeepSpeed-FastGen: MIIとDeepSpeed-InferenceによるLLMのための高速なテキスト生成
+
+</div>
+
+<div align="center">
+ <img src="../assets/images/fastgen-hero-light.png#gh-light-mode-only" width="850px">
+ <img src="../assets/images/fastgen-hero-dark.png#gh-dark-mode-only" width="850px">
+</div>
+
+## Table of Contents
+1. [概要](#introduction)
+2. [LLMのためのテキスト生成の既存技術](#background)
+3. [Dynamic SplitFuse: プロンプト処理と生成を組み合わせる新しいアプローチ](#technical-approach)
+4. [パフォーマンス評価](#performance-evaluation)
+5. [DeepSpeed-FastGen: 実装と使い方](#using-deepspeed-fastgen)
+6. [DeepSpeed-FastGenを使ってみる](#try)
+7. [謝辞](#acknowledgements)
+
+
+## 1. 概要 <a name="introduction"></a>
+
+AIを様々な目的に利用する幅広いアプリケーションで、GPT-4やLLaMAのような大規模言語モデル（LLM）が、主要なワークロードになってきています。一般的なチャットモデルから、文書の要約、自動運転、ソフトウェアスタックの各層におけるプログラミングの補助まで、これらのモデルを大規模に展開・提供する需要が急増しています。DeepSpeedやPyTorchをはじめとするフレームワークは、一般に、LLMの訓練では良好なハードウェアの利用効率を達成できるものの、オープンエンドのテキスト生成などの課題では、GPUなどのハードウェア上で一度に実行される計算量が少ないことが、既存システムにおいて推論スループットのボトルネックとなっています。
+
+PagedAttentionを搭載した [vLLM](https://arxiv.org/pdf/2309.06180.pdf) や [Orca](https://www.usenix.org/system/files/osdi22-yu.pdf) のような既存システムは、こうした課題を解決するために設計され、LLMの推論性能を大幅に向上させました。しかしこれらのシステムは依然として、特に長いプロンプトを含むワークロードにおいて、一貫したサービス品質の提供という点で課題を残しています。
+数千トークンに及ぶコンテキストウィンドウをサポートするモデルやシステム、例えば [MPT-StoryWriter](https://www.mosaicml.com/blog/mpt-7b) や [DeepSpeed Ulysses](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-ulysses) などが増えるにつれて、これらの長いプロンプトのワークロードはますます重要になってきています。
+これらの問題をより深く理解するために、LLMによるテキスト生成がどのように機能するか説明します。LLMによるテキスト生成は、プロンプト処理と生成と呼ばれる2つの異なるフェーズから構成されます。システムがこれらを全く独立に扱うと、生成のフェーズは、プロンプト処理によって中断されることになります。その結果、システムのレイテンシなどを定めた SLA (Service Level Agreement) に違反する可能性が高くなります。
+
+このブログで紹介するDeepSpeed-FastGenは、新たに提案するDynamic SplitFuse技術などを活用することでこうした課題を解決し、vLLMなどの最新の既存システムと比較して最大2.3倍の実効スループットを実現するシステムです。
+DeepSpeed-FastGenは、DeepSpeed-MIIとDeepSpeed-Inferenceの組み合わせにより、使いやすいテキスト生成機能を実現します。
+
+
+**クイックスタート:** 最新の[DeepSpeed-MII](https://github.com/microsoft/DeepSpeed-MII)をインストールするだけで、 DeepSpeed-FastGenを試すことができます。
+
+
+```bash
+pip install deepspeed-mii
+```
+
+より簡単に利用できる、非永続型（推論サーバを起動しない）のパイプラインを使用してテキストを生成するには、次のコードを実行します。詳細については、[セクション5](#using-deepspeed-fastgen) をご覧ください。
+
+```python
+from mii import pipeline
+pipe = pipeline("mistralai/Mistral-7B-v0.1")
+output = pipe(["Hello, my name is", "DeepSpeed is"], max_new_tokens=128)
+print(output)
+```
+
+## 2. LLMのためのテキスト生成の既存技術 <a name="background"></a>
+
+テキスト系列を生成するためのワークロードは、次の2つのフェーズで構成されます。 1. プロンプト処理: ここでユーザーが与えたテキストは、アテンション機構におけるキーとバリューのキャッシュ（KVキャッシュ）を構築するために、トークンのバッチとして効率的に処理されます。 2. トークン生成: このフェーズで、KVキャッシュに単一のトークンが追加され、新たなトークンが生成されます。テキスト系列を生成する過程では、モデルは完全なテキストの系列を生成するために多くのフォワードパスの呼び出しを行います。これらのフェーズにおける様々な制限やボトルネックを解決するため、既存システムでは従来提案されてきた以下の2つの主要な技術が採用されています。
+
+_<b> ブロックKVキャッシュ: </b>_
+
+vLLMは、KVキャッシュにモノリシックの巨大なメモリ領域を割り当てることが、LLMによるテキスト生成システムの同時実行性を大幅に低下させる原因であるとし、その解決として、非連続的に確保されたメモリ領域をKVキャッシュとして利用することで、システム全体のスループットを増加させる [Paged Attention](https://arxiv.org/pdf/2309.06180.pdf) を提案しました。リクエストごとに様々なサイズの連続メモリ領域を割り当てるのではなく、固定されたサイズのメモリブロック（ページとも呼ばれる）を割り当てるようにします。このブロックKVキャッシュは、KVキャッシュによるメモリ断片化を解決することで、潜在的に処理可能な系列の同時実行数を増やし、システムのスループットを増加させます。こうした非連続KVキャッシュの実装は、[HuggingFace TGI](https://github.com/huggingface/text-generation-inference) と [NVIDIA TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) にも含まれています。
+
+
+_<b> 連続バッチ（Continuous Batching）: </b>_
+
+従来は、サーバーが複数のリクエストを一緒に処理するために待つという 動的バッチ（Dynamic Batching）が GPU利用率を改善するために使用されていました。しかし、このアプローチには欠点があります。通常は入力を同一の長さにパディングするか、より大きなバッチを構築するために、十分な数のリクエストが到着するまで処理を止めて待つ必要があります。
+
+最近の大規模言語モデル（LLM）の推論と、それをサービスとして提供するための技術は、より細かい粒度でのスケジューリングとメモリ効率の最適化に焦点を当てています。例えば、Orcaは _イテレーションレベルのスケジューリング_ （連続バッチまたは Continuous Batching とも呼ばれる）を提案しており、これはモデルの各フォワードパスごとにスケジューリングの判断を行います。これにより、必要に応じてあるリクエストをバッチに含めたり除いたりすることができるため、パディングが不要になり、全体のスループットを向上させます。この連続バッチは、Orcaだけでなく、NVIDIAのTRT-LLM、HuggingFaceのTGI、およびvLLMにも実装されています。
+
+現在のシステムでは、連続バッチ処理を実装するには二つの主要なアプローチがあります。TGIとvLLMでは、生成フェーズが中断されてプロンプト処理（TGIではインフィルと呼ばれる）が行われ、その後で生成を続けます。Orcaでは、これらのフェーズは区別されず、代わりにシーケンスの総数が一定の制限に達しない限り、実行中のバッチにプロンプトを追加します。これらのアプローチは、長いプロンプトを処理するために生成を一時停止する必要があるという点で、程度の差こそあれ似ています（[セクション3B](#splitfuse)参照）。
+
+
+これらの課題に対処するために、私たちはDynamic SplitFuseと呼ばれる、プロンプト処理と生成を組み合わせる新しい手法を提案します。
+
+
+## 3. Dynamic SplitFuse: プロンプト処理と生成を組み合わせる新しいアプローチ<a name="technical-approach"></a>
+
+DeepSpeed-FastGenは、データセンターでのLLMの提供において、TRT-LLM、TGI、vLLMなどの既存のフレームワークと同様に、連続バッチと非連続なKVキャッシュを活用して、より高い占有率と応答性を実現するために開発されました。より高いレベルのパフォーマンスを実現するために、DeepSpeed-FastGenはSplitFuseを導入し、動的にプロンプトの分解し、生成と組み合わせることで、連続バッチとシステムスループットをさらに改善します。
+
+
+### A. パフォーマンスに関する三つの知見
+
+Dynamic SplitFuseについて説明する前に、その設計を動機付ける三つの重要なパフォーマンスに関する質問とその回答を示します。
+
+*__1. 単一のLLMのフォワードパスに影響を与える要因は何ですか？__* 効果的にスケジューリングを行うためには、反復的に実行されるスケジューリングで制御すべき、関連する独立変数が何であるかを理解することが必要です。我々は以下に示すように、フォワードパス内のシーケンスの構成（シーケンスでのバッチサイズ）がフォワードパスのトークンの生数に比べてパフォーマンスにほとんど影響を与えないことを観察しました。これは、効果的なスケジューラを構築するには、主にフォワードパスのトークン数という単一の要素のみに注目すればよいことを意味しています。
+
+<div align="center">
+<img src="../assets/images/observation-prompt-v-latency.png" alt="" width="480"/><br>
+</div>
+
+*__2. フォワードパスのトークン数の変化に対して、モデルのスループットはどのように反応しますか？__* LLMには比較的急に振る舞いが変化する、二つの主要な動作領域があります。トークン数が少ない場合、GPUのボトルネックはメモリからのモデルの読み出しであるため、スループットはトークン数に応じてスケールしますが、トークンが多い場合はモデルのスループットは計算によって制限され、ほぼ一定のスループットを示します。効率的な実行のために、すべてのフォワードパスが、スループットが飽和するような領域で実行されるのが望ましいと言えます。
+
+<div align="center">
+<img src="../assets/images/observation-prompt-v-flops.png" alt="" width="480"/><br>
+</div>
+
+*__3. トークンのプールは複数のフォワードパスにどのようにスケジュールされるべきですか？__* 上記で述べたように、入力が適切に整列している場合、トークンのスループット曲線は凹形であり、これは二次導関数が0以下であることを意味します。例として、あるモデルの遅延からスループットへの凹関数を $f(x)$ としましょう。凹関数 $f(x)$ に対しては、以下が成り立ちます：
+
+  $$0 \geq \lim_{h \to 0} \frac{f(x + h) - 2f(x) + f(x - h)}{h^2}$$
+
+  $$0 \geq f(x + h) - 2f(x) + f(x - h)$$
+
+  $$2f(x) \geq f(x + h) + f(x - h)$$
+
+これは、処理する `2x` トークンのプールに対して、スループットを最大化する方法は、それらを二つのバッチに均等に分割することであると述べています。より一般的には、`P` トークンを `F` 回のフォワードパスで処理する必要があるシステムでは、理想的な分割スキームはそれらを均等に分割するものになります。
+
+### B. Dynamic SplitFuse <a name="splitfuse"></a>
+
+Dynamic SplitFuseは、プロンプト処理とトークン生成を組み合わせるための新しいアプローチです。DeepSpeed-FastGenは、プロンプトからの一部のトークンを取り出し、これを生成と組み合わせることで、一貫したフォワードサイズで実行するためにDynamic SplitFuseを利用します。Dynamic SplitFuseは以下の2つの主要な動作からなります：
+
+1. 長いプロンプトは、はるかに小さなチャンクに分解され、複数のフォワードパス（イテレーション）にわたってスケジュールされます。生成は、最後のフォワードパスでのみ実行されます。
+2. 短いプロンプトは、フォワードパスのための目標トークン数を正確に満たすようにスケジュールされます。短いプロンプトであっても、フォワードパスに与える目標のトークン数を正確に満たし、複数のフォワードパスでトークン数が均等になるように分解されることがあります。
+
+これら2つの技術を組み合わせることで、以下のすべてのユーザー指標において、具体的な利点が得られます：
+
+1. *__より良い応答性__*: 長いプロンプトによりフォワードパスで非常に長い時間がかかることがなくなり、モデルはクライアントから見てより低いレイテンシが実現できます。これは、同じ時間枠内でより多くのフォワードパスが実行されていることになります。
+2. *__高い効率__*: 短いプロンプトを、その他のリクエストのトークンと一緒に実行することで、モデルは一貫して高スループットで動作します。
+3. *__レイテンシ変動の減少と一貫性の向上__*： 1回のフォワードパスに与えるトークン数の変動が少なくなります。フォワードパスに与えるトークン数がパフォーマンスの主要な決定要因であるため、各フォワードパスのレイテンシは競合するシステムよりもはるかに一貫したものとなります。他の先行研究のように、プリエンプションや長時間実行されるプロンプトによって遅延が増加することはありません。
+
+結果として、DeepSpeed-FastGenは、システムの利用率を高めるためにトークンをフォワードパスに加えていくことで、到着するリクエストのプロンプト処理を、進行中の生成フェーズを高速に実行しながら行えます。これにより、
+他の最先端のテキスト生成システムと比較して、すべてのクライアントに対してより低レイテンシかつ高スループットのストリーミング生成を実現できます。
+
+
+<div align="center">
+  <img src="../assets/images/fastgen-overview-light.png#gh-light-mode-only" width="640">
+  <img src="../assets/images/fastgen-overview-dark.png#gh-dark-mode-only" width="640"><br>
+
+*図1: 連続バッチ処理戦略のイラスト。各ブロックはフォワードパスの実行を示しています。矢印は、1つ以上のトークンが生成されたシーケンスを持つフォワードパスを示しています。vLLMはフォワードパスでトークン生成またはプロンプト処理のいずれかを実行し、トークン生成はプロンプト処理をプリエンプトします。Orcaは生成と同時に完全な長さのプロンプトを実行します。Dynamic SplitFuseは、生成トークンとプロンプトトークンの両方で構成された固定サイズのバッチの動的構成を実行します。*
+</div>
+
+## 4. パフォーマンス評価 <a name="performance-evaluation"></a>
+
+DeepSpeed-FastGenは、ブロックKVキャッシュとDynamic SplitFuseのcontinuous batchingを活用し、最先端のLLMサービング性能を提供します。我々は、以下で議論されるベンチマーク手法に従って、さまざまなモデルとハードウェア構成でDeepSpeed-FastGenとvLLMを評価します。
+
+### A. ベンチマーク手法
+
+パフォーマンスを測定するために、我々は2つの主要な定量的スキームを使用します。
+
+**スループット-レイテンシカーブ**： 実サービス利用のための2つの主要な指標は、スループット（秒間リクエスト数で測定）とレイテンシ（各リクエストの応答性）です。これを測定するために、我々は複数のクライアント（1から32まで）を同時に起動し、サーバーにリクエスト（合計512）を送信します。各リクエストの結果としてのレイテンシは各リクエストの単位で測定され、スループットは実験を完了するためのエンドツーエンドの時間で測定されます。
+
+**実効スループット**: チャットアプリケーションのようなインタラクティブなアプリケーションは、エンドツーエンドのレイテンシのようなトップレベルの指標では捉えきれない、より厳格で複雑な要件を持っている場合があります。特にここでは、急速に広く使われつつあるチャットアプリケーションのユーザシナリオに焦点を当てます：
+
+1. ユーザーがプロンプトを送信してタスクを開始します。
+2. システムがプロンプトを処理し、最初のトークンを返します。
+3. 続くトークンは、生成されると同時に、ユーザーにストリーミングで送信されます。
+
+このプロセスの各ポイントで、ユーザーにとって望ましくない体験になる可能性があります。例えば、最初のトークンが遅すぎる場合や、生成がしばらくの間停止するように見える場合です。我々は、これらの2つの観点を考慮に入れたSLAのフレームワークを提案します。
+
+プロンプトと生成されたテキストの長さには、非常に広い幅があり、またそれが計算コストに影響を与えるため、スループットとレイテンシに厳格なSLA値を設定することは非現実的です。したがって、我々はプロンプトのレイテンシのSLAをプロンプト内の|トークン数| / 512秒（= 512トークン/秒）と定義します。さらに、人間の読む速度を考慮して、生成レイテンシのSLAを、指数移動平均（EMA）で秒間2、4、または6トークンに設定します。これらのSLAを満たすリクエストは成功と見なし、これらの成功したリクエストのスループットを **実効スループット** とします。
+
+我々は、NVIDIA A100、H100、およびA6000上のLlama-2 7B、Llama-2 13B、およびLlama-2 70BでvLLMとDeepSpeed-FastGenを評価しました。
+
+### B. スループット・レイテンシ分析
+
+この実験では、DeepSpeed-FastGenは、vLLMをスループットとレイテンシの両方で上回り、同じスループットでより低レイテンシを提供するか、あるいはより高スループットで同じレイテンシを提供します。4台の A100 GPU（メモリ80GB）とLlama-2 70Bを使用したテキスト生成では、DeepSpeed-FastGenは同じレイテンシ（9秒）で2倍高いスループット（それぞれ1.36 rpsと0.67 rps）を示すか、同じスループット（1.2 rps）を達成しながら最大50％のレイテンシ削減（それぞれ7秒と14秒）を実現します。この結果は図2に示されています。またこの傾向は、図3に示されるLlama-2 13Bでの評価でも同様です。
+
+
+<div align="center">
+  <img src="../assets/images/throughput_latency.png" alt="" width="850"/><br>
+
+  *図2: テキスト生成のスループットとレイテンシ（4台のA100-80GB GPUでのテンソル並列を使用したLlama 2 70B）。プロンプトと生成の長さは、平均1200/2600と128/60の正規分布（30％の分散）に基づいて設定。*
+</div><br>
+
+<div align="center">
+  <img src="../assets/images/throughput_latency_13B_no_arrow.png" alt="" width="850"/><br>
+
+  *図3: テキスト生成のスループットとレイテンシ（1台のA100-80GB GPUでのテンソル並列なしでのLlama 2 13B）。プロンプトと生成の長さは、平均1200/2600と60/128の正規分布の正規分布（30％の分散）に基づいて設定。*
+</div>
+
+### C. 実効スループット分析
+
+最初のトークンのレイテンシと、生成が行われる速度の両方を考慮した実効スループットにおいて、DeepSpeed-FastGenはvLLMに比べて最大2.3倍の性能を示しています。図4はDeepSpeed-FastGenとvLLMの実効スループットの比較分析を示しています。プロットされたそれぞれの点は、特定のクライアント数で得られた実効スループットを表します。クライアント数を増やすと初めは実効スループットが増加することが観察されました。しかし、クライアント数がシステムの容量に近づくとレイテンシも大幅に増加し、多くのリクエストがSLAを満たすことができなくなります。その結果、実効スループットはいずれかのポイントを上限として、その後減少します。使用性の観点から、最大実効スループットを達成するために必要なクライアント数は特に重要ではありません。ラインの最高点が、サービス提供における最適な点になります。
+
+<div align="center">
+  <img src="../assets/images/effective_throughput.png" alt="" width="1200" />
+
+  *図4: DeepSpeed-FastGenとvLLMの実効スループット。Llama 2 70B/A100-80GBを使用し、4台のA100-80GB GPU間でテンソル並列を使用。プロンプトと生成の長さは、それぞれ平均2600と60の正規分布（30％の分散）に基づいて設定。*
+</div><br>
+
+vLLMが、新たなプロンプトを処理するために進行中の前のリクエストの生成を中断すると、生成のレイテンシは顕著に増加します。これにより、vLLMの実効スループットは直接測定されたスループットよりも低く見えます。vLLMのピーク時、実効スループットは0.63クエリ/秒であり、リクエストの約28％が4トークン/秒のSLAを満たすことができませんでした。同じSLAで、DeepSpeed-FastGenは1.42クエリ/秒（SLAを満たさなかったリクエストは1％未満）を達成し、これはvLLMの2.3倍です。
+
+### D. トークン単位のレイテンシ分析
+
+図5は生成プロセスのP50、P90、P95のレイテンシを表示しています。vLLMとDeepSpeed-FastGenを比べると、P50レイテンシに大きな違いはありませんが、vLLMはP90とP95で著しく高いレイテンシを示しています。
+P95レイテンシに関しては、DeepSpeed-FastGenは3.7倍の削減を達成しています。
+
+この差異は、vLLMが進行中の生成を中断して新しいプロンプトを処理する際に、生成レイテンシに顕著なスパイクが生じるためです。
+対照的に、DeepSpeed-FastGenは通常、前のリクエストのプロンプトと生成を同時に処理するため、はるかに一貫した生成のレイテンシを実現します。
+
+<div align="center">
+  <img src="../assets/images/token_latency.png" alt="" width="400"/><br>
+
+  *図5: トークンごとの生成レイテンシ。Llama 2 70B/A100-80GBを使用し、4台のA100-80GB GPU間でテンソル並列を使用。クライアント数16。プロンプトと生成の長さは、それぞれ平均2600と128の正規分布（30％の分散）に基づいて設定。*
+</div><br>
+
+
+### E. ロードバランシングを使用したスケーラビリティ
+DeepSpeed-FastGenはレプリカ単位のロードバランシングの機能を備えており、複数のサーバーにリクエストを均等に分散させることで、アプリケーションを簡単にスケールアップすることができます。
+
+図6は、ロードバランサーを使用し、最大16のレプリカを適用したときのDeepSpeed-FastGenのスケーラビリティを示しています。Llama 2 70Bモデルの計算には、レプリカ一つあたりで、4台のA100 GPUを使用しました。合計で16のレプリカを実行するために8ノードを使用しました。その結果はDeepSpeed-FastGenのほぼ完璧なスケーラビリティを示しています。1つのレプリカのスループットが1.46クエリ/秒である場合、16のレプリカでのスループットは23.7クエリ/秒に達し、1つのレプリカに比べて16倍の線形増加を示しています。
+
+<div align="center">
+  <img src="../assets/images/repl_scale_llama70b_tp4_p2600g60.png" alt="" width="400"/><br>
+
+  *図6: ロードバランシング機能を使用したスケーラビリティ。プロンプトと生成の長さは、それぞれ平均2600と60の正規分布（30％の分散）に基づいて設定。*
+</div>
+
+### F. 他のハードウェアプラットフォーム
+
+A100 GPUを用いた分析に加えて、H100とA6000を使用したベンチマーク結果を提供します。A6000とH100の両方で、A100と同様のパフォーマンスの傾向が観察されました。
+
+<div align="center">
+  <img src="../assets/images/H100_benchmark.png" alt="" width="800"/><br>
+
+  *図7: 8つのH100 GPUを使用したLlama 2 70bのスループット・レイテンシカーブと実効スループット。プロンプトと生成の長さは、それぞれ平均2600と60の正規分布（30％の分散）に基づいて設定。*
+</div>
+
+<div align="center">
+  <img src="../assets/images/A6000_benchmark.png" alt="" width="800"/><br>
+
+  *図8: A6000を使用したLlama 2 7bのスループット・レイテンシカーブと実効スループット。プロンプトと生成の長さは、それぞれ平均2600と60の正規分布（30％の分散）に基づいて設定。*
+</div>
+
+## 5. DeepSpeed-FastGen: 実装と使い方 <a name="using-deepspeed-fastgen"></a>
+
+DeepSpeed-FastGenは、以下の図に示されているように、[DeepSpeed-MII](https://github.com/microsoft/DeepSpeed-MII)と[DeepSpeed-Inference](https://github.com/microsoft/DeepSpeed)を融合的に組み合わせたものです。これらのソフトウェアパッケージは、フロントエンドAPI、Dynamic SplitFuseを使用してバッチをスケジュールするホストおよびデバイスインフラストラクチャ、最適化されたカーネル実装、新しいモデル実装を構築するためのツールなど、システムの様々なコンポーネントを提供します。
+
+
+<div align="center">
+<img src="../assets/images/fastgen-arch-light.png#gh-light-mode-only" width="800px">
+<img src="../assets/images/fastgen-arch-dark.png#gh-dark-mode-only" width="800px">
+</div>
+
+DeepSpeed-FastGenのアルファリリースを使い始める最も簡単な方法は、 ``pip install deepspeed-mii`` を実行することです。
+
+詳細については、[Getting Started](https://github.com/microsoft/deepspeed-mii#getting-started-with-mii)ガイドを参照してください。使用法や問題の報告には、[DeepSpeed-MII Github リポジトリ](https://github.com/microsoft/DeepSpeed-MII)を使用してください。
+
+### A. 対応モデル
+
+現在、DeepSpeed-FastGenのこのアルファリリースでは、以下のモデルアーキテクチャをサポートしています：
+
+* [LLaMA](https://huggingface.co/models?other=llama) and [LLaMA-2](https://huggingface.co/models?other=llama-2)
+* [Mistral](https://huggingface.co/models?other=mistral)
+* [OPT](https://huggingface.co/models?other=opt)
+
+現在のすべてのモデルは、モデルの重みとモデルに対応するトークナイザーの両方を提供するために、バックエンドで [HuggingFace](https://github.com/huggingface) を利用しています。
+
+初期リリース後の数週間と数ヶ月に追加のモデルを追加する予定です。サポートを希望する特定のモデルアーキテクチャがある場合は、[issue](https://github.com/microsoft/DeepSpeed-MII/issues) を登録してください。。
+
+### B. デプロイメントのオプション
+
+以下の例はすべて [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples/tree/master/inference/mii) で実行可能です。インストール後、デプロイメントのオプションとして、対話型の非永続パイプラインまたは永続的なサービス提供デプロイメントの2つのオプションがあります。
+
+#### 非永続パイプライン
+
+非永続パイプラインデプロイメントは、非常に簡単に使い始めることができ、わずか数行のコードで実行可能です。
+非永続モデルは、Pythonスクリプトの実行中だけ起動しますが、一時的な対話型セッションには便利です。
+
+```python
+from mii import pipeline
+pipe = pipeline("mistralai/Mistral-7B-v0.1")
+output = pipe(["Hello, my name is", "DeepSpeed is"], max_new_tokens=128)
+print(output)
+```
+
+#### 永続デプロイメント
+
+永続デプロイメントは、長時間実行されるアプリケーションや本番アプリケーションに使用するためのものです。永続デプロイメントでは、以下の2行を使用して軽量なGRPCサーバーを起動できます。
+
+```python
+import mii
+mii.serve("mistralai/Mistral-7B-v0.1")
+```
+
+上記のサーバーは、DeepSpeed-MIIの組み込みロードバランサーのおかげで、複数のクライアントから一度にクエリを受け取ることができます。クライアントも、以下の2行のコードだけで利用できます：
+
+```python
+client = mii.client("mistralai/Mistral-7B-v0.1")
+output = client.generate("Deepspeed is", max_new_tokens=128)
+print(output)
+```
+
+永続デプロイメントは、必要なくなったときに、以下の方法で終了できます：
+
+```python
+client.terminate_server()
+```
+
+### C. インストールの詳細情報
+
+類似の他のプロジェクトでは、カスタムカーネルのコンパイルに非常に時間がかかることがよくあります。
+DeepSpeed-FastGenでは、このコンパイル時間を大幅に短縮し、利便性を向上するため、主要なカスタムカーネルの大部分を事前コンパイルしたPython wheelを、[DeepSpeed-Kernels](https://github.com/microsoft/DeepSpeed-Kernels)という新しいライブラリを通じて配布しています。
+このライブラリは、NVIDIA GPUのコンピュート能力が8.0以上（Ampere+）、CUDA 11.6以上、Ubuntu 20以上の環境で非常に移植性が高いことがわかっています。
+このライブラリは、DeepSpeed-MIIの依存関係としてインストールされるため、ほとんどの場合では、このライブラリの存在を知る必要はありません。しかし、何らかの理由でカーネルを手動でコンパイルする必要がある場合は、インストールに関する[詳細ドキュメント](https://github.com/microsoft/DeepSpeed-Kernels#source)をご覧ください。
+
+# 6. DeepSpeed-FastGen を使ってみる <a name="try"></a>
+
+このDeepSpeed-FastGenアルファリリースをユーザの皆さんと共有できることを非常に嬉しく思います。
+
+* 使用を始めるにあたっては、DeepSpeed-MIIのGitHubページをご覧ください: [GitHubランディングページ](https://github.com/microsoft/DeepSpeed-MII)
+
+DeepSpeed-FastGenは、Deep Learningシステムやモデリングテクノロジーを数多く含む、より大きなDeepSpeedエコシステムの一部です。さらに詳しい情報が必要な方は、
+[詳細なブログ記事]、チュートリアル、役立つドキュメントがある私たちの [ウェブサイト](https://www.deepspeed.ai/) をご覧ください。
+DeepSpeedの最新情報については、[英語のTwitter](https://twitter.com/MSFTDeepSpeed)、[日本語のTwitter](https://twitter.com/MSFTDeepSpeedJP)、[中国語の知乎](https://www.zhihu.com/people/deepspeed)をフォローしてください。
+
+DeepSpeedは、皆様の開発への参加を歓迎しています。DeepSpeedのGitHubページで、バグ報告、Pull Request、ディスカッションへの参加が可能です。詳細は[ガイドライン](https://github.com/microsoft/DeepSpeed/blob/master/CONTRIBUTING.md)をご覧ください。[contributing guide](https://github.com/microsoft/DeepSpeed/blob/master/CONTRIBUTING.md) にはより詳細な情報があります。
+また、深層学習の研究や、実世界のAIモデルやアプリケーションへのDeepSpeedの適用に取り組む大学、研究所、企業とのコラボレーションも行っています。こうしたコラボレーションについてのご要望（およびGitHubには適さないその他の話題）については<deepspeed-info@microsoft.com> まで直接メールをお送りください。
+
+以下の項目は、今後のロードマップです。GitHubの問題やPRを通じてコミュニティと協力して取り組む予定です:
+
+- パフォーマンスの改善
+- より広範なモデルサポート
+- パートナーとのコラボレーションによる新しいハードウェアバックエンド
+- ブログに掲載したプロットを生成するパフォーマンスベンチマークのリリース
+
+このプロジェクトが気に入ったら、ぜひ [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/) と [DeepSpeedMII GitHub](https://github.com/microsoft/DeepSpeed-MII/) のリポジトリに "スター" をつけてください。
+
+# 7. 謝辞 <a name="acknowledgements"></a>
+
+HuggingFace、vLLM、HuggingFace TGIを含むさまざまなオープンソースコミュニティプロジェクトに感謝します。私たちはアルファリリースでのモデルとトークナイザーをサポートするためにHF APIを活用し、今後もさらに多くのモデルを追加する予定です。特に、[Flash Attention](https://github.com/Dao-AILab/flash-attention) の開発者の素晴らしい成果に感謝します。私たちはシステムでFlashAttentionカーネルを広範囲に活用しており、コードリポジトリに含まれる適切なファイルヘッダーにそのことを記載しています。最後に、私たちのMoEカーネルで使用している [FasterTransformer](https://github.com/NVIDIA/FasterTransformer) カーネルの開発者に感謝します（DeepSpeed-Kernelsリポジトリの一部としてリリースされました）。
diff --git a/blogs/deepspeed-offloadpp/README.md b/blogs/deepspeed-offloadpp/README.md
new file mode 100644
index 000000000000..1441da5a35c0
--- /dev/null
+++ b/blogs/deepspeed-offloadpp/README.md
@@ -0,0 +1,52 @@
+# DeepSpeed ZeRO-Offload++: 6x Higher Training Throughput via Collaborative CPU/GPU Twin-Flow
+
+Deep learning has been successfully adopted in a wide range of applications such as speech recognition, chatbot, text and image generation, etc. To achieve better model serving accuracy, model size grows significantly. Take language models as example, from BERT with 110 million parameters to Megatron-Turing NLG with 530 billion parameters, the model size grows almost 5000x. Given limited GPU memory size, we need to efficiently utilize GPU memory to achieve good system throughput.
+
+ZeRO offers memory efficient data parallel training scheme. For training large models like LLMs using ZeRO, GPU memory size is still often insufficient to hold all the model parameters. Thus, ZeRO-Offload is introduced to solve this insufficient GPU memory issue. ZeRO-Offload releases GPU memory pressure by offloading data and compute to the CPU side while minimizing CPU-GPU data copy overhead. Given CPU memory is often orders-of-magnitude larger than GPU memory, ZeRO-Offload was the first piece of work that enables billion-level parameter training even with very limited GPU memory resources (e.g., to an extreme: single GPU). ZeRO-Offload provides excellent performance when model size is multiple times larger than total GPU memory size.
+
+However, system efficiency is still far from optimal when adopting ZeRO-Offload in some scenarios. Especially in the cases like small batch training, model that could not fit into GPU memory but not orders-of-magnitude bigger than GPU memory capacity, CPU offload not only introduce long end-to-end latency, but also underutilize GPU computation resources. To reduce memory copy latency as well as inefficient utilization of GPU introduced in these offload cases, we propose ZeRO-Offload++, which leverages both CPU and GPU coherently. ZeRO-Offload++ mainly includes 3 new features as _Twin-Flow_, MemCpy reduction, CPUAdam optimization. Now we release our __Twin-Flow__ feature.
+
+The key benefits are:
+* With _Twin-Flow_, ZeRO-Offload++ achieves up to **6x** training speedup compared with ZeRO-Offload.
+*	High-level API provided in DeepSpeed config JSON makes it easy to use and fine-tune.
+
+![h100-img](./images/h100-8.png)
+
+## Twin-Flow
+
+In DeepSpeed, when training using popular optimizer like Adam, optimizer offloading follows an all-or-nothing policy. For simplifed example shown as Figure below, without offloading, all the parameters will be updated using GPU adam as FusedAdam optimizer. On the other hand, if offloading is enabled, all model weights use CPUAdam to update.
+
+![cpu-offload-img](./images/cpu-offload.png)
+
+The major downside of this all-or-nothing offloading is, when offload all optimizer states to CPU side, both GPU memory and compute resources remain under-utilized. Although increasing batch size improves GPU utilization rate, each training iteration time is still super long compared with no-offloading case. To improve GPU compute and memory utilization rate as well as decrease training iteration time, we introduce a new feature in our DeepSpeed training engine called _Twin-Flow_.
+
+In comparison, _Twin-Flow_ allows a portion of optimizer states to be held in CPU memory and the other portion of optimizer states remaining in GPU memory. When optimization step is triggered, both CPU and GPU can do parameter updates simultaneously. Once offloading is enabled, we provide an offload ratio configuration which allows users to adjust how many percentages of model weights are updated on CPU side and the rest are happened on GPU side. "_Twin_" comes from the idea that both CPU and GPU are using the same optimizer function here. "_Flow_" means parameters are not only hold in both host and device memory, but also computed using both CPU and GPU cores.
+
+As shown in Figure below, with ZeRO-Offload enabled and we set _Twin-Flow_ ratio of 0.4 (40%). DeepSpeed Training engine will automatically assign first 40% (i.e. 0-40%) of weights step procedure on the CPU side using CPUAdam, and use GPU side FusedAdam to update the rest 60% (i.e., 40-100%) model parameters jointly. Therefore, with _Twin-Flow_, we can achieve decent GPU memory and core utilization rate, at the same time reduce training iteation time in optimizer offloading cases.
+
+![_Twin-Flow_-img](./images/twin-offload.png)
+
+Note that this _Twin-Flow_ ratio can be adjusted based on how much GPU idle memory is available. The smaller this ratio is, the more GPU memory and cores are used and the shorter training iteration time it achieves. The ideal case is to be as near as GPU memory upper bound in order to minimize training iteration time.
+Note that _Twin-Flow_ is not limited to Adam optimizer only, it can be applied to any optimizer (e.g., AdaGrad) from the user side.
+
+## Performance Evaluation
+
+We conduct our performance evaluations over both A100 and H100 DGX machine and test for OPT model with 13B and 30B parameters. We run 13B OPT model training on a 8 A100 DGX machine, and run OPT-30B model training using a 8 H100 DGX machine. With some tuning on offload ratio in ZeRO-Offload++, we achieve 6x and 3x training speedup of Meta OPT models on single DGX-H100-80GB and DGX-A100-40GB, respectively (top-most figure and bottom figure here).
+
+![a100-img](./images/a100-8.png)
+
+## On-going Optimizations
+
+* Reduce uncessary D2H/H2D memcpy
+
+* On-the-fly fp16 to fp32 casting for CPUAdam
+
+## Tutorials
+
+Examples and Tutorials are [here](https://github.com/microsoft/Megatron-DeepSpeed/blob/main/examples_deepspeed/offload_pp/README.md)
+
+## Contributors:
+
+This project was made possible by the contributions of the following people from DeepSpeed Team:
+
+[Guanhua Wang](https://www.microsoft.com/en-us/research/people/guanhuawang/), Masahiro Tanaka, Xiaoxia Wu, Lok Chand Koppaka, Samyam Rajbhandari, [Olatunji Ruwase](https://www.microsoft.com/en-us/research/people/olruwase/), [Yuxiong He](https://www.microsoft.com/en-us/research/people/yuxhe/) (team lead)
diff --git a/blogs/deepspeed-offloadpp/images/a100-8.png b/blogs/deepspeed-offloadpp/images/a100-8.png
new file mode 100644
index 000000000000..22b787f69e1e
Binary files /dev/null and b/blogs/deepspeed-offloadpp/images/a100-8.png differ
diff --git a/blogs/deepspeed-offloadpp/images/cpu-offload.png b/blogs/deepspeed-offloadpp/images/cpu-offload.png
new file mode 100644
index 000000000000..cc4dae505cd3
Binary files /dev/null and b/blogs/deepspeed-offloadpp/images/cpu-offload.png differ
diff --git a/blogs/deepspeed-offloadpp/images/h100-8.png b/blogs/deepspeed-offloadpp/images/h100-8.png
new file mode 100644
index 000000000000..938625d52aaf
Binary files /dev/null and b/blogs/deepspeed-offloadpp/images/h100-8.png differ
diff --git a/blogs/deepspeed-offloadpp/images/twin-offload.png b/blogs/deepspeed-offloadpp/images/twin-offload.png
new file mode 100644
index 000000000000..1c8c3ef92454
Binary files /dev/null and b/blogs/deepspeed-offloadpp/images/twin-offload.png differ
diff --git a/blogs/deepspeed-triton/README.md b/blogs/deepspeed-triton/README.md
new file mode 100644
index 000000000000..071b5d4bc6d0
--- /dev/null
+++ b/blogs/deepspeed-triton/README.md
@@ -0,0 +1,95 @@
+# DeepSpeed with Triton compiler
+
+# 1. Overview
+
+We have integrated [Triton](https://github.com/openai/triton), an open source compiler for GPU programming, into DeepSpeed, which further boosts the inference speed of BERT-like models in float16 precision.
+By replacing some CUDA kernels or torch operators with Triton kernels, we achieved 1.14\~1.68x speedup (or 12\~41% latency reduction) for different models and GPUs, as shown in Table 1.
+
+<div align="center">
+
+| Hardware | Bert-base | Bert-large | Roberta-base | Roberta-large |
+|----------|:------:|:------:|:------:|:------:|
+| A100 |1.65x | 1.68x | 1.53x | 1.61x |
+| V100 | 1.29x | 1.14x | 1.23x | 1.21x |
+
+Table 1. The average speedup (see NOTE below for more detail)
+
+
+</div>
+
+For those transformer operators in float16, we have implemented kernels written in Triton language that replace ordinary CUDA kernels or torch operators.
+The Triton kernels we implemented include softmax, layer-normalization, residual-addition and all the matrix multiplications except MLP layers (see NOTE below for details).
+In our experiments, Triton kernels help to reduce the average latecy (over difference sequence lengths) by 6\~24% (depending on model and hardware) when compared to the latency with CUDA-only kernels.
+
+
+Figures below show the latency reduction in more detail.
+Figure 1 visualizes latency reduction in different sequence lengths in A100 GPU for Bert-base model.
+The baseline (blue) is from Huggingface transformers without any kernel injection, the orange is from Deepspeed with CUDA-only kernels and the gray is from Deepspeed with Triton kernels.
+Figure 2 shows the same plot for Bert-large model in A100 GPU.
+
+<div align="center">
+
+<img src="../assets/images/triton-bert-base-latency.png" width="500px" alt="triton-bert-base-latency"/>
+
+*Figure 1: Normalized P90 latency for Bert-base model in A100 GPU across different sequence lengths*
+
+<img src="../assets/images/triton-bert-large-latency.png" width="500px" alt="triton-bert-large-latency"/>
+
+*Figure 2: Normalized P90 latency for Bert-large model in A100 GPU across different sequence lengths*
+
+</div>
+
+
+Next, we dive deeper into this new feature in DeepSpeed.
+
+# 2. How to use Triton in Deepspeed
+
+You can enable Triton compilers to optimize these kernels by setting a flag in the DeepSpeed config file.
+
+```
+pipe = pipeline('fill-mask', model='bert-base-cased', framework='pt', device=0)
+pipe.model = deepspeed.init_inference(pipe.model,
+                                        dtype=torch.float16,
+                                        replace_with_kernel_inject=True,
+                                        enable_cuda_graph=True,
+                                        use_triton=True,
+                                        triton_autotune=True,
+                                        max_out_tokens=pipe.tokenizer.model_max_length)
+```
+
+
+## Running BERT inference with Triton kernels
+
+We use an example of Bert-base here.
+
+```python
+pip install deepspeed[triton]
+
+git clone https://github.com/microsoft/DeepSpeedExamples.git
+cd DeepSpeedExamples/inference/huggingface/fill-mask
+
+deepspeed --num_gpus 1 test-bert.py --triton
+```
+
+To run a performance benchmark, you can use the following command:
+
+```python
+pip install deepspeed[triton]
+
+git clone https://github.com/microsoft/DeepSpeedExamples.git
+cd DeepSpeedExamples/benchmarks/inference
+
+deepspeed --num_gpus 1 triton-bert-benchmark.py --model bert-base-cased --dtype fp16 --kernel-inject --deepspeed --graphs --triton
+```
+
+# NOTE
+<!-- **_NOTE:_** -->
+* For more information on how to use DeepSpeed, please visit our [GitHub Page](https://github.com/microsoft/DeepSpeedExamples) and our [website](https://www.deepspeed.ai/), where you can find blog posts, tutorials, and documentation.
+
+* This feature is currently only supported for BERT, Roberta and other BERT-like models, and not for text-generation models yet.
+
+* To achieve the best performance with Triton optimization, you need to activate CUDA graph and ‘triton_autotune’ in the DeepSpeed config. CUDA graph prevents the overhead of JIT compilation and a deep call stack in Triton. ‘triton_autotune’ executes an initial step to find the most suitable parameters for Triton kernels, which may take some time.
+
+* We used [Triton 2.0.0.post1 release](https://pypi.org/project/triton/2.0.0.post1/) in our experiments.
+
+* In our experiments, we used a batch size of 1, a sequence length range of 8 to 512, and a ‘fill-mask’ task. Table 1 shows the average P90 latency over the entire sequence length range, while Figures 1 and 2 show the P90 latency for specific sub-ranges. The baseline is the Huggingface transformers without any optimization. The speedup is calculated as (baseline P90 latency)/(DeepSpeed-Triton P90 Latency). We found that the CUDA kernel in MLP performed better than the Triton kernel in our experiments, so we used a hybrid approach that combines both kernels when Triton is enabled in the DeepSpeed config.
diff --git a/blogs/deepspeed-ulysses/README.md b/blogs/deepspeed-ulysses/README.md
new file mode 100644
index 000000000000..7ea7a4535e90
--- /dev/null
+++ b/blogs/deepspeed-ulysses/README.md
@@ -0,0 +1,370 @@
+<div align="center">
+
+# DeepSpeed Ulysses: System Optimizations for Enabling Training of Extreme Long Sequence Transformer Models
+
+</div>
+
+<div align="center">
+<img src="./media/hero1.png" width="780px" />
+<img src="./media/hero2.png" width="820px" />
+
+</div>
+
+To cite DeepSpeed-Ulysses, please cite our [arxiv report](https://arxiv.org/abs/2309.14509):
+
+```
+@article{jacobs2023deepspeed,
+      title={DeepSpeed Ulysses: System Optimizations for Enabling Training of Extreme Long Sequence Transformer Models},
+      author={Sam Ade Jacobs and Masahiro Tanaka and Chengming Zhang and Minjia Zhang and Shuaiwen Leon Song and Samyam Rajbhandari and Yuxiong He},
+      journal={arXiv preprint arXiv:2309.14509},
+      year={2023},
+}
+```
+
+## Introduction
+
+Training large models with long sequences is becoming very important
+across the board from generative AI to models for scientific discovery.
+On generative AI side, conversational AI, long document summarization
+and video generation require reasoning over long contexts in spatial and
+temporal domains. For example, multimodal foundation models such as ones
+that process speech, images and waveforms concurrently require long
+context reasoning over high dimensional inputs with extremely large
+sequences. Similarly, chapter and book level summarization (estimated at
+tens and hundreds of thousands of words) are of great importance in
+conversational AI and abstractive summarization tasks.
+
+Long sequence length is equally critical for AI for science opening
+doors for better understanding of structure biology, health care,
+climate and weather forecasting and large molecular simulation. For
+instance, by adapting large language models with gene sequences, we can
+create language models that can learn the evolutionary patterns of
+genomes using simple alphabets and extremely long sequences (the human
+genome has 6.4 billion letters). In health care, diagnostic predictive
+model conditioned on entire patient care record requires context of
+extremely long sequence.
+
+Despite the emerging importance of long sequence length for both
+generative AI and AI for science, existing large model training systems
+and the underlying parallelism technologies (data, tensor, pipeline,
+sequence parallelism) are limited in their ability to support the
+efficient long sequence training. Two challenges with existing
+parallelism approach come to the fore. First, existing parallelism
+approach such as data, tensor and pipeline parallelism cannot address
+the scaling along sequence dimension. Second, existing sequence
+parallelism approaches are not effective because of memory-communication
+inefficiencies<span class="mark">.</span> Furthermore, existing
+approaches have limited usability requiring intrusive and error prone
+code refactoring.
+
+In this release, we are proud to introduce *DeepSpeed-Ulysses (or
+Ulysses, a very long novel)*, a simple, portable, and effective
+methodology for enabling highly efficient and scalable LLM training with
+extremely long sequence lengths.
+
+DeepSpeed-Ulysses partitions individual samples along the sequence
+dimension among participating GPU. Then right before the attention
+computation, it employs *all-to-all communication* collective on the
+partitioned queries, keys and values such that each GPU receives the
+full sequence but only for a non-overlapping subset of the attention
+heads. This allows the participating GPUs to compute attention for
+different attention heads in parallel. Finally, DeepSpeed-Ulysses
+employs another all-to-all to gather the results along the attention
+heads while re-partitioning along the sequence dimension.
+
+The key properties of DeepSpeed-Ulysses and its implementation released
+with this blog are as follows:
+
+*  ***4x larger sequence lengths*** than existing systems, while
+enabling training with sequences with ***over a million tokens***.
+
+* Communication reduction of ***over 10x*** compared to existing
+systems, resulting in throughput improvements of ***up to 2.5x***, and
+sustained throughput of over 175 TFlops/GPU (over 54% of hardware peak).
+
+* Fully general and implementation agnostic attention: DeepSpeed
+sequence parallelism supports dense as well as sparse
+attention, and it works with efficient attention implementations such as
+FlashAttention v2.
+
+* Support for massive model training: DeepSpeed sequence parallelism
+works together with ZeRO-3 to not only support large sequence lengths
+but also massive model sizes.
+
+* Easy-to-use and portable, requiring minimal code changes to the
+existing training frameworks.
+
+In subsequent sections, we provide detailed discussion of DeepSpeed-Ulysses
+core design, communication complexity analysis,
+experimental evaluation and comparison with existing work and highlight
+of usability and guide on usage.
+
+## Core Design of DeepSpeed-Ulysses
+
+<div align="center">
+<img src="./media/image3.png" style="width:6.3479in;height:2.89442in"
+alt="A diagram of a computer Description automatically generated" />
+
+*Figure 1: DeepSpeed sequence parallelism (DeepSpeed-Ulysses) design*
+</div>
+
+Figure 1 shows the core design of DeepSpeed-Ulysses. As with the known
+transformer architecture, the design consists of input sequences *N*
+partitioned across *P* available devices. Each local *N/P* partition is
+projected into queries (Q), keys (K) and values (V) embeddings. Next,
+(QKV) embeddings are gathered into global QKV through highly optimized
+all-to-all collectives between participating compute devices. Sequel to
+all-to-all collective is the attention computation per head in the form:
+
+$$Output\ context = Softmax\ (\frac{QK^{T}}{\sqrt{d}})V$$
+
+After the attention computation, another all-to-all collective
+transforms *output context* tensor of attention computation to sequence
+(*N/P*) parallel for subsequent operators (MLP MatMul, layer norm etc)
+in the remaining modules of transformer layer block.
+
+### Significant Communication Volume Reduction
+
+What distinguishes DeepSpeed-Ulysses from the other existing
+long-sequence approaches is our much smaller aggregate communication
+volume and overall better scalability with increasing degree of sequence
+parallelism compared to existing solutions, as demonstrated by the
+communication volume analysis below:
+
+On modern clusters with intra-node NVSwitch interconnect and inter-node
+fat tree IB topology, the communication volume transmitted per link for
+an all-to-all for aggregate message of size *M* over *P* GPUs is *M/P*.
+For a transformer model with hidden size h, sequence length of N, and
+parallelism degree of P, DeepSpeed sequence parallelism performs all-to-all for the QKV
+projections with an aggregate message size of *3Nh* before the attention
+computation, and another all-to-all for output context projection with a
+size *Nh* for each transformer layer. Therefore, DeepSpeed sequence
+parallelism incurs an aggregate communication volume per link of
+***4Nh/P (or with the complexity of O(N/P).*** Note that this
+communication volume is constant when both N and P are increased
+proportionally.
+
+In contrast, the existing approaches like Megatron-LM incur
+communication volume that increases linearly with N regardless of P,
+resulting in the ***communication complexity of O(N).*** For instance,
+Megatron-LM performs two *all-gather* with the message volume of *Nh*
+and two *reduce-scatter* with the volume of *Nh* for each transformer
+layer. However, the cost of each all-gather and reduce-scatter of size M
+remains M when *P \>\> 1*, instead of *M/P*. Therefore, Megatron-LM
+sequence parallelism incurs a communication volume per link of ***4Nh***
+which is P times larger than that for DeepSpeed sequence parallelism.
+This allows DeepSpeed sequence parallelism to enable training with
+extremely long sequences while achieving significantly higher training
+efficiency compared to the existing approaches. Our evaluation results
+match this analysis.
+
+### Additional Highlights of DeepSpeed-Ulysses
+
+***An Attention Agnostic Solution***
+
+DeepSpeed implementation of distributed attention module is general
+enough to support any attention: e.g., self-attention, cross-attention,
+causal attention in both their dense and sparse counterparts, and their
+various optimized kernels that support long-sequence at local attention
+level such as different versions of FlashAttention.
+
+The generality property of DeepSpeed-Ulysses stems from the modular
+nature of its core design: an attention-centric sequence parallelism
+design. Prior to attention computation is sequence parallelism of N/P
+partition, attention computation is head parallelism with full attention
+per head but just with fewer heads, thus attention computation can be
+replaced with any type of attention mechanisms, e.g., dense attention
+and various forms of sparse attention.
+
+***Training Bigger Models with Longer Sequences through ZeRO-3 Integration***
+
+While DeepSpeed sequence parallelism reduces the activation memory when
+training with longer sequences, it does not impact the memory consumed
+by the model states. Therefore, to support large sequence length
+training with large language model, DeepSpeed sequence parallelism is
+integrated with ZeRO-3.
+
+[ZeRO Redundancy Optimizer Stage 3 (ZeRO-3)](https://www.microsoft.com/en-us/research/blog/zero-deepspeed-new-system-optimizations-enable-training-models-with-over-100-billion-parameters/) is a memory optimization technique for training large
+models. Unlike the classic data parallel training of neural networks
+where model states are replicated across data parallel ranks, ZeRO-3
+optimizes memory usage by partitioning model states across data parallel
+ranks. However, with sequence parallelism, training data can be
+considered in both batch (sample) and sequence dimensions and the
+associated parallel groups combined to form a larger group for ZeRO
+parallelism.
+
+Therefore, we extend ZeRO-3 partitioning to combination of data parallel
+and sequence parallel ranks. In other words, in DeepSpeed sequence
+parallelism, ZeRO partitions model states across both sequence and data
+parallel group and collects per rank partitions (allgather) when they
+are needed. Similarly, gradients are reduced across both data and
+sequence parallel ranks for parameter update. ZeRO allows
+for huge memory savings in both sequence and data dimensions and enables
+scaling not just to large sequence lengths but also to large models.
+
+## Evaluation
+
+We evaluate DeepSpeed-Ulysses (Ulysses) on GPT,
+a foundation model for many NLP tasks on up to 64 A100 GPUs with 40GB memory. Our
+evaluations are four-fold: i) sequence length scalability, ii)
+throughput for dense attention and comparison with existing system, and
+iii) throughput with sparse attention and comparison with existing
+system, iv) convergence study of DeepSpeed sequence parallelism. We discuss
+and present evaluations from each of these categories next.
+
+### Sequence Length Scalability
+
+The first set of experiments is strong scaling of sequence length up to
+1 million tokens on 1.2 billion parameter GPT model. Results of this
+evaluation are shown in Figures 2. DeepSpeed sequence parallelism
+allows increasing sequence length linearly with the
+number of GPUs and
+maintains similar computation throughput across different sequence
+length at appropriate GPU count.
+
+<div align="center">
+<img src="./media/dense1B1Mscale.png" style="width:5in;height:4in" />
+
+*Figure 2: DeepSpeed sequence parallelism strong scalability evaluation
+at different sequence length and GPU count.*
+</div>
+
+### Dense Attention Evaluation
+
+Next, we evaluate Ulysses on 7 billion (7B) and 30 billion (30B) parameter
+GPT dense attention models and compare against Megatron-LM's sequence
+parallelism (Megatron LM) and Colosal AI sequence parallelism (ColAI-SP) on
+32 and 64 A100 GPUs respectively. The results of these evaluations are shown
+in Figures 3 and 4.
+
+We compare Ulysses with Megatron-LM and ColAI-SP for 7B and 30B models
+running various sequence lengths. We chose the sequence parallelism
+degree and micro-batch size that produced the best performance
+(measured as TFLOPs) for the three methods, this we call optimal
+(batch size-sequence length) configurations. For Ulysses, we always
+use a ZeRO-3 parallelism degrees of 32 and 64 for 7B and 30B models
+respectively.
+
+
+Figures 3 and 4 show that Ulysses consistently outperforms Megatron-LM
+and ColAI-SP for the sequence length that can be run with them. In addition,
+Ulysses can run longer sequence than the two existing methods. Ulysses
+performance advantages are two folds: (1) Ulysses in combination with ZeRO-3
+parameter sharding across both data and sequence parallel groups fits more
+samples than Megatron-LM and ColAI-SP because of the memory optimization
+leading to higher throughput (2) Ulysses benefits from efficient *all-to-all*
+communication relative to *all-gather* *reduce-scatter* and *ring-style* P2P
+communication as applied in Megatron-LM and ColAI-SP sequence parallelism.
+However, for dense attention at long sequence length, the throughput is
+primarily determined by local attention computation due to quadratic
+computation complexity of attention, therefore performance gap between Ulysses
+and the two existing methods closes for sequence length that can be run with them.
+
+<div align="center">
+<img src="./media/dense7B.png" style="width:5in;height:4in" />
+
+*Figure 3: Evaluation of Ulysses vs Megatron LM vs ColAI-SP on GPT-7B parameter
+ model with dense attention (32 GPUs).*
+</div>
+
+<div align="center">
+<img src="./media/dense30B.png" style="width:5in;height:4in" />
+
+*Figure 4:  Evaluation of Ulysses vs Megatron LM vs ColAI-SP on GPT-30B parameter
+ model with dense attention (64 GPUs).*
+</div>
+
+### Sparse Attention Evaluation
+
+Similarly, we evaluate Ulysses on 7 billion and 30 billion parameter sparse
+attention models and benchmark against Megatron-LM sequence parallelism.
+There is no public implementation of block sparse attention for ColAI-SP,
+therefore, evaluation of sparse attention is in comparison with Megatron-LM.
+Results of our evaluation are shown in Figures 5 and 6. We observe similar
+trends with sparse attention as dense attention experiments. We observe more
+than 2x throughput performance of Ulysses compared to Megatron-LM. For memory
+saving, Ulysses leveraging ZeRO-3 scales to 4x longer sequence lengths
+than Megatron-LM.
+
+Ulysses outperforms Megatron-LM for sequence length that can be run with both.
+In fact, the current Ulysses throughput is bottle-necked by the local sparse
+attention implementation, and as a result Ulysses throughput decreases as
+the sequence length increases. We expect this gap in performance between our
+method and Megatron-LM to increase further for larger sequence lengths as we
+improve the performance of the local sparse attention implementation in future.
+A noteworthy observation is that the decreasing performance gap between Ulysses
+and Megatron-LM observed in dense attention evaluation is less pronounced in
+sparse attention evaluation, because the attention computation in sparse attention
+is less dominant compared to dense attention.
+
+<div align="center">
+<img src="./media/sparse7B.png" style="width:5in;height:4in" />
+
+*Figure 5: Evaluation of Ulysses and Megatron LM sequence parallelism on GPT-7B
+parameter model with block sparse attention (32 GPUs).*
+</div>
+
+<div align="center">
+<img src="./media/sparse30B.png" style="width:5in;height:4in" />
+
+*Figure 6: Evaluation of Ulysses and Megatron LM sequence parallelism on GPT-30B
+parameter model with block sparse attention (64 GPUs).*
+</div>
+
+### Convergence Study
+
+Lastly, Figure 7 shows convergence of a 1.3 billion GPT model at 32K
+sequence length on 8 A100 GPUs with sequence parallelism degree set at 4
+for both DeepSpeed and Megatron-LM sequence parallelism. For DeepSpeed
+sequence parallelism, we evaluate convergence with different ZeRO
+stages. DeepSpeed sequence parallelism is a purely system optimization
+technique that enables training of long sequence Transformer model, thus
+there is no (negative) impact on quality of trained models, this assertion is
+validated through experiments and is shown in Figure 5.
+
+<div align="center">
+<img src="./media/convgZ.png" width="500px" />
+
+*Figure 7: Convergence evaluation of DeepSpeed sequence parallelism with different
+ZeRO memory optimization stages.*
+</div>
+
+## DeepSpeed-Ulysses Software Accessibility
+
+DeepSpeed-Ulysses can be easily integrated into your code with just a
+few lines of simple code changes. Here is an example of how to enable
+it:
+
+```python
+from deepspeed.sequence.layer import DistributedAttention
+
+# Replace the original self-attention (attn) with DeepSpeed-Ulysses’s self-attention
+
+dist_attn = DistributedAttention(attn, get_sequence_parallel_group())
+```
+
+Compared to other libraries that support sequence parallelism, such as
+Megatron-LM, DeepSpeed-Ulysses does not require model refactoring.
+DeepSpeed-Ulysses has been fully integrated and tested with the
+Megatron-DeepSpeed code repository. This means that if you are already
+using this repository for training large language models, you can
+seamlessly benefit from DeepSpeed-Ulysses to train models with massive
+sequence length.
+
+## Release: Try DeepSpeed-Ulysses Today
+
+We are excited to release DeepSpeed-Ulysses, accessible through
+DeepSpeed GitHub. Detailed tutorial on usage is available on [DeepSpeed
+tutorial page](https://www.deepspeed.ai/tutorials/ds-sequence/).
+
+We welcome contributions and collaboration as we together push forward
+on what is possible when long context window is no longer a limitation.
+DeepSpeed-Ulysses is part of the bigger DeepSpeed ecosystem of
+large-scale AI training and inference. For more details on all DeepSpeed
+technologies and innovations, please visit our [website]((https://www.deepspeed.ai/)) and follow us
+on X, formerly Twitter, ([English](https://twitter.com/MSFTDeepSpeed), [Japanese](https://twitter.com/MSFTDeepSpeedJP)) and [Chinese Zhihu](https://www.zhihu.com/people/deepspeed).
+
+We are open to collaborations with universities, research labs, and
+companies. For such requests (and other requests unsuitable for GitHub),
+please directly email to <deepspeed-info@microsoft.com>. If you like
+our work, please "Star" our [repo](https://github.com/microsoft/DeepSpeed).
diff --git a/blogs/deepspeed-ulysses/chinese/README.md b/blogs/deepspeed-ulysses/chinese/README.md
new file mode 100644
index 000000000000..20af8b91fbea
--- /dev/null
+++ b/blogs/deepspeed-ulysses/chinese/README.md
@@ -0,0 +1,155 @@
+<div align="center">
+
+# DeepSpeed Ulysses: 训练极长序列Transformer模型的系统优化
+
+</div>
+
+<div align="center">
+<img src="../media/hero1.png" width="780px" />
+<img src="../media/hero2.png" width="820px" />
+
+</div>
+
+## 简介
+
+从生成性AI到科研模型，长序列训练正在变得非常重要。
+在生成性AI领域，会话式AI、长文档摘要和视频生成等任务都需要在空间和时间层面对长上下文进行推理。
+例如，多模态基础模型，如同时处理语音、图像和波形的模型，需要对具有极长序列的高维输入进行长上下文推理。
+同样，章节和书籍级别的摘要（数万甚至数十万字）在会话式AI和摘要任务中也非常重要。
+
+对于科学AI来说，长序列同样至关重要，它为更好地理解结构生物学、医疗保健、气候和天气预测以及大分子模拟打开了大门。
+例如，通过在基因序列上训练大型语言模型，我们可以创建可以使用极长序列（人类基因组有64亿个碱基对）学习基因组进化模式的语言模型。在医疗保健领域，以所有的患者护理记录为条件的诊断预测模型需要极长序列的上下文。
+
+尽管对于生成性AI和科学AI来说，长序列长度的重要性逐渐增长，但现有的大型模型训练系统和底层的并行技术（数据、张量、流水线、序列并行）并不能支持高效的长序列训练。现有并行方法存在两个主要挑战。首先，现有的数据、张量和流水线等并行方法无法解决序列维度的扩展问题。其次，由于内存通信效率低下，现有的序列并行方法不够高效。此外，现有方法的易用性不足，需要进行侵入性和复杂易出错的代码重构。
+
+为了解决这些问题，我们很高兴宣布推出*DeepSpeed-Ulysses（或称为Ulysses，一个非常长的小说）*，这是一种简单、易用且高效的方法，用于支持具有极长序列长度的高效可扩展LLM训练。
+
+DeepSpeed-Ulysses将各个样本在序列维度上分割给参与的GPU。然后，在attention计算之前，它对已分割的查询(Q)、键(K)和值(V)执行*all-to-all通信*操作，以使每个GPU接收完整的序列，但仅用于注意力头的非重叠子集。这使得参与的GPU可以并行计算不同的注意力头。最后，DeepSpeed-Ulysses还使用另一个all-to-all来在注意力头上收集结果，同时重新在序列维度上进行分区。
+
+DeepSpeed-Ulysses及其与此博客一起发布的实现的关键特性如下：
+
+* 与现有系统相比，序列长度增加了***4倍***，支持训练***超过百万个token***的序列。
+
+* 与现有系统相比，通信减少了***超过10倍***，导致吞吐量提高了***高达2.5倍***，并且每个GPU的持续吞吐量超过175 TFlops（超过硬件峰值的54%）。
+
+* 完全通用的attention：DeepSpeed序列并行支持密集和稀疏的注意力，并可与高效的注意力实现（如FlashAttention v2）一起工作。
+
+* 支持大规模模型训练：DeepSpeed序列并行不仅支持大序列长度，还可以与ZeRO-3并用支持大模型尺寸。
+
+* 易于使用和迁移，最小化对现有训练框架的代码更改要求。
+
+在接下来的章节中，我们详细讨论DeepSpeed-Ulysses的核心设计、通信复杂度分析、实验评估以及与现有工作的比较，并展示其可用性和使用指南。
+
+## DeepSpeed-Ulysses的核心设计
+
+<div align="center">
+<img src="../media/image3.png" style="width:6.3479in;height:2.89442in"
+alt="" />
+
+*图1：DeepSpeed序列并行（DeepSpeed-Ulysses）设计*
+</div>
+
+图1显示了DeepSpeed-Ulysses的核心设计。与已知的Transformer架构一样，设计由*N*个输入序列在*P*个可用设备上分区组成。每个本地*N/P*分区都被投影到查询（Q）、键（K）和值（V）嵌入中。接下来，(QKV) 嵌入通过参与计算设备之间的高度优化的全对全集合（all-to-all collectives）进行全局的 QKV 收集。在全对全集合后，每个头的注意力计算形式为：
+
+$$Output\ context = Softmax\ (\frac{QK^{T}}{\sqrt{d}})V$$
+
+注意力计算后，另一个全对全集合将注意力计算的输出上下文张量转换为序列(*N/P*)并行，用于Transformer模型层的剩余模块中的后续操作（MLP MatMul、层归一化等）。
+
+### 显著的通信量减少
+
+DeepSpeed-Ulysses与其他现有的长序列方法的区别在于其更小的累积通信量以及随着序列并行度增加而更好的可扩展性，如下所示：
+
+在具有节点内NVSwitch互连和节点间胖树IB拓扑的现代集群上，针对一个聚合消息大小为*M*的全对全传输，传输到*P*个GPU上的每个链接的通信量为*M/P*。
+对于隐藏层大小为h、序列长度为N且并行度为P的Transformer模型，DeepSpeed序列并行会在注意计算之前对QKV投影执行聚合消息大小为*3Nh*的全对全操作，并在注意计算之后对输出上下文投影执行大小为*Nh*的另一个全对全操作。因此，DeepSpeed序列并行每个链接的聚合通信量为***4Nh/P（或O(N/P)复杂度）***。值得注意的是，当N和P成比例增加时，这个通信量是恒定的。
+
+相比之下，现有的方法，如Megatron-LM，在N线性增长的情况下会导致通信量线性增加，而与P无关，从而导致***O(N)的通信复杂度***。例如，Megatron-LM对每个Transformer模型层都执行两个大小为*Nh*的*all-gather*操作，以及两个大小为*Nh*的*reduce-scatter*操作。然而，当*P \>\> 1*时，大小为M的每个all-gather和reduce-scatter的成本仍然是M，而不是*M/P*。因此，Megatron-LM序列并行会导致每个链接的通信量为***4Nh***，这比DeepSpeed序列并行大P倍。这使得DeepSpeed序列并行可以在实现显著更高的训练效率的同时支持极长序列训练。我们的实验评估结果与此理论分析相符。
+
+### DeepSpeed-Ulysses的其他亮点
+
+***通用的注意力解决方案***
+
+DeepSpeed分布式注意力模块的实现足够通用，以支持任何类型的注意力，例如自注意、交叉注意和因果注意，无论是它们的密集还是稀疏版本，以及支持局部注意层级上的长序列的各种优化内核，例如不同版本的FlashAttention。
+
+DeepSpeed-Ulysses的通用性来自其核心设计的模块化性质：一个以注意力为中心的序列并行设计。在注意力计算之前，序列并行性是对N/P分区的，而注意力计算是对每个头的并行性，每个头的注意力全都保留，但头的数量较少，因此注意力计算可以用任何类型的注意力机制替代，例如密集注意力和各种形式的稀疏注意力。
+
+***通过ZeRO-3集成实现更大的模型和更长的序列训练***
+
+尽管DeepSpeed序列并行在使用更长的序列进行训练时减少了激活内存的使用，但并不影响模型状态的内存占用。因此，为了支持具有大序列长度的大语言模型训练，我们实现了DeepSpeed序列并行与ZeRO-3的集成。
+
+[ZeRO Redundancy Optimizer Stage 3 (ZeRO-3)](https://www.microsoft.com/en-us/research/blog/zero-deepspeed-new-system-optimizations-enable-training-models-with-over-100-billion-parameters/) 是一种用于训练大模型的内存优化技术。与传统的神经网络数据并行训练中，模型状态在数据并行等级上进行复制不同，ZeRO-3通过将模型状态在数据并行等级之间进行分区来优化内存使用。然而，使用序列并行时，训练数据可以在批（样本）和序列维度上考虑，相关的并行群组可以组合成一个更大的群组以实现ZeRO并行。
+
+因此，我们将ZeRO-3分区扩展到数据并行和序列并行等级的组合。换句话说，在DeepSpeed序列并行中，ZeRO将模型状态分区在序列和数据并行组之间，并在需要时收集每个等级分区（allgather）。类似地，梯度将在数据并行和序列并行等级之间进行减少，用于参数更新。ZeRO可以在序列和数据维度上实现巨大的内存节省，并且不仅可以扩展到大序列长度，还可以扩展到大模型。
+
+## 评估
+
+我们在GPT（用于许多NLP任务的基础模型）上使用最多64个A100 GPU（40GB显存）对DeepSpeed-Ulysses进行了评估。我们的评估分为四个方面：i) 序列长度可扩展性，ii) 密集注意力的吞吐量以及与现有系统的比较，iii) 稀疏注意力的吞吐量以及与现有系统的比较，iv) DeepSpeed序列并行的收敛性研究。接下来，我们将对每个类别讨论和展示评估结果。
+
+### 序列长度可扩展性
+
+第一组实验是在12亿参数的GPT模型上将序列长度扩展到100万token。这个评估的结果如图2所示。DeepSpeed序列并行允许随着GPU数量的增加线性增加序列长度，并且序列长度与GPU数量保持线性比例关系，适当的GPU数量下保持相似的计算吞吐量。
+
+<div align="center">
+<img src="../media/fig2Ulysses.png" style="width:5in;height:4in" />
+
+*图2：DeepSpeed序列并行强化可扩展性评估，使用不同的序列长度和GPU数量。*
+</div>
+
+### 密集注意力评估
+
+接下来，我们在300亿参数的密集注意力模型上对DeepSpeed序列并行进行了评估，并与Megatron序列并行在64个A100 GPU上进行了对比。这些评估的结果如图3所示。
+
+我们将DeepSpeed序列并行与Megatron-LM在不同序列长度下的性能进行了比较。对于我们的评估，我们选择了能使DeepSpeed序列并行和Megatron-LM分别达到最佳性能（通过吞吐量或TFLOPs衡量）的序列长度-批大小组合，我们称之为最佳（批大小-序列长度）配置。对于DeepSpeed序列并行，我们始终使用64的ZeRO并行度。
+
+图3显示，DeepSpeed序列并行在相同序列长度下始终优于Megatron-LM。此外，DeepSpeed序列并行可以运行比Megatron-LM更长的序列。DeepSpeed序列并行的性能优势在于两个方面：（1）DeepSpeed序列并行结合ZeRO-3的内存优化，可以容纳更多的样本，从而提高吞吐量；（2）相对于Megatron-LM序列并行中应用的*all-gather*通信，DeepSpeed序列并行使用更高效的全对全通信。
+
+<div align="center">
+<img src="../media/fig3Ulysses.png" style="width:5in;height:4in" />
+
+*图3：DeepSpeed和Megatron LM序列并行在300亿参数模型上的密集注意力评估。*
+</div>
+
+### 稀疏注意力评估
+
+类似地，我们在300亿参数的稀疏注意力模型上对DeepSpeed序列并行进行了评估，并与Megatron序列并行进行了对比。我们的评估结果如图4所示。稀疏注意力的实验结果与密集注意力实验类似。我们观察到DeepSpeed序列并行的吞吐量性能相对于Megatron-LM提高了2倍以上。通过节省内存，DeepSpeed序列并行结合ZeRO-3可以扩展到比Megatron-LM更长4倍的序列长度。
+
+DeepSpeed序列并行在相同序列长度下始终优于Megatron-LM。事实上，当前的DeepSpeed吞吐量受到本地稀疏注意力实现的瓶颈，因此DeepSpeed吞吐量随着序列长度的增加而降低。我们预计，随着未来局部稀疏注意力实现性能的改善，DeepSpeed与Megatron之间的性能差距将在更大的序列长度下进一步增加。
+
+<div align="center">
+<img src="../media/fig4Ulysses.png" style="width:5in;height:4in" />
+
+*图4：DeepSpeed和Megatron LM序列并行在300亿参数模型上的稀疏注意力评估。*
+</div>
+
+### 收敛性研究
+
+最后，图5显示了1.3亿参数GPT模型在32K序列长度下，使用序列并行度设置为4的情况下，在8个A100 GPU上的收敛性。对于DeepSpeed序列并行，我们使用不同的ZeRO阶段进行了收敛性评估。DeepSpeed序列并行是一种纯系统优化技术，用于实现长序列Transformer模型的训练，因此在训练模型质量上没有（负面）影响，并通过实验得到了验证，如图5所示。
+
+<div align="center">
+<img src="../media/convg.png" width="500px" />
+
+*图5：使用不同ZeRO内存优化阶段的DeepSpeed序列并行的收敛性评估。*
+</div>
+
+## DeepSpeed-Ulysses软件可用性
+
+DeepSpeed-Ulysses只需进行少量简单代码更改来集成到您的代码中。下面是一个启用它的示例：
+
+```python
+from deepspeed.sequence.layer import DistributedAttention
+
+# 将原始的自注意（attn）替换为DeepSpeed-Ulysses的自注意
+
+dist_attn = DistributedAttention(attn, get_sequence_parallel_group())
+```
+
+与其他支持序列并行的库（如Megatron-LM）相比，DeepSpeed-Ulysses不需要进行模型重构。
+DeepSpeed-Ulysses已经完全与Megatron-DeepSpeed代码库集成并经过测试。这意味着如果您已经
+在使用这个代码库来训练大型语言模型，您可以无缝地使用DeepSpeed-Ulysses训练具有极长序列的模型。
+
+## 立即尝试DeepSpeed-Ulysses
+
+我们很高兴宣布推出DeepSpeed-Ulysses，您可以通过DeepSpeed GitHub获取代码。详细的使用教程在[DeepSpeed教程页面](https://www.deepspeed.ai/tutorials/ds-sequence/)上提供。
+
+我们欢迎各种形式的贡献和合作，以共同推动当长上下文窗口不再是限制时的各种创新。DeepSpeed-Ulysses是大规模AI训练和推理的更大DeepSpeed生态系统的一部分。有关所有DeepSpeed技术和创新的更多细节，请访问我们的[网站]((https://www.deepspeed.ai/))，并关注我们在X（Twitter）上的帐号（[英文](https://twitter.com/MSFTDeepSpeed)，[日文](https://twitter.com/MSFTDeepSpeedJP)）和[中文知乎](https://www.zhihu.com/people/deepspeed)。
+
+我们愿意与大学、研究实验室和公司合作。有关此类请求（以及不适合在GitHub上提出的其他请求），请直接发送电子邮件至<deepspeed-info@microsoft.com>。
diff --git a/blogs/deepspeed-ulysses/japanese/README.md b/blogs/deepspeed-ulysses/japanese/README.md
new file mode 100644
index 000000000000..88a0e375ce70
--- /dev/null
+++ b/blogs/deepspeed-ulysses/japanese/README.md
@@ -0,0 +1,158 @@
+<div align="center">
+
+# DeepSpeed Ulysses: Transformerモデルを非常に長いシーケンスで訓練するための最適化
+
+</div>
+
+<div align="center">
+<img src="../media/hero1.png" width="780px" />
+<img src="../media/hero2.png" width="820px" />
+
+</div>
+
+## 概要
+
+巨大モデルを長いシーケンスで訓練することは、生成AIから科学的発見のためのモデルに至るまで、あらゆる分野で非常に重要になっています。
+生成AIでは、会話型AI、長文の要約、ビデオ生成など、空間的・時間的な文脈での長いコンテキストの理解が求められます。
+たとえば、音声、画像、波形を同時に処理するマルチモーダルの基盤モデルは、非常に長いシーケンス長の高次元の入力から、長期のコンテキストを理解することが求められます。同様に、章や書籍単位での要約（数万から数十万語と想定される）は、会話AIや要約タスクにおいて非常に重要です。
+
+長いシーケンスを扱えることは、科学におけるAIの利用にも重要であり、構造生物学、医療、気候および天気予報、大規模分子シミュレーションを進歩させる可能性を持っています。例えば、大規模な言語モデルを遺伝子のシーケンスに適応させることにより、単純なアルファベットからなる非常に長いシーケンスから、ゲノムの進化のパターンを学ぶ言語モデルを作成できます（ヒトゲノムには64億の文字があります）。また医療分野において、全体の患者ケア記録に基づいて条件付けされる診断予測モデルでは、非常に長いシーケンスで表現される文脈を扱う必要があります。
+
+生成AIや科学分野において、長いシーケンスを扱う重要性が急速に増している一方で、既存の大規模モデルの訓練システムや基盤となる並列化技術（データ並列、テンソル並列、パイプライン並列、シーケンス並列）では、効率的に長いシーケンスを訓練することができませんでした。既存の並列化のアプローチには、2つの課題があります。第一に、データ並列、テンソル並列、パイプライン並列のような、既存の広く使用されている並列アプローチは、シーケンスの次元に沿ってスケールアップすることができません。第二に、既存のシーケンス並列のアプローチは、メモリ上のデータの通信が理由で、高い効率が得られません。さらに、既存のアプローチは、大規模なコードの変更が必要となり、既存のコードにエラーを発生させやすいという課題もあります。
+
+このリリースは、LLM（大規模言語モデル）の訓練において、非常に長いシーケンスの処理を、効率的かつスケーラブルに実現する新たな手法である *DeepSpeed-Ulysses（またはUlysses、非常に長い小説にちなんで名づけられました）* を公開するものです。
+
+
+DeepSpeed-Ulyssesは、個々のサンプルを、シーケンスの次元で複数のGPUで分割します。そして、Transformerにおけるアテンション計算の直前に、 クエリ (Q)、キー (K)、および値 (V)について、*all-to-all* 通信を適用します。
+このall-to-all通信により、アテンションヘッドの単位で重複のないように複数のGPUに分割配置される一方で、シーケンス全体が一つのGPUに保持されるようになります。各GPUは、それぞれに異なるアテンションヘッドを計算するため、並列に計算が可能です。アテンションの計算後、もう一度 all-to-all 通信によって、計算結果をシーケンスの次元で再分割します。
+
+このブログで紹介するDeepSpeed-Ulysses及びその実装の主な特長は以下の通りです。
+
+
+* 既存のシステムに比べて ***4倍長いシーケンス長*** （***100万トークン以上***）のシーケンスでの訓練が可能。
+
+* 既存のシステムと比較して ***10倍以上の通信削減***。これにより、***最大2.5倍のスループット向上***と、175 TFlops/GPU（ハードウェアピークの54%以上）のスループットを実現。
+
+* アテンションの実装に依存しない汎用性: Denseなアテンション計算のアルゴリズムだけでなく、Sparseなアルゴリズムも利用できます。また、FlashAttention v2のような効率的なアテンションの実装も容易に利用できます。
+
+* 大規模モデルの訓練のサポート: ZeRO-3と連携して、長いシーケンスを処理できるだけでなく、巨大なモデルサイズもサポートします。
+
+* 最小限のコード変更で、既存の訓練フレームワークに適用できます。
+
+以降のセクションでは、DeepSpeed-Ulyssesの中心となる設計アイデア、通信コストの分析、実験的な評価と既存手法との比較を詳しく示した後、使用方法について説明します。
+
+
+## DeepSpeed-Ulyssesの設計
+
+<div align="center">
+<img src="../media/image3.png" style="width:6.3479in;height:2.89442in"
+alt="図1: DeepSpeed-Ulysses の設計" />
+
+*図1: DeepSpeed-Ulysses の設計*
+</div>
+
+図1はDeepSpeed-Ulyssesの中心となる設計を示しています。既知のTransformerアーキテクチャと同様に、入力シーケンス長 *N* が *P* の利用可能なデバイスに分割されて構成されます。各デバイスにおける、サイズ *N/P* の分割されたシーケンスから、クエリ (Q)、キー (K)、および値 (V) が計算されます。次に、各デバイス上のローカルな QKV から、all-to-all 集合通信によって、グローバルな QKV が構成されます。all-to-all 通信に続いて、ヘッドごとに以下のようにアテンションが計算されます。
+
+$$Output\ context = Softmax\ (\frac{QK^{T}}{\sqrt{d}})V$$
+
+アテンションの計算後、all-to-all 通信を再度実行し、Transformerレイヤーの残りのモジュール (MLP、layer norm など) の後続のオペレータを実行するため、シーケンス次元に沿って出力を分割します（各デバイス上での分割されたシーケンス長は、また *N/P* になります）。
+
+### 通信量の大幅な削減
+
+DeepSpeed-Ulyssesが、長いシーケンスのための既存の並列化手法と異なる点は、以降の通信量の分析に示すように、総通信量がはるかに少なく、それによって、シーケンスの並列度が増加した際の全体的なスケーラビリティが優れていることです。
+
+ノード内通信にNVSwitch、ノード間通信にfat tree IBトポロジを備えるなどのモダンな計算クラスタでは、*P* 個のGPU上でall-to-all通信を行ったとき、合計メッセージのサイズ *M* に対して、リンクごとの通信量は *M/P* になります。隠れサイズ*h*、シーケンス長*N*、および並列度*P*のTransformerモデルに対して、アテンション計算の前に、QKVについてall-to-allを実行しますが、この合計メッセージサイズは *3Nh* になります。また、アテンションの出力に対しても、all-to-allを実行しますが、このメッセージサイズは *Nh* になります。したがって、Transformerレイヤごとに、リンクあたり合計通信量が ***4Nh/P*** となります (オーダーでは O(N/P)) 。この通信量は、NとPの両方が比例して増加する場合に一定です。
+
+対照的に、Megatron-LMのシーケンス並列のような既存のアプローチは、*P* に関係なく *N* とともに通信量が線形に増加するため、通信量のオーダーは ***O(N)*** となります。例えば、Megatron-LMは、Transformerの各レイヤーに対して、通信量がNhの2つのall-gatherと、同じく通信量がNhの2つのreduce-scatterを実行します。しかし、サイズMの各all-gatherおよびreduce-scatterのコストは、 *P \>\> 1* の場合に（M/Pではなく）Mのままです。したがって、Megatron-LMシーケンス並列は、DeepSpeed-UlyssesのP倍大きな ***4Nh*** の通信ボリュームを発生させます。これにより、DeepSpeed-Ulyssesは、既存のアプローチと比較して、極端に長いシーケンスでの訓練を可能にし、訓練効率を大幅に向上させることができます。以降で示す評価結果は、この分析と一致しています。
+
+### その他の特長
+
+***アテンションの実装に非依存***
+
+DeepSpeed-Ulyssesでは、アテンションの実装について、self-attention, cross-attention, Dense/Sparse等の異なるアルゴリズム、FlashAttentionのように、長いシーケンスをサポートするさまざまな最適化されたカーネルを用いた実装など、任意のアテンションと組み合わせて用いることができるような、一般化された構造になっています。
+
+この一般性は、アテンション計算をモジュール化して用いることによって実現されています。アテンション計算の前では、シーケンス長NをN/Pに分割しますが、アテンション計算自体は、ヘッドごとに完全なアテンションを計算しており、ただデバイスあたりのヘッド数が少ないだけです。したがって、アテンション計算は、Denseなアルゴリズムやさまざまな種類のSparseなアテンションのためのアルゴリズムなど、任意の種類のアテンションのメカニズムと置き換えることができます。
+
+***ZeRO3による大規模モデルの訓練***
+
+DeepSpeed-Ulyssesによるシーケンスの分割と並列化は、長いシーケンスでの訓練時のアクティベーションメモリを削減しますが、モデル状態の保持に必要なメモリ量には影響しません。したがって、大きな言語モデルで長いシーケンス長の訓練をサポートするために、シーケンスの並列化はZeRO-3と統合されています。
+
+
+[ZeRO Redundancy Optimizer Stage 3 (ZeRO-3)](https://www.microsoft.com/en-us/research/blog/zero-deepspeed-new-system-optimizations-enable-training-models-with-over-100-billion-parameters/) は、大規模なモデルを訓練するためのメモリ最適化技術です。モデルの状態（パラメータ、勾配、Optimizer状態）を全てのGPUに複製する従来のデータ並列と異なり、ZeRO-3はGPUにモデルの状態を分割配置します。シーケンス並列を併用する場合、訓練データは、サンプルの次元と、シーケンスの次元の両方で分割されていることになります。
+そこで、データ並列およびシーケンス並列の両方のグループにまたがるプロセス群で、ZeRO-3におけるパラメータや勾配等の分割を行い、また必要な時にallgather通信によってそれらを収集します。同様に、勾配の集約（reduce）も、パラメータ更新のためにデータ並列とシーケンス並列の両方にまたがるプロセス群で実施されます。ZeROを使用することで、シーケンスとデータの両方の次元で大きなメモリ節約が可能となり、長いシーケンス長だけでなく、大きなモデルサイズにもスケーリングすることができます。
+
+## 評価
+
+多くのNLPタスクの基盤モデルとして用いられるGPTモデルの学習に、DeepSpeed-Ulyssesを適用し、最大64台のA100 GPU（40GBメモリ）を用いて評価を行いました。評価は以下の4つの観点で実施しました: i) シーケンス長のスケーラビリティ、ii) Denseなアテンションでのスループットおよび既存のシステムとの比較、iii) Sparseなアテンションのスループットおよび既存のシステムとの比較、iv) 収束性の検証。以降で、それぞれの評価結果を示します。
+
+### シーケンス長のスケーラビリティ
+
+
+最初の評価実験は、12億パラメータのGPTモデルでの、最大100万トークンまでのシーケンス長の強スケーリング（strong scaling）です。この評価の結果を、図2に示します。GPUの数に比例してシーケンス長を増加させた際に、それぞれのGPU数・シーケンス長で、ほぼ同等の計算スループットを維持しています。
+
+<div align="center">
+<img src="../media/fig2Ulysses.png" style="width:5in;height:4in" />
+
+*図2: 異なるシーケンス長・GPU数での強スケーリング（strong scaling）*
+</div>
+
+### Denseなアテンションでの比較
+
+次に、300億パラメータのDenseなアテンションを持つモデルで、64台のA100 GPU上でのMegatron-LMのシーケンス並列との比較を行ったベンチマーク結果を図3に示します。
+
+ここでは、様々なシーケンス長で、DeepSpeed-UlyssesとMegatron-LMのシーケンス並列を比較しました。評価のために、それぞれのフレームワークが、最高の性能（スループットまたはTFLOPとして測定）を得られるシーケンス並列の並列度と、グローバルバッチサイズを選択しました。これを私たちは最適（バッチサイズ-シーケンス長）構成と呼びます。DeepSpeed-Ulyssesでは、常にZeRO-3を用い、64台のGPUにパラメータ・勾配・Optimizerの状態を分割配置しました。
+
+図3に示すように、DeepSpeed-UlyssesとMegatron-LMの両方で処理できるシーケンス長では、DeepSpeed-Ulyssesが常にMegatron-LMよりも優れたパフォーマンスを示しました。さらに、DeepSpeed-Ulyssesは、Megatron-LMのシーケンス並列よりも、長いシーケンスを処理できます。DeepSpeed-Ulyssesの利点は2つあります：(1) ZeRO-3との組み合わせにより、メモリの必要量をより小さくできるため、Megatron-LMよりも大きなバッチサイズを処理できるようになり、スループットが高まる。 (2) DeepSpeed-Ulyssesは、Megatron-LMシーケンス並列処理で適用されるall-gather通信と比較して、より効率的なall-to-all通信のメリットを得られる。
+
+
+<div align="center">
+<img src="../media/fig3Ulysses.png" style="width:5in;height:4in" />
+
+*図3: 300億パラメータ・DenseなアテンションでのMegatron-LMとの比較*
+</div>
+
+### Sparseなアテンションでの比較
+
+同様に、300億パラメータのSparseなアテンションを用いたモデルに、DeepSpeed-Ulyssesを適用し、Megatron-LMのシーケンス並列との比較を行いました。評価の結果を図4に示します。Sparseなアテンションに関しても、Denseなアテンションと同様の傾向が見られます。Megatron-LMに比べて、DeepSpeed-Ulyssesのスループット性能が2倍以上であることを確認しています。ZeRO-3を用いたメモリ使用量の削減によって、Megatron-LMよりも4倍長いシーケンス長を処理できています。
+
+DeepSpeed-Ulyssesは、DeepSpeed-UlyssesとMegatron-LMの両方で実行できるシーケンス長において、Megatron-LMを上回っています。実際、現在のDeepSpeed-Ulyssesのスループットは、各GPU上でローカルに計算されるSparseなアテンションがボトルネックとなっており、その結果、シーケンス長が増加するにつれてスループットが減少します。将来、ローカルのSparseなアテンションの実装のパフォーマンスを向上させることで、DeepSpeed-UlyssesとMegatronの間の性能のギャップが、より大きなシーケンス長に対してさらに広がると予想しています。
+
+<div align="center">
+<img src="../media/fig4Ulysses.png" style="width:5in;height:4in" />
+
+*図4: 300億パラメータ・SparseなアテンションでのMegatron-LMとの比較*
+</div>
+
+### 収束の検証
+
+図5では、8台のA100 GPU上で32Kのシーケンス長を持つ13億パラメータのGPTモデルの収束を示しています。ここでは、DeepSpeed-UlyssesとMegatron-LMのシーケンス並列の両方で、シーケンス並列の並列度を4に設定しています。DeepSpeed-Ulyssesは、ZeROと併用可能なため、それぞれのZeROステージでの収束を評価しました。DeepSpeed-Ulyssesは、長いシーケンスのTransformerモデルの訓練を可能にするための、システム面での最適化技術であり、したがって訓練されたモデルの品質に対する（ネガティブな）影響はありません。このことは、図5の結果から確認できます。
+
+<div align="center">
+<img src="../media/convg.png" width="500px" />
+
+*図5: 異なるZeROのステージでの収束*
+</div>
+
+## DeepSpeed-Ulyssesの利用
+
+DeepSpeed-Ulyssesは、数行の簡単なコードの変更だけで、既存のコードに簡単に統合することができます。以下は利用の例です：
+
+``` python
+from deepspeed.sequence.layer import DistributedAttention
+
+# Replace the original self-attention (attn) with DeepSpeed-Ulysses’s self-attention
+dist_attn = DistributedAttention(attn, get_sequence_parallel_group())
+```
+
+シーケンス並列処理をサポートする他のライブラリ、例えばMegatron-LMと比較して、DeepSpeed-Ulyssesはモデルのリファクタリングを必要としません。
+また、DeepSpeed-UlyssesはMegatron-DeepSpeedコードリポジトリと統合され、テストされています。
+大規模な言語モデルの訓練のためにこのリポジトリをすでに使用している場合、巨大なシーケンス長のモデルを訓練するために、DeepSpeed-Ulyssesをすぐに活用できます。
+
+## 早速試してみましょう！
+
+DeepSpeed-Ulyssesは、DeepSpeedのGitHubを通じてアクセス可能です。使用方法に関する詳しいチュートリアルは、[DeepSpeedのチュートリアルページ
+](https://www.deepspeed.ai/tutorials/ds-sequence/)にあります。
+
+長いコンテキストを扱う際の制約を取り除くことによって何が可能になるのか、ユーザの皆様と共に様々な可能性を探求するため、幅広い協力やコラボレーションを歓迎します。DeepSpeed-Ulyssesは、大規模なAIの訓練と推論のためのより大きなDeepSpeedエコシステムの一部です。DeepSpeedの多くの技術や革新的な機能の詳細については、[ウェブサイト](https://www.deepspeed.ai/)をご覧いただくか、X（以前のTwitter。[英語版](https://twitter.com/MSFTDeepSpeed)、[日本語版](https://twitter.com/MSFTDeepSpeedJP)）や、中国の[Zhihu](https://www.zhihu.com/people/deepspeed)でフォローしてください。
+
+DeepSpeedは、皆様の開発への参加を歓迎しています。DeepSpeedのGitHubページで、バグ報告、Pull Request、ディスカッションへの参加が可能です。詳細は[ガイドライン](https://github.com/microsoft/DeepSpeed/blob/master/CONTRIBUTING.md)をご覧ください。また、大学、研究所、企業とのコラボレーションも行っています。こうしたコラボレーションについてのご要望（およびGitHubには適さないその他の話題）については<deepspeed-info@microsoft.com> まで直接メールをお送りください。
diff --git a/blogs/deepspeed-ulysses/media/convg.png b/blogs/deepspeed-ulysses/media/convg.png
new file mode 100644
index 000000000000..b9586dc404e4
Binary files /dev/null and b/blogs/deepspeed-ulysses/media/convg.png differ
diff --git a/blogs/deepspeed-ulysses/media/convgZ.png b/blogs/deepspeed-ulysses/media/convgZ.png
new file mode 100644
index 000000000000..324f47cd61bd
Binary files /dev/null and b/blogs/deepspeed-ulysses/media/convgZ.png differ
diff --git a/blogs/deepspeed-ulysses/media/dense1B1Mscale.png b/blogs/deepspeed-ulysses/media/dense1B1Mscale.png
new file mode 100644
index 000000000000..eb886f879247
Binary files /dev/null and b/blogs/deepspeed-ulysses/media/dense1B1Mscale.png differ
diff --git a/blogs/deepspeed-ulysses/media/dense30B.png b/blogs/deepspeed-ulysses/media/dense30B.png
new file mode 100644
index 000000000000..d2eef04b73cc
Binary files /dev/null and b/blogs/deepspeed-ulysses/media/dense30B.png differ
diff --git a/blogs/deepspeed-ulysses/media/dense7B.png b/blogs/deepspeed-ulysses/media/dense7B.png
new file mode 100644
index 000000000000..042269276a6b
Binary files /dev/null and b/blogs/deepspeed-ulysses/media/dense7B.png differ
diff --git a/blogs/deepspeed-ulysses/media/fig2Ulysses.png b/blogs/deepspeed-ulysses/media/fig2Ulysses.png
new file mode 100644
index 000000000000..39e8a8420bde
Binary files /dev/null and b/blogs/deepspeed-ulysses/media/fig2Ulysses.png differ
diff --git a/blogs/deepspeed-ulysses/media/fig3Ulysses.png b/blogs/deepspeed-ulysses/media/fig3Ulysses.png
new file mode 100644
index 000000000000..fa1498096284
Binary files /dev/null and b/blogs/deepspeed-ulysses/media/fig3Ulysses.png differ
diff --git a/blogs/deepspeed-ulysses/media/fig4Ulysses.png b/blogs/deepspeed-ulysses/media/fig4Ulysses.png
new file mode 100644
index 000000000000..f55838b36e78
Binary files /dev/null and b/blogs/deepspeed-ulysses/media/fig4Ulysses.png differ
diff --git a/blogs/deepspeed-ulysses/media/hero1.png b/blogs/deepspeed-ulysses/media/hero1.png
new file mode 100644
index 000000000000..f0034ffdf8b9
Binary files /dev/null and b/blogs/deepspeed-ulysses/media/hero1.png differ
diff --git a/blogs/deepspeed-ulysses/media/hero2.png b/blogs/deepspeed-ulysses/media/hero2.png
new file mode 100644
index 000000000000..323d3d419002
Binary files /dev/null and b/blogs/deepspeed-ulysses/media/hero2.png differ
diff --git a/blogs/deepspeed-ulysses/media/image3.png b/blogs/deepspeed-ulysses/media/image3.png
new file mode 100644
index 000000000000..18be3c843370
Binary files /dev/null and b/blogs/deepspeed-ulysses/media/image3.png differ
diff --git a/blogs/deepspeed-ulysses/media/sparse30B.png b/blogs/deepspeed-ulysses/media/sparse30B.png
new file mode 100644
index 000000000000..2637d353d0c6
Binary files /dev/null and b/blogs/deepspeed-ulysses/media/sparse30B.png differ
diff --git a/blogs/deepspeed-ulysses/media/sparse7B.png b/blogs/deepspeed-ulysses/media/sparse7B.png
new file mode 100644
index 000000000000..2d9c9ad69420
Binary files /dev/null and b/blogs/deepspeed-ulysses/media/sparse7B.png differ
diff --git a/blogs/deepspeed-visualchat/10-03-2023/README-Chinese.md b/blogs/deepspeed-visualchat/10-03-2023/README-Chinese.md
new file mode 100644
index 000000000000..e91ff1ecd51e
--- /dev/null
+++ b/blogs/deepspeed-visualchat/10-03-2023/README-Chinese.md
@@ -0,0 +1,181 @@
+
+<div align="center">
+
+# DeepSpeed-VisualChat：多轮图像+文字，为你展现不一样的AI聊天魅力
+
+</div>
+
+<div align="center">
+
+<img src="../assets/images/hero-figure.png" width="1000px" alt="DeepSpeed-VisualChat!"/>
+
+</div>
+
+要引用 DeepSpeed-VisualChat，请引用我们的 [arxiv 报告](https://arxiv.org/abs/2309.14327)：
+
+
+```
+@article{yao2023deepspeed-visualchat,
+  title={{DeepSpeed-VisualChat: Multi-Round Multi-Image Interleave Chat via Multi-Modal Causal Attention}},
+  author={Zhewei Yao and Xiaoxia Wu and Conglong Li and Minjia Zhang and Heyang Qin and Olatunji Ruwase and Ammar Ahmad Awan and Samyam Rajbhandari and Yuxiong He},
+  journal={arXiv preprint arXiv:2309.14327},
+  year={2023}
+}
+```
+
+# 1. 概述
+大型语言模型 (LLMs)，如 GPT 和 LLaMa，在各种文本生成和理解任务中都展现出了卓越的能力，特别是在经过零次/少次学习（zero-/few-shot learning）或微调（instructed fine-tuning）后。然而，要让 AI 模型为多样化的任务做好准备，需要加入的一个关键特性是多模态能力；例如，AI 模型应该能够读取图像、听到声音、观看视频等。这种能力在纯文本基础的 LLMs 中基本上是不存在的。
+
+最近，大量的研究项目开始探索将视觉能力引入到 LLMs 中，特别是通过插入图片输入使 LLMs 来理解图片（简称为大型视觉语言模型或 LVLMs）。
+
+大多数现有工作的主要缺点是：
+* 重点主要放在与单一图像相关的任务上，如视觉问题回答和字幕，或处理需要同时输入的多个图像。两种方法都不太擅长管理交错的图像和文本输入。
+* 系统的可扩展性仅限于具有约 10B 参数的模型，这比最大的开源模型小了一个数量级。
+
+然而，对于一个真正的 AI 聊天模型，输入内容可能是与文本交错的多个图像，这是目前的工作很少涉及的情况。此外，随着模型大小的增加，LLMs 的生成能力增长迅速。因此，将系统能力集中在约 10B 的模型上限制了对 LVLMs 潜力的进一步探索。
+
+为了解决这些问题，我们推出了 DeepSpeed-VisualChat（请参阅 [arxiv 报告](https://arxiv.org/abs/2309.14327) 以获取更多详细信息），带有以下新特性：
+
+* ***全开源多轮多图框架与前所未有的可扩展性***：DeepSpeed-VisualChat，作为开创性的全开源框架之一，支持多轮和多图对话，容纳交错的文本和图像输入。我们利用 DeepSpeed 提高我们的训练效果，使用一个 2B 的视觉编码器和一个 70B 的 LLaMA-2 解码器模型，展示了我们框架的显著可扩展性。
+* ***多模态因果注意力 (MMCA)*** 我们为多模态模型设计了一个新的 MMCA 注意力机制，独立地计算各种模态的注意力权重。MMCA 达到了与传统交叉注意机制类似的目标，但为生成任务提供了增强的因果注意解释，消除了对额外模块或参数的需求。与标准的因果注意力相比，它还提供了更好的训练数据效率。
+* ***交错输入的数据混合*** 为了促进交错模态的对话，DeepSpeed-VisualChat 在现有数据集上采用了各种数据混合技术，克服了大多数现有开源数据集中交错文本和图像输入的短缺。
+
+# 2. 模型架构概述
+<div align="center">
+  <img src="../assets/images/model.png" alt="模型结构" width="400"/>
+
+  *图 1：DeepSpeed-VisualChat 的模型架构示意图。*
+</div>
+
+如 *图 1* 所示，DeepSpeed-VisualChat 的模型架构由三个部分组成：一个视觉编码器，如 CLIP；一个语言解码器，如 LLaMa-7B；和一个特征对齐线性投影层。模型的大部分都是冻结的，只有语言模型的嵌入和线性投影层是可训练的。因此，可训练参数的总数大约在 O(10M) (LLaMa-2-13B) 到 O(100M) (LLaMa-2-70B) 之间。
+
+# 3. DeepSpeed 多模态因果注意力
+
+用于在多模态模型中连接视觉和文本组件的两种常见注意机制是：因果注意力，如在 MiniGPT 和 QWen-VL 中使用的，以及交叉注意力，如在 Otter 和 Flamingo 中使用的。
+
+<div align="center">
+  <img src="../assets/images/attention.png" alt="不同的注意机制" width="1000"/>
+
+  *图 2：不同的注意机制：使用一个输入句子“用户：请描述这个图片。”和三个图像令牌（I-token1、I-token2、I-token3）来比较不同的注意机制。在左边，我们展示了标准的因果注意力，将图像令牌视为文本。在中间，我们展示了应用于图像的交叉注意力，同时保持文本令牌的标准因果注意力。在右边，我们展示了我们的创新 MMCA 注意力机制，其中图像令牌只执行自注意，文本令牌独立地注意文本/图像令牌，橙色为图像部分。这种机制由：softmax($`QK^T \odot M_1`$)+ softmax($`QK^T \odot M_2`$) 定义，其中 Q 和 K 分别为查询和密钥，$`M_1`$=[M==1]，和 $`M_2`$=[M==2]，其中 M $`\in`$ R<sup>10x10</sup>。*
+</div>
+
+<b>因果注意力 (CA)</b>：基于 CA 的方法简单地将视觉特征（即来自最终视觉编码器层输出的特征）投影到文本特征，并将它们与文本嵌入层后的正常文本特征组合，以送入 LLMs。CA 的好处是它是 LLMs 原始注意机制的自然扩展，因此，它不引入任何额外的模块或参数。但是，直觉上这种方法会带来一些问题：
+
+* 每个视觉令牌会关注它之前的视觉和文本令牌。然而视觉令牌已经以双向方式完全编码，不需要进一步关注它之前的视觉和文本令牌。
+* 对于一个文本令牌，模型需要学习如何在其之前的文本和图像令牌之间分配其注意权重。由于这些问题，我们发现 LVLMs 中 CA 的数据效率通常是有问题的。为了解决这个问题，LLaVA 和 QWen-VL 需要视觉-语言预训练来完全对齐视觉特征和文本特征。
+
+<b>交叉注意力 (CrA)</b>：作为替代方案，交叉注意力 (CrA) 与 CA 的结合展示出更好的数据效率，但也带有一些缺点：
+
+* 它为模型引入了新的参数。例如，具有交叉注意力引入的新参数的 Otter 拥有超过 15 亿的可训练参数。和 LLaVA 的百万级可训练参数相比，这大大增加了训练成本和内存需求。
+* 如果在训练过程中中间引入了一个图像，需要仔细设计，因为先前的文本令牌不应该能够注意到图像。
+
+<b>多模态因果注意机制 (MMCA)</b>：为了克服这些问题，我们提出了一种新的多模态因果注意机制 (MMCA)，它既有 CA 的参数效率，又有 CrA 的数据效率。总体思路如下：
+
+* 对于视觉令牌，它们只关注自己，因为视觉令牌是由视觉编码器编码的。
+* 对于文本令牌，它们关注所有以前的令牌。但是，对文本和图像令牌 MMCA 使用两个单独的注意权重矩阵。
+
+MMCA 的第二点背后的直觉是，一个模态的注意权重可能会影响另一个模态。例如，文本令牌可能会比视觉信息更多地关注文本信息。因此，如果注意权重矩阵在两种模态之间进行归一化，那么视觉令牌的注意得分可能会非常小。请参考 *图 2* 以查看三种注意机制的可视化。
+
+<b>演示结果。</b>我们首先通过几个例子展示在不同的注意机制下 DeepSpeed-VisualChat 的单图像视觉语言对话功能。在这些实验中，我们使用 LLaMA2-7B 语言模型和 QWen-VL 视觉编码器作为我们的视觉编码器。这两个模型通过一个简单的线性投影层连接在一起。这个模型在两个 LLaVa 数据集上进行了训练。正如 *图 3* 和 *图 4* 所示，当与 MMCA 配合使用时，DeepSpeed-VisualChat 有效地识别了图像中的视觉细节，对用户的问题提供了准确通顺的回答。
+此外，与其他注意机制（如使用因果注意力和交叉注意力的组合）相比，MMCA 表现出更全面和精确的图像细节把握。与 CrA 和 CA 的组合以及 MMCA 相比，仅使用 CA 可能会显示出稍微多一些的错误（*图 3*）或导致较低的理解能力（*图 4*）。
+
+<div align="center">
+  <img src="../assets/images/cat-chat.png" alt="小猫咪" width="600"/>
+
+  *图 3：示例视觉和语言输入，显示了（1）标准因果注意力 (CA) （2）与交叉注意力组合的标准因果注意力 (CA+ CrA) 和（3）DeepSpeed-VisualChat 中的特殊多模态因果注意力 (MMCA) 之间的输出比较。*
+</div>
+
+<div align="center">
+  <img src="../assets/images/lake-chat.png" alt="美丽的湖泊" width="600"/>
+
+  *图 4：DeepSpeed-VisualChat 准确地识别了场景是一个美丽的湖泊，并提供了一组合理的建议。相比之下，其他的注意力机制误解了图像认为其包含“带船坡的码头”。*
+</div>
+
+# 4. 数据混合
+我们使用了 3 个来源的 9 个数据集，如我们的 [arxiv 报告](https://arxiv.org/abs/2309.14327) 所述。一个实现多轮和多图对话的关键缺失元素是没有足够的数据。我们找到的唯一的多轮多图数据来源是 SparklesDialogue 数据集，它只包含 6520 个样本。为了解决这个问题，我们采用了两种方法，从现有的单图或单轮数据中合成多轮多图数据：简单的数据连接和 LLaVA-Otter 数据混合。
+
+## 4.1 简单数据连接
+对于 LLaVA 模型使用的 "llava" 和 "llava_dial" 数据集，每个样本包括单图像的单轮/多轮对话。为了模拟用户依次询问多个图像的情况，我们对这两个数据集进行了简单的数据后处理。具体来说，我们随机将不同数量的样本连接成一个样本。在 "llava" 的情况下，我们连接了 1 到 3 个样本，而在 "llava_dial" 的情况下，我们连接了 1 到 2 个样本。
+
+## 4.2 LLaVA-Otter 数据混合
+我们注意到，LLaVA 模型使用的 llava 和 llava_dial 数据集以及 Otter 模型使用的 otter_mimicit_cgd 数据集都使用了 COCO train2017 图像。对于 llava 和 llava_dial 数据集，每个样本包括一个图像的单轮/多轮对话。对于 otter_mimicit_cgd 数据集，每个样本包括一对图像的单轮对话。这使我们能够构建一个合成的多轮多图数据 llava_otter_blend 作为更自然的混合：对于 otter_mimicit_cgd 数据集中的每个样本，我们寻找使用相同图像的 llava 和 llava_dial 样本，然后以 "llava/llava_dial 对话然后 otter_mimicit_cgd 对话" 的方式构建一个新样本。
+
+<div align="center">
+  <img src="../assets/images/data-blending.png" alt="朋友们" width="600"/>
+
+  *图 5：经过 LLaVA-Otter 数据混合后的数据样本。灰色对话框来自 LLaVA 数据集，橙色对话框来自 Otter 数据集。*
+</div>
+
+# 5. 演示
+我们在几个开源数据集上训练了我们的 DeepSpeed-VisualChat-13B 模型，该模型使用一个 2B 的视觉编码器和 13B 的 LLaMA 模型。DeepSpeed-VisualChat-13B 展示了图像字幕功能（*图 6--8*），计数和文本阅读（*图 6*），名人识别（*图 7*），讲故事（*图 8*）等。
+
+<div align="center">
+  <img src="../assets/images/friends.png" alt="朋友们" width="600"/>
+
+  *图 6：DeepSpeed-VisualChat 可以计算图像中的人数，并读取第一张图像中的文本。它还展示了跨图像的理解。*
+</div>
+
+<div align="center">
+  <img src="../assets/images/ceos.png" alt="CEO" width="600"/>
+
+  *图 7：DeepSpeed-VisualChat 可以识别名人并将他们与其成就联系起来。*
+</div>
+
+<div align="center">
+  <img src="../assets/images/zootopia.png" alt="疯狂动物城" width="600"/>
+
+  *图 8：DeepSpeed-VisualChat 可以讲故事并识别电影。*
+</div>
+
+# 6. 如何开始使用 DeepSpeed-VisualChat
+DeepSpeed-VisualChat 是一个易于使用的训练框架，具有很好的可扩展性，到目前为止已经在 LLaMa-2-70B 模型上进行了测试。我们为所有实验采用了统一的指令调优格式，模板如下所示。
+```
+<System Instruction>      % You are a powerful vision-language assistant.
+
+### Image 1: <image>       % some image, e.g., cat-1.png
+### Question: <question>   % please describe the image.
+### Answer: <answer>       % It's a cute black cat.
+
+### Image 2: <image>       % some image, e.g., cat-2.png
+### Image 3: <image>       % some image, e.g., cat-3.png
+### Question: <question>   % What's the difference between the three cats?
+### Answer: <answer>       % The colors of the three cats are different.
+...
+```
+
+使用 DeepSpeed-VisualChat 训练模型是简单和方便的。这里我们给出了基于 CLIP 视觉编码器和 LLaMa-7B 模型的一个例子：
+
+```
+git clone https://github.com/microsoft/DeepSpeedExamples.git
+cd DeepSpeedExamples/applications/DeepSpeed-VisualChat/
+pip install -r requirements.txt
+cd training
+bash training_scripts/run_7b.sh
+```
+
+训练后的模型权重将自动保存为 Hugging Face 兼容版本，并且可以用于启动您自己的视觉聊天 API：
+```
+cd ../chat
+bash chat_scripts/run.sh # You need to change necessary variables, e.g, ckpt path
+```
+
+为了支持更大的模型推理，我们已经将 Hugging Face 大模型推理集成到我们的 DeepSpeed-VisualChat API 中。因此，用户可以根据 GPU 内存容量和模型大小选择不同数量的 GPU。
+
+请参考我们的 [GitHub 主页](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-VisualChat) 了解更多细节。
+
+# 7. 发布：今天尝试 DeepSpeed-VisualChat！
+
+我们非常兴奋地分享 DeepSpeed-VisualChat 现已开源并供 AI 社区使用。
+
+* 要开始使用，请访问我们的 DeepSpeed-VisualChat GitHub 页面：[GitHub 主页](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-VisualChat)
+
+* 我们将继续在您的反馈和支持下改进 DeepSpeed-VisualChat。我们的 [路线图](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-VisualChat/README.md#-deepspeed-visualchats-roadmap-) 显示了目前支持的功能以及未来计划支持的功能。
+
+DeepSpeed-VisualChat 是更大的 DeepSpeed 生态系统的一部分，其中包括一系列深度学习系统和建模技术。要了解更多信息，
+
+* 请访问我们的 [网站](https://www.deepspeed.ai/) 了解详细的博客文章、教程和文档。
+* 在我们的 [英文 X(Twitter)](https://twitter.com/MSFTDeepSpeed)、[日语 X(Twitter)](https://twitter.com/MSFTDeepSpeedJP) 和 [中文知乎](https://www.zhihu.com/people/deepspeed) 上关注我们，以获取 DeepSpeed 的最新消息。
+
+我们欢迎您为 DeepSpeed 做出贡献！我们鼓励您报告问题、贡献 PRs、并在 [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/) 页面上参加讨论。有关更多详细信息，请查看我们的 [贡献指南](https://github.com/microsoft/DeepSpeed/blob/master/CONTRIBUTING.md)。我们对与大学、研究实验室、公司等进行合作持开放态度，例如共同进行深度学习研究、应用 DeepSpeed 为现实世界的 AI 模型和应用提供支持等等。对于此类请求（以及其他不适合 GitHub 的请求），请直接发送电子邮件至 deepspeed-info@microsoft.com。
+
+* 如果你喜欢我们的工作，请在 [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/) 和 [DeepSpeedExamples GitHub](https://github.com/microsoft/DeepSpeedExamples/) 上为我们的仓库点“星”。
diff --git a/blogs/deepspeed-visualchat/10-03-2023/README-Japanese.md b/blogs/deepspeed-visualchat/10-03-2023/README-Japanese.md
new file mode 100755
index 000000000000..ec23e005bff6
--- /dev/null
+++ b/blogs/deepspeed-visualchat/10-03-2023/README-Japanese.md
@@ -0,0 +1,188 @@
+<div align="center">
+
+# DeepSpeed-VisualChat: 複数ラウンド・複数画像の入力が可能なAIチャット体験を実現
+</div>
+
+<div align="center">
+
+<img src="../assets/images/hero-figure.png" width="1000px" alt="DeepSpeed-VisualChat!"/>
+
+</div>
+
+DeepSpeed-VisualChatを引用する場合、[arxiv上のレポート](https://arxiv.org/abs/2309.14327)を引用してください。
+
+```
+@article{yao2023deepspeed-visualchat,
+  title={{DeepSpeed-VisualChat: Multi-Round Multi-Image Interleave Chat via Multi-Modal Causal Attention}},
+  author={Zhewei Yao and Xiaoxia Wu and Conglong Li and Minjia Zhang and Heyang Qin and Olatunji Ruwase and Ammar Ahmad Awan and Samyam Rajbhandari and Yuxiong He},
+  journal={arXiv preprint arXiv:2309.14327},
+  year={2023}
+}
+```
+
+# 1. 概要
+GPTやLLaMaのような大規模言語モデル（LLM）は、テキスト生成やテキスト理解などの多くのタスクにおいて、Zero-shot/Few-shot学習、あるいはinstructed fine-tuningによって、非常に優れた能力を示してきました。しかし、AIエージェントをより多様なタスクに対応させるには、マルチモーダルを扱う能力が必要です。例えば、AIエージェントは画像を読んだり、音声を聞いたり、ビデオを見たりすることができる必要があります。こうした機能は、テキストベースのLLMにはほとんどありません。
+
+近年、LLMに視覚的な能力を導入することは、研究・実践の両方において広く試みられています。特に、画像をそのまま与えて、LLMが理解できるようにする取り組みが行われています（大規模視覚言語モデル、略してLVLMなどと呼ばれる）。
+
+こうした分野における、既存の研究の主な問題は以下の通りです：
+
+* 視覚に関する質問への回答やキャプション付けのように、単一の画像に関連するタスクや、同時に入力される複数の画像の処理に重点が置かれており、画像とテキストが交互に入力されるような状況には対応していない
+* システムのスケーラビリティは、～10Bのパラメータを持つモデルに限定される
+
+しかし、本来はAIチャットエージェントには、複数のテキストと画像の両方が与えられる可能性があります。また、LLMの生成能力は、モデルサイズが大きくなるにつれて急速に向上することが知られており、～10Bのモデルではその能力が制限されてしまいます。
+
+これらの問題を解決するために、我々は以下の新たな機能を備えたDeepSpeed-VisualChat（詳細は[arxivのレポート](https://arxiv.org/abs/2309.14327)を参照）を開発しました:
+
+* ***完全にオープンソース化され、前例のないスケーラビリティを備えた複数ラウンド・複数画像を処理できるフレームワーク***： DeepSpeed-VisualChatは、完全にオープンソース化された先進的なフレームワークの1つであり、複数ラウンドを通じて画像とテキストが両方与えられる対話を可能にします。また、DeepSpeedを利用することで、比類ないスケーラビリティを実現しており、実際に2Bのビジュアルエンコーダーと70BのLLaMA-2デコーダーモデルで訓練を行えます。
+* ***マルチモーダル因果的注意(MMCA)***: マルチモーダルモデルのための新しいアテンションMMCA（Multi-Modal Causal Attention）を考案し、異なるモダリティ間で独立にアテンションの重みを計算します。MMCAは、従来のcross attentionに類似したものですが、生成タスクのためのcausal attentionを強化しており、追加のモジュールやパラメータが不要になります。また、標準的なcausal attentionと比較して、優れた訓練データ効率を示します。
+* ***順次与えられる画像とテキストを扱うためのデータブレンディング***: DeepSpeed-VisualChatは、既存のデータセットに様々なデータブレンディング技術を採用しています。これにより、順次与えられるテキストと画像の不足という、利用可能なオープンソースデータセットのほとんどに当てはまる課題を克服しています。
+
+# 2 モデルアーキテクチャの概要
+<div align="center">
+  <img src="../assets/images/model.png" alt="model arch" width="400"/>
+
+  *図1: モデルアーキテクチャの概要*
+
+</div>
+
+*図1*に示すように、DeepSpeed-VisualChatのモデルアーキテクチャは、CLIPのような視覚エンコーダー、LLaMa-7Bのような言語デコーダー、特徴アライメントを行う linear projectionレイヤの3つのコンポーネントで構成されています。モデルのほとんどのパラメータは固定されており、言語モデルのembeddingとlinear projectionレイヤのみが学習可能です。その結果、学習可能なパラメータの総数は O(10M) (LLaMa-2-13B) から O(100M) (LLaMa-2-70B) となります。
+
+# 3. DeepSpeed マルチモーダル Causal Attention (MMCA)
+
+マルチモーダルモデルで、画像とテキストをつなぐ一般的なattentionの機構は二つあります。一つはMiniGPTやQWen-VLで使われているようなcausal attentionで、もう一つはOtterやFlamingoで使われているようなcross attentionです。
+
+
+<div align="center">
+  <img src="../assets/images/attention.png" alt="Different attention mehanisms" width="1000"/>
+
+  *図2: 異なるアテンションの機構: 「ユーザー：画像を説明してください」という入力文と3つの画像トークン（I-token1、I-token2、I-token3）と組み合わせて与えた場合の、それぞれのattention機構の構成を示しています。左側では、標準的なcausal attentionによって、画像トークンをテキストとして扱う様子を示しています。中央は、テキストトークンに対する標準的なcausal attentionを維持しながら、画像に適用されるcross attentionを使用する様子を示しています。右側では、画像トークンはself attentionのみを行い、テキストトークンはテキスト／画像トークンへのアテンションを独立に計算するという、新しいマルチモーダルのためのアテンションの提案を、オレンジ色のマスクで強調して示しています。この仕組みは、Q, Kをクエリとキーとしたとき、 softmax($`QK^T \odot M_1`$)+ softmax($`QK^T \odot M_2`$)として定義されます。M $`\in`$ R<sup>10x10</sup>としたとき、$`M_1`$=[M==1], and $`M_2`$=[M==2] です。*
+</div>
+
+<b>Causal Attention（CA）</b>：CAに基づく方法は、視覚的特徴（最終的な視覚エンコーダ層の出力からの特徴）を単純にテキストの特徴量に投影し、テキスト埋め込み層以降の通常のテキストの特徴量と組み合わせてLLMに送り込むというものです。CAの利点は、LLMにおける本来のアテンション機構の自然な拡張であり、そのため余分なモジュールやパラメータを導入しないことです。しかし、このアプローチにはいくつかの直感的な問題があります：
+
+* 視覚トークンはすでに双方向に特徴量に変換されており、本来他の視覚トークンやテキストトークンとのアテンションの必要はありませんが、実際には前の視覚またはテキストトークンとのアテンションがあります。。
+* テキストトークンの場合、モデルは前のテキストトークンと画像トークンとの間でどのようにアテンションの重みを配分するかを学習する必要があります。これらの問題により、LVLMにおけるCAのデータ効率にはしばしば問題があることが分かりました。この問題への対処として、LLaVAとQWen-VLは、視覚的特徴とテキストの特徴を完全に対応させるために、視覚言語の事前学習を必要とします。
+
+<b>Cross Attention (CrA)</b>：代替案であるCross Attention (CrA) と CAの組み合わせは、より優れたデータ効率を示しますが、いくつかの欠点もあります：
+
+* モデルに新しいパラメーターを導入する必要があります。例えば、Otterは、Cross Attentionによって導入された新しいパラメータがあるため、LLaVAが数百万個の学習可能なパラメータを持つのに対し、15億個以上のパラメータを必要とします。これにより、学習コストと必要メモリ量が大幅に増加します。
+* 訓練中に会話の途中で画像が与えられた場合、前のテキストトークンは与えられた画像とのアテンションを求められないので、慎重な設計が必要です。
+
+<b>マルチモーダル Causal Attention (MMCA)</b>：これらの問題を解決するために、我々は新しいマルチモーダルCausal Attention (MMCA) を提案します。この機構は、CAと同様のパラメータ効率と、CrAと同様のデータ効率の、両方の利点を持つものです。全体的なアイデアは以下の通りです：
+
+* 視覚トークンは視覚エンコーダによってエンコードされるため、視覚トークンは自分自身とのアテンションのみを利用する。
+* テキストトークンについては、その前のすべてのトークンに注目する。ただし、前のテキストトークンと画像トークンに対して、それぞれ別々のアテンションの重み行列を持つ。
+
+MMCAの2つ目のポイントは、1つのモダリティに対するアテンションの重みが、もう1つのモダリティに影響を与える可能性があるということです。例えば、テキストトークンは、視覚情報よりもテキスト情報により大きなアテンションを持つかもしれません。そのため、アテンションの重み行列を両方のモダリティで正規化すると、視覚トークンのアテンションスコアが非常に小さくなる可能性があります。3つのアテンション機構の視覚化については、*図2*を参照してください。
+
+<b>出力例</b> まず、異なるアテンションの機構を採用した、画像を一つだけ用いた会話におけるDeepSpeed-VisualChatの能力を示す様々な例を紹介します。これらの実験では、LLaMA2-7B言語モデルとQWen-VL視覚エンコーダを視覚エンコーダとして併用します。これら2つのモデルはlinear projection layerを介して接続されています。このモデルは2つのLLaVaデータセットで学習を行いました。*図3*と*図4*で実証されているように、DeepSpeed-VisualChatはMMCAと組み合わされることで、画像内の視覚的な詳細を効果的に識別し、ユーザーのクエリに対して首尾一貫した応答を提供します。さらに、MMCAは、Causal AttentionとCross Attentionの両方から合成されたマスクを使用するような、別のアテンション機構と比べて、より包括的で正確な画像詳細の把握が可能です。また、CrAとCAの組み合わせやMMCAとは対照的に、CA単独では若干エラーが多く（*図3*）、推論能力の程度が低い（*図4*）可能性があることも明らかです。
+
+<div align="center">
+  <img src="../assets/images/cat-chat.png" alt="Small kitten" width="600"/>
+
+  *図3: (1) 標準的なcausal attention (CA) (2) cross attentionと組み合わせた標準的なcausal attention (CA+CrA) (3)DeepSpeed-VisualChatの特別なマルチモーダルCausal Attention (MMCA) の出力比較を示す視覚入力と言語入力の例。*
+</div>
+
+<div align="center">
+  <img src="../assets/images/lake-chat.png" alt="Beautiful lake" width="600"/>
+
+  *図4：DeepSpeed-VisualChatは、示された場面を美しい湖として正確に識別し、妥当な提案のセットを提示する。対照的に、ベースラインは画像を「ボート乗り場のあるドック」と誤認識している。*
+</div>
+
+# 4. データブレンディング
+
+[arxivのレポート](https://arxiv.org/abs/2309.14327)に記載されているように、訓練には3つのソースから9つのデータセットを使用しました。複数ラウンド・複数画像の入力を可能にするために決定的に欠けている要素は、適切なデータがないことです。我々が見つけた複数ラウンド・複数画像の唯一のデータソースはSparklesDialogueデータセットで、そこにはわずか6520サンプルしか含まれていません。この制限に対処するため、既存の単一画像または単一ラウンドのデータから、複数ラウンド・複数画像のデータを合成するために、単純なデータ連結とLLaVA-Otterデータ混合という2つの方法を採用しました。
+
+## 4.1 単純なデータ連結
+LLaVAモデルで利用する "llava" と "llava_dial" データセットでは、各サンプルは1つの画像に対する単一／複数ラウンドの会話で構成されています。ユーザーが複数の画像について逐次質問するシナリオをシミュレートするため、これら2つのデータセットに対して、簡単なデータ後処理を行いました。具体的には、ランダムな数のサンプルを1つのサンプルとして連結しました。 "llava" の場合は1～3個のサンプルを連結し、"llava_dial" の場合は1～2個のサンプルを連結しました。
+
+## 4.2 LLaVAとOtterのデータブレンディング
+
+LLaVAモデルで使用されているllavaとllava_dialデータセット、およびOtterモデルで使用されているotter_mimicit_cgdデータセットは、すべてCOCO train2017画像を使用しています。llavaデータセットとllava_dialデータセットには、各サンプルに1つの画像に対する単発／複数回の会話が含まれます。otter_mimicit_cgdデータセットでは、各サンプルは画像のペアに対する1ラウンドの会話を含んでいます。そこで、otter_mimicit_cgdデータセットの各サンプルについて、同じ画像を使うllavaとllava_dialのサンプルを探し、「llava/llava_dial会話 -> otter_mimicit_cgd会話」という流れで新しいサンプルを構築しました。
+
+<div align="center">
+  <img src="../assets/images/data-blending.png" alt="Friends" width="600"/>
+
+  *図5: LLaVA-Otterデータブレンド後のデータサンプル。灰色のダイアログボックスはLLaVAデータセットから、オレンジ色のダイアログボックスはOtterデータセットからのもの*
+</div>
+
+# 5. デモ
+いくつかのオープンソースデータセットで2Bビジュアルエンコーダーと13B LLaMAモデルを使い、DeepSpeed-VisualChat-13Bモデルを訓練しました。DeepSpeed-VisualChat-13Bは、画像キャプション機能（*図6-8*）、計数とテキスト読み取り（*図6*）、著名人の認識（*図7*）、ストーリーテリング（*図8*）などを示しています。
+
+<div align="center">
+  <img src="../assets/images/friends.png" alt="Friends" width="600"/>
+
+  *図6: DeepSpeed-VisualChatは、画像内の人数を数え、最初の画像のテキストを読み取ることができます。また、複数画像を横断的に理解することも可能です。*
+</div>
+
+
+<div align="center">
+  <img src="../assets/images/ceos.png" alt="CEO" width="600"/>
+
+  *図7: DeepSpeed-VisualChatは有名人を認識し、その人物の業績と関連付けることができます*
+</div>
+
+
+<div align="center">
+  <img src="../assets/images/zootopia.png" alt="Zootopia" width="600"/>
+
+  *図8: DeepSpeed-VisualChatは、ストーリーを作ったり、映画を認識したりできます。*
+</div>
+
+
+# 6. DeepSpeed-VisualChatを使い始めるには
+DeepSpeed-VisualChatは使いやすく、かつ優れたスケーラビリティを持つ学習フレームワークで、これまでLLaMa-2-70Bモデルでテストされています。
+すべての実験で統一された命令チューニング形式を採用しており、そのテンプレートを以下に示します。
+
+```
+<System Instruction>      % You are a powerful vision-language assistant.
+
+### Image 1: <image>       % some image, e.g., cat-1.png
+### Question: <question>   % please describe the image.
+### Answer: <answer>       % It's a cute black cat.
+
+### Image 2: <image>       % some image, e.g., cat-2.png
+### Image 3: <image>       % some image, e.g., cat-3.png
+### Question: <question>   % What's the difference between the three cats?
+### Answer: <answer>       % The colors of the three cats are different.
+...
+```
+
+DeepSpeed-VisualChatの訓練は簡単かつ便利に実行できます。ここではCLIPビジュアルエンコーダーとLLaMa-7Bモデルを使用する例を示します：
+
+```
+git clone https://github.com/microsoft/DeepSpeedExamples.git
+cd DeepSpeedExamples/applications/DeepSpeed-VisualChat/
+pip install -r requirements.txt
+cd training
+bash training_scripts/run_7b.sh
+```
+
+訓練されたチェックポイントは自動的にHugging Faceと互換性のある形式で保存され、独自のビジュアルチャットAPIを提供するために使用できます：
+
+```
+cd ../chat
+bash chat_scripts/run.sh # You need to change necessary variables, e.g, ckpt path
+```
+
+より大規模なモデル推論をサポートするために、我々はHugging Faceの大規模モデル推論をDeepSpeed-VisualChat APIに組み込みました。そのため、ユーザーはGPUメモリ容量とモデルサイズに基づいて、異なるGPU数を選択することができます。
+
+詳細は[ランディングページ](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-VisualChat)をご参照ください。
+
+# 7. 早速使ってみましょう！
+
+DeepSpeed-VisualChatがオープンソース化され、AIコミュニティで利用できるようになったことを大変嬉しく思います。
+
+* まずは、DeepSpeed-VisualChatのGitHubページをご覧ください： [GitHubランディングページ](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-VisualChat)
+
+* DeepSpeed-VisualChatは、皆様からのフィードバックとサポートにより改良を続けていきます。私たちの[ロードマップ](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-VisualChat/README.md#-deepspeed-visualchats-roadmap-)は、現在サポートされている機能と将来的に計画している機能を示しています。
+
+DeepSpeed-VisualChatは、さまざまなDeep Learningシステムやモデリング技術を含む、より大きなDeepSpeedエコシステムの一部です。詳細については、以下をご覧ください。
+
+* 私たちの[ウェブサイト](https://www.deepspeed.ai/)で、詳細なブログ記事、チュートリアル、役立つドキュメントを提供しています。
+* DeepSpeedの最新ニュースは、[English X(Twitter)](https://twitter.com/MSFTDeepSpeed)、[Japanese X(Twitter)](https://twitter.com/MSFTDeepSpeedJP)、[Chinese Zhihu](https://www.zhihu.com/people/deepspeed)をフォローしてください。
+
+DeepSpeedは、皆様の開発への参加を歓迎しています。DeepSpeedのGitHubページで、バグ報告、Pull Request、ディスカッションへの参加が可能です。詳細は[ガイドライン](https://github.com/microsoft/DeepSpeed/blob/master/CONTRIBUTING.md)をご覧ください。また、大学、研究所、企業とのコラボレーションも行っています。こうしたコラボレーションについてのご要望（およびGitHubには適さないその他の話題）については<deepspeed-info@microsoft.com> まで直接メールをお送りください。
+
+* 私たちの[DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/)および[DeepSpeedExamples GitHub](https://github.com/microsoft/DeepSpeedExamples/)リポジトリが気に入ったら、ぜひスターをつけてください！
diff --git a/blogs/deepspeed-visualchat/10-03-2023/README.md b/blogs/deepspeed-visualchat/10-03-2023/README.md
new file mode 100755
index 000000000000..eac9d06c3cea
--- /dev/null
+++ b/blogs/deepspeed-visualchat/10-03-2023/README.md
@@ -0,0 +1,188 @@
+<div align="center">
+
+# DeepSpeed-VisualChat: Improve Your Chat Experience with Multi-Round Multi-Image Inputs
+
+</div>
+
+<div align="center">
+
+<img src="../assets/images/hero-figure.png" width="1000px" alt="DeepSpeed-VisualChat!"/>
+
+</div>
+
+To cite DeepSpeed-VisualChat, please cite our [arxiv report](https://arxiv.org/abs/2309.14327):
+
+```
+@article{yao2023deepspeed-visualchat,
+  title={{DeepSpeed-VisualChat: Multi-Round Multi-Image Interleave Chat via Multi-Modal Causal Attention}},
+  author={Zhewei Yao and Xiaoxia Wu and Conglong Li and Minjia Zhang and Heyang Qin and Olatunji Ruwase and Ammar Ahmad Awan and Samyam Rajbhandari and Yuxiong He},
+  journal={arXiv preprint arXiv:2309.14327},
+  year={2023}
+}
+```
+# 1. Overview
+Large Language models (LLMs), such as GPT and LLaMa, have showcased exceptional prowess in a myriad of text generation and comprehension tasks, especially when subjected to zero-/few-shot learning, particularly after instructed fine-tuning. However, to equip AI agents for diverse tasks, one critical feature that needs to be incorporated is multi-modal capability; for instance, the AI agent should be able to read images, hear voices, watch videos, etc. This capability is largely absent in solely text-based LLMs.
+
+Recently, one of the research/practice mainstreams has begun exploring the incorporation of visual capability into LLMs, especially enabling LLMs to understand images by inserting raw pictures (referred to as large visual language models, or LVLMs in short).
+
+The main caveats of the majority of existing works are:
+* The focus is predominantly on tasks related to a single image, such as visual question answering and captioning, or on handling multiple images that require concurrent input. Neither approach adeptly manages interleaved image-and-text input.
+* The scalability of the system is limited to models with ~10B parameters, which is about an order of magnitude smaller than largest open-sourced models.
+
+However, for a genuine AI chat agent, the content of inputs could be multiple images interleaved with text, a situation rarely addressed by current works. Also, the generation capability of LLMs grows quickly as the model size increases. Therefore, focusing system capability on ~10B models limits further exploration of the potential of LVLMs.
+
+To resolve these issues, we are introducing DeepSpeed-VisualChat (see [arxiv report](https://arxiv.org/abs/2309.14327) for more details) with the following new features:
+
+* ***Fully Open-Sourced Multi-round Multi-image Framework with Unprecedented Scalability***: DeepSpeed-VisualChat, one of the pioneering fully open-sourced frameworks, enables multi-round and multi-image dialogues, accommodating interleaved text-and-image inputs. We leverage DeepSpeed to enhance our training with a 2B visual encoder and a 70B LLaMA-2 decoder model, illustrating the remarkable scalability of our framework.
+* ***Multi-Modal Causal Attention (MMCA)***
+We devise a novel MMCA for multi-modal models that computes attention weights independently across various modalities. MMCA achieves objectives analogous to conventional cross-attention mechanisms but offers enhanced causal attention interpretations for generative tasks, eliminating the need for additional modules or parameters. It also presents superior training data efficiency compared to standard causal attention.
+* ***Data Blending for Interleaved Inputs*** To facilitate conversations with interleaved modalities, DeepSpeed-VisualChat employs assorted data blending techniques on existing datasets, overcoming the shortage of interleaved text-and-image inputs in most available open-source datasets.
+
+
+
+# 2 Model architecture overview
+<div align="center">
+  <img src="../assets/images/model.png" alt="model arch" width="400"/>
+
+  *Figure 1: Model architecture illustration.*
+
+</div>
+
+The model architecture of DeepSpeed-VisualChat, as depicted in *Figure 1*, is composed of three components: a visual encoder, such as CLIP; a language decoder, such as LLaMa-7B; and a feature alignment linear projection layer. Most parts of the model are frozen, with only the embedding of the language model and the linear projection layer being trainable. Consequently, the total number of trainable parameters ranges from approximately O(10M) (LLaMa-2-13B) to O(100M) (LLaMa-2-70B).
+
+# 3. DeepSpeed multi-modal causal attention
+
+There are two common attention mechanisms used to connect the visual and textual components in a multi-modal model: causal attention, as used in MiniGPT and QWen-VL, and cross attention, as used in Otter and Flamingo.
+
+<div align="center">
+  <img src="../assets/images/attention.png" alt="Different attention mehanisms" width="1000"/>
+
+  *Figure 2: Different Attention Mechanisms: Examine the differing attention mechanisms using an input sentence "User: Please describe the image." coupled with three Image tokens (I-token1, I-token2, I-token3). On the left, we demonstrate standard causal attention, treating image tokens as text. In the middle, we present cross attention applied to images, while maintaining standard causal attention for text tokens. On the right, we illustrate our innovative multi-modal attention proposal where image tokens only perform self-attention, and text tokens attend to text/image tokens independently, highlighted with an orange mask. This mechanism is defined by: softmax($`QK^T \odot M_1`$)+ softmax($`QK^T \odot M_2`$) with Q and K as query and key, $`M_1`$=[M==1], and $`M_2`$=[M==2], with M $`\in`$ R<sup>10x10</sup> in this case.*
+</div>
+
+
+<b>Causal Attention (CA)</b>: The CA-based method simply projects visual features (i.e., the features from the output of the final visual encoder layer) into textual features and combines them with the normal textual features after the textual embedding layer to feed into LLMs. The benefit of CA is that it's a natural extension of the original attention mechanism in LLMs, and as such, it doesn't introduce any extra modules or parameters. However, this approach raises some intuitive problems:
+
+* For a visual token, it attends to previous visual and textual tokens, even though visual tokens are already fully encoded in a bidirectional manner and do not need further attention to other visual tokens or previous textual tokens.
+* For a textual token, the model needs to learn how to distribute its attention weights between its previous textual and image tokens. Due to these issues, we found that the data efficiency of CA in LVLMs is often problematic. To address this, LLaVA and QWen-VL require visual-language pretraining to fully align visual features with textual features.
+
+<b>Cross Attention (CrA)</b>: The alternative, cross attention (CrA), along with CA, exhibits better data efficiency but also comes with a few drawbacks:
+
+* It introduces new parameters to the model. For example, Otter has more than 1.5 billion trained parameters compared to the millions of trained parameters in LLaVA due to the new parameters introduced by cross attention. This significantly increases the training cost and memory requirements.
+* It requires careful design if an image is introduced in the middle of a conversation during training, as previous text tokens should not be able to attend to the image.
+
+<b>Multi-Modal Causal Attention Mechanism (MMCA)</b>: To overcome these issues, we propose a new multi-modal causal attention mechanism (MMCA), which has both benefits, i.e., similar parameter efficiency as CA and similar data efficiency as CrA. The overall idea is as follows:
+
+* For visual tokens, they only attend to themselves, as visual tokens are  encoded by the visual encoder.
+* For textual tokens, they attend to all their previous tokens. However, they have two separate attention weight matrices for their previous textual tokens and image tokens.
+
+The intuition behind the second point of MMCA is that the attention weight for one modality may affect the other modality. For instance, a textual token may pay more attention to textual information than visual information. Therefore, if the attention weight matrix is normalized across both modalities, the attention score for visual tokens might be very small. Refer to *Figure 2* for a visualization of the three attention mechanisms.
+
+
+<b>Demo Results.</b> We begin by showcasing various examples that highlight the capabilities of DeepSpeed-VisualChat in single-image visual language conversations, employing different attention mechanisms. In these experiments, we employ the LLaMA2-7B language model in conjunction with the QWen-VL visual-encoder as our visual encoder. These two models are connected via a straightforward linear projection layer. Our model underwent training on two LLaVa datasets. As demonstrated in *Figure 3* and *Figure 4*, DeepSpeed-VisualChat, when coupled with MMCA, effectively discerns visual details in images and furnishes coherent responses to user queries.
+Furthermore, DeepSpeed-VisualChat exhibits a more comprehensive and precise grasp of image details compared to alternative attention mechanisms, such as the use of combined masks from both causal attention and cross attention. It is also evident that, in contrast to the combination of CrA and CA, as well as MMCA, CA alone may exhibit slightly more errors (*Figure 3*) and capture a lower degree of reasoning capability (*Figure 4*).
+
+<div align="center">
+  <img src="../assets/images/cat-chat.png" alt="Small kitten" width="600"/>
+
+  *Figure 3: Example visual and language inputs that demonstrate the output comparison between (1) the standard causal attention (CA) (2)  the standard causal attention combined with cross-attention (CA+ CrA) and (3) the special multi-modal causal attention (MMCA) in DeepSpeed-VisualChat.*
+
+</div>
+
+<div align="center">
+  <img src="../assets/images/lake-chat.png" alt="Beautiful lake" width="600"/>
+
+  *Figure 4: DeepSpeed-VisualChat accurately identifies the scene as a beautiful lake and offers a set of plausible suggestions. In contrast, the baseline misinterprets the image as containing “dock with a boat ramp”.*
+
+</div>
+
+# 4. Data blending
+We used 9 datasets from 3 sources as described in our [arxiv report](https://arxiv.org/abs/2309.14327). A critical missing element for enabling multi-round and multi-image conversations is the absence of adequate data. The sole source of multi-round multi-image data we located is the SparklesDialogue dataset, which contains a mere 6520 samples. To address this limitation, we employed two methods to synthesize multi-round multi-image data from existing single-image or single-round data: simple data concatenation and LLaVA-Otter data blending.
+
+## 4.1 Simple data concatenation
+For the "llava" and "llava_dial" datasets utilized by the LLaVA model, each sample comprises single/multi-round conversations for a single image. To simulate scenarios where a user sequentially asks questions about multiple images, we conducted straightforward data post-processing for these two datasets. Specifically, we randomly concatenated different numbers of samples into a single sample. In the case of "llava," we concatenated 1 to 3 samples, while for "llava_dial," we concatenated 1 to 2 samples.
+
+## 4.2 LLaVA-Otter data blending
+We noticed that the llava and llava_dial datasets used by LLaVA model and the otter_mimicit_cgd dataset used by the Otter model all use the COCO train2017 images. For the llava and llava_dial datasets, each sample includes a single/multi-round conversations for a single image. For the otter_mimicit_cgd dataset, each sample includes a single-round conversation for a pair of images. This enables us to build a synthesized multi-round multi-image data llava_otter_blend as a more natural blending: for each sample in the otter_mimicit_cgd dataset, we look for llava and llava_dial samples that use the same image, and then build a new sample in a "llava/llava_dial conversations then otter_mimicit_cgd conversation" fashion.
+
+<div align="center">
+  <img src="../assets/images/data-blending.png" alt="Friends" width="600"/>
+
+  *Figure 5:  A data sample after LLaVA-Otter data blending. Gray dialog boxes are from LLaVA datasets, and orange ones are from Otter dataset.*
+</div>
+
+# 5. Demonstration
+We trained our DeepSpeed-VisualChat-13B model with a 2B visual encoder and the 13B LLaMA model on several open-sourced datasets. DeepSpeed-VisualChat-13B shows image captioning capabilities (*Figure 6--8*), counting and text reading (*Figure 6*), celebrity recognition (*Figure 7*), storytelling (*Figure 8*), etc.
+
+<div align="center">
+  <img src="../assets/images/friends.png" alt="Friends" width="600"/>
+
+  *Figure 6:  DeepSpeed-VisualChat can count the number of people in the image and read the text in the first image. It also demonstrates cross-image understanding.*
+</div>
+
+
+<div align="center">
+  <img src="../assets/images/ceos.png" alt="CEO" width="600"/>
+
+  *Figure 7:  DeepSpeed-VisualChat can recognize celebrities and associate them with their achievements.*
+</div>
+
+
+<div align="center">
+  <img src="../assets/images/zootopia.png" alt="Zootopia" width="600"/>
+
+  *Figure 8: DeepSpeed-VisualChat can tell stories and recognize movies.*
+</div>
+
+
+# 6. How to begin with DeepSpeed-VisualChat
+DeepSpeed-VisualChat is an easy-to-use training framework with great scalability, having been tested up to LLaMa-2-70B models so far. We adopt a unified instruction tuning format for all experiments, and the template is shown below.
+```
+<System Instruction>      % You are a powerful vision-language assistant.
+
+### Image 1: <image>       % some image, e.g., cat-1.png
+### Question: <question>   % please describe the image.
+### Answer: <answer>       % It's a cute black cat.
+
+### Image 2: <image>       % some image, e.g., cat-2.png
+### Image 3: <image>       % some image, e.g., cat-3.png
+### Question: <question>   % What's the difference between the three cats?
+### Answer: <answer>       % The colors of the three cats are different.
+...
+```
+
+The training experience of DeepSpeed-VisualChat is straightforward and convenient. Here we give an example based on the CLIP visual encoder and the LLaMa-7B model:
+```
+git clone https://github.com/microsoft/DeepSpeedExamples.git
+cd DeepSpeedExamples/applications/DeepSpeed-VisualChat/
+pip install -r requirements.txt
+cd training
+bash training_scripts/run_7b.sh
+```
+
+The trained checkpoint will be automatically saved in a Hugging Face-compatible version and can be used to launch your own visual chat API:
+```
+cd ../chat
+bash chat_scripts/run.sh # You need to change necessary variables, e.g, ckpt path
+```
+To support larger model inference, we have incorporated Hugging Face large model inference into our DeepSpeed-VisualChat API. Therefore, users can choose a different number of GPUs based on the GPU memory capacity and the model size.
+
+Please refer to our [GitHub Landing Page](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-VisualChat) for more details.
+
+# 7. Release: Try DeepSpeed-VisualChat today!
+
+We are very excited to share that DeepSpeed-VisualChat is now open-sourced and available to the AI community.
+
+* To get started, please visit our GitHub page for DeepSpeed-VisualChat: [GitHub Landing Page](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-VisualChat)
+
+* We will continue to improve DeepSpeed-VisualChat with your feedback and support. Our [roadmap](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-VisualChat/README.md#-deepspeed-visualchats-roadmap-) shows currently supported features as well as ones that are planned for the future.
+
+
+DeepSpeed-VisualChat is a component of the larger DeepSpeed ecosystem, which includes a range of Deep Learning systems and modeling technologies. To learn more,
+
+* Please visit our [website](https://www.deepspeed.ai/) for detailed blog posts, tutorials, and helpful documentation.
+* Follow us on our [English X(Twitter)](https://twitter.com/MSFTDeepSpeed), [Japanese X(Twitter)](https://twitter.com/MSFTDeepSpeedJP), and [Chinese Zhihu](https://www.zhihu.com/people/deepspeed) for latest news on DeepSpeed.
+
+We welcome your contributions to DeepSpeed! We encourage you to report issues, contribute PRs, and join discussions on the [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/) page. Please see our [contributing guide](https://github.com/microsoft/DeepSpeed/blob/master/CONTRIBUTING.md) for more details. We are open to collaborations with universities, research labs, companies, such as those working together on deep learning research, applying DeepSpeed to empower real-world AI models and applications, and so on. For such requests (and other requests unsuitable for GitHub), please directly email to deepspeed-info@microsoft.com.
+
+* "Star" our [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/) and [DeepSpeedExamples GitHub](https://github.com/microsoft/DeepSpeedExamples/) repositories if you like our work!
diff --git a/blogs/deepspeed-visualchat/assets/images/attention.png b/blogs/deepspeed-visualchat/assets/images/attention.png
new file mode 100644
index 000000000000..b01d8f8027ce
Binary files /dev/null and b/blogs/deepspeed-visualchat/assets/images/attention.png differ
diff --git a/blogs/deepspeed-visualchat/assets/images/cat-chat.png b/blogs/deepspeed-visualchat/assets/images/cat-chat.png
new file mode 100755
index 000000000000..5a5c27381f65
Binary files /dev/null and b/blogs/deepspeed-visualchat/assets/images/cat-chat.png differ
diff --git a/blogs/deepspeed-visualchat/assets/images/ceos.png b/blogs/deepspeed-visualchat/assets/images/ceos.png
new file mode 100644
index 000000000000..e148f545a44b
Binary files /dev/null and b/blogs/deepspeed-visualchat/assets/images/ceos.png differ
diff --git a/blogs/deepspeed-visualchat/assets/images/data-blending.png b/blogs/deepspeed-visualchat/assets/images/data-blending.png
new file mode 100644
index 000000000000..a8afb5144fb1
Binary files /dev/null and b/blogs/deepspeed-visualchat/assets/images/data-blending.png differ
diff --git a/blogs/deepspeed-visualchat/assets/images/friends.png b/blogs/deepspeed-visualchat/assets/images/friends.png
new file mode 100644
index 000000000000..2689d8d4bb1c
Binary files /dev/null and b/blogs/deepspeed-visualchat/assets/images/friends.png differ
diff --git a/blogs/deepspeed-visualchat/assets/images/hero-figure.png b/blogs/deepspeed-visualchat/assets/images/hero-figure.png
new file mode 100644
index 000000000000..ca79b2c6239f
Binary files /dev/null and b/blogs/deepspeed-visualchat/assets/images/hero-figure.png differ
diff --git a/blogs/deepspeed-visualchat/assets/images/lake-chat.png b/blogs/deepspeed-visualchat/assets/images/lake-chat.png
new file mode 100755
index 000000000000..c47199737d54
Binary files /dev/null and b/blogs/deepspeed-visualchat/assets/images/lake-chat.png differ
diff --git a/blogs/deepspeed-visualchat/assets/images/model.png b/blogs/deepspeed-visualchat/assets/images/model.png
new file mode 100644
index 000000000000..dbd1f05c484b
Binary files /dev/null and b/blogs/deepspeed-visualchat/assets/images/model.png differ
diff --git a/blogs/deepspeed-visualchat/assets/images/zootopia.png b/blogs/deepspeed-visualchat/assets/images/zootopia.png
new file mode 100644
index 000000000000..c9e3783ed198
Binary files /dev/null and b/blogs/deepspeed-visualchat/assets/images/zootopia.png differ
diff --git a/blogs/deepspeed4science/README.md b/blogs/deepspeed4science/README.md
new file mode 100644
index 000000000000..a318490329a5
--- /dev/null
+++ b/blogs/deepspeed4science/README.md
@@ -0,0 +1,18 @@
+<div align="center">
+
+# Announcing the DeepSpeed4Science Initiative: Enabling large-scale scientific discovery through sophisticated AI system technologies
+
+</div>
+
+[https://www.microsoft.com/en-us/research/blog/announcing-the-deepspeed4science-initiative-enabling-large-scale-scientific-discovery-through-sophisticated-ai-system-technologies/](https://www.microsoft.com/en-us/research/blog/announcing-the-deepspeed4science-initiative-enabling-large-scale-scientific-discovery-through-sophisticated-ai-system-technologies/)
+
+To cite DeepSpeed4Science, please cite our [white paper](https://arxiv.org/abs/2310.04610):
+
+```
+@article{song2023deepspeed4science,
+  title={DeepSpeed4Science Initiative: Enabling Large-Scale Scientific Discovery through Sophisticated AI System Technologies},
+  author={Song, Shuaiwen Leon and Kruft, Bonnie and Zhang, Minjia and Li, Conglong and Chen, Shiyang and Zhang, Chengming and Tanaka, Masahiro and Wu, Xiaoxia and Rasley, Jeff and Awan, Ammar Ahmad and others},
+  journal={arXiv preprint arXiv:2310.04610},
+  year={2023}
+}
+```
diff --git a/blogs/deepspeed4science/chinese/README.md b/blogs/deepspeed4science/chinese/README.md
new file mode 100644
index 000000000000..dabc4ab077f2
--- /dev/null
+++ b/blogs/deepspeed4science/chinese/README.md
@@ -0,0 +1,156 @@
+<div align="center">
+
+# DeepSpeed4Science：利用先进的AI系统优化技术实现科学发现
+
+</div>
+
+*此博客为英文博客[Announcing the DeepSpeed4Science Initiative: Enabling large-scale scientific discovery through sophisticated AI system technologies](https://www.microsoft.com/en-us/research/blog/announcing-the-deepspeed4science-initiative-enabling-large-scale-scientific-discovery-through-sophisticated-ai-system-technologies/)的官方翻译*
+
+<div align="center">
+<img src="../media/Figure1.png" width="800px" alt="" />
+
+*图1：DeepSpeed4Science方法概述：专为加速科学发现和应对其复杂性而量身定制的AI系统技术开发。*
+</div>
+
+如需引用 DeepSpeed4Science，请引用我们的[white paper](https://arxiv.org/abs/2310.04610):
+
+```
+@article{song2023deepspeed4science,
+  title={DeepSpeed4Science Initiative: Enabling Large-Scale Scientific Discovery through Sophisticated AI System Technologies},
+  author={Song, Shuaiwen Leon and Kruft, Bonnie and Zhang, Minjia and Li, Conglong and Chen, Shiyang and Zhang, Chengming and Tanaka, Masahiro and Wu, Xiaoxia and Rasley, Jeff and Awan, Ammar Ahmad and others},
+  journal={arXiv preprint arXiv:2310.04610},
+  year={2023}
+}
+```
+
+## 简介
+
+在接下来的十年中，深度学习可能会彻底改变自然科学，增强我们对自然现象进行建模和预测的能力。这可能预示着科学探索的新时代，为从药物开发到可再生能源的各个领域带来重大进展。为了响应这一机会以及微软“予力全球每一人、每一组织，成就不凡”的使命，[微软DeepSpeed团队](https://www.deepspeed.ai/)启动了一个名为[DeepSpeed4Science](https://deepspeed4science.ai/)的新计划，旨在通过AI系统技术创新帮助领域专家解锁当今最大的科学之谜。
+
+[DeepSpeed](https://www.deepspeed.ai/)系统是由微软开发的业界领先的开源AI系统框架，它为各种AI硬件上的深度学习训练和推理提供了前所未有的规模和速度。图1展示了我们对DeepSpeed4Science这一新计划的基本方法。通过利用DeepSpeed当前的技术方案（训练、推理和压缩）作为基础技术推动器，DeepSpeed4Science将创建一套专为加速科学发现而量身定制的AI系统技术，以应对其独特的复杂性，超越用于加速通用大型语言模型（LLMs）的常见技术方法。我们与拥有科学AI模型的内部和外部团队紧密合作，以发现和解决领域特定AI系统的挑战。这包括气候科学、药物设计、生物学理解、分子动力学模拟、癌症诊断和监测、催化剂/材料发现、和其他领域。
+
+我们的长期愿景是将DeepSpeed4Science发展成一个用于分享支持科学发现的先进AI技术的软件平台和统一代码仓库。DeepSpeed4Science的设计旨在包容性，呼应微软的[“AI for Good”承诺](https://www.microsoft.com/en-us/ai/ai-for-good)。这体现在该计划对一系列标志性科学模型的支持上，他们代表了一些最关键的AI4Science应用场景。在这篇博客中，我们展示了DeepSpeed4Science如何帮助解决结构生物学研究中的两个关键AI系统挑战：(1) 解决了以Evoformer为中心的蛋白质结构预测模型中的内存爆炸问题，以及(2)为更好地理解引发大流行的病毒的进化提供AI模型长序列支持。
+
+## 我们的初期主要合作者
+
+DeepSpeed4Science的新系统技术可以用于很多推动科学边界的标志性模型，赋能AI驱动的科学发现。目前，DeepSpeed4Science很荣幸地支持来自[微软研究院AI4Science](https://www.microsoft.com/en-us/research/lab/microsoft-research-ai4science/)、[微软WebXT/Bing](https://www.msn.com/en-us/weather/forecast/)、[美国能源部国家实验室](https://www.energy.gov/national-laboratories)和多所大学的几个关键科学模型。
+
+### 微软内部合作伙伴
+
+#### 科学基础模型（Scientific Foundation Model，SFM），微软研究院AI4Science
+
+<div align="center">
+<img src="../media/Figure2-1.png" width="800px" alt="" />
+<img src="../media/Figure2-2.gif" width="800px" alt="" />
+
+*图2：科学基础模型（Scientific Foundation Model，SFM）及其当前探索：Distributional Graphormer。*
+</div>
+
+科学基础模型（SFM）旨在创建一个统一的大规模基础模型，以支持自然科学发现，支持多种输入、多个科学领域（例如，药物、材料、生物学、健康等）和计算任务。DeepSpeed4Science合作伙伴关系将为SFM团队提供新的训练和推理技术，以支持他们的新生成AI方法（例如[Distributional Graphormer](https://www.microsoft.com/en-us/research/blog/distributional-graphormer-toward-equilibrium-distribution-prediction-for-molecular-systems/)）这样的项目进行持续研究。
+
+#### ClimaX，微软研究院AI4Science
+
+<div align="center">
+<img src="../media/Figure3.png" width="800px" alt="" />
+
+*图3：ClimaX是第一个设计用于执行各种天气和气候建模任务的基础模型。*
+</div>
+
+我们的气候正在发生变化，导致极端天气事件的频率增加。为了减轻负面影响，预测这些事件将发生的地方变得越来越重要。[ClimaX](https://www.microsoft.com/en-us/research/group/autonomous-systems-group-robotics/articles/introducing-climax-the-first-foundation-model-for-weather-and-climate/)是第一个设计用于执行各种天气和气候建模任务的基础模型。它可以吸收许多具有不同变量和分辨率的数据集以提高天气预报的准确性。DeepSpeed4Science正在为ClimaX创建新的系统支持和加速策略，以高效地预训练/微调更大的基础模型，同时处理非常大的高分辨率图像数据（例如，数十到数百PB）和长序列。
+
+#### 分子动力学和机器学习力场（Molecular Dynamics and Machine Learning Force Field），微软研究院AI4Science
+
+<div align="center">
+<img src="../media/Figure4.gif" width="800px" alt="" />
+
+*图4：一百万步的分子动力学模拟：RBD-蛋白（RBD-protein）与蛋白抑制剂（protein inhibitor）相互作用。*
+</div>
+
+这个项目模拟了使用[AI驱动的力场模型](https://www.microsoft.com/en-us/research/publication/ai2bmd-efficient-characterization-of-protein-dynamics-with-ab-initio-accuracy/)进行近似第一性原理计算精度的大型（百万原子）分子系统的动态模拟，同时保持了经典分子动力学的效率和可扩展性。这些模拟足够高效，可以生成足够长的轨迹来观察化学上有意义的事件。通常，这个过程需要数百万甚至数十亿的推理步骤。这对优化图神经网络（GNN）+ LLM模型的推理速度提出了重大挑战，DeepSpeed4Science将为此提供新的加速策略。
+
+#### 微软天气，微软WebXT/Bing
+
+<div align="center">
+<img src="../media/Figure5.gif" width="800px" alt="" />
+
+*图5：微软降水预报（每4分钟一次对接下来4小时进行预测）。*
+</div>
+
+[微软天气](https://www.msn.com/en-us/weather/forecast/)提供精确的天气信息，[帮助用户为他们的生活方式、健康、工作和活动做出更好的决策](https://blogs.windows.com/windowsexperience/2022/08/31/microsoft-joins-noaas-weather-ready-nation-ambassador-initiative-to-help-improve-americas-readiness-and-response-to-weather-events/)——包括每小时多次更新的准确的10天全球天气预报。此前，微软天气受益于DeepSpeed技术，加速了他们的多GPU训练环境。目前，DeepSpeed4Science正在与微软WebXT天气预报团队合作，进一步增强微软天气预报服务的最新功能和改进。
+
+### 外部合作者
+
+DeepSpeed4Science的旅程始于两个开创性的基于LLM的结构生物学研究AI模型：来自哥伦比亚大学的[OpenFold](https://openfold.io/)，一个开源的高保真蛋白质结构预测模型；以及来自[阿贡国家实验室](https://www.anl.gov/)的[GenSLMs](https://github.com/ramanathanlab/genslm)，一个获得[ACM戈登贝尔奖](https://www.acm.org/media-center/2022/november/gordon-bell-special-prize-covid-research-2022)的用于学习SARS-CoV-2（COVID-19）基因组的进化的语言模型。作为此次发布的特色展示，它们代表了当今AI驱动的结构生物学研究面临的两个常见AI系统挑战。我们将在下一节中讨论DeepSpeed4Science如何赋能这些科学研究。
+
+此外，DeepSpeed4Science最近扩大了其范围，以支持更多样的科学模型。例如，在我们与阿贡国家实验室合作训练[Aurora Exascale系统](https://www.anl.gov/aurora)上的万亿参数科学模型的工作中，DeepSpeed4Science技术将帮助他们达到这一关键任务所需的性能要求和可扩展性。此外，通过与[橡树岭国家实验室](https://ai-roadmap.ornl.gov/)和[国家癌症研究所（NCI）](https://www.cancer.gov/)合作进行癌症监测，DeepSpeed4Science将帮助从非结构化的临床文本中高保真地提取和分类信息，以供[MOSSAIC项目](https://www.olcf.ornl.gov/tag/mossaic/)使用。[Brookhaven国家实验室](https://www.bnl.gov/world/)还将采用DeepSpeed4Science技术，支持使用LLMs开发大型数字双胞胎模型，以便为清洁能源研究产生更真实的模拟数据。您可以在[deepspeed4science.ai](https://deepspeed4science.ai/)上找到有关我们外部合作者及其科学任务的更多详细信息。
+
+## 合作展示
+
+### 展示（I）：DeepSpeed4Science通过DS4Sci_EvoformerAttention消除以Evoformer为中心的结构生物学模型的内存爆炸问题
+
+<div align="center">
+<img src="../media/Figure6-1.png" width="800px" alt="" />
+<img src="../media/Figure6-2.gif" width="800px" alt="" />
+
+*图6：在训练过程中OpenFold对PDB链7B3A_A的预测。*
+</div>
+
+[OpenFold](https://github.com/aqlaboratory/openfold)是DeepMind的[AlphaFold2](https://alphafold.com/)的开源社区再现，使其可以在新数据集上训练或微调AlphaFold2。研究人员已经使用它从头开始重新训练AlphaFold2，生成新的模型参数集，研究AlphaFold2的早期训练阶段（图6），并开发新的蛋白质折叠系统。
+
+<div align="center">
+<img src="../media/Figure7.jpg" width="600px" alt="" />
+
+*图7：在OpenFold中，对多序列比对（MSA）Attention内核（包含偏差）变体的训练峰值内存需求。 (左) 使用在AlphaFold2中的EvoformerAttention的原始OpenFold实现。对于这些类型的蛋白质结构预测模型，在训练/推理中的内存爆炸问题是常见的。最先进的FlashAttention无法有效支持这样的Attention变体。 (右) DeepSpeed4Science的一种新解决方案DS4Sci_EvoformerAttention在不影响模型品质的条件下显著地减少了OpenFold的训练峰值内存需求（最多13倍）。*
+</div>
+
+尽管OpenFold有使用最先进的系统技术进行性能和内存优化，但从头开始训练AlphaFold2仍然在计算上很昂贵。目前阶段的模型参数很小，只有9300万个参数，但它包含了几个需要非常大的中间内存的特殊Attention变体。在标准AlphaFold2训练的“微调”阶段，只是这些变体中的其中一个在半精度下就生成了超过12GB的张量，使其峰值内存要求远远超过了相同大小的语言模型。即使使用像activation checkpointing和DeepSpeed ZeRO优化这样的技术，这种内存爆炸问题仍然严重限制了可训练模型的序列长度和MSA深度。此外，近似策略可能会显著影响模型的准确性和收敛性，同时仍然导致内存爆炸，如图7左侧（橙色）所示。
+
+为了应对结构生物学研究（例如，蛋白质结构预测和平衡分布预测）中的这一常见系统挑战，DeepSpeed4Science通过为这类科学模型中广泛出现的注意力变体（即EvoformerAttention）设计定制的精确注意力内核来解决这一内存效率问题。具体来说，我们设计了一套由复杂的融合/矩阵分块策略和动态内存减少方法而组成的高内存效率DS4Sci_EvoformerAttention内核，作为高质量机器学习模块供更广泛的生物学研究社区使用。通过整合到OpenFold中，这些定制内核在训练期间提供了显著的加速，并显著减少了模型的训练和推理的峰值内存需求。这使得OpenFold可以用更大、更复杂的模型，使用更长的序列在更广泛的硬件上进行实验。关于这项技术的详细信息可以在[这里](https://deepspeed4science.ai/2023/09/18/model-showcase-openfold/)找到。
+
+### 展示（II）：DeepSpeed4Science通过系统和算法方法为基因组基础模型（例如，GenSLMs）提供长序列支持
+
+<div align="center">
+<img src="../media/Figure8.gif" width="800px" alt="" />
+
+*图8：GenSLMs：获2022年ACM 戈登贝尔奖的COVID基因组模型（基于GPT-NeoX的25B/33B模型）。它用于学习描述SARS-CoV-2基因组生物学意义的潜在空间。这个GIF展示了一个重要的蛋白质家族苹果酸脱氢酶（malate dehydrogenase）的根据重要特征（如序列长度和GC含量（核酸鸟嘌呤和胞嘧啶的含量与腺嘌呤和胸腺嘧啶的比率。它测量DNA链抵抗热的能力））着色的潜在空间的投影。*
+</div>
+
+[GenSLMs](https://github.com/ramanathanlab/genslm)，一个来自阿贡国家实验室的[2022年ACM 戈登贝尔奖获奖](https://www.acm.org/media-center/2022/november/gordon-bell-special-prize-covid-research-2022)的基因组模型，可以通过大型语言模型（LLMs）的基因组数据训练来学习SARS-CoV-2（COVID-19）基因组的进化。它旨在改变如何识别和分类引发大流行的病毒（特别是SARS-CoV-2）的新变种。GenSLMs代表了第一批可以泛化到其他预测任务的基因组基础模型。对潜在空间的良好理解可以帮助GenSLMs处理超出仅仅是病毒序列的新领域，并扩展它们模拟细菌病原体甚至真核生物的能力（例如，理解功能、途径成员资格和进化关系等事物）。为了实现这一科学目标，GenSLMs和类似的模型需要非常长的序列支持用于训练和推理，这超出了像[FlashAttention](https://arxiv.org/abs/2307.08691)这样的通用LLM的长序列策略。通过DeepSpeed4Science的新设计，科学家现在可以构建和训练具有显著更长的上下文窗口的模型，允许他们探索以前无法访问的关系。
+
+<div align="center">
+<img src="../media/Figure9.png" width="1000px" alt="" />
+
+*图9：由不同框架在不同规模下支持的两个GenSLMs模型的最大序列长度。使用NVIDIA DGX，每个节点有八个40G A100 GPU。*
+</div>
+
+具体在系统层面，我们发布了包括[长序列支持和其他新优化](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples_deepspeed/deepspeed4science/megatron_long_seq_support)的最新的[Megatron-DeepSpeed框架](https://github.com/microsoft/Megatron-DeepSpeed)。科学家现在可以通过我们新添加的内存优化技术（如注意力掩码异步处理和位置码分割）、张量并行、流水线并行、序列并行、基于ZeRO的数据并行和模型状态异步处理等技术的协同组合，用更长的序列训练他们的GenSLMs等大型科学模型。图9展示了我们的新版本使GenSLMs的25B和33B模型的最长序列长度分别比之前的Megatron-DeepSpeed版本增加了12倍和14倍。在支持的序列长度方面，这个新Megatron-DeepSpeed框架也显著地超过了NVIDIA的Megatron-LM（对于25B和33B模型分别高达9.8倍和9.1倍）。例如，阿贡实验室团队的GenSLMs 25B模型在64个GPU上的原始序列长度为42K，而现在可以用512K的核苷酸序列进行训练。这在不损失准确性的条件下大大提高了模型质量和科学发现的范围。对于那些更喜欢相对位置编码技术这样的算法策略的领域科学家，这个[新版本](https://deepspeed4science.ai/2023/09/18/model-showcase-genslms/)也进行了集成。
+
+## 总结和路线图
+
+我们非常自豪和兴奋地宣布DeepSpeed4Science计划以及几个研发亮点和成果。从今天开始，我们将在[deepspeed4science.ai](https://deepspeed4science.ai/)上介绍我们的新计划，包括关于我们的外部合作者的信息，以及当前和未来的DeepSpeed4Science技术发布。我们的一个高层次目标是推广广泛解决大规模科学发现的主要系统痛点的AI系统技术。我们希望全球的科学家们能够从DeepSpeed4Science通过开源软件解锁的新功能中受益。我们期待更好地了解阻碍您的科学发现的AI系统设计挑战。我们真诚地欢迎您的参与，帮助构建一个更有前途的AI4Science未来。请给我们发送电子邮件至<deepspeed-info@microsoft.com>。我们鼓励您在我们的[DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/)上报告问题、贡献PR、参与讨论。
+
+## 致谢
+
+**Core DeepSpeed4Science Team:**
+
+Shuaiwen Leon Song (DeepSpeed4Science lead), Minjia Zhang, Conglong Li, Shiyang Chen, Chengming Zhang, Xiaoxia (Shirley) Wu, Masahiro Tanaka, Martin Cai, Adam Graham, Charlie Zhou, Yuxiong He (DeepSpeed team lead)
+
+**Our Founding Collaborators (in alphabetical order):**
+
+**Argonne National Lab team:** Rick Stevens, Cristina Negri, Rao Kotamarthi, Venkatram Vishwanath, Arvind Ramanathan, Sam Foreman, Kyle Hippe, Troy Arcomano, Romit Maulik, Maxim Zvyagin, Alexander Brace, Yuntian Deng, Bin Zhang, Cindy Orozco Bohorquez, Austin Clyde, Bharat Kale, Danilo Perez-Rivera, Heng Ma, Carla M. Mann, Michael Irvin, J. Gregory Pauloski, Logan Ward, Valerie Hayot, Murali Emani, Zhen Xie, Diangen Lin, Maulik Shukla, Weili Nie, Josh Romero, Christian Dallago, Arash Vahdat, Chaowei Xiao, Thomas Gibbs, Ian Foster, James J. Davis, Michael E. Papka, Thomas Brettin, Anima Anandkumar
+
+**AMD:** Ivo Bolsen, Micheal Schulte, Bo Begole, Angela Dalton, Steve Reinhart, Ashwin Aji, Jalal Mahmud, Mahesh Balashibramanian
+
+**Brookhaven National Lab team:** Adolfy Hoisie, Shinjae Yoo, Yihui Ren.
+
+**Columbia University OpenFold team:** Mohammed AlQuraishi, Gustaf Ahdritz
+
+**Microsoft Research AI4Science team:** Christopher Bishop, Bonnie Kruft, Max Welling, Tie-Yan Liu, Christian Bodnar, Johannes Brandsetter, Wessel Bruinsma, Chan Cao, Yuan-Jyue Chen, Peggy Dai, Patrick Garvan, Liang He, Elizabeth Heider, PiPi Hu, Peiran Jin, Fusong Ju, Yatao Li, Chang Liu, Renqian Luo, Qi Meng, Frank Noe, Tao Qin, Janwei Zhu, Bin Shao, Yu Shi, Wenlei Shi, Gregor Simm, Megan Stanley, Lixin Sun, Yue Wang, Tong Wang, Zun Wang, Lijun Wu, Yingce Xia, Leo Xia, Shufang Xie, Shuxin Zheng, Jianwei Zhu
+
+**Oakridge National Lab team:** Prassana Balaprakash, Georgia Tourass
+
+**Princeton University:** William Tang, Kyle Felker, Alexey Svyatkovskiy (Microsoft liaison)
+
+**Rutgers University:** Hang Liu
+
+**WebXT Weather team:** Pete Luferenko, Divya Kumar, Jonathan Weyn, Ruixiong Zhang, Sylwester Klocek, Volodymyr Vragov
diff --git a/blogs/deepspeed4science/japanese/README.md b/blogs/deepspeed4science/japanese/README.md
new file mode 100644
index 000000000000..276528650ab5
--- /dev/null
+++ b/blogs/deepspeed4science/japanese/README.md
@@ -0,0 +1,156 @@
+<div align="center">
+
+# DeepSpeed4Scienceイニシアティブ: 洗練されたAIシステムのテクノロジーにより大規模な科学的発見を可能に
+
+</div>
+
+*こちらは英語ブログ[Announcing the DeepSpeed4Science Initiative: Enabling large-scale scientific discovery through sophisticated AI system technologies](https://www.microsoft.com/en-us/research/blog/announcing-the-deepspeed4science-initiative-enabling-large-scale-scientific-discovery-through-sophisticated-ai-system-technologies/)の公式の翻訳です*
+
+<div align="center">
+<img src="../media/Figure1.png" width="800px" alt="" />
+
+*図1：DeepSpeed4Scienceのアプローチ: 汎用の言語モデルのサポートを超え、科学的発見とその複雑さの解決に特化したAI技術を開発*
+</div>
+
+DeepSpeed4Science を引用するには、こちらの[white paper](https://arxiv.org/abs/2310.04610)を引用してください:
+
+```
+@article{song2023deepspeed4science,
+  title={DeepSpeed4Science Initiative: Enabling Large-Scale Scientific Discovery through Sophisticated AI System Technologies},
+  author={Song, Shuaiwen Leon and Kruft, Bonnie and Zhang, Minjia and Li, Conglong and Chen, Shiyang and Zhang, Chengming and Tanaka, Masahiro and Wu, Xiaoxia and Rasley, Jeff and Awan, Ammar Ahmad and others},
+  journal={arXiv preprint arXiv:2310.04610},
+  year={2023}
+}
+```
+
+## はじめに
+
+自然の出来事をモデル化し予測する深層学習の能力は急速に高まっており、次の１０年間に、自然科学に革命を起こすかも知れません。薬の開発から再生可能エネルギーまでの各セクターで、大きな進展をもたらす新しい科学的探求の時代が到来するでしょう。「地球上のすべての人と組織がもっと多くのことを成し遂げられるようにする」というMicrosoftのミッションに従い、この機会に、[DeepSpeedチーム](https://www.deepspeed.ai/)では[DeepSpeed4Science](https://deepspeed4science.ai/)という新しいイニシアティブを立ち上げました。これは、AIシステム技術のイノベーションを通じて他に類を見ない技術を構築し、様々な分野の専門家が、科学分野における大きな謎を解き明かす手助けをすることを目指しています。
+
+[DeepSpeed](https://www.deepspeed.ai/)システムは、Microsoftが開発した、AI分野をリードするオープンソースのAIシステムのフレームワークであり、多様なAIハードウェア上での深層学習の訓練と推論において、前例のない規模と速度を実現します。図1は、この新しいDeepSpeed4Scienceイニシアティブでの基本的なアプローチを示しています。DeepSpeedの現在の柱となる技術（訓練、推論、圧縮）を基盤として活用しつつ、DeepSpeed4Scienceでは、大規模言語モデル（LLM）を加速するための汎用の技術的アプローチを超え、科学的発見を加速する目的で新たに構築された、一連のAIシステム技術を提供します。私たちは、重要な科学的ミッションを推進している、代表的な科学分野向けAIモデルを所有する内外のチームと連携し、ドメイン固有のAIシステムの課題を特定し、解決していきます。これには、気候科学、薬物設計、生物学的理解、分子動力学シミュレーション、がんの診断と監視、触媒/材料の発見、およびその他の分野が含まれます。
+
+私たちの長期的なビジョンは、DeepSpeed4Scienceを、科学的発見をサポートする先進的なAIシステム技術を共有するための新しいソフトウェアプラットフォームおよび統一的なリポジトリに発展させることです。DeepSpeed4Scienceは、Microsoftの[AI for Good](https://www.microsoft.com/en-us/ai/ai-for-good)のコミットメントを反映して、包括的に設計されています。このことは、AI4Scienceへのもっとも重要な投資の成果として構築された、様々な代表的モデルへの、DeepSpeed4Scienceイニシアティブによるサポートに現れています。このブログでは、DeepSpeed4Scienceが、構造生物学の研究における2つの重要なシステムの課題にどのように対処するかを紹介します：(1) Evoformer中心のタンパク質構造予測モデルをスケールアップする際に極めて大きなメモリが必要となる問題を解決し、(2) パンデミックを引き起こすウイルスの進化の様子をよりよく理解するための非常に長いシーケンスのサポートを可能にします。
+
+## 主要な初期コラボレータ
+
+DeepSpeed4Scienceによる新しいシステム技術はAI駆動の幅広い科学研究を強化するものです。現在、DeepSpeed4Scienceは、[Microsoft Research AI4Science](https://www.microsoft.com/en-us/research/lab/microsoft-research-ai4science/)、[Microsoft WebXT/Bing](https://www.msn.com/en-us/weather/forecast/)、[U.S. DoE National Labs](https://www.energy.gov/national-laboratories)、および複数の大学のいくつかの重要な科学モデルをサポートしています。
+
+### Microsoft内のパートナーシップ
+
+#### 科学基盤モデル (Scientific Foundation Model, SFM), Microsoft Research AI4Science
+
+<div align="center">
+<img src="../media/Figure2-1.png" width="800px" alt="" />
+<img src="../media/Figure2-2.gif" width="800px" alt="" />
+
+*図2: 科学基盤モデル (Scientific foundation model, SFM) とその探索: Distributional Graphormer*
+</div>
+
+科学的基盤モデル(SFM)は、多様なインプット、複数の科学領域(薬物、材料、生物学、健康など)、および計算タスクをサポートする、自然科学的発見を強化するための統一された大規模基盤モデルを作成することを目的としています。DeepSpeed4Scienceパートナーシップは、[Distributional Graphormer](https://www.microsoft.com/en-us/research/blog/distributional-graphormer-toward-equilibrium-distribution-prediction-for-molecular-systems/)などのMicrosoftの新しい生成AI手法などのプロジェクトに関する、SFMチームの継続的な研究を強化するための新しい訓練および推論テクノロジーを提供します。
+
+#### ClimaX, Microsoft Research AI4Science
+
+<div align="center">
+<img src="../media/Figure3.png" width="800px" alt="" />
+
+*図3: 天気・気候の多様なモデリングタスクのための最初の基盤モデルClimaX*
+</div>
+
+気候の変化は、より頻繁な異常気象を引き起こしています。悪影響を軽減するため、これらのイベントが発生する場所を予測することがますます重要になっています。[ClimaX](https://www.microsoft.com/en-us/research/group/autonomous-systems-group-robotics/articles/introducing-climax-the-first-foundation-model-for-weather-and-climate/)は、さまざまな気象および気候モデリングタスクを実行するために設計された最初の基盤モデルです。さまざまな変数と解像度を持つ多くの異なるデータセットを扱えるため、天気予報の精度が向上する可能性があります。DeepSpeed4Scienceは、非常に大きな高解像度画像データ(数十から数百ペタバイトなど)を長いシーケンスで処理しながら、より大きな基盤モデルを効率的に事前訓練/ファインチューニングするためのClimaXの新しいシステムサポートを提供しています。
+
+#### 分子動力学と機械学習型力場（Molecular Dynamics and Machine Learning Force Field），Microsoft Research AI4Science
+
+<div align="center">
+<img src="../media/Figure4.gif" width="800px" alt="" />
+
+*図4: 100万ステップの分子動力学シミュレーション: RBD-proteinとprotein inhibitorの相互作用*
+</div>
+
+このプロジェクトは、古典的な分子動力学の効率とスケーラビリティを維持しながら、[AIを利用した力場モデル](https://www.microsoft.com/en-us/research/publication/ai2bmd-efficient-characterization-of-protein-dynamics-with-ab-initio-accuracy/)を使用して、原理に基づく精度（ab initio accuracy）に近い精度で大規模(原子数で100万規模)な分子システムの力学をシミュレートします。このシミュレーションは、化学的に重要なイベントを観察するのに十分な長さの軌道を生成できる効率を実現しています。通常、このプロセスには数百万から数十億の推論ステップが必要です。これは、グラフニューラルネットワーク(GNN)+ LLMモデルの推論速度を最適化する上で大きな課題となります。DeepSpeed4Scienceは、この課題に対して、新しいシステムサポートを提供します。
+
+#### 天気 from Microsoft Start, Microsoft WebXT/Bing
+
+<div align="center">
+<img src="../media/Figure5.gif" width="800px" alt="" />
+
+*図5: Microsoft Startにおける降水予想 (次の４時間について４分ごと)*
+</div>
+
+[天気 from Microsoft Start](https://www.msn.com/en-us/weather/forecast/)は、[ユーザーがライフスタイル、健康、仕事、活動についてより適切な決定を下せるよう](https://blogs.windows.com/windowsexperience/2022/08/31/microsoft-joins-noaas-weather-ready-nation-ambassador-initiative-to-help-improve-americas-readiness-and-response-to-weather-events/)、正確な気象情報を提供します。 (1 時間ごとに複数回更新される、10 日間に渡る正確かつグローバルな天気予報など)。 以前にも、この天気予報は、DeepSpeedの技術を使用して、マルチ GPU を用いた訓練を高速化していました。現在、DeepSpeed4ScienceはMicrosoft WebXT気象チームと協力して、最先端の機能と更なる改善により、マイクロソフトの気象サービスをさらに強化しています。
+
+### 外部のコラボレータ
+
+DeepSpeed4Scienceは、構造生物学研究のための2つの先駆的なLLMベースのAIモデルを扱うことから始まりました: オープンソースのハイフィデリティタンパク質構造予測モデルであるコロンビア大学の[OpenFold](https://openfold.io/)と、SARS-CoV-2(COVID-19)ゲノムの進化を学習する、[Gordon Bell Special Prize](https://www.acm.org/media-center/2022/november/gordon-bell-special-prize-covid-research-2022)を受賞したゲノム用言語モデルである[アルゴンヌ国立研究所](https://www.anl.gov/)の[GenSLMs](https://github.com/ramanathanlab/genslm)です。次のセクションでは、今日のAI主導の構造生物学研究が直面している2つの一般的なAIシステムの課題を紹介し、DeepSpeed4Scienceが科学研究をどのように強化したかについて説明します。
+
+またDeepSpeed4Scienceは最近、より多様な科学モデルをサポートするために、その対象を拡大しました。たとえば、[Aurora Exascaleシステム](https://www.anl.gov/aurora)で、1兆パラメータの科学モデルを訓練するアルゴンヌ国立研究所との協力にあたって、DeepSpeed4Scienceテクノロジーは、求められるパフォーマンス要件とスケーラビリティを実現するのに重要な役割を果たします。さらに、DeepSpeed4Scienceは、がんの調査に関して、[オークリッジ国立研究所](https://ai-roadmap.ornl.gov/)および[国立がん研究所(NCI)](https://www.cancer.gov/)と協力することにより、[MOSSAICプロジェクト](https://www.olcf.ornl.gov/tag/mossaic/)の非構造化臨床テキストからの情報の高信頼度抽出と分類にも用いられます。さらに、DeepSpeed4Scienceのテクノロジーは、[ブルックヘブン国立研究所](https://www.bnl.gov/world/)にも採用され、LLMを使用してより現実的なシミュレーションデータを生成することにより、クリーンエネルギー研究用の大規模なデジタルツインモデルの開発をサポートします。外部のコラボレータとその科学ミッションに関するより詳細な情報は、[deepspeed4science.ai](https://deepspeed4science.ai/)に掲載しています。
+
+## パートナーシップの事例
+
+### 事例(I): DeepSpeed4ScienceのDS4Sci_EvoformerAttentionにより、Evoformerで構成された生物学モデルをスケールアップする際のメモリ問題を解決
+
+<div align="center">
+<img src="../media/Figure6-1.png" width="800px" alt="" />
+<img src="../media/Figure6-2.gif" width="800px" alt="" />
+
+*図6: モデル学習の進行に伴うPDB chain 7B3A_AについてのOpenFoldの予測*
+</div>
+
+[OpenFold](https://github.com/aqlaboratory/openfold)は、DeepMindによる[AlphaFold2](https://alphafold.com/)をオープンソースで再現したものであり、新しいデータセットでAlphaFold2を訓練またはファインチューニングすることを可能にします。研究者は、これを使用して、AlphaFold2をゼロから再訓練して新しいモデルパラメータを作成し、AlphaFold2の初期訓練フェーズを研究し(図6)、新しいタンパク質フォールディングシステムを開発しました。
+
+<div align="center">
+<img src="../media/Figure7.jpg" width="600px" alt="" />
+
+*図7: OpenFoldで可能な最大の訓練サンプル次元を持つ多重配列アライメント（MSA）アテンションカーネル（バイアス付き）のバリエーションを訓練するために必要なピークメモリ。(左）AlphaFold2で使用されているEvoformerAttentionを用いたオリジナルのOpenFold実装。この種のタンパク質構造予測モデルの訓練/推論では、極めて多くのメモリが必要とされることは一般的な課題となっている。特に、最新技術として広く知られるFlashAttentionでも、このような科学研究のためのアテンションのバリエーションを効果的にサポートできない。(右）DS4Sci_EvoformerAttentionと呼ばれるDeepSpeed4Scienceの新しい技術は、精度を落とすことなく、OpenFoldモデルの訓練に必要なピークメモリを1/13に大幅に削減する。*
+</div>
+
+OpenFoldには、最先端のシステムテクノロジーを使用したパフォーマンスとメモリの最適化が含まれていますが、AlphaFold2をゼロから訓練することは依然として大きな計算コストがかかります。現段階でのモデルは、パラメータ数の絶対値は小さい（9,300万個）のですが、極めて大きなアクティベーションを持つアテンションのバリエーションが含まれています。標準的なAlphaFold2訓練のファインチューニングフェーズでは、これらのバリエーションのうちのの1つが生成したロジットテンソル(入力としてモデルに供給されるディープタンパク質MSAに対応するように設計されたもの)は、半精度浮動小数で12GBを超え、同等のサイズの言語モデルが使用するメモリを大幅に上回ります。Activation checkpointingや、DeepSpeed ZeRO 最適化などの手法を使用しても、非常に多くのメモリが必要とされるため、モデルを訓練できるシーケンスの長さと MSA の深さが大幅に制限されます。さらに、近似解を与えるような戦略を用いると、モデルの精度と収束に大きな影響を与える可能性があり、それでもメモリが爆発的に増加します(図7の左側のバー(オレンジ色))。
+
+DeepSpeed4Scienceは、構造生物学研究(タンパク質構造予測や平衡分布予測など)におけるこの一般的なシステムの課題に対処するために、このカテゴリの科学モデルに広く見られるアテンションのバリエーション(つまりEvoformerAttention)用にカスタマイズされた正確なアテンションのカーネルを設計することにより、このメモリの非効率性の問題に対処しています。具体的には、高度なフュージョン/タイリング戦略とオンザフライのメモリ削減方法によって可能になるメモリ効率の高いDS4Sci_EvoformerAttentionカーネルのセットを、高品質の機械学習プリミティブとして、より広いコミュニティ向けに作成しました。これらをOpenFoldに組み込むことで、訓練中の速度が大幅に向上し、訓練と推論のためのモデルのピークメモリが大幅に削減されます。これにより、OpenFoldはより大きく、より複雑なモデル、より長いシーケンスで実験し、より幅広いハードウェアで訓練することができます。この技術の詳細については、[こちら](https://deepspeed4science.ai/2023/09/18/model-showcase-openfold/)をご覧ください。
+
+### 事例(II): DeepSpeed4Scienceのシステムとアルゴリズムの両方からのアプローチにより、ゲノム基盤モデルでの非常に長い系列の使用をサポート
+
+<div align="center">
+<img src="../media/Figure8.gif" width="800px" alt="" />
+
+*図8: GenSLMs：2022年ACM Gordon Bell Special Prize受賞COVIDゲノム用モデル（GPT-NeoXに基づく25B/33Bモデル）。SARS-CoV-2ゲノムの生物学的に意味のある特性を記述する潜在空間を学習するために使用される。このGIFは、重要なタンパク質ファミリーであるリンゴ酸デヒドロゲナーゼ（malate dehydrogenase）を可視化し、配列の長さやGC含量（アデニンとチミンと比較した核酸グアニンとシトシンの含量の比率。これはDNA鎖が熱に耐える能力を測るものである。）などの重要な特徴で色付けされた潜在空間の投影を表示している。*
+</div>
+
+アルゴンヌ国立研究所が開発し、[2022年ACM Gordon Bell Special Prize](https://www.acm.org/media-center/2022/november/gordon-bell-special-prize-covid-research-2022)を受賞したゲノム用言語モデルである[GenSLMs](https://github.com/ramanathanlab/genslm)は、ゲノムデータに大規模言語モデル(LLM)を適用することにより、SARS-CoV-2(COVID-19)ゲノムの進化を学習します。これは、パンデミックを引き起こすウイルス、特にSARS-CoV-2の新たに出現する亜種を特定し、分類する方法を変えるように設計されています。GenSLMsは、他の予測タスクに一般化できる最初のゲノム基盤モデルの1つです。潜在空間をうまく表現することにより、GenSLMsはウイルス配列だけでなく新しいドメインに適用し、細菌性病原体や真核生物をモデル化する能力を拡大し、機能、経路のメンバーシップ、進化的関係などを理解することができます。この科学的目標を達成するために、GenSLMsおよび同様のモデルは、[FlashAttention](https://arxiv.org/abs/2307.08691)のように、長いシーケンスのための一般的な戦略では扱うことが困難なレベルの、非常に長いシーケンスサポートを、訓練と推論の両方に対して必要とします。DeepSpeed4Scienceの新しい設計により、科学者はより長いシーケンスでモデルを構築および訓練できるようになり、以前は扱えなかった科学探索が可能になりました。
+
+<div align="center">
+<img src="../media/Figure9.png" width="1000px" alt="" />
+
+*図9: 異なるスケールで異なるフレームワークがサポートする2つのGenSLMsモデルの最大シーケンス長。1ノードあたり8個の40G A100 GPUを搭載したNVIDIA DGXノードを使用。*
+</div>
+
+システムレベルでは、非常に長いシーケンスをサポートするための最新の[Megatron-DeepSpeedフレームワーク](https://github.com/microsoft/Megatron-DeepSpeed)を、[他の新しい最適化とともにリリースします](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples_deepspeed/deepspeed4science/megatron_long_seq_support)。科学者は、（アテンションマスクと位置の埋め込みに関する）新しく追加されたメモリ最適化手法、テンソル並列処理、パイプライン並列処理、シーケンス並列処理、ZeROスタイルのデータ並列処理、モデル状態のオフロードなどの技術を相乗的な組み合わせにより、GenSLMsのような大規模な科学モデルをはるかに長いシーケンスで訓練できるようになりました。図9は、新しいリリースにより、GenSLMsの25Bおよび33Bモデルで、以前のMegatron-DeepSpeedよりもそれぞれ最大12倍および14倍の最長シーケンス長を処理できることを示しています。サポートされているシーケンス長に関しては、この新しいMegatron-DeepSpeedは、25Bモデルと33Bモデルでそれぞれ最大9.8倍と9.1倍でNVIDIAのMegatron-LMを大幅に上回っています。たとえば、GenSLMsの25Bモデルは、64個のGPUでのアルゴンヌチームの元の42Kシーケンス長と比較して、512Kのヌクレオチド配列で訓練できるようになりました。これにより、精度を損なうことなく、モデルの品質と科学的発見の範囲が大幅に向上します。Relative position embeddingなどのアルゴリズム戦略を必要とする科学者向けの追加サポートも、[このリリース](https://deepspeed4science.ai/2023/09/18/model-showcase-genslms/)に統合されています。
+
+## まとめとロードマップ
+
+DeepSpeed4Scienceイニシアティブを、いくつかのR&Dのハイライトや成果と共に発表できることを嬉しく思います。本日から、外部の協力者に関する情報や、現在および将来のDeepSpeed4Scienceテクノロジーリリースなど、新しいイニシアティブでの活動を[deepspeed4science.ai](https://deepspeed4science.ai/)上で進めていきます。私たちの高レベルな目標の1つは、大規模な科学的発見のための主要なシステムの問題点に広く対処するAIシステムテクノロジーを一般化することです。世界中の科学者によって、オープンソースのソフトウェアを通じてDeepSpeed4Scienceによって利用可能になる新機能が活用されることを願っています。科学的発見の障害となるAIシステム設計の課題を解決していくことを楽しみにしています。AI4Scienceの有望な未来を築くために、皆様の参加を歓迎します。お問い合わせは<deepspeed-info@microsoft.com>までお願いします。問題の報告や、PRを通じての貢献、ディスカッションへの参加は、[DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/)でお願いします。
+
+## 謝辞
+
+**Core DeepSpeed4Science Team:**
+
+Shuaiwen Leon Song (DeepSpeed4Science lead), Minjia Zhang, Conglong Li, Shiyang Chen, Chengming Zhang, Xiaoxia (Shirley) Wu, Masahiro Tanaka, Martin Cai, Adam Graham, Charlie Zhou, Yuxiong He (DeepSpeed team lead)
+
+**Our Founding Collaborators (in alphabetical order):**
+
+**Argonne National Lab team:** Rick Stevens, Cristina Negri, Rao Kotamarthi, Venkatram Vishwanath, Arvind Ramanathan, Sam Foreman, Kyle Hippe, Troy Arcomano, Romit Maulik, Maxim Zvyagin, Alexander Brace, Yuntian Deng, Bin Zhang, Cindy Orozco Bohorquez, Austin Clyde, Bharat Kale, Danilo Perez-Rivera, Heng Ma, Carla M. Mann, Michael Irvin, J. Gregory Pauloski, Logan Ward, Valerie Hayot, Murali Emani, Zhen Xie, Diangen Lin, Maulik Shukla, Weili Nie, Josh Romero, Christian Dallago, Arash Vahdat, Chaowei Xiao, Thomas Gibbs, Ian Foster, James J. Davis, Michael E. Papka, Thomas Brettin, Anima Anandkumar
+
+**AMD:** Ivo Bolsen, Micheal Schulte, Bo Begole, Angela Dalton, Steve Reinhart, Ashwin Aji, Jalal Mahmud, Mahesh Balashibramanian
+
+**Brookhaven National Lab team:** Adolfy Hoisie, Shinjae Yoo, Yihui Ren.
+
+**Columbia University OpenFold team:** Mohammed AlQuraishi, Gustaf Ahdritz
+
+**Microsoft Research AI4Science team:** Christopher Bishop, Bonnie Kruft, Max Welling, Tie-Yan Liu, Christian Bodnar, Johannes Brandsetter, Wessel Bruinsma, Chan Cao, Yuan-Jyue Chen, Peggy Dai, Patrick Garvan, Liang He, Elizabeth Heider, PiPi Hu, Peiran Jin, Fusong Ju, Yatao Li, Chang Liu, Renqian Luo, Qi Meng, Frank Noe, Tao Qin, Janwei Zhu, Bin Shao, Yu Shi, Wenlei Shi, Gregor Simm, Megan Stanley, Lixin Sun, Yue Wang, Tong Wang, Zun Wang, Lijun Wu, Yingce Xia, Leo Xia, Shufang Xie, Shuxin Zheng, Jianwei Zhu
+
+**Oakridge National Lab team:** Prassana Balaprakash, Georgia Tourass
+
+**Princeton University:** William Tang, Kyle Felker, Alexey Svyatkovskiy (Microsoft liaison)
+
+**Rutgers University:** Hang Liu
+
+**WebXT Weather team:** Pete Luferenko, Divya Kumar, Jonathan Weyn, Ruixiong Zhang, Sylwester Klocek, Volodymyr Vragov
diff --git a/blogs/deepspeed4science/media/Figure1.png b/blogs/deepspeed4science/media/Figure1.png
new file mode 100644
index 000000000000..614c4b40d6a1
Binary files /dev/null and b/blogs/deepspeed4science/media/Figure1.png differ
diff --git a/blogs/deepspeed4science/media/Figure2-1.png b/blogs/deepspeed4science/media/Figure2-1.png
new file mode 100644
index 000000000000..bb0b8d9206d1
Binary files /dev/null and b/blogs/deepspeed4science/media/Figure2-1.png differ
diff --git a/blogs/deepspeed4science/media/Figure2-2.gif b/blogs/deepspeed4science/media/Figure2-2.gif
new file mode 100644
index 000000000000..0890be7d7e31
Binary files /dev/null and b/blogs/deepspeed4science/media/Figure2-2.gif differ
diff --git a/blogs/deepspeed4science/media/Figure3.png b/blogs/deepspeed4science/media/Figure3.png
new file mode 100644
index 000000000000..465e80e15a25
Binary files /dev/null and b/blogs/deepspeed4science/media/Figure3.png differ
diff --git a/blogs/deepspeed4science/media/Figure4.gif b/blogs/deepspeed4science/media/Figure4.gif
new file mode 100644
index 000000000000..b45a5f28fd36
Binary files /dev/null and b/blogs/deepspeed4science/media/Figure4.gif differ
diff --git a/blogs/deepspeed4science/media/Figure5.gif b/blogs/deepspeed4science/media/Figure5.gif
new file mode 100644
index 000000000000..a26c20103269
Binary files /dev/null and b/blogs/deepspeed4science/media/Figure5.gif differ
diff --git a/blogs/deepspeed4science/media/Figure6-1.png b/blogs/deepspeed4science/media/Figure6-1.png
new file mode 100644
index 000000000000..65f7f9309f71
Binary files /dev/null and b/blogs/deepspeed4science/media/Figure6-1.png differ
diff --git a/blogs/deepspeed4science/media/Figure6-2.gif b/blogs/deepspeed4science/media/Figure6-2.gif
new file mode 100644
index 000000000000..b50588c227d7
Binary files /dev/null and b/blogs/deepspeed4science/media/Figure6-2.gif differ
diff --git a/blogs/deepspeed4science/media/Figure7.jpg b/blogs/deepspeed4science/media/Figure7.jpg
new file mode 100644
index 000000000000..eaa92007268b
Binary files /dev/null and b/blogs/deepspeed4science/media/Figure7.jpg differ
diff --git a/blogs/deepspeed4science/media/Figure8.gif b/blogs/deepspeed4science/media/Figure8.gif
new file mode 100644
index 000000000000..624384910f2a
Binary files /dev/null and b/blogs/deepspeed4science/media/Figure8.gif differ
diff --git a/blogs/deepspeed4science/media/Figure9.png b/blogs/deepspeed4science/media/Figure9.png
new file mode 100644
index 000000000000..f00fd9b6917f
Binary files /dev/null and b/blogs/deepspeed4science/media/Figure9.png differ
diff --git a/blogs/zeropp/assets/images/eval1.png b/blogs/zeropp/assets/images/eval1.png
new file mode 100644
index 000000000000..8312c1db6de1
Binary files /dev/null and b/blogs/zeropp/assets/images/eval1.png differ
diff --git a/blogs/zeropp/assets/images/eval2.png b/blogs/zeropp/assets/images/eval2.png
new file mode 100644
index 000000000000..b6fd05f8cd98
Binary files /dev/null and b/blogs/zeropp/assets/images/eval2.png differ
diff --git a/blogs/zeropp/assets/images/eval3.png b/blogs/zeropp/assets/images/eval3.png
new file mode 100644
index 000000000000..4675e2041d84
Binary files /dev/null and b/blogs/zeropp/assets/images/eval3.png differ
diff --git a/blogs/zeropp/assets/images/hpz.png b/blogs/zeropp/assets/images/hpz.png
new file mode 100644
index 000000000000..790903cff68b
Binary files /dev/null and b/blogs/zeropp/assets/images/hpz.png differ
diff --git a/blogs/zeropp/assets/images/overview.png b/blogs/zeropp/assets/images/overview.png
new file mode 100644
index 000000000000..8e261b533528
Binary files /dev/null and b/blogs/zeropp/assets/images/overview.png differ
diff --git a/blogs/zeropp/assets/images/qgz.gif b/blogs/zeropp/assets/images/qgz.gif
new file mode 100644
index 000000000000..90716d325a04
Binary files /dev/null and b/blogs/zeropp/assets/images/qgz.gif differ
diff --git a/blogs/zeropp/assets/images/qwz.png b/blogs/zeropp/assets/images/qwz.png
new file mode 100644
index 000000000000..ae68c322668f
Binary files /dev/null and b/blogs/zeropp/assets/images/qwz.png differ
diff --git a/blogs/zeropp/assets/images/rlhf-eval.png b/blogs/zeropp/assets/images/rlhf-eval.png
new file mode 100644
index 000000000000..d9b1f3d272c1
Binary files /dev/null and b/blogs/zeropp/assets/images/rlhf-eval.png differ
diff --git a/blogs/zeropp/assets/images/zero-overview.gif b/blogs/zeropp/assets/images/zero-overview.gif
new file mode 100644
index 000000000000..65051947f79d
Binary files /dev/null and b/blogs/zeropp/assets/images/zero-overview.gif differ
diff --git a/blogs/zeropp/chinese/README.md b/blogs/zeropp/chinese/README.md
new file mode 100644
index 000000000000..e4a6b5279de5
--- /dev/null
+++ b/blogs/zeropp/chinese/README.md
@@ -0,0 +1,185 @@
+<div align="center">
+
+# DeepSpeed ZeRO++：降低4倍网络通信，显著提高大模型及类ChatGPT模型训练效率
+
+</div>
+<div align="center">
+
+<img src="../assets/images/overview.png" width="800px"/>
+
+图1: DeepSpeed ZeRO++ 简介
+</div>
+
+大型 AI 模型正在改变数字世界。基于大型语言模型 (LLM)的 Turing-NLG、ChatGPT 和 GPT-4 等生成语言模型用途广泛，能够执行摘要、代码生成和翻译等任务。 同样，DALL·E、Microsoft Designer 和 Bing Image Creator 等大型多模态生成模型可以生成艺术、建筑、视频和其他数字资产，使内容创作者、建筑师和工程师能够探索全新的创意生产力。\
+\
+然而，训练这些大型模型需要在数百甚至数千个 GPU 设备上使用大量内存和计算资源。 例如，训练 [Megatron-Turing NLG 530B](https://www.microsoft.com/en-us/research/blog/using-deepspeed-and-megatron-to-train-megatron-turing-nlg-530b-the-worlds-largest-and-most-powerful-generative-language-model/)模型需要使用超过 4,000 个 NVidia A100 GPU。 有效地利用这些资源需要一个复杂的优化系统，以将模型合理分配到各个设备的内存中，并有效地并行化这些设备上的计算。 同时，为了使深度学习社区能够轻松进行大型模型训练，这些优化必须易于使用。
+
+DeepSpeed 的 ZeRO [优化系列](https://www.deepspeed.ai/tutorials/zero/)为这些挑战提供了强大的解决方案，并已广泛用于大型深度学习模型例如TNLG-17B、Bloom-176B、MPT-7B、Jurrasic-1的训练中 。尽管它具有变革性的能力 ，在一些关键场景中，ZeRO 会在 GPU 之间产生大量数据传输开销，这降低了训练效率。 这种情况特别发生在以下场景中：a) 全局batch size较小，而 GPU数量多，这导致每个 GPU 上batch size较小，需要频繁通信；或者 b) 在低端集群上进行训练，其中跨节点网络带宽有限，导致高通信延迟。在这些情况下，ZeRO 的训练效率会受到限制。
+
+为了解决这些限制，我们发布了 [ZeRO++](https://arxiv.org/abs/2306.10209) 。 ZeRO++相比 ZeRO将总通信量减少了 4 倍，而不会影响模型质量。 这有两个关键意义：
+
+1. *ZeRO++加速大型模型预训练和微调*
+    1. 每个GPU上 batch size较小时: 无论是在数千个 GPU 上预训练大型模型，还是在数百个甚至数十个 GPU 上对其进行微调，当每个 GPU 的batch size较小时，ZeRO++ 提供比 ZeRO 高 2.2 倍的吞吐量，直接减少训练时间和成本。
+    2.  低带宽计算集群: ZeRO++ 使低带宽集群能够实现与带宽高 4 倍的高端集群类似的吞吐量。 因此，ZeRO++ 可以跨更广泛的集群进行高效的大型模型训练。
+
+2. *ZeRO++加速 ChatGPT 类的 RLHF训练*
+
+    1. 虽然 ZeRO++ 主要是为训练而设计的，但它的优化也自动适用于 [ZeRO-Inference](https://www.deepspeed.ai/2022/09/09/zero-inference.html#:~:text=ZeRO-Inference%20adapts%20and%20optimizes%20ZeRO-Infinity%20techniques%20for%20model,memory%2C%20thus%20hosting%20no%20%28zero%29%20weights%20in%20GPU.)，因为通信开销对于 ZeRO 的训练和推理同样适用。 因此，ZeRO++ 可以提高人类反馈强化学习 (RLHF) 等算法的效率，因为RLHF结合了训练和推理。
+
+    2. 通过与 DeepSpeed-Chat 的集成，与原始 ZeRO 相比，ZeRO++ 可以将 RLHF 训练的生成阶段效率提高多达 2 倍，强化学习训练阶段效率提高多达 1.3 倍。
+
+接下来，我们将更深入地解释 ZeRO 及其通信开销，并讨论 ZeRO++ 中为解决这些问题而进行的关键优化。 然后我们将展示 ZeRO++ 对不同模型大小、批量大小和带宽限制的训练吞吐量的影响。我们还将讨论 ZeRO++ 如何应用于 DeepSpeed-Chat，以加速使用 RLHF的对话模型的训练。
+
+## ZeRO++详解
+
+<div align="center">
+
+<img src="../assets/images/zero-overview.gif" width="800px"/>
+
+图2: ZeRO optimizer 工作流程图
+</div>
+
+ZeRO 是数据并行(Data Parallelism)的一种内存高效版本，其中模型状态会被分割储存在所有 GPU 上，而不需要在训练期间使用基于gather/broadcas的通信进行复制和重建。这使 ZeRO 能够有效地利用所有设备的聚合 GPU 内存和计算力，同时提供简单易用的数据并行训练。\
+\
+假设模型大小为 M。在前向传播过程中，ZeRO 执行全收集/广播(all-gather/broadcast)操作以在需要之时为每个模型层收集参数（总共大小为 M）。 在向后传递中，ZeRO 对每一层的参数采用类似的通信模式来计算其局部梯度（总大小为 M）。 此外，ZeRO 在对每个局部梯度计算完毕后会立刻使用 reduce 或 reduce-scatter 通信进行平均和分割储存（总大小为 M）。 因此，ZeRO 总共有 3M 的通信量，平均分布在两个全收集/广播(all-gather/broadcast)和一个减少分散/减少(reduce-scatter/reduce)操作中。
+
+为了减少这些通信开销，ZeRO++ 进行了三组通信优化，分别针对上述三个通信集合：
+
+<div align="center">
+
+<img src="../assets/images/qwz.png" width="800px"/>
+
+图3:qwZ的分区量化图例
+</div>
+
+
+### ZeRO通信过程中的权重量化 (qwZ)
+
+首先，为了减少 all-gather 期间的参数通信量，我们采用权重量化在通信前将每个模型参数从 FP16（两个字节）动态缩小为 INT8（一个字节）数据类型，并在通信后对权重进行反量化。 然而，简单地对权重进行量化会降低模型训练的准确性。 为了保持良好的模型训练精度，我们采用分区量化，即对模型参数的每个子集进行独立量化。目前尚且没有针对分区量化的高性能现有实现。 因此，我们自行从头开始实现了一套高度优化的量化 CUDA 内核，与基本量化相比，精度提高 3 倍，速度提高 5 倍。
+
+<div align="center">
+
+<img src="../assets/images/hpz.png" width="800px"/>
+
+图4: 权重的分层分割存储(hpZ)
+</div>
+
+
+### ZeRO模型权重的分层分割存储 (hpZ)
+
+其次，为了减少向后传递期间全收集(all-gather)权重的通信开销，我们用 GPU 内存进行通信。 更具体地说，我们不像在 ZeRO 中那样将整个模型权重分布在所有机器上，而是在每台机器中维护一个完整的模型副本。 以更高的内存开销为代价，这允许我们用机器内的模型权重全收集/广播(all-gather/broadcast)代替昂贵的跨机器全收集/广播(all-gather/broadcast)，由于机器内通信带宽更高，这使得通信速度大幅提升。
+
+<div align="center">
+
+<img src="../assets/images/qgz.gif" width="800px"/>
+
+图5: qgZ 端到端的工作流程
+
+</div>
+
+### ZeRO通信过程中梯度量化 (qgZ)
+
+第三，要降低梯度的reduce-scatter通信成本更具挑战性。 因为直接应用量化来减少通信量是不可行的。 即使我们使用分区量化来降低量化误差，梯度reduce也会累积并放大量化误差。 为了解决这个问题，我们只在通信之前量化梯度，但在任何reduce操作之前将它们反量化到原有精度。 为了有效地做到这一点，我们发明了一种名为 qgZ 的基于 all-to-all 的新型量化梯度通信范式，它在功能上等同于压缩的归约-分散(reduce-scatter)操作。
+
+qgZ 旨在解决两个挑战：i) 如果我们简单地在 INT4/INT8 中实施 reduce-scatter 会导致显著精度损失，以及 ii) 在传统tree或ring-based reduce-scatter中使用量化需要一长串量化和反量化步骤，这直接导致误差积累和显著的延迟，即使我们在全精度上进行reduce。为了解决这两个挑战，qgZ 不使用tree或ring-based reduce-scatter算法，而是基于一种新颖的分层 all-to-all 方法。
+
+qgZ 中有三个主要步骤：i）梯度切片重新排序，ii）节点内通信和reduce，以及 iii）节点间通信和reduce。 首先，在任何通信发生之前，我们对梯度进行切片并对张量切片重新排序，以保证通信结束时每个 GPU 上的最终梯度位置（即图 5 中的绿色块）是正确的。 其次，我们量化重新排序的梯度切片，在每个节点内进行 all-to-all 通信，从 all-to-all 中对接收到的梯度切片进行反量化，并进行局部reduce。 第三，我们再次量化局部reduce后的梯度，进行节点间的all-to-all通信，再次对接收到的梯度进行反量化，并计算最终的高精度梯度reduce，得到图5中绿色块的结果。\
+\
+这种分层方法的原因是为了减少跨节点通信量。 更准确地说，给定每个节点 N 个 GPU、M 的模型大小和 Z 的量化比率，单跳 all-to-all 将生成 M\*N/Z 跨节点流量。 相比之下，通过这种分层方法，我们将每个 GPU 的跨节点流量从 M/Z 减少到 M/(Z\*N)。 因此，总通信量从 M\*N/Z 减少到 M\*N/(Z\*N) = M/Z。 我们通过重叠节点内和节点间通信以及融合 CUDA 内核来进一步优化 qgZ 的端到端延迟（张量切片重新排序 (Tensor Slice Reordering)+ 节点内量化(Intra-node quantization)）和（节点内反量化 (Intra-node Dequantization) + 节点内梯度整合 (Intra-node Reduction) + 节点间量化(inter-node quantization)）。
+
+<div align="center">
+
+|     Communication Volume    |     Forward all-gather on weights    |     Backward all-gather on weights    |     Backward reduce-scatter on gradients    |     Total    |
+|:---------------------------:|:------------------------------------:|:-------------------------------------:|:-------------------------------------------:|:------------:|
+|             ZeRO            |                   M                  |                    M                  |                       M                     |       3M     |
+|            ZeRO++           |                  0.5M                |                    0                  |                     0.25M                   |     0.75M    |
+
+</div>
+
+### **通信总量优化**
+
+通过结合以上所有三个组件，我们将跨节点通信量从 3M 减少到 0.75M。 更具体地说，我们使用 qwZ 将模型权重的前向全收集/广播从 M 减少到 0.5M。 我们使用 hpZ 消除了反向传播期间的跨节点 all-gather，将通信从 M 减少到 0。最后，我们使用 qgZ 将反向传播期间的跨节点 reduce-scatter 通信从 M 减少到 0.25M。
+
+## **ZeRO++ 加速大型语言模型训练**
+
+在这里，我们展示了 ZeRO++ 在 384 个 Nvidia V100 GPU 上的真实 LLM 训练场景的测试结果。
+
+<div align="center">
+
+<img src="../assets/images/eval1.png" width="800px"/>
+
+图6: 在 384 个 V100 GPU 上的各种模型大小下 ZeRO++ 与 ZeRO 的吞吐量，节点间使用 4 个 Infiniband (IB) 进行互连，每个以 100 Gbps 运行。
+
+</div>
+
+### **在GPU小batch size情况下ZeRO++实现更高的训练效率**
+
+**高带宽集群:** 如图 6 所示，我们首先展示了 ZeRO++ 相对于 ZeRO 的吞吐量改进，针对不同的模型大小和微批量(micro-batch size)大小，测试使用 4x Infiniband (IB) 以实现 400Gbps 跨节点互连带宽，每个以 100Gbps 运行。 在 micro-batch size为每 GPU 1k tokens时，ZeRO++ 比 ZeRO-3 的吞吐量提高了 28% 到 36%。 对于 2k tokens micro-batch size大小，ZeRO++ 比 ZeRO-3 实现了 24% 到 29% 的吞吐量增益。
+
+<div align="center">
+
+<img src="../assets/images/eval2.png" width="800px"/>
+
+
+图7: 在 384 个 V00 GPU 上 100Gbps 跨节点带宽时各种 LLM 的吞吐量
+
+</div>
+
+**低带宽集群:** 在 100Gbps等低带宽网络环境中，ZeRO++ 的性能明显优于 ZeRO-3。 如图 7 所示，与 ZeRO-3 相比，ZeRO++ 在端到端吞吐量方面实现了高达 2.2 倍的加速。 平均而言，ZeRO++ 比 ZeRO-3 基线实现了大约 2 倍的加速。
+
+<div align="center">
+
+<img src="../assets/images/eval3.png" width="800px"/>
+
+
+图8: ZeRO++ 以显着降低的带宽实现高带宽集群性能
+
+</div>
+
+### **实现高带宽ZeRO和低带宽ZeRO++集群之间的模型训练效率等效**
+
+此外，与 ZeRO 在高得多的带宽环境下相比，ZeRO ++ 可以在低带宽集群中实现相当的系统吞吐量。 如图 8 所示，对于 18B 和 138B 模型大小，具有 200Gbps 跨节点带宽的 ZeRO++ 可以达到与 800Gbps 跨节点带宽的 ZeRO-3 相似的 TFLOP。
+
+鉴于 ZeRO++ 出色的可扩展性，我们将 ZeRO++ 视为用于训练大型 AI 模型的下一代 ZeRO。
+
+## **DeepSpeed-Chat 与ZeRO++结合用于 RLHF 训练**
+
+### **RLHF训练简介**
+
+ChatGPT 类模型由 LLM 提供支持，并[使用 RLHF 进行微调](https://openai.com/blog/chatgpt)。 RLHF 由生成（推理）阶段和训练阶段组成。 在生成阶段，演员(actor)模型将部分对话作为输入，并使用一系列前向传递生成响应。 然后在训练阶段，评论(critic)模型根据质量对生成的响应进行排名，为演员模型提供强化信号。 使用这些排名对参与者模型进行微调，使其能够在后续迭代中生成更准确和适当的响应。
+
+RLHF 训练带来了巨大的内存压力，因为它使用了四种模型（演员、参考、评论、奖励）。 常见的解决方案是采用低秩自适应训练 (LoRA) 来解决 RLHF 的内存压力。 LoRA 冻结了预训练模型的权重，并将可训练的秩分解矩阵注入到 Transformer 架构的每一层中，显着减少了可训练参数的数量。 LoRA 通过减少内存使用来加速 RLHF，允许更大的批处理(batch)大小，从而大大提高吞吐量。
+
+### **DeepSpeed-Chat with ZeRO++ 用于 RLHF 训练**
+
+<div align="center">
+
+<img src="../assets/images/rlhf-eval.png" width="800px"/>
+
+
+图9: ZeRO++ 加速了 RLHF 训练的生成和训练阶段
+
+</div>
+
+ZeRO++在RLHF + LoRA的场景下有着独特的应用，因为大多数模型权重都被冻结了。 这意味着 ZeRO++ 可以将这些冻结的权重量化保存到INT4/8 中，而不是将它们存储在 fp16 中并在每次通信操作之前对其进行量化。 通信后的反量化仍然是为了让权重为计算做好准备，但反量化后的权重在计算后被简单地丢弃。
+
+以这种方式使用 ZeRO++ 进行 RLHF 训练可以减少内存使用和通信量。 这意味着通过减少通信以及由于减少内存使用而启用更大的批处理大小来提高训练吞吐量。 在生成阶段，ZeRO++ 使用 hpZ 将所有权重通信保持在每个节点内，以利用更高的节点内通信带宽，减少通信量，进一步提高生成吞吐量。\
+\
+ZeRO++ 已集成到 DeepSpeed-Chat 中，以支持 ChatGPT 类模型的 RLHF 训练。 在图 9 中，我们比较了不同大小的 actor 模型的 RLHF 生成吞吐量。测试配置为 32个V100 GPU ，actor 模型大小为30B 和 66B以测试 ZeRO 和 ZeRO++性能。 结果表明，ZeRO++ 的 RLHF 生成吞吐量比 ZeRO 高出 2.25 倍。 我们还展示了在 16 个 V100 GPU 上训练阶段的加速，其中 ZeRO++ 实现了比 ZeRO 高 1.26 倍的吞吐量，这是由于 ZeRO++ 支持的更低通信量和更大批量大小。
+
+##  **DeepSpeed ZeRO++现已发布!**
+
+我们非常高兴能够发布 DeepSpeed ZeRO++ 并让 AI 社区中的每个人都可以使用它。请访问我们的 GitHub 页面以获取 [LLM训练教程](https://www.deepspeed.ai/tutorials/zeropp/)。 用于 DeepSpeed-Chat 的 ZeRO++ 将在未来几周内发布。\
+有关 ZeRO++ 的更多技术细节，请查看我们的[arxiv论文](https://arxiv.org/pdf/2306.10209.pdf)。
+
+DeepSpeed-ZeRO++ 是 DeepSpeed 生态系统的一部分。 要了解更多信息，请访问我们的网站，在那里您可以找到详细的博客文章、教程和有用的文档。
+
+您还可以在我们的[英文 Twitter](https://twitter.com/MSFTDeepSpeed)、[日文 Twitter](https://twitter.com/MSFTDeepSpeedJP) 和[中文知乎](https://www.zhihu.com/people/deepspeed) 上获取最新的 DeepSpeed 新闻。
+
+DeepSpeed 欢迎您的贡献！ 我们鼓励您在 DeepSpeed GitHub 页面上报告问题、贡献 PR 并加入讨论。 有关更多详细信息，请参阅我们的贡献指南。 我们对与大学、研究实验室和公司的合作持开放态度。 对于此类请求（以及其他不适合 GitHub 的请求），请直接发送电子邮件至 <deepspeed-info@microsoft.com>。
+
+**贡献者:**
+
+DeepSpeed 团队的以下人员的贡献使该项目成为可能：
+
+[Guanhua Wang](https://www.microsoft.com/en-us/research/people/guanhuawang/), Heyang Qin, Sam Ade Jacobs, Connor Holmes, [Samyam Rajbhandari](https://www.microsoft.com/en-us/research/people/samyamr/), [Olatunji Ruwase](https://www.microsoft.com/en-us/research/people/olruwase/), Ammar Ahmad Awan, Jeff Rasley, Michael Wyatt, [Yuxiong He](https://www.microsoft.com/en-us/research/people/yuxhe/) (team lead)
diff --git a/blogs/zeropp/japanese/README.md b/blogs/zeropp/japanese/README.md
new file mode 100644
index 000000000000..a4d4e68f6b02
--- /dev/null
+++ b/blogs/zeropp/japanese/README.md
@@ -0,0 +1,186 @@
+<div align="center">
+
+# DeepSpeed ZeRO++: LLMやチャットモデルの訓練を劇的に高速化 – 通信オーバヘッドを1/4に大幅削減 -
+
+</div>
+<div align="center">
+
+<img src="../assets/images/overview.png" width="800px"/>
+
+図1: DeepSpeed ZeRO++ の概要
+</div>
+
+大規模AIモデルは、まさに今デジタルの世界を変革しつつあります。大規模言語モデル（Large Language Model, LLM）を搭載したTuring-NLG、ChatGPT、GPT-4のような生成言語モデルは、驚くほど汎用性が高く、要約、コーディング、翻訳のようなタスクを実行できます。同様に、DALL·E、Microsoft Designer、Bing Image Creatorのような大規模なマルチモーダル生成モデルは、アート、建築、ビデオ、その他のデジタルアセットを生成することができ、コンテンツクリエイター、建築家、エンジニアがクリエイティブな生産性を発揮し、新たなフロンティアを開拓する力をもたらしています。
+
+しかし、これらの大規模なモデルを訓練するには、何百、何千ものGPUデバイスを使用した膨大なメモリとコンピューティングリソースが必要です。例えば、[Megatron-Turing NLG 530Bモデル](https://www.microsoft.com/en-us/research/blog/using-deepspeed-and-megatron-to-train-megatron-turing-nlg-530b-the-worlds-largest-and-most-powerful-generative-language-model/)の訓練には、4,000以上のNVidia A100 GPUが使用されました。これらのリソースを効率的に活用するには、モデルを個々のGPUデバイスのメモリに収まるように分割し、これらのデバイス間で効率的に並列計算を行うための、複雑な最適化システムが必要になります。同時に、大規模なモデル学習をユーザーが容易に利用できるようにするには、そうした最適化が簡単に適用できる必要があります。
+
+DeepSpeedが提供する[ZeRO](https://www.deepspeed.ai/tutorials/zero/)と呼ばれる一連の最適化技術は、これらの課題に対する強力なソリューションを提供し、大規模で強力な深層学習モデルであるTNLG-17B、Bloom-176B、MPT-7B、Jurrasic-1などの訓練に広く使用されています。ZeROはそうした強力な機能を持つ一方で、いくつかの利用シナリオでは、GPU間のデータ転送のオーバーヘッドが大きくなり、高い学習効率を達成することが難しいことがあります。これは特に、a) (グローバル)バッチサイズに対して多数のGPUで訓練するため、GPUごとのバッチサイズが小さくなり、頻繁な通信が必要になる場合 b) ローエンドの計算クラスタで訓練する際、ノード間のネットワーク帯域幅が十分ではなく、通信待ち時間が長くなる場合 に発生します。これらのシナリオでは、ZeROの使いやすさと計算効率という利点が十分に発揮できません。
+
+今回リリースする[ZeRO++](https://arxiv.org/abs/2306.10209)は、ZeROの通信を最適化することで、こうした問題を解決するシステムです。バッチサイズの制限やデバイス間の帯域幅の制約に関係なく、大規模モデルの訓練で極めて高い効率を実現します。ZeRO++は、量子化および通信とデータの再マッピングを組み合わせることで、モデルの品質に影響を与えることなく、ZeROと比較して総通信量を4分の1に削減します。これにより、以下に示す2つの重要な効果が得られます。
+
+
+1. *大規模モデルの事前学習・ファインチューニングの高速化*
+    1. GPUあたりのバッチサイズが小さい： 数千のGPUで大規模モデルを事前学習する場合でも、数百または数十のGPUでモデルをファインチューニングする場合でも、GPUあたりのバッチサイズが小さい場合、ZeRO++はZeROに比べて最大2.2倍のスループットを提供し、訓練時間とコストを削減します。
+
+    2. 低帯域幅クラスタ： ZeRO++では、帯域幅の小さいクラスタでも、4倍の帯域幅を持つクラスタと同等のスループットを達成できます。そのため、ZeRO++を使用すれば、さまざまなクラスタで効率的な大規模モデルの訓練が可能になります。
+
+2. *RLHFによるChatGPTライクなモデルの訓練の高速化*
+
+    1. ZeRO++は主に訓練の高速化を目的に設計されていますが、通信オーバーヘッドは、ZeROを用いた訓練と推論に共通の課題であるため、ZeRO++の最適化は、推論のための機構である[ZeRO-Inference](https://www.deepspeed.ai/2022/09/09/zero-inference.html#:~:text=ZeRO-Inference%20adapts%20and%20optimizes%20ZeRO-Infinity%20techniques%20for%20model,memory,%20thus%20hosting%20no%20(zero)%20weights%20in%20GPU.)でも有効です。その結果、ZeRO++は、対話モデルの推論に使用される、人間のフィードバックからの強化学習（RLHF）のような、訓練と推論の両方を組み合わせたワークロードの効率を向上させます。
+
+    2. DeepSpeed-Chatとの統合により、ZeRO++はオリジナルのZeROと比較して、RLHF訓練の生成フェーズを最大2倍、訓練フェーズを最大1.3倍高速化することができます。
+
+次に、ZeROとその通信オーバーヘッドについて詳しく掘り下げた上で、ZeRO++における主要な最適化について説明します。また、モデルサイズ、バッチサイズ、帯域幅の制約を変えて、ZeRO++が訓練の実行速度に与える影響も実証します。また、ZeRO++をDeepSpeed-Chatに適用して、RLHFを使用した対話モデルの学習を高速化する方法についても説明します。
+
+## ZeRO++の詳細
+
+<div align="center">
+
+<img src="../assets/images/zero-overview.gif" width="800px"/>
+
+図2: ZeROによる最適化
+</div>
+
+ZeROは、データ並列のメモリ効率を向上させた技術であり、モデルの状態を全てのGPUに複製する代わりに、GPUごとに分割し、訓練中にgather/broadcastといった集合通信を必要になる都度実行して、分割されたモデル状態を再構築します。これにより、ZeROは、データ並列のシンプルさ・使いやすさを保ちつつ、すべてのGPUデバイスのメモリと計算を集約して、効果的に活用することができます。
+
+順伝播（forward）の計算では、ZeROはallgather/broadcast通信によって、モデルの各レイヤーのパラメータを、使用する直前に収集します（パラメータの合計のサイズをMとします）。逆伝播（backward）では、ZeRO は各レイヤーのパラメータについて同様の通信パターンによって、各GPU上でローカルに勾配を計算します（勾配の合計サイズは同じく Mになります）。さらに、ZeROは、ローカルに計算された勾配を、reduceまたはreduce-scatter通信（合計サイズM）を使用して平均化し、分割します。2回のallgather/broadcast、及び1回のreduceまたはreduce-scatter通信で、合計の通信データサイズは3Mになります。
+
+これらの通信オーバーヘッドを削減するために、ZeRO++では、上記の3回の通信を対象とした一連の最適化技術を実現しました：
+
+<div align="center">
+
+<img src="../assets/images/qwz.png" width="800px"/>
+
+図3: qwZにおけるブロックベース量子化
+</div>
+
+
+### パラメータの量子化と通信 (qwZ)
+
+まず、allgather時のパラメータの通信量を削減するために、パラメータの量子化を使用します。通信の直前に各モデルパラメータをFP16（2バイト）からINT8（1バイト）データ型に変換し、通信後に元に戻します。しかし、単純にパラメータの量子化を行うと、モデルの学習精度が低下する可能性があります。そこで、モデルの学習精度を維持するために、モデルパラメータの各サブセットに対して、独立した量子化を行うブロックベースの量子化を採用しています。これまでに、高性能なブロックベース量子化の実装は存在しなかったため、ZeRO++のために、高度に最適化された量子化CUDAカーネルをゼロから実装し、基本的な量子化と比較して、3倍の高精度と、5倍の高速化を実現しました。
+
+<div align="center">
+
+<img src="../assets/images/hpz.png" width="800px"/>
+
+図4: hpZにおける階層的なパラメータの分割
+</div>
+
+
+### ZeROのための階層的なパラメータの分割 (hpZ)
+
+次に、逆伝播において、GPUメモリの必要サイズの増加と引き換えに、パラメータのallgatherの通信オーバヘッドを削減します。具体的には、ZeROのようにモデル全体のパラメータを全てのサーバのGPUデバイスに分散させるのではなく、各サーバごとに完全なモデルのコピーを保持します。これにより、必要メモリサイズは増加しますが、一般に通信帯域幅が限られるサーバ間でのallgather/broadcastではなく、通信帯域幅の大きいサーバ内通信によるallgather/broadcastのみを使用することになり、大幅に高速化できます。
+
+<div align="center">
+
+<img src="../assets/images/qgz.gif" width="800px"/>
+
+図5: qgZの処理の流れ
+
+</div>
+
+### 勾配の量子化と通信 (qgZ)
+
+次に取り上げる、reduce-scatterを使った勾配の通信コストの削減は、上述の他の課題よりさらに困難です。通信量を減らすために単純に量子化を適用すると、ブロックベースの量子化を使用したとしても、reduceでの加算の過程で誤差が累積されてしまいます。そこで我々は、勾配を送信前に量子化し、受信後、reduceでの加算の前に量子化を解除します。これを効率的に行うために、我々はqgZと呼ばれるall-to-allベースの新しい量子化勾配通信パラダイムを考案しました。
+
+qgZは、次の2つの課題を解決するために設計されています。i) 単純にINT4/INT8でreduce-scatterを実装した場合、reduceを低精度で計算することによって生じる大幅な精度低下を克服すること、及び ii) （元の精度でreduce-scatterを行う場合でも）リングベースまたはツリーベースの従来のreduce-scatterにおいて、量子化と復元の一連の処理から生じる精度低下と大幅なレイテンシオーバーヘッドを回避すること です。qgZは、リングベースまたはツリーベースの散布度削減アルゴリズムの代わりに、新しい階層的なall-to-all通信によるアプローチを用います。
+
+qgZには3つの主要なステップがあります：i) 勾配スライスの並べ替え、ii) ノード内通信と加算、iii) ノード間通信と加算。まず、通信が行われる前に、勾配テンソルのスライスと、スライスの並べ替えを行い、通信終了時に各GPU上で正しい勾配の配置（図5の緑色の勾配のスライス）が得られるようにします。第2に、並べ替えられた勾配スライスを量子化し、各ノード内でall-to-all通信を行います。all-to-allから受信した勾配スライスは、量子化から復元され、ローカルでreduction（加算）の計算を行います。第3に、ローカルでreductionされた勾配を再び量子化し、ノード間で全ノード間通信を行います。受信した勾配を再び量子化から復元し、元の精度でreductionの計算を行い、図5の緑の勾配のスライスを得ます。
+
+このような階層的なアプローチをとる理由は、ノード間の通信量を削減するためです。より正確には、ノードあたりN個のGPU、モデルサイズM、および量子化の比率Zが与えられた場合、シングルホップのall-to-all通信では、M*N/Z個のノード間通信が発生します。これに対し、この階層的アプローチでは、各GPUのノード間通信をM/ZからM/(Z*N)に減らすことができます。したがって、総通信量はM*N/ZからM*N/(Z*N)=M/Zに減少します。さらに、ノード内通信とノード間通信をオーバーラップさせ、(テンソルスライス並べ替え+ノード内量子化)と(ノード内非量子化+ノード内加算+ノード間量子化)のCUDAカーネルを融合させることで、qgZのend-to-endのレイテンシを最適化します。
+
+<div align="center">
+
+|     Communication Volume    |     Forward all-gather on weights    |     Backward all-gather on weights    |     Backward reduce-scatter on gradients    |     Total    |
+|:---------------------------:|:------------------------------------:|:-------------------------------------:|:-------------------------------------------:|:------------:|
+|             ZeRO            |                   M                  |                    M                  |                       M                     |       3M     |
+|            ZeRO++           |                  0.5M                |                    0                  |                     0.25M                   |     0.75M    |
+
+</div>
+
+### **通信量の削減**
+
+上述の3つの最適化技術をすべて組み込むことで、ノード間の通信量を3Mから0.75Mに減らすことができます。具体的には、qwZを用いて、モデルパラメータに関する順伝播のallgather/broadcast通信をMから0.5Mに削減します。また、qgZを使用して、逆伝播のノード間のreduce-scatter通信をMから0.25Mに削減します。
+
+## **ZeRO++によるLLM訓練の高速化**
+
+ここでは、384台のNVIDIA V100 GPUを使用した、実際のLLM訓練シナリオでのZeRO++の評価結果を示します。
+
+<div align="center">
+
+<img src="../assets/images/eval1.png" width="800px"/>
+
+図6: 様々なモデルサイズでのZeRO++とZeROの速度の比較（384台のV100 GPU、400Gbps (100Gbps×4) のノード間接続）
+
+</div>
+
+### **GPUあたりのバッチサイズが小さい場合でも高い効率を実現**
+
+**高帯域幅クラスタ:** 図6は、それぞれ100Gbpsで動作する4つのインフィニバンド（IB）接続を使用した400Gbpsノード間接続で、異なるモデルサイズとマイクロバッチサイズについて、ZeRO++のスループットがZeROを上回ったことを示しています。GPUあたり1kトークンを使用した場合、ZeRO++はZeRO-3に対して28%から36%のスループット向上を達成しました。マイクロバッチサイズが2kの場合では、ZeRO++はZeRO-3に対して24%から29%のスループット向上を達成しています。
+
+<div align="center">
+
+<img src="../assets/images/eval2.png" width="800px"/>
+
+
+図7: 異なるサイズのLLMのスループット比較（384台のGPU・100Gbpsのノード間接続）
+</div>
+
+**低帯域幅クラスタ:** 100Gbpsネットワークのような低速なネットワーク環境では、ZeRO++は大幅に優れた性能を発揮します。図 7 に示すように、ZeRO++ は ZeRO-3 と比較して、end-to-endのスループットで最大 2.2 倍の高速化を達成しています。平均して、ZeRO++はZeRO-3をベースラインとして、約2倍の高速化を達成しています。
+
+<div align="center">
+
+<img src="../assets/images/eval3.png" width="800px"/>
+
+
+図8: ZeRO++により、低い帯域幅のクラスタでも、ZeROを高い帯域幅のクラスタで使用した場合と同等の性能を実現
+
+</div>
+
+### **低帯域幅クラスタでも高帯域幅クラスタで従来技術と用いたのと同様の効率を実現**
+
+さらに、ZeRO ++は、低帯域幅クラスタで、はるかに高い帯域幅クラスタでのZeROを使用した場合と比較して、同等のシステムスループットを達成できます。図8に示すように、18Bと138Bの両モデルで、200Gbpsノード間通信が可能な環境でのZeRO++は、800Gbpsノード間通信が可能な環境のZeRO-3と同等のTFLOPを達成できます。その優れたスケーラビリティから、ZeRO++は大規模AIモデルを訓練するための次世代のZeROと位置付けられます。
+
+## **DeepSpeed-Chatを用いたRLHF訓練におけるZeRO++の適用**
+
+### **RLHF訓練の背景**
+
+ChatGPTのようなモデルは、LLMの学習と、[RLHFによるファインチューニング](https://openai.com/blog/chatgpt)によって構築されます。RLHFは生成（推論）フェーズと学習フェーズから構成されます。生成フェーズでは、アクターモデルが部分的な会話を入力とし、一連の順伝播の計算を用いて応答を生成します。そして訓練フェーズでは、クリティックモデルが生成された応答を品質によってランク付けし、アクターモデルに強化信号を与えます。アクターモデルはこれらのランク付けを用いてファインチューニングされ、その後の反復においてより正確で適切な応答を生成できるようになります。
+
+RLHFトレーニングは4つのモデル（アクター、リファレンス、クリティック、リウォード）を利用するため、きわめて大きなメモリが必要となります。この問題に対処するため、低ランク適応（LoRA）を採用しています。LoRAは事前学習されたモデルのパラメータを固定し、学習可能なランク分解行列をTransformerアーキテクチャの各層に追加することで、学習可能なパラメータ数を大幅に削減することができます。LoRAを用いてメモリ使用量を削減することでRLHFを高速化し、より大きなバッチサイズでの計算が可能になり、スループットを大幅に向上できます。
+
+### **RLHF訓練のためのDeepSpeed-ChatへのZeRO++の適用**
+
+<div align="center">
+
+<img src="../assets/images/rlhf-eval.png" width="800px"/>
+
+
+図9:  ZeRO++によりRLHF訓練の生成フェーズと訓練フェーズの両方を高速化
+
+</div>
+
+LoRAを使用する場合、RLHFでは、ほとんどのモデルパラメータが固定されています。ZeRO++は、この特徴を利用した特別な機能を提供しています。ZeRO++は通常、固定されたパラメータをFP16で保持し、各通信操作の前に量子化します。RLHFではその代わりに、前もってINT4/8に量子化しておくことができます。通信後の量子化からの復元は必要ですが、復元されたパラメータは、それを使用する計算が終わった後に破棄されます。
+
+このようにZeRO++をRLHF訓練に使用することで、メモリ使用量と通信量の両方を削減できます。通信量だけでなく、メモリ使用量が削減されるため、バッチサイズが大きくすることができ、訓練のスループットが向上します。生成フェーズでは、ZeRO++はhpZを使用してすべてのパラメータの通信を各ノード内で行うようにし、通信量を削減しながらノード内の高い通信帯域幅を利用することで、生成スループットをさらに向上させます。
+
+ZeRO++はDeepSpeed-Chatに統合され、ChatGPTライクなモデルのRLHF訓練を強力にサポートします。図 9 では、32 個の V100 GPU 上で、30B および 66B のアクターモデルについて、ZeRO と ZeRO++ を比較し、アクターモデルのサイズが異なる場合の RLHF 生成のスループットを比較しています。その結果、ZeRO++はZeROよりもRLHF生成スループットが最大2.25倍向上することが確認されました。また、16個のV100 GPU上での訓練フェーズでは、ZeRO++によって可能になった通信量の低減とバッチサイズの拡大により、ZeRO++はZeROよりも1.26倍優れたスループットを達成しています。
+
+##  **早速試してみましょう！**
+
+DeepSpeed ZeRO++をリリースし、AIコミュニティの誰もが利用できるようになることを大変嬉しく思っています。まずは、LLM訓練の[チュートリアル](https://www.deepspeed.ai/tutorials/zeropp/)をご覧ください。ZeRO++ for DeepSpeed-Chatは数週間以内にリリースされる予定です。
+
+ZeRO++の技術的な詳細については、arXivにアップロードされた[論文](https://arxiv.org/pdf/2306.10209.pdf)をご覧ください。
+
+DeepSpeed-ZeRO++は、DeepSpeedエコシステムの一部です。詳細については、我々の[Webサイト](https://www.deepspeed.ai/)をご覧ください。詳細なブログ記事、チュートリアル、ドキュメントが掲載されています。
+
+また、[英語版Twitter](https://twitter.com/MSFTDeepSpeed)、[日本語版Twitter](https://twitter.com/MSFTDeepSpeedJP)、[中国語版Zhihuアカウント](https://www.zhihu.com/people/deepspeed)でも最新のDeepSpeedニュースを発信しています。
+
+DeepSpeedは、皆様の開発への参加を歓迎しています。DeepSpeedのGitHubページで、バグ報告、Pull Request、ディスカッションへの参加が可能です。詳細は[ガイドライン](https://github.com/microsoft/DeepSpeed/blob/master/CONTRIBUTING.md)をご覧ください。また、大学、研究所、企業とのコラボレーションも行っています。こうしたコラボレーションについてのご要望（およびGitHubには適さないその他の話題）については<deepspeed-info@microsoft.com> まで直接メールをお送りください。
+
+
+**Contributors:**
+
+このプロジェクトは、DeepSpeedチームの以下のメンバーによって実施されました。
+
+[Guanhua Wang](https://www.microsoft.com/en-us/research/people/guanhuawang/), Heyang Qin, Sam Ade Jacobs, Connor Holmes, [Samyam Rajbhandari](https://www.microsoft.com/en-us/research/people/samyamr/), [Olatunji Ruwase](https://www.microsoft.com/en-us/research/people/olruwase/), Ammar Ahmad Awan, Jeff Rasley, Michael Wyatt, [Yuxiong He](https://www.microsoft.com/en-us/research/people/yuxhe/) (team lead)
diff --git a/csrc/adagrad/cpu_adagrad.cpp b/csrc/adagrad/cpu_adagrad.cpp
index 5ae2419cb5e1..563255176500 100644
--- a/csrc/adagrad/cpu_adagrad.cpp
+++ b/csrc/adagrad/cpu_adagrad.cpp
@@ -1,3 +1,8 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
 #include "cpu_adagrad.h"
 #include <torch/extension.h>
 #include <iostream>
@@ -42,6 +47,8 @@ void Adagrad_Optimizer::Step_1(float* _params,
             size_t offset = copy_size + t;
 #if defined(__ENABLE_CUDA__)
             if ((t / TILE) >= 2) { cudaStreamSynchronize(_streams[_buf_index]); }
+#elif defined(__ENABLE_CANN__)
+            if ((t / TILE) >= 2) { aclrtSynchronizeStream(_streams[_buf_index].stream()); }
 #endif
 #pragma omp parallel for
             for (size_t k = t; k < offset; k++) {
@@ -57,7 +64,7 @@ void Adagrad_Optimizer::Step_1(float* _params,
                 grad += _eps;
                 grad = momentum / grad;
                 param = grad * step_size + param;
-#if defined(__ENABLE_CUDA__)
+#if defined(__ENABLE_CUDA__) or defined(__ENABLE_CANN__)
                 if (dev_params) _doubled_buffer[_buf_index][k - t] = param;
 #endif
                 if (half_precision)
@@ -74,6 +81,17 @@ void Adagrad_Optimizer::Step_1(float* _params,
                     _doubled_buffer[_buf_index], dev_params + t, (copy_size), _streams[_buf_index]);
                 _buf_index = !_buf_index;
             }
+#elif defined(__ENABLE_CANN__)
+            if (dev_params) {
+                size_t memcpy_size = copy_size * sizeof(_doubled_buffer[_buf_index][0]);
+                aclrtMemcpy(dev_params + t,
+                            memcpy_size,
+                            _doubled_buffer[_buf_index],
+                            memcpy_size,
+                            aclrtMemcpyKind::ACL_MEMCPY_HOST_TO_DEVICE);
+
+                _buf_index = !_buf_index;
+            }
 #endif
         }
     }
@@ -173,9 +191,9 @@ int ds_adagrad_step(int optimizer_id,
         std::static_pointer_cast<Adagrad_Optimizer>(s_optimizers[optimizer_id]);
     opt->IncrementStep(step);
     opt->update_state(lr, epsilon, weight_decay);
-    opt->Step_8(params_ptr, grads_ptr, exp_avg_sq_ptr, params_c.size(0));
+    opt->Step_8(params_ptr, grads_ptr, exp_avg_sq_ptr, params_c.numel());
 
-#if defined(__ENABLE_CUDA__)
+#if defined(__ENABLE_CUDA__) or defined(__ENABLE_CANN__)
     opt->SynchronizeStreams();
 #endif
     return 0;
@@ -191,7 +209,7 @@ int ds_adagrad_step_plus_copy(int optimizer_id,
                               torch::Tensor& exp_avg_sq,
                               torch::Tensor& gpu_params)
 {
-#if defined(__ENABLE_CUDA__)
+#if defined(__ENABLE_CUDA__) or defined(__ENABLE_CANN__)
     auto params_c = params.contiguous();
     auto gpu_params_c = gpu_params.contiguous();
     auto exp_avg_sq_c = exp_avg_sq.contiguous();
@@ -209,7 +227,7 @@ int ds_adagrad_step_plus_copy(int optimizer_id,
     opt->Step_8(params_ptr,
                 grads_ptr,
                 exp_avg_sq_ptr,
-                params_c.size(0),
+                params_c.numel(),
                 gpu_params_ptr,
                 (params.options().dtype() == at::kHalf));
 
diff --git a/csrc/adam/cpu_adam.cpp b/csrc/adam/cpu_adam.cpp
index f17f22535ab8..96809827f3e1 100644
--- a/csrc/adam/cpu_adam.cpp
+++ b/csrc/adam/cpu_adam.cpp
@@ -1,297 +1,9 @@
-#include "cpu_adam.h"
-#include <torch/extension.h>
-#include <cassert>
-#include <iostream>
-#include <memory>
-#include <type_traits>
-#include <unordered_map>
-
-#if defined(__ENABLE_CUDA__)
-#include <cuda_runtime_api.h>
-#include "cublas_v2.h"
-#include "cuda.h"
-#include "curand.h"
-#include "custom_cuda_layers.h"
-#endif
-
-static std::unordered_map<int, std::shared_ptr<void>> s_optimizers;
-
-// C++ interface
-
-void Adam_Optimizer::Step_1(float* _params,
-                            float* grads,
-                            float* _exp_avg,
-                            float* _exp_avg_sq,
-                            size_t _param_size,
-                            ds_half_precision_t* dev_params,
-                            bool half_precision)
-{
-    size_t rounded_size = 0;
-#if defined(__AVX512__) or defined(__AVX256__)
-    Step_AVX<1>(&rounded_size,
-                _params,
-                grads,
-                _exp_avg,
-                _exp_avg_sq,
-                _param_size,
-                dev_params,
-                half_precision);
-#endif
-    if (_param_size > rounded_size) {
-        float betta1_minus1 = 1 - _betta1;
-        float betta2_minus1 = 1 - _betta2;
-
-        float step_size = -1 * _alpha / _bias_correction1;
-        float w_decay = -1 * _alpha * _weight_decay;
-        ds_half_precision_t* grads_cast_h;
-        ds_half_precision_t* params_cast_h;
-        if (half_precision) {
-            grads_cast_h = reinterpret_cast<ds_half_precision_t*>(grads);
-            params_cast_h = reinterpret_cast<ds_half_precision_t*>(_params);
-        }
-
-        for (size_t t = rounded_size; t < _param_size; t += TILE) {
-            size_t copy_size = TILE;
-            if ((t + TILE) > _param_size) copy_size = _param_size - t;
-            size_t offset = copy_size + t;
-#if defined(__ENABLE_CUDA__)
-            if ((t / TILE) >= 2) { cudaStreamSynchronize(_streams[_buf_index]); }
-#endif
-#pragma omp parallel for
-            for (size_t k = t; k < offset; k++) {
-                float grad = half_precision ? (float)grads_cast_h[k] : grads[k];
-                float param = half_precision ? (float)params_cast_h[k] : _params[k];
-                float momentum = _exp_avg[k];
-                float variance = _exp_avg_sq[k];
-                if (_weight_decay > 0 && !_adamw_mode) { grad = param * _weight_decay + grad; }
-                momentum = momentum * _betta1;
-                momentum = grad * betta1_minus1 + momentum;
-
-                variance = variance * _betta2;
-                grad = grad * grad;
-                variance = grad * betta2_minus1 + variance;
-
-                grad = sqrt(variance);
-                grad = grad * _bias_correction2 + _eps;
-                grad = momentum / grad;
-                if (_weight_decay > 0 && _adamw_mode) { param += w_decay * param; }
-                param = grad * step_size + param;
-#if defined(__ENABLE_CUDA__)
-                if (dev_params) _doubled_buffer[_buf_index][k - t] = param;
-#endif
-                if (half_precision)
-                    params_cast_h[k] = (ds_half_precision_t)param;
-                else
-                    _params[k] = param;
-                _exp_avg[k] = momentum;
-                _exp_avg_sq[k] = variance;
-            }
-#if defined(__ENABLE_CUDA__)
-            if (dev_params) {
-                launch_param_update(
-                    _doubled_buffer[_buf_index], dev_params + t, (copy_size), _streams[_buf_index]);
-
-                _buf_index = !_buf_index;
-            }
-#endif
-        }
-    }
-}
-
-void Adam_Optimizer::Step_4(float* _params,
-                            float* grads,
-                            float* _exp_avg,
-                            float* _exp_avg_sq,
-                            size_t _param_size,
-                            ds_half_precision_t* dev_params,
-                            bool half_precision)
-{
-    size_t rounded_size = 0;
-#if defined(__AVX512__) or defined(__AVX256__)
-    Step_AVX<4>(&rounded_size,
-                _params,
-                grads,
-                _exp_avg,
-                _exp_avg_sq,
-                _param_size,
-                dev_params,
-                half_precision);
-#endif
-    if (_param_size > rounded_size)
-        Step_1((_params + rounded_size),
-               (grads + rounded_size),
-               (_exp_avg + rounded_size),
-               (_exp_avg_sq + rounded_size),
-               (_param_size - rounded_size),
-               (dev_params != nullptr ? (dev_params + rounded_size) : dev_params),
-               half_precision);
-}
-
-int create_adam_optimizer(int optimizer_id,
-                          float alpha = 1e-3,
-                          float betta1 = 0.9,
-                          float betta2 = 0.999,
-                          float eps = 1e-8,
-                          float weight_decay = 0,
-                          bool adamw_mode = true,
-                          bool should_log = false)
-{
-    auto opt =
-        std::make_shared<Adam_Optimizer>(alpha, betta1, betta2, eps, weight_decay, adamw_mode);
-
-    s_optimizers[optimizer_id] = opt;
-
-    if (should_log) {
-        std::string avx_type = "";
-#if defined(__AVX512__)
-        avx_type = "AVX512";
-#else
-#if defined(__AVX256__)
-        avx_type = "AVX2";
-#else
-        avx_type = "scalar";
-#endif
-#endif
-
-        printf("Adam Optimizer #%d is created with %s arithmetic capability.\n",
-               optimizer_id,
-               avx_type.c_str());
-        printf("Config: alpha=%f, betas=(%f, %f), weight_decay=%f, adam_w=%d\n",
-               alpha,
-               betta1,
-               betta2,
-               weight_decay,
-               (int)adamw_mode);
-    }
-
-    return 0;
-}
-
-void Adam_Optimizer::Step_8(float* _params,
-                            float* grads,
-                            float* _exp_avg,
-                            float* _exp_avg_sq,
-                            size_t _param_size,
-                            ds_half_precision_t* dev_params,
-                            bool half_precision)
-{
-    size_t rounded_size = 0;
-#if defined(__AVX512__) or defined(__AVX256__)
-    Step_AVX<8>(&rounded_size,
-                _params,
-                grads,
-                _exp_avg,
-                _exp_avg_sq,
-                _param_size,
-                dev_params,
-                half_precision);
-#endif
-    if (_param_size > rounded_size)
-        Step_4((_params + rounded_size),
-               (grads + rounded_size),
-               (_exp_avg + rounded_size),
-               (_exp_avg_sq + rounded_size),
-               (_param_size - rounded_size),
-               (dev_params != nullptr ? (dev_params + rounded_size) : dev_params),
-               half_precision);
-}
-
-int ds_adam_step(int optimizer_id,
-                 size_t step,
-                 float lr,
-                 float beta1,
-                 float beta2,
-                 float epsilon,
-                 float weight_decay,
-                 bool bias_correction,
-                 torch::Tensor& params,
-                 torch::Tensor& grads,
-                 torch::Tensor& exp_avg,
-                 torch::Tensor& exp_avg_sq)
-{
-    auto params_c = params.contiguous();
-    auto grads_c = grads.contiguous();
-    auto exp_avg_c = exp_avg.contiguous();
-    auto exp_avg_sq_c = exp_avg_sq.contiguous();
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
 
-    // assert(params.options().dtype() == grads.options().dtype());
-
-    float* params_ptr = (float*)params_c.data_ptr();
-    float* grads_ptr = (float*)grads_c.data_ptr();
-    float* exp_avg_ptr = (float*)exp_avg_c.data_ptr();
-    float* exp_avg_sq_ptr = (float*)exp_avg_sq_c.data_ptr();
-
-    std::shared_ptr<Adam_Optimizer> opt =
-        std::static_pointer_cast<Adam_Optimizer>(s_optimizers[optimizer_id]);
-    opt->IncrementStep(step, beta1, beta2);
-    opt->update_state(lr, epsilon, weight_decay, bias_correction);
-
-    opt->Step_8(params_ptr,
-                grads_ptr,
-                exp_avg_ptr,
-                exp_avg_sq_ptr,
-                params_c.size(0),
-                nullptr,
-                (params.options().dtype() == at::kHalf));
-
-#if defined(__ENABLE_CUDA__)
-    opt->SynchronizeStreams();
-#endif
-    return 0;
-}
+// DeepSpeed Team
 
-int ds_adam_step_plus_copy(int optimizer_id,
-                           size_t step,
-                           float lr,
-                           float beta1,
-                           float beta2,
-                           float epsilon,
-                           float weight_decay,
-                           bool bias_correction,
-                           torch::Tensor& params,
-                           torch::Tensor& grads,
-                           torch::Tensor& exp_avg,
-                           torch::Tensor& exp_avg_sq,
-                           torch::Tensor& gpu_params)
-{
-#if defined(__ENABLE_CUDA__)
-    auto params_c = params.contiguous();
-    auto gpu_params_c = gpu_params.contiguous();
-    auto exp_avg_c = exp_avg.contiguous();
-    auto exp_avg_sq_c = exp_avg_sq.contiguous();
-    auto grads_c = grads.contiguous();
-
-    float* params_ptr = (float*)params_c.data_ptr();
-    float* grads_ptr = (float*)grads_c.data_ptr();
-    ds_half_precision_t* gpu_params_ptr = (ds_half_precision_t*)gpu_params_c.data_ptr();
-    float* exp_avg_ptr = (float*)exp_avg_c.data_ptr();
-    float* exp_avg_sq_ptr = (float*)exp_avg_sq_c.data_ptr();
-
-    std::shared_ptr<Adam_Optimizer> opt =
-        std::static_pointer_cast<Adam_Optimizer>(s_optimizers[optimizer_id]);
-    opt->IncrementStep(step, beta1, beta2);
-    opt->update_state(lr, epsilon, weight_decay, bias_correction);
-    opt->Step_8(params_ptr,
-                grads_ptr,
-                exp_avg_ptr,
-                exp_avg_sq_ptr,
-                params_c.size(0),
-                gpu_params_ptr,
-                (params.options().dtype() == at::kHalf));
-
-    opt->SynchronizeStreams();
-#else
-    assert(false);
-#endif
-    return 0;
-}
-
-int destroy_adam_optimizer(int optimizer_id)
-{
-    s_optimizers.erase(optimizer_id);
-
-    return 0;
-}
+#include "cpu_adam.h"
 
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
 {
diff --git a/csrc/adam/cpu_adam_impl.cpp b/csrc/adam/cpu_adam_impl.cpp
new file mode 100644
index 000000000000..9a4a8d956519
--- /dev/null
+++ b/csrc/adam/cpu_adam_impl.cpp
@@ -0,0 +1,312 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include <torch/extension.h>
+#include <cassert>
+#include <iostream>
+#include <memory>
+#include <type_traits>
+#include <unordered_map>
+#include "cpu_adam.h"
+
+#if defined(__ENABLE_CUDA__)
+#include <cuda_runtime_api.h>
+#include "cublas_v2.h"
+#include "cuda.h"
+#include "curand.h"
+#include "custom_cuda_layers.h"
+#endif
+
+static std::unordered_map<int, std::shared_ptr<void>> s_optimizers;
+
+// C++ interface
+
+void Adam_Optimizer::Step_1(float* _params,
+                            float* grads,
+                            float* _exp_avg,
+                            float* _exp_avg_sq,
+                            size_t _param_size,
+                            ds_half_precision_t* dev_params,
+                            bool half_precision)
+{
+    size_t rounded_size = 0;
+#if defined(__AVX512__) or defined(__AVX256__)
+    Step_AVX<1>(&rounded_size,
+                _params,
+                grads,
+                _exp_avg,
+                _exp_avg_sq,
+                _param_size,
+                dev_params,
+                half_precision);
+#endif
+    if (_param_size > rounded_size) {
+        float betta1_minus1 = 1 - _betta1;
+        float betta2_minus1 = 1 - _betta2;
+
+        float step_size = -1 * _alpha / _bias_correction1;
+        float w_decay = -1 * _alpha * _weight_decay;
+        ds_half_precision_t* grads_cast_h;
+        ds_half_precision_t* params_cast_h;
+        if (half_precision) {
+            grads_cast_h = reinterpret_cast<ds_half_precision_t*>(grads);
+            params_cast_h = reinterpret_cast<ds_half_precision_t*>(_params);
+        }
+
+        for (size_t t = rounded_size; t < _param_size; t += TILE) {
+            size_t copy_size = TILE;
+            if ((t + TILE) > _param_size) copy_size = _param_size - t;
+            size_t offset = copy_size + t;
+#if defined(__ENABLE_CUDA__)
+            if ((t / TILE) >= 2) { cudaStreamSynchronize(_streams[_buf_index]); }
+#elif defined(__ENABLE_CANN__)
+            if ((t / TILE) >= 2) { aclrtSynchronizeStream(_streams[_buf_index].stream()); }
+#endif
+#pragma omp parallel for
+            for (size_t k = t; k < offset; k++) {
+                float grad = half_precision ? (float)grads_cast_h[k] : grads[k];
+                float param = half_precision ? (float)params_cast_h[k] : _params[k];
+                float momentum = _exp_avg[k];
+                float variance = _exp_avg_sq[k];
+                if (_weight_decay > 0 && !_adamw_mode) { grad = param * _weight_decay + grad; }
+                momentum = momentum * _betta1;
+                momentum = grad * betta1_minus1 + momentum;
+
+                variance = variance * _betta2;
+                grad = grad * grad;
+                variance = grad * betta2_minus1 + variance;
+
+                grad = sqrt(variance);
+                grad = grad * _bias_correction2 + _eps;
+                grad = momentum / grad;
+                if (_weight_decay > 0 && _adamw_mode) { param += w_decay * param; }
+                param = grad * step_size + param;
+#if defined(__ENABLE_CUDA__) or defined(__ENABLE_CANN__)
+                if (dev_params) _doubled_buffer[_buf_index][k - t] = param;
+#endif
+                if (half_precision)
+                    params_cast_h[k] = (ds_half_precision_t)param;
+                else
+                    _params[k] = param;
+                _exp_avg[k] = momentum;
+                _exp_avg_sq[k] = variance;
+            }
+#if defined(__ENABLE_CUDA__)
+            if (dev_params) {
+                launch_param_update(
+                    _doubled_buffer[_buf_index], dev_params + t, (copy_size), _streams[_buf_index]);
+
+                _buf_index = !_buf_index;
+            }
+#elif defined(__ENABLE_CANN__)
+            if (dev_params) {
+                size_t memcpy_size = copy_size * sizeof(_doubled_buffer[_buf_index][0]);
+                aclrtMemcpy(dev_params + t,
+                            memcpy_size,
+                            _doubled_buffer[_buf_index],
+                            memcpy_size,
+                            aclrtMemcpyKind::ACL_MEMCPY_HOST_TO_DEVICE);
+
+                _buf_index = !_buf_index;
+            }
+#endif
+        }
+    }
+}
+
+void Adam_Optimizer::Step_4(float* _params,
+                            float* grads,
+                            float* _exp_avg,
+                            float* _exp_avg_sq,
+                            size_t _param_size,
+                            ds_half_precision_t* dev_params,
+                            bool half_precision)
+{
+    size_t rounded_size = 0;
+#if defined(__AVX512__) or defined(__AVX256__)
+    Step_AVX<4>(&rounded_size,
+                _params,
+                grads,
+                _exp_avg,
+                _exp_avg_sq,
+                _param_size,
+                dev_params,
+                half_precision);
+#endif
+    if (_param_size > rounded_size)
+        Step_1((_params + rounded_size),
+               (grads + rounded_size),
+               (_exp_avg + rounded_size),
+               (_exp_avg_sq + rounded_size),
+               (_param_size - rounded_size),
+               (dev_params != nullptr ? (dev_params + rounded_size) : dev_params),
+               half_precision);
+}
+
+int create_adam_optimizer(int optimizer_id,
+                          float alpha,
+                          float betta1,
+                          float betta2,
+                          float eps,
+                          float weight_decay,
+                          bool adamw_mode,
+                          bool should_log)
+{
+    auto opt =
+        std::make_shared<Adam_Optimizer>(alpha, betta1, betta2, eps, weight_decay, adamw_mode);
+
+    s_optimizers[optimizer_id] = opt;
+
+    if (should_log) {
+        std::string avx_type = "";
+#if defined(__AVX512__)
+        avx_type = "AVX512";
+#else
+#if defined(__AVX256__)
+        avx_type = "AVX2";
+#else
+        avx_type = "scalar";
+#endif
+#endif
+
+        printf("Adam Optimizer #%d is created with %s arithmetic capability.\n",
+               optimizer_id,
+               avx_type.c_str());
+        printf("Config: alpha=%f, betas=(%f, %f), weight_decay=%f, adam_w=%d\n",
+               alpha,
+               betta1,
+               betta2,
+               weight_decay,
+               (int)adamw_mode);
+    }
+
+    return 0;
+}
+
+void Adam_Optimizer::Step_8(float* _params,
+                            float* grads,
+                            float* _exp_avg,
+                            float* _exp_avg_sq,
+                            size_t _param_size,
+                            ds_half_precision_t* dev_params,
+                            bool half_precision)
+{
+    size_t rounded_size = 0;
+#if defined(__AVX512__) or defined(__AVX256__)
+    Step_AVX<8>(&rounded_size,
+                _params,
+                grads,
+                _exp_avg,
+                _exp_avg_sq,
+                _param_size,
+                dev_params,
+                half_precision);
+#endif
+    if (_param_size > rounded_size)
+        Step_4((_params + rounded_size),
+               (grads + rounded_size),
+               (_exp_avg + rounded_size),
+               (_exp_avg_sq + rounded_size),
+               (_param_size - rounded_size),
+               (dev_params != nullptr ? (dev_params + rounded_size) : dev_params),
+               half_precision);
+}
+
+int ds_adam_step(int optimizer_id,
+                 size_t step,
+                 float lr,
+                 float beta1,
+                 float beta2,
+                 float epsilon,
+                 float weight_decay,
+                 bool bias_correction,
+                 torch::Tensor& params,
+                 torch::Tensor& grads,
+                 torch::Tensor& exp_avg,
+                 torch::Tensor& exp_avg_sq)
+{
+    auto params_c = params.contiguous();
+    auto grads_c = grads.contiguous();
+    auto exp_avg_c = exp_avg.contiguous();
+    auto exp_avg_sq_c = exp_avg_sq.contiguous();
+
+    // assert(params.options().dtype() == grads.options().dtype());
+
+    float* params_ptr = (float*)params_c.data_ptr();
+    float* grads_ptr = (float*)grads_c.data_ptr();
+    float* exp_avg_ptr = (float*)exp_avg_c.data_ptr();
+    float* exp_avg_sq_ptr = (float*)exp_avg_sq_c.data_ptr();
+
+    std::shared_ptr<Adam_Optimizer> opt =
+        std::static_pointer_cast<Adam_Optimizer>(s_optimizers[optimizer_id]);
+    opt->IncrementStep(step, beta1, beta2);
+    opt->update_state(lr, epsilon, weight_decay, bias_correction);
+
+    opt->Step_8(params_ptr,
+                grads_ptr,
+                exp_avg_ptr,
+                exp_avg_sq_ptr,
+                params_c.numel(),
+                nullptr,
+                (params.options().dtype() == at::kHalf));
+
+#if defined(__ENABLE_CUDA__) or defined(__ENABLE_CANN__)
+    opt->SynchronizeStreams();
+#endif
+    return 0;
+}
+
+int ds_adam_step_plus_copy(int optimizer_id,
+                           size_t step,
+                           float lr,
+                           float beta1,
+                           float beta2,
+                           float epsilon,
+                           float weight_decay,
+                           bool bias_correction,
+                           torch::Tensor& params,
+                           torch::Tensor& grads,
+                           torch::Tensor& exp_avg,
+                           torch::Tensor& exp_avg_sq,
+                           torch::Tensor& device_params)
+{
+#if defined(__ENABLE_CUDA__) or defined(__ENABLE_CANN__)
+    auto params_c = params.contiguous();
+    auto device_params_c = device_params.contiguous();
+    auto exp_avg_c = exp_avg.contiguous();
+    auto exp_avg_sq_c = exp_avg_sq.contiguous();
+    auto grads_c = grads.contiguous();
+
+    float* params_ptr = (float*)params_c.data_ptr();
+    float* grads_ptr = (float*)grads_c.data_ptr();
+    ds_half_precision_t* device_params_ptr = (ds_half_precision_t*)device_params_c.data_ptr();
+    float* exp_avg_ptr = (float*)exp_avg_c.data_ptr();
+    float* exp_avg_sq_ptr = (float*)exp_avg_sq_c.data_ptr();
+
+    std::shared_ptr<Adam_Optimizer> opt =
+        std::static_pointer_cast<Adam_Optimizer>(s_optimizers[optimizer_id]);
+    opt->IncrementStep(step, beta1, beta2);
+    opt->update_state(lr, epsilon, weight_decay, bias_correction);
+    opt->Step_8(params_ptr,
+                grads_ptr,
+                exp_avg_ptr,
+                exp_avg_sq_ptr,
+                params_c.numel(),
+                device_params_ptr,
+                (params.options().dtype() == at::kHalf));
+
+    opt->SynchronizeStreams();
+#else
+    assert(false);
+#endif
+    return 0;
+}
+
+int destroy_adam_optimizer(int optimizer_id)
+{
+    s_optimizers.erase(optimizer_id);
+
+    return 0;
+}
diff --git a/csrc/adam/fused_adam_frontend.cpp b/csrc/adam/fused_adam_frontend.cpp
index b06531c53002..13b390248608 100644
--- a/csrc/adam/fused_adam_frontend.cpp
+++ b/csrc/adam/fused_adam_frontend.cpp
@@ -1,3 +1,8 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
 #include <torch/extension.h>
 
 void multi_tensor_adam_cuda(int chunk_size,
diff --git a/csrc/adam/multi_tensor_adam.cu b/csrc/adam/multi_tensor_adam.cu
index 3cb9763befce..1b697d989b1a 100644
--- a/csrc/adam/multi_tensor_adam.cu
+++ b/csrc/adam/multi_tensor_adam.cu
@@ -1,6 +1,11 @@
-/* Copyright 2020 The Microsoft DeepSpeed Team
-   Copyright NVIDIA/apex
-   This file is adapted from fused adam in NVIDIA/apex, commit a109f85
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+/*
+Copyright NVIDIA/apex
+This file is adapted from fused adam in NVIDIA/apex, commit a109f85
 */
 
 #include <ATen/ATen.h>
diff --git a/csrc/adam/multi_tensor_apply.cuh b/csrc/adam/multi_tensor_apply.cuh
index 13af4b7578f6..12f41cb49c6b 100644
--- a/csrc/adam/multi_tensor_apply.cuh
+++ b/csrc/adam/multi_tensor_apply.cuh
@@ -1,6 +1,11 @@
-/* Copyright 2020 The Microsoft DeepSpeed Team
-   Copyright NVIDIA/apex
-   This file is adapted from fused adam in NVIDIA/apex, commit a109f85
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+/*
+Copyright NVIDIA/apex
+This file is adapted from fused adam in NVIDIA/apex, commit a109f85
 */
 
 #include <ATen/ATen.h>
diff --git a/csrc/aio/common/deepspeed_aio_common.cpp b/csrc/aio/common/deepspeed_aio_common.cpp
index 9e405d8e704c..32b0e8a32394 100644
--- a/csrc/aio/common/deepspeed_aio_common.cpp
+++ b/csrc/aio/common/deepspeed_aio_common.cpp
@@ -1,7 +1,9 @@
-/*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
+/*
 Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
 */
 
@@ -113,10 +115,13 @@ static int _do_io_complete(const long long int min_completes,
                            std::vector<std::chrono::duration<double>>& reap_times)
 {
     const auto start_time = std::chrono::high_resolution_clock::now();
-    const auto n_completes = io_getevents(
-        aio_ctxt->_io_ctxt, min_completes, max_completes, aio_ctxt->_io_events.data(), nullptr);
+    long long int n_completes = io_pgetevents(aio_ctxt->_io_ctxt,
+                                              min_completes,
+                                              max_completes,
+                                              aio_ctxt->_io_events.data(),
+                                              nullptr,
+                                              nullptr);
     reap_times.push_back(std::chrono::high_resolution_clock::now() - start_time);
-
     assert(n_completes >= min_completes);
     return n_completes;
 }
@@ -262,7 +267,7 @@ void report_file_error(const char* filename, const std::string file_op, const in
 
 int open_file(const char* filename, const bool read_op)
 {
-    const int flags = read_op ? (O_RDONLY | __O_DIRECT) : (O_WRONLY | O_CREAT | __O_DIRECT);
+    const int flags = read_op ? (O_RDONLY | O_DIRECT) : (O_WRONLY | O_CREAT | O_DIRECT);
     const int mode = 0600;
     const auto fd = open(filename, flags, mode);
     if (fd == -1) {
diff --git a/csrc/aio/common/deepspeed_aio_common.h b/csrc/aio/common/deepspeed_aio_common.h
index cc62d33765c8..2940de945ee8 100644
--- a/csrc/aio/common/deepspeed_aio_common.h
+++ b/csrc/aio/common/deepspeed_aio_common.h
@@ -1,7 +1,9 @@
-/*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
+/*
 Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
 */
 
diff --git a/csrc/aio/common/deepspeed_aio_types.cpp b/csrc/aio/common/deepspeed_aio_types.cpp
index e5811bb91149..5e34a61065d4 100644
--- a/csrc/aio/common/deepspeed_aio_types.cpp
+++ b/csrc/aio/common/deepspeed_aio_types.cpp
@@ -1,7 +1,9 @@
-/*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
+/*
 Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
 */
 
diff --git a/csrc/aio/common/deepspeed_aio_types.h b/csrc/aio/common/deepspeed_aio_types.h
index be3b352d6be2..ce6a4e5cdfa7 100644
--- a/csrc/aio/common/deepspeed_aio_types.h
+++ b/csrc/aio/common/deepspeed_aio_types.h
@@ -1,7 +1,9 @@
-/*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
+/*
 Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
 */
 
diff --git a/csrc/aio/common/deepspeed_aio_utils.cpp b/csrc/aio/common/deepspeed_aio_utils.cpp
index e8bf9de11259..763b2c253a34 100644
--- a/csrc/aio/common/deepspeed_aio_utils.cpp
+++ b/csrc/aio/common/deepspeed_aio_utils.cpp
@@ -1,7 +1,9 @@
-/*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
+/*
 Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
 */
 
diff --git a/csrc/aio/common/deepspeed_aio_utils.h b/csrc/aio/common/deepspeed_aio_utils.h
index 6c5952749dd3..9c58c2286610 100644
--- a/csrc/aio/common/deepspeed_aio_utils.h
+++ b/csrc/aio/common/deepspeed_aio_utils.h
@@ -1,7 +1,9 @@
-/*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
+/*
 Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
 */
 
diff --git a/csrc/aio/py_lib/deepspeed_aio_thread.cpp b/csrc/aio/py_lib/deepspeed_aio_thread.cpp
index a2670fb7b4cb..e9c6a8505858 100644
--- a/csrc/aio/py_lib/deepspeed_aio_thread.cpp
+++ b/csrc/aio/py_lib/deepspeed_aio_thread.cpp
@@ -1,7 +1,9 @@
-/*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
+/*
 Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
 */
 
@@ -22,7 +24,8 @@ io_op_desc_t::io_op_desc_t(const bool read_op,
       _num_bytes(num_bytes),
       _validate(validate)
 {
-    _cpu_buffer = _buffer.is_cuda() ? _buffer.to(torch::kCPU).pin_memory() : _buffer;
+    _cpu_buffer = (_buffer.is_cuda() || _buffer.is_xpu()) ? _buffer.to(torch::kCPU).pin_memory()
+                                                          : _buffer;
     _contiguous_buffer = _cpu_buffer.contiguous();
 }
 
@@ -31,6 +34,7 @@ char* io_op_desc_t::data_ptr() const { return (char*)_contiguous_buffer.data_ptr
 void io_op_desc_t::fini()
 {
     if (_read_op && _buffer.is_cuda()) { _buffer.copy_(_cpu_buffer.to(torch::kCUDA)); }
+    if (_read_op && _buffer.is_xpu()) { _buffer.copy_(_cpu_buffer.to(torch::kXPU)); }
 }
 
 deepspeed_aio_thread_t::deepspeed_aio_thread_t(const int tid, deepspeed_aio_config_t& aio_config)
diff --git a/csrc/aio/py_lib/deepspeed_aio_thread.h b/csrc/aio/py_lib/deepspeed_aio_thread.h
index d1cfcab8bfc2..20799ecbb018 100644
--- a/csrc/aio/py_lib/deepspeed_aio_thread.h
+++ b/csrc/aio/py_lib/deepspeed_aio_thread.h
@@ -1,7 +1,9 @@
-/*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
+/*
 Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
 */
 
diff --git a/csrc/aio/py_lib/deepspeed_pin_tensor.cpp b/csrc/aio/py_lib/deepspeed_pin_tensor.cpp
index 20bdf5b142a6..752823dc7dd2 100644
--- a/csrc/aio/py_lib/deepspeed_pin_tensor.cpp
+++ b/csrc/aio/py_lib/deepspeed_pin_tensor.cpp
@@ -1,7 +1,9 @@
-/*
-Copyright 2023 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
+/*
 Functionality for managing CPU tensors occupying page-locked memory.
 */
 
diff --git a/csrc/aio/py_lib/deepspeed_pin_tensor.h b/csrc/aio/py_lib/deepspeed_pin_tensor.h
index a421bbc8f3b1..4350a4ac7df6 100644
--- a/csrc/aio/py_lib/deepspeed_pin_tensor.h
+++ b/csrc/aio/py_lib/deepspeed_pin_tensor.h
@@ -1,11 +1,14 @@
-/*
-Copyright 2023 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
+/*
 Functionality for managing CPU tensors occupying page-locked memory.
 TODO: Implement a full-featured manager that
- 1. Avoid page-locked memory leaks
- 2. Minimize page-locked memory usage by reducing internal fragmentation
+1. Avoid page-locked memory leaks
+2. Minimize page-locked memory usage by reducing internal fragmentation
+Functionality for managing CPU tensors occupying page-locked memory.
 */
 
 #include <map>
diff --git a/csrc/aio/py_lib/deepspeed_py_aio.cpp b/csrc/aio/py_lib/deepspeed_py_aio.cpp
index 49ff1f240c43..387b713f2bfc 100644
--- a/csrc/aio/py_lib/deepspeed_py_aio.cpp
+++ b/csrc/aio/py_lib/deepspeed_py_aio.cpp
@@ -1,3 +1,7 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 /*
 Copyright 2020 The Microsoft DeepSpeed Team
diff --git a/csrc/aio/py_lib/deepspeed_py_aio.h b/csrc/aio/py_lib/deepspeed_py_aio.h
index 230d88da9763..11d5225de9f1 100644
--- a/csrc/aio/py_lib/deepspeed_py_aio.h
+++ b/csrc/aio/py_lib/deepspeed_py_aio.h
@@ -1,3 +1,7 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 /*
 Copyright 2020 The Microsoft DeepSpeed Team
diff --git a/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp b/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp
index cb81924ec7d3..c21e92de9449 100644
--- a/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp
+++ b/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp
@@ -1,3 +1,7 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 /*
 Copyright 2020 The Microsoft DeepSpeed Team
diff --git a/csrc/aio/py_lib/deepspeed_py_aio_handle.h b/csrc/aio/py_lib/deepspeed_py_aio_handle.h
index 2163aafcfe89..3a254c3814a2 100644
--- a/csrc/aio/py_lib/deepspeed_py_aio_handle.h
+++ b/csrc/aio/py_lib/deepspeed_py_aio_handle.h
@@ -1,7 +1,9 @@
-/*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
+/*
 Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
 */
 
diff --git a/csrc/aio/py_lib/deepspeed_py_copy.cpp b/csrc/aio/py_lib/deepspeed_py_copy.cpp
index ee51147f9c41..8a59107dd347 100644
--- a/csrc/aio/py_lib/deepspeed_py_copy.cpp
+++ b/csrc/aio/py_lib/deepspeed_py_copy.cpp
@@ -1,7 +1,9 @@
-/*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
+/*
 Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
 */
 
diff --git a/csrc/aio/py_lib/deepspeed_py_copy.h b/csrc/aio/py_lib/deepspeed_py_copy.h
index 69b044851eca..19ba28317d00 100644
--- a/csrc/aio/py_lib/deepspeed_py_copy.h
+++ b/csrc/aio/py_lib/deepspeed_py_copy.h
@@ -1,3 +1,7 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 /*
 Copyright 2020 The Microsoft DeepSpeed Team
diff --git a/csrc/aio/py_lib/py_ds_aio.cpp b/csrc/aio/py_lib/py_ds_aio.cpp
index 3c971c667874..9033549bc0d2 100755
--- a/csrc/aio/py_lib/py_ds_aio.cpp
+++ b/csrc/aio/py_lib/py_ds_aio.cpp
@@ -1,7 +1,9 @@
-/*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
+/*
 Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
 */
 
diff --git a/csrc/aio/py_test/aio_bench_generate_param.py b/csrc/aio/py_test/aio_bench_generate_param.py
index caa833f5febb..09d0e03c7ef6 100644
--- a/csrc/aio/py_test/aio_bench_generate_param.py
+++ b/csrc/aio/py_test/aio_bench_generate_param.py
@@ -1,7 +1,8 @@
-"""
-Copyright 2021 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
 
+# DeepSpeed Team
+"""
 Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
 """
 import os
@@ -14,13 +15,10 @@
 def parse_arguments():
     parser = argparse.ArgumentParser()
 
-    parser.add_argument(
-        '--log_dir',
-        type=str,
-        default=BENCH_LOG_DIR,
-        help=
-        f'Folder of performance sweep logs. Default is {os.path.join(".", BENCH_LOG_DIR)}'
-    )
+    parser.add_argument('--log_dir',
+                        type=str,
+                        default=BENCH_LOG_DIR,
+                        help=f'Folder of performance sweep logs. Default is {os.path.join(".", BENCH_LOG_DIR)}')
 
     args = parser.parse_args()
     print(f'args = {args}')
@@ -75,9 +73,7 @@ def generate_aio_param(read_log_dir, write_log_dir):
     optimal_config_read = read_results.get(read_perf_keys[optimal_key], None)
     optimal_config_write = write_results.get(write_perf_keys[optimal_key], None)
 
-    print(
-        f'Best performance (GB/sec): read = {optimal_config_read:5.2f}, write = {optimal_config_write:5.2f}'
-    )
+    print(f'Best performance (GB/sec): read = {optimal_config_read:5.2f}, write = {optimal_config_write:5.2f}')
     print(json.dumps(aio_param, indent=3))
 
 
diff --git a/csrc/aio/py_test/aio_bench_perf_sweep.py b/csrc/aio/py_test/aio_bench_perf_sweep.py
index eebea69b1bbf..7d55f7ded65c 100644
--- a/csrc/aio/py_test/aio_bench_perf_sweep.py
+++ b/csrc/aio/py_test/aio_bench_perf_sweep.py
@@ -1,7 +1,8 @@
-"""
-Copyright 2021 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
 
+# DeepSpeed Team
+"""
 Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
 """
 import os
@@ -20,20 +21,16 @@
 OTHER_OPTIONS = '--handle'
 PERF_SCRIPT = 'test_ds_aio.py'
 DEFAULT_SWEEP_CONFIG = {
-    "block_size": ["128K",
-                   "256K"],
-    "queue_depth": [4,
-                    16,
-                    32],
-    "overlap_events": [True,
-                       False],
-    "io_parallel": [2,
-                    8],
+    "block_size": ["128K", "256K"],
+    "queue_depth": [4, 16, 32],
+    "overlap_events": [True, False],
+    "io_parallel": [2, 8],
     "single_submit": [False]
 }
 
 
 class Job(object):
+
     def __init__(self, cmd_line, output_file=None, work_dir=None):
         self.cmd_line = cmd_line
         self.output_file = output_file
@@ -63,6 +60,7 @@ def close_output_file(self):
 
 
 class SweepConfig(object):
+
     def __init__(self, args):
         self.nvme_dir = args.nvme_dir
         self.io_size = args.io_size
@@ -78,52 +76,35 @@ def __init__(self, args):
 def parse_arguments():
     parser = argparse.ArgumentParser()
 
-    parser.add_argument(
-        '--nvme_dir',
-        required=True,
-        type=str,
-        help=
-        'Directory in which to perform I/O tests. A writeable directory on a NVMe device.'
-    )
-
-    parser.add_argument('--sweep_config',
+    parser.add_argument('--nvme_dir',
+                        required=True,
                         type=str,
-                        default=None,
-                        help='Performance sweep configuration json file.')
+                        help='Directory in which to perform I/O tests. A writeable directory on a NVMe device.')
 
-    parser.add_argument('--no_read',
-                        action='store_true',
-                        help='Disable read performance measurements.')
+    parser.add_argument('--sweep_config', type=str, default=None, help='Performance sweep configuration json file.')
 
-    parser.add_argument('--no_write',
-                        action='store_true',
-                        help='Disable write performance measurements.')
+    parser.add_argument('--no_read', action='store_true', help='Disable read performance measurements.')
 
-    parser.add_argument(
-        '--io_size',
-        type=str,
-        default="400M",
-        help='Number of I/O bytes to read/write for performance measurements.')
+    parser.add_argument('--no_write', action='store_true', help='Disable write performance measurements.')
+
+    parser.add_argument('--io_size',
+                        type=str,
+                        default="400M",
+                        help='Number of I/O bytes to read/write for performance measurements.')
 
     parser.add_argument(
         '--no_sudo',
         action='store_true',
         help=
-        'Run without sudo access. Page cache will not be flushed and reported read speeds may be higher than actual.'
-    )
+        'Run without sudo access. Page cache will not be flushed and reported read speeds may be higher than actual.')
 
     parser.add_argument(
         '--log_dir',
         type=str,
         default=BENCH_LOG_DIR,
-        help=
-        f'Output directory for performance log files. Default is {os.path.join(".", BENCH_LOG_DIR)}'
-    )
+        help=f'Output directory for performance log files. Default is {os.path.join(".", BENCH_LOG_DIR)}')
 
-    parser.add_argument('--loops',
-                        type=int,
-                        default=1,
-                        help='Count of operation repetitions')
+    parser.add_argument('--loops', type=int, default=1, help='Count of operation repetitions')
 
     args = parser.parse_args()
     print(f'args = {args}')
@@ -147,6 +128,7 @@ def get_sweep_config_dict(sweep_config_json):
 
 
 def get_sweep_cmd_lines(sweep_config_dict):
+
     def flatten_options(key, value_list):
         flat_list = []
         for v in value_list:
@@ -170,11 +152,7 @@ def run_job(job):
     args = ' '.join(job.cmd())
     print(f'args = {args}')
     job.open_output_file()
-    proc = subprocess.run(args=args,
-                          shell=True,
-                          stdout=job.get_stdout(),
-                          stderr=job.get_stderr(),
-                          cwd=job.get_cwd())
+    proc = subprocess.run(args=args, shell=True, stdout=job.get_stdout(), stderr=job.get_stderr(), cwd=job.get_cwd())
     job.close_output_file()
     assert proc.returncode == 0, \
     f"This command failed: {job.cmd()}"
@@ -240,14 +218,7 @@ def get_config_value(tag, value):
             return tag_key
         return f'{tag_key}{value}'
 
-    tag_list = [
-        SINGLE_SUBMIT,
-        OVERLAP_EVENTS,
-        THREAD_COUNT,
-        IO_PARALLEL,
-        QUEUE_DEPTH,
-        BLOCK_SIZE
-    ]
+    tag_list = [SINGLE_SUBMIT, OVERLAP_EVENTS, THREAD_COUNT, IO_PARALLEL, QUEUE_DEPTH, BLOCK_SIZE]
     log_tags = [io_op_desc]
     cmd_tags = create_cmd_tags(cmd_line)
     for tag in tag_list:
@@ -298,16 +269,10 @@ def create_read_file(sweep_config):
     os.makedirs(read_folder, exist_ok=True)
     read_file_name = os.path.join(read_folder, f'random_{sweep_config.io_size}B.pt')
     block_size, block_count = get_block_size_and_count(refine_integer_value(sweep_config.io_size))
-    dd_job = Job(cmd_line=[
-        f'dd if=/dev/urandom of={read_file_name} bs={block_size} count={block_count}'
-    ])
-    print(
-        f'[Start] Create read file of {sweep_config.io_size} bytes by running {dd_job.cmd()} ....'
-    )
+    dd_job = Job(cmd_line=[f'dd if=/dev/urandom of={read_file_name} bs={block_size} count={block_count}'])
+    print(f'[Start] Create read file of {sweep_config.io_size} bytes by running {dd_job.cmd()} ....')
     run_job(dd_job)
-    print(
-        f'[Done] Create read file of {sweep_config.io_size} bytes by running {dd_job.cmd()} ....'
-    )
+    print(f'[Done] Create read file of {sweep_config.io_size} bytes by running {dd_job.cmd()} ....')
     return read_folder, read_file_name
 
 
@@ -319,20 +284,15 @@ def remove_folder(folder):
 def run_read_sweep(sweep_config, flush_cache_job, sync_job, cmd_lines):
     read_folder, read_file_name = create_read_file(sweep_config)
     read_option = f'--read_file {read_file_name}'
-    read_cmd_lines = [[f'{read_option} {sweep_config.other_options}'] + cmd
-                      for cmd in cmd_lines]
+    read_cmd_lines = [[f'{read_option} {sweep_config.other_options}'] + cmd for cmd in cmd_lines]
     #dump_cmd_lines(read_cmd_lines)
 
     log_folder = os.path.join(sweep_config.log_dir, f'{READ_LOG_DIR}')
     os.makedirs(log_folder, exist_ok=True)
 
-    perf_jobs = create_perf_jobs(io_op_desc=READ_OP_DESC,
-                                 log_dir=log_folder,
-                                 cmd_lines=read_cmd_lines)
+    perf_jobs = create_perf_jobs(io_op_desc=READ_OP_DESC, log_dir=log_folder, cmd_lines=read_cmd_lines)
 
-    launch_sweep(sweep_jobs=perf_jobs,
-                 sync_job=sync_job,
-                 flush_cache_job=flush_cache_job)
+    launch_sweep(sweep_jobs=perf_jobs, sync_job=sync_job, flush_cache_job=flush_cache_job)
 
     remove_folder(read_folder)
 
@@ -342,20 +302,15 @@ def run_write_sweep(sweep_config, flush_cache_job, sync_job, cmd_lines):
     os.makedirs(write_folder, exist_ok=True)
     write_file_name = os.path.join(write_folder, f'random_{sweep_config.io_size}B.pt')
     write_option = f'--write_size {sweep_config.io_size} --write_file {write_file_name}'
-    write_cmd_lines = [[f'{write_option} {sweep_config.other_options}'] + cmd
-                       for cmd in cmd_lines]
+    write_cmd_lines = [[f'{write_option} {sweep_config.other_options}'] + cmd for cmd in cmd_lines]
     #dump_cmd_lines(write_cmd_lines)
 
     log_folder = os.path.join(sweep_config.log_dir, f'{WRITE_LOG_DIR}')
     os.makedirs(log_folder, exist_ok=True)
 
-    perf_jobs = create_perf_jobs(io_op_desc=WRITE_OP_DESC,
-                                 log_dir=log_folder,
-                                 cmd_lines=write_cmd_lines)
+    perf_jobs = create_perf_jobs(io_op_desc=WRITE_OP_DESC, log_dir=log_folder, cmd_lines=write_cmd_lines)
 
-    launch_sweep(sweep_jobs=perf_jobs,
-                 sync_job=sync_job,
-                 flush_cache_job=flush_cache_job)
+    launch_sweep(sweep_jobs=perf_jobs, sync_job=sync_job, flush_cache_job=flush_cache_job)
 
     remove_folder(write_folder)
 
@@ -376,10 +331,7 @@ def main():
     cmd_lines = get_sweep_cmd_lines(sweep_config.search_space)
 
     if sweep_config.flush_cache:
-        flush_cache_job = Job(
-            cmd_line=['sudo',
-                      'bash -c',
-                      "'echo 1 > /proc/sys/vm/drop_caches'"])
+        flush_cache_job = Job(cmd_line=['sudo', 'bash -c', "'echo 1 > /proc/sys/vm/drop_caches'"])
     else:
         flush_cache_job = None
 
diff --git a/csrc/aio/py_test/ds_aio_basic.py b/csrc/aio/py_test/ds_aio_basic.py
index d7f034ad9c46..ad2a4349cd0c 100755
--- a/csrc/aio/py_test/ds_aio_basic.py
+++ b/csrc/aio/py_test/ds_aio_basic.py
@@ -1,7 +1,8 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
 
+# DeepSpeed Team
+"""
 Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
 """
 
@@ -20,14 +21,8 @@ def pre_basic(args, tid, read_op):
     file = args.read_file if read_op else f'{args.write_file}.{tid}'
 
     task_log(tid, f'Allocate tensor of size {num_bytes} bytes')
-    buffer = get_accelerator().pin_memory(
-        torch.empty(num_bytes,
-                    dtype=torch.uint8,
-                    device='cpu'))
-    task_log(
-        tid,
-        f'{io_string} file {file} of size {num_bytes} bytes from buffer on device {buffer.device}'
-    )
+    buffer = get_accelerator().pin_memory(torch.empty(num_bytes, dtype=torch.uint8, device='cpu'))
+    task_log(tid, f'{io_string} file {file} of size {num_bytes} bytes from buffer on device {buffer.device}')
 
     ctxt = {}
     ctxt['file'] = file
@@ -60,13 +55,8 @@ def post_basic(pool_params):
 def main_basic_read(pool_params):
     args, tid, ctxt = pool_params
     start_time = time.time()
-    AsyncIOBuilder().load().aio_read(ctxt['buffer'],
-                                     ctxt['file'],
-                                     args.block_size,
-                                     args.queue_depth,
-                                     args.single_submit,
-                                     args.overlap_events,
-                                     args.validate)
+    AsyncIOBuilder().load().aio_read(ctxt['buffer'], ctxt['file'], args.block_size, args.queue_depth,
+                                     args.single_submit, args.overlap_events, args.validate)
     end_time = time.time()
     ctxt['elapsed_sec'] += end_time - start_time
 
@@ -76,13 +66,8 @@ def main_basic_read(pool_params):
 def main_basic_write(pool_params):
     args, tid, ctxt = pool_params
     start_time = time.time()
-    AsyncIOBuilder().load().aio_write(ctxt['buffer'],
-                                      ctxt['file'],
-                                      args.block_size,
-                                      args.queue_depth,
-                                      args.single_submit,
-                                      args.overlap_events,
-                                      args.validate)
+    AsyncIOBuilder().load().aio_write(ctxt['buffer'], ctxt['file'], args.block_size, args.queue_depth,
+                                      args.single_submit, args.overlap_events, args.validate)
     end_time = time.time()
     ctxt['elapsed_sec'] += end_time - start_time
 
diff --git a/csrc/aio/py_test/ds_aio_handle.py b/csrc/aio/py_test/ds_aio_handle.py
index 7f0e44779cb1..d35b2713edae 100755
--- a/csrc/aio/py_test/ds_aio_handle.py
+++ b/csrc/aio/py_test/ds_aio_handle.py
@@ -1,7 +1,8 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
 
+# DeepSpeed Team
+"""
 Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
 """
 
@@ -20,27 +21,17 @@ def pre_handle(args, tid, read_op):
     file = args.read_file if read_op else f'{args.write_file}.{tid}'
 
     io_parallel = args.io_parallel if args.io_parallel else 1
-    handle = AsyncIOBuilder().load().aio_handle(args.block_size,
-                                                args.queue_depth,
-                                                args.single_submit,
-                                                args.overlap_events,
-                                                io_parallel)
+    handle = AsyncIOBuilder().load().aio_handle(args.block_size, args.queue_depth, args.single_submit,
+                                                args.overlap_events, io_parallel)
     task_log(tid, f'Created deepspeed aio handle')
 
     if args.gpu:
-        buffer = torch.empty(num_bytes,
-                             dtype=torch.uint8,
-                             device=get_accelerator().device_name())
+        buffer = torch.empty(num_bytes, dtype=torch.uint8, device=get_accelerator().device_name())
     else:
         if args.use_accelerator_pin_memory:
-            buffer = get_accelerator().pin_memory(
-                torch.empty(num_bytes,
-                            dtype=torch.uint8,
-                            device='cpu'))
+            buffer = get_accelerator().pin_memory(torch.empty(num_bytes, dtype=torch.uint8, device='cpu'))
         else:
-            buffer = handle.new_cpu_locked_tensor(num_bytes,
-                                                  torch.empty(0,
-                                                              dtype=torch.uint8))
+            buffer = handle.new_cpu_locked_tensor(num_bytes, torch.empty(0, dtype=torch.uint8))
 
     task_log(tid, f'Allocate tensor of size {num_bytes} bytes')
 
@@ -51,10 +42,7 @@ def pre_handle(args, tid, read_op):
     ctxt['buffer'] = buffer
     ctxt['elapsed_sec'] = 0
 
-    task_log(
-        tid,
-        f'{io_string} file {file} of size {num_bytes} bytes from buffer on device {buffer.device}'
-    )
+    task_log(tid, f'{io_string} file {file} of size {num_bytes} bytes from buffer on device {buffer.device}')
 
     return ctxt
 
diff --git a/csrc/aio/py_test/parse_aio_stats.py b/csrc/aio/py_test/parse_aio_stats.py
index 2a3e64944bef..09c79ada5b36 100755
--- a/csrc/aio/py_test/parse_aio_stats.py
+++ b/csrc/aio/py_test/parse_aio_stats.py
@@ -1,7 +1,8 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
 
+# DeepSpeed Team
+"""
 Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
 """
 
@@ -19,10 +20,7 @@
 def parse_arguments():
     parser = argparse.ArgumentParser()
 
-    parser.add_argument('--log_dir',
-                        type=str,
-                        required=True,
-                        help='Folder of statistics logs')
+    parser.add_argument('--log_dir', type=str, required=True, help='Folder of statistics logs')
 
     parser.add_argument('--metric',
                         type=str,
@@ -125,10 +123,7 @@ def get_results(log_files, metric):
 
 
 def get_sorted_results(log_dir, metric):
-    log_files = [
-        f for f in os.listdir(log_dir) if os.path.isfile(os.path.join(log_dir,
-                                                                      f))
-    ]
+    log_files = [f for f in os.listdir(log_dir) if os.path.isfile(os.path.join(log_dir, f))]
 
     log_files_path = [os.path.join(log_dir, f) for f in log_files]
     results = get_results(log_files_path, metric)
diff --git a/csrc/aio/py_test/perf_sweep_utils.py b/csrc/aio/py_test/perf_sweep_utils.py
index 78dd93b0a175..e6832c1baa49 100644
--- a/csrc/aio/py_test/perf_sweep_utils.py
+++ b/csrc/aio/py_test/perf_sweep_utils.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 SCRIPT_PREFIX = '_aio_bench'
 WRITE_OP_DESC = 'write'
diff --git a/csrc/aio/py_test/test_ds_aio.py b/csrc/aio/py_test/test_ds_aio.py
index 7cb737d689e1..e6242cb35789 100755
--- a/csrc/aio/py_test/test_ds_aio.py
+++ b/csrc/aio/py_test/test_ds_aio.py
@@ -1,7 +1,8 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
 
+# DeepSpeed Team
+"""
 Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
 """
 
@@ -20,46 +21,29 @@ def parse_arguments():
 
     parser.add_argument('--write_file', type=str, default=None, help='Write file.')
 
-    parser.add_argument('--write_size',
-                        type=str,
-                        default=None,
-                        help='Number of bytes to write.')
+    parser.add_argument('--write_size', type=str, default=None, help='Number of bytes to write.')
 
     parser.add_argument('--block_size', type=str, default='1M', help='I/O block size.')
 
     parser.add_argument('--queue_depth', type=int, default=32, help='I/O queue depth.')
 
-    parser.add_argument('--threads',
-                        type=int,
-                        default=1,
-                        help='Thread parallelism count.')
+    parser.add_argument('--threads', type=int, default=1, help='Thread parallelism count.')
 
-    parser.add_argument(
-        '--single_submit',
-        action='store_true',
-        help=
-        'Submit I/O requests in singles (default is submit queue_depth amount at once.).'
-    )
+    parser.add_argument('--single_submit',
+                        action='store_true',
+                        help='Submit I/O requests in singles (default is submit queue_depth amount at once.).')
 
     parser.add_argument('--overlap_events',
                         action='store_true',
                         help='Overlap I/O submission and completion requests.')
 
-    parser.add_argument('--validate',
-                        action='store_true',
-                        help='Perform validation in library.')
+    parser.add_argument('--validate', action='store_true', help='Perform validation in library.')
 
     parser.add_argument('--handle', action='store_true', help='Use AIO handle.')
 
-    parser.add_argument('--loops',
-                        type=int,
-                        default=1,
-                        help='Count of operation repetitions')
+    parser.add_argument('--loops', type=int, default=1, help='Count of operation repetitions')
 
-    parser.add_argument('--io_parallel',
-                        type=int,
-                        default=None,
-                        help='Per iop parallelism')
+    parser.add_argument('--io_parallel', type=int, default=None, help='Per iop parallelism')
 
     parser.add_argument('--gpu', action='store_true', help='Use GPU memory')
 
diff --git a/csrc/aio/py_test/test_ds_aio_utils.py b/csrc/aio/py_test/test_ds_aio_utils.py
index a330e4cd1980..6aad114c0bdc 100755
--- a/csrc/aio/py_test/test_ds_aio_utils.py
+++ b/csrc/aio/py_test/test_ds_aio_utils.py
@@ -1,7 +1,8 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
 
+# DeepSpeed Team
+"""
 Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
 """
 
diff --git a/csrc/aio/py_test/validate_async_io.py b/csrc/aio/py_test/validate_async_io.py
index 3b96c3c2ffba..019ec05d49d3 100644
--- a/csrc/aio/py_test/validate_async_io.py
+++ b/csrc/aio/py_test/validate_async_io.py
@@ -1,7 +1,8 @@
-"""
-Copyright 2021 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
 
+# DeepSpeed Team
+"""
 Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
 """
 from deepspeed.ops.op_builder import AsyncIOBuilder
diff --git a/csrc/common/custom_cuda_kernel.cu b/csrc/common/custom_cuda_kernel.cu
index 9994a096efa3..f46bf303125c 100644
--- a/csrc/common/custom_cuda_kernel.cu
+++ b/csrc/common/custom_cuda_kernel.cu
@@ -1,6 +1,7 @@
-/*
-Copyright The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #include "custom_cuda_layers.h"
 
diff --git a/csrc/cpu/adam/fused_adam.cpp b/csrc/cpu/adam/fused_adam.cpp
new file mode 100644
index 000000000000..d25578f410da
--- /dev/null
+++ b/csrc/cpu/adam/fused_adam.cpp
@@ -0,0 +1,48 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include "cpu_adam.h"
+
+// C++ interface
+
+void multi_tensor_adam(int chunk_size,
+                       at::Tensor noop_flag,
+                       std::vector<std::vector<at::Tensor>> tensor_lists, /*gpmv*/
+                       const float lr,
+                       const float beta1,
+                       const float beta2,
+                       const float epsilon,
+                       const int step,
+                       const int mode,
+                       const int bias_correction,
+                       const float weight_decay)
+{
+    static bool initialized = false;
+    if (!initialized) {
+        create_adam_optimizer(0);
+        initialized = true;
+    }
+    for (int i = 0; i < tensor_lists[0].size(); i++) {
+        ds_adam_step(0,
+                     step,
+                     lr,
+                     beta1,
+                     beta2,
+                     epsilon,
+                     weight_decay,
+                     bias_correction,
+                     tensor_lists[1][i],
+                     tensor_lists[0][i],
+                     tensor_lists[2][i],
+                     tensor_lists[3][i]);
+    }
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+    m.def("multi_tensor_adam",
+          &multi_tensor_adam,
+          "Compute and apply gradient update to parameters for Adam optimizer");
+}
diff --git a/csrc/cpu/comm/ccl.cpp b/csrc/cpu/comm/ccl.cpp
new file mode 100644
index 000000000000..9a04b6f873f2
--- /dev/null
+++ b/csrc/cpu/comm/ccl.cpp
@@ -0,0 +1,639 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include <torch/extension.h>
+
+#include <fcntl.h>
+#include <immintrin.h>
+#include <math.h>
+#include <omp.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <atomic>
+#include <cstdlib>
+#include <iostream>
+#include <oneapi/ccl.hpp>
+
+// states for collectives
+enum coll_state {
+    coll_begin = 0,
+    // coll states for naive allreduce
+    coll_allreduce_naive__copy_in_done,   // this state is for rank != 0
+    coll_allreduce_naive__reduce_done,    // this state is for rank == 0
+    coll_allreduce_naive__copy_out_done,  // this state is for rank != 0
+};
+
+// SHM building blocks
+struct SharedData {
+    const char* name;
+    int descriptor;
+    void* bytes;
+    size_t nbytes;
+};
+
+void shared_open(SharedData* data, const char* name, size_t nbytes)
+{
+    int d = shm_open(name, O_RDWR, S_IRUSR | S_IWUSR);
+    if (d != -1) {
+        void* bytes = mmap(NULL, nbytes, PROT_READ | PROT_WRITE, MAP_SHARED, d, 0);
+        data->name = name;
+        data->descriptor = d;
+        data->bytes = bytes;
+        data->nbytes = nbytes;
+    } else {
+        printf("shared_open %s failed\n", name);
+        data->descriptor = -1;
+    }
+}
+
+void shared_create(SharedData* data, const char* name, void* bytes, size_t nbytes)
+{
+    int d = shm_open(name, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR);
+    if (d != -1) {
+        if (nbytes = write(d, bytes, nbytes)) { shared_open(data, name, nbytes); }
+    } else {
+        printf("shared_create %s failed\n", name);
+    }
+}
+
+void shared_close(SharedData* data)
+{
+    if (data->descriptor != -1) {
+        munmap(data->bytes, data->nbytes);
+        shm_unlink(data->name);
+    }
+}
+
+// SHM based allreduce helper functions
+// buffer that holds shm name
+#define NAME_BUF_SIZE 1000
+#define MAX_BUF_SIZE 1048576
+#define SHM_BUFFER_NAME "deepspeed_allreduce_buffer"
+SharedData allreduce_buffer;
+struct allreduce_workspace {
+    enum coll_state state;
+    char buffer[MAX_BUF_SIZE];
+};
+struct allreduce_workspace* workspace;
+
+void wait_buffer_state_until(int index, enum coll_state state)
+{
+    volatile enum coll_state* state_ptr = &(workspace[index].state);
+
+    while (*state_ptr != state)
+        ;
+}
+
+void wait_buffer_state_until_not(int index, enum coll_state state)
+{
+    volatile enum coll_state* state_ptr = &(workspace[index].state);
+
+    while (*state_ptr == state)
+        ;
+}
+
+__m512 cvt_bf16_to_fp32(const __m256i src) __attribute__((target("avx512bw")));
+inline __m512 cvt_bf16_to_fp32(const __m256i src)
+{
+    auto y = _mm512_cvtepu16_epi32(src);
+    return _mm512_castsi512_ps(_mm512_bslli_epi128(y, 2));
+}
+
+inline __m256i cvt_fp32_to_bf16(const __m512 src) __attribute__((target("avx512bw")));
+inline __m256i cvt_fp32_to_bf16(const __m512 src)
+{
+    __m512i value = _mm512_castps_si512(src);
+    __m512i nan = _mm512_set1_epi32(0xffff);
+    auto mask_value = _mm512_cmp_ps_mask(src, src, _CMP_ORD_Q);
+    __m512i ones = _mm512_set1_epi32(0x1);
+    __m512i vec_bias = _mm512_set1_epi32(0x7fff);
+    // uint32_t lsb = (input >> 16) & 1;
+    auto t_value = _mm512_and_si512(_mm512_srli_epi32(value, 16), ones);
+    // uint32_t rounding_bias = 0x7fff + lsb;
+    t_value = _mm512_add_epi32(t_value, vec_bias);
+    // input += rounding_bias;
+    t_value = _mm512_add_epi32(t_value, value);
+    // input = input >> 16;
+    t_value = _mm512_srli_epi32(t_value, 16);
+    // Check NaN before converting back to bf16
+    t_value = _mm512_mask_blend_epi32(mask_value, nan, t_value);
+    return _mm512_cvtusepi32_epi16(t_value);
+}
+
+void reduce_2_bf16_buffers(int num_elements, void* in_out, void* in)
+    __attribute__((target("avx512bw")));
+
+void reduce_bf16_buffers(int num_elements, int num_buffers, struct allreduce_workspace* workspace)
+    __attribute__((target("avx512bw")));
+
+void reduce_2_fp32_buffers(int num_elements, void* in_out, void* in)
+    __attribute__((target("avx512bw")));
+
+void reduce_fp32_buffers(int num_elements, int num_buffers, struct allreduce_workspace* workspace)
+    __attribute__((target("avx512bw")));
+
+// N_REDUCE_LIMIT is the number of buffers that can be reduced together in one shot.
+// Compared with do N-1 2-reduces which needs 2*(N-1) read and N-1 write,
+// N-reduce only needs N read and 1 write, this saves 2/3 memory bandwidth.
+// When increase N_REDUCE_LIMIT to a bigger number, do the following steps
+// 1. Extend REPEAT_<X> macros list down below
+// 2. Extend switch cases which call "REPEAT(X, ...)" down below
+#define N_REDUCE_LIMIT 8
+
+void reduce_all_buffers(struct allreduce_workspace* workspace,
+                        int num_elements,
+                        c10::ScalarType scalar_type,
+                        int num_buffers)
+{
+    switch (scalar_type) {
+        case c10::ScalarType::BFloat16:
+            if (num_buffers > 2 && num_buffers <= N_REDUCE_LIMIT) {
+                reduce_bf16_buffers(num_elements, num_buffers, workspace);
+            } else {
+                for (int i = 1; i < num_buffers; i++) {
+                    reduce_2_bf16_buffers(num_elements, workspace[0].buffer, workspace[i].buffer);
+                }
+            }
+            break;
+        case c10::ScalarType::Float:
+            if (num_buffers > 2 && num_buffers <= N_REDUCE_LIMIT) {
+                reduce_fp32_buffers(num_elements, num_buffers, workspace);
+            } else {
+                for (int i = 1; i < num_buffers; i++) {
+                    reduce_2_fp32_buffers(num_elements, workspace[0].buffer, workspace[i].buffer);
+                }
+            }
+            break;
+        default: assert(!"Should not get here");
+    }
+}
+
+#define REPEAT(N, x) REPEAT_##N(x)
+#define REPEAT_1(x) x(1)
+#define REPEAT_2(x) \
+    REPEAT_1(x);    \
+    x(2)
+#define REPEAT_3(x) \
+    REPEAT_2(x);    \
+    x(3)
+#define REPEAT_4(x) \
+    REPEAT_3(x);    \
+    x(4)
+#define REPEAT_5(x) \
+    REPEAT_4(x);    \
+    x(5)
+#define REPEAT_6(x) \
+    REPEAT_5(x);    \
+    x(6)
+#define REPEAT_7(x) \
+    REPEAT_6(x);    \
+    x(7)
+
+#define CVT_ADD_BF16(x)                                                                \
+    do {                                                                               \
+        auto in##x##_val =                                                             \
+            cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)(workspace[x].buffer + i))); \
+        inout_val = _mm512_add_ps(inout_val, in##x##_val);                             \
+    } while (0)
+
+// Reduce functions down below use vectorized algorithm, the number of bytes processed each
+// iteration depends on vector length.  256bit vector ==> 32 bytes, 512bit vector ==> 64 bytes
+// If you change implementation of reduce_2_bf16_buffers or reduce_2_fp32_buffers, check
+// whether this number needs to be changed
+#define VECTOR_LENGTH_IN_BYTES 32
+
+// num_elements must be divisible by 16 (caller check)
+void reduce_bf16_buffers(int num_elements, int num_buffers, struct allreduce_workspace* workspace)
+{
+#pragma omp parallel for
+    for (int i = 0; i < num_elements * 2; i += VECTOR_LENGTH_IN_BYTES) {
+        auto inout_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)(workspace[0].buffer + i)));
+        switch (num_buffers) {
+            case 8: REPEAT(7, CVT_ADD_BF16); break;
+            case 7: REPEAT(6, CVT_ADD_BF16); break;
+            case 6: REPEAT(5, CVT_ADD_BF16); break;
+            case 5: REPEAT(4, CVT_ADD_BF16); break;
+            case 4: REPEAT(3, CVT_ADD_BF16); break;
+            case 3: REPEAT(2, CVT_ADD_BF16); break;
+            default: assert(!"Should not get here.");
+        }
+        _mm256_storeu_si256((__m256i*)(workspace[0].buffer + i), cvt_fp32_to_bf16(inout_val));
+    }
+}
+
+void reduce_2_bf16_buffers(int num_elements, void* in_out, void* in1)
+{
+#pragma omp parallel for
+    for (int i = 0; i < num_elements * 2; i += VECTOR_LENGTH_IN_BYTES) {
+        auto inout_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)((char*)in_out + i)));
+        auto in1_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)((char*)in1 + i)));
+        inout_val = _mm512_add_ps(inout_val, in1_val);
+        _mm256_storeu_si256((__m256i*)((char*)in_out + i), cvt_fp32_to_bf16(inout_val));
+    }
+}
+
+#define CVT_ADD_F32(x)                                                         \
+    do {                                                                       \
+        auto in##x##_val = _mm256_loadu_ps((float*)(workspace[x].buffer + i)); \
+        inout_val = _mm256_add_ps(inout_val, in##x##_val);                     \
+    } while (0)
+
+// num_elements must be divisible by 16 (caller check)
+void reduce_fp32_buffers(int num_elements, int num_buffers, struct allreduce_workspace* workspace)
+{
+#pragma omp parallel for
+    for (int i = 0; i < num_elements * 4; i += VECTOR_LENGTH_IN_BYTES) {
+        auto inout_val = _mm256_loadu_ps((float*)(workspace[0].buffer + i));
+        switch (num_buffers) {
+            case 8: REPEAT(7, CVT_ADD_F32); break;
+            case 7: REPEAT(6, CVT_ADD_F32); break;
+            case 6: REPEAT(5, CVT_ADD_F32); break;
+            case 5: REPEAT(4, CVT_ADD_F32); break;
+            case 4: REPEAT(3, CVT_ADD_F32); break;
+            case 3: REPEAT(2, CVT_ADD_F32); break;
+            default: assert(!"Should not get here.");
+        }
+        _mm256_storeu_ps((float*)(workspace[0].buffer + i), inout_val);
+    }
+}
+
+void reduce_2_fp32_buffers(int num_elements, void* in_out, void* in1)
+{
+#pragma omp parallel for
+    for (int i = 0; i < num_elements * 4; i += VECTOR_LENGTH_IN_BYTES) {
+        auto inout_val = _mm256_loadu_ps((float*)((char*)in_out + i));
+        auto in1_val = _mm256_loadu_ps((float*)((char*)in1 + i));
+        inout_val = _mm256_add_ps(inout_val, in1_val);
+        _mm256_storeu_ps((float*)((char*)in_out + i), inout_val);
+    }
+}
+
+// Communicatiooon settings
+int world_rank = -1;
+int world_size = -1;
+
+std::set<int> _comm_ids;
+std::set<int> _colors;
+std::vector<ccl::communicator> _ccl_comms;
+ccl::shared_ptr_class<ccl::kvs> sub_kvs;
+std::map<std::vector<int>, int> group_to_comm_id;
+
+ccl::communicator& _get_comm_from_group() { return _ccl_comms[0]; }
+ccl::communicator& _get_comm_from_group(py::object group) { return _ccl_comms[0]; }
+ccl::communicator& _get_comm_from_group(std::vector<int> ranks)
+{
+    if (group_to_comm_id.find(ranks) != group_to_comm_id.end()) {
+        auto id = group_to_comm_id.find(ranks);
+        return _ccl_comms[id->second];
+    }
+    return _ccl_comms[0];
+}
+
+#define CCLCHECK(cmd) \
+    do {              \
+        cmd;          \
+    } while (0)
+
+#define KVS_CREATE_SUCCESS 0
+#define KVS_CREATE_FAILURE -1
+
+bool is_initialized = 0;
+
+ccl::shared_ptr_class<ccl::kvs> kvs;
+
+bool all_ranks_local_p = false;
+
+void initialize(int size, int rank, torch::Tensor& kvs_data)
+{
+    if (is_initialized) return;
+
+    // Check whether all ranks is on the same physical machine.
+    // If true, we will use an SHM based low latency allreduce
+
+    auto ls_string = std::getenv("LOCAL_SIZE");
+    int ls = 0;
+    if (ls_string != NULL) { ls = std::stoi(std::getenv("LOCAL_SIZE")); }
+
+    if (size >= 1 && size == ls) { all_ranks_local_p = true; }
+
+    world_size = size;
+    world_rank = rank;
+    is_initialized = 1;
+
+    ccl::kvs::address_type main_addr;
+
+    if (rank != 0) {
+        memcpy(main_addr.data(), kvs_data.data_ptr(), main_addr.size());
+        kvs = ccl::create_kvs(main_addr);
+    }
+
+    _ccl_comms.emplace_back(ccl::create_communicator(size, rank, kvs));
+
+    auto addr_string = std::getenv("MASTER_ADDR");
+    if (addr_string == NULL) { addr_string = ""; }
+    auto port_string = std::getenv("MASTER_PORT");
+    if (port_string == NULL) { port_string = ""; }
+    char shm_name[NAME_BUF_SIZE];
+    snprintf(shm_name,
+             NAME_BUF_SIZE,
+             "%s_%d_%s_%s",
+             SHM_BUFFER_NAME,
+             getuid(),
+             addr_string,
+             port_string);
+    // create shared workspace for SHM based allreduce
+    if (all_ranks_local_p) {
+        if (rank == 0) {
+            workspace =
+                (struct allreduce_workspace*)malloc(size * sizeof(struct allreduce_workspace));
+            shared_create(
+                &allreduce_buffer, shm_name, workspace, size * sizeof(struct allreduce_workspace));
+            workspace = (struct allreduce_workspace*)allreduce_buffer.bytes;
+            for (int i = 0; i < size; i++) { workspace[i].state = coll_begin; }
+        }
+        CCLCHECK(ccl::barrier(_get_comm_from_group()).wait());
+        if (rank != 0) {
+            shared_open(&allreduce_buffer, shm_name, size * sizeof(struct allreduce_workspace));
+        }
+        workspace = (struct allreduce_workspace*)allreduce_buffer.bytes;
+    }
+}
+
+/*
+    rank == 0: create main kvs and return its address
+    rank == else: return an empty address
+*/
+std::vector<uint8_t> get_kvs_addr(int rank)
+{
+    if (rank == 0) {
+        kvs = ccl::create_main_kvs();
+        ccl::kvs::address_type main_addr = kvs->get_address();
+        auto ccl_kvs_addr = std::vector<uint8_t>(main_addr.begin(), main_addr.end());
+        return ccl_kvs_addr;
+    } else {
+        ccl::kvs::address_type main_addr;
+        auto ccl_kvs_addr = std::vector<uint8_t>(main_addr.begin(), main_addr.end());
+        return ccl_kvs_addr;
+    }
+}
+
+int get_rank(int group = 0) { return world_rank; }
+
+int get_world_size(int group = 0) { return world_size; }
+
+// Find the next ordered, unique value to a set. E.g. <0,1,2,7> --> 3
+int next_unique_val(std::set<int> s)
+{
+    std::set<int>::iterator itr;
+    // Base case. Add 0 to start of set.
+    if (s.empty() || *s.begin() != 0) {
+        return 0;
+        // second base case where s = {0} (the case of s = {n != 0} is caught above)
+    } else if (s.size() == 1) {
+        return 1;
+    } else {
+        int prev_val = *s.begin();
+        for (itr = std::next(s.begin()); itr != s.end(); itr++) {
+            if (*itr != prev_val + 1) { return prev_val + 1; }
+            prev_val = *itr;
+        }
+        return *(s.end()) + 1;
+    }
+}
+
+std::vector<uint8_t> get_sub_kvs_addr(bool first)
+{
+    if (first) {
+        sub_kvs = ccl::create_main_kvs();
+        ccl::kvs::address_type main_addr = sub_kvs->get_address();
+        auto ccl_kvs_addr = std::vector<uint8_t>(main_addr.begin(), main_addr.end());
+        return ccl_kvs_addr;
+    } else {
+        ccl::kvs::address_type main_addr;
+        auto ccl_kvs_addr = std::vector<uint8_t>(main_addr.begin(), main_addr.end());
+        return ccl_kvs_addr;
+    }
+}
+
+void initialize_sub_comm(int size, int rank, torch::Tensor& kvs_data, std::vector<int> ranks)
+{
+    ccl::kvs::address_type main_addr;
+    if (rank != 0) {
+        memcpy(main_addr.data(), kvs_data.data_ptr(), main_addr.size());
+        sub_kvs = ccl::create_kvs(main_addr);
+    }
+    _ccl_comms.push_back(ccl::create_communicator(size, rank, sub_kvs));
+    group_to_comm_id[ranks] = _ccl_comms.size() - 1;
+}
+
+ccl::datatype get_ccl_datatype(c10::ScalarType type)
+{
+    ccl::datatype ccl_type;
+    switch (type) {
+        case c10::ScalarType::Int: ccl_type = ccl::datatype::int32; break;
+        case c10::ScalarType::Long: ccl_type = ccl::datatype::int64; break;
+        case c10::ScalarType::Float: ccl_type = ccl::datatype::float32; break;
+        case c10::ScalarType::Double: ccl_type = ccl::datatype::float64; break;
+        case c10::ScalarType::BFloat16: ccl_type = ccl::datatype::bfloat16; break;
+        case c10::ScalarType::Half: ccl_type = ccl::datatype::float16; break;
+        default: ccl_type = ccl::datatype::int8;
+    }
+    return ccl_type;
+}
+
+ccl::reduction get_ccl_reduce_op(py::object op, at::Tensor& input)
+{
+    py::object ReduceOp = py::module_::import("deepspeed.comm").attr("ReduceOp");
+    if (!py::isinstance(op, ReduceOp)) {
+        throw std::runtime_error("Error: Op must be of type ReduceOp");
+    }
+
+    int op_val = py::int_(op.attr("value"));
+    ccl::reduction ccl_op;
+
+    if (input.scalar_type() == at::kBool) {
+        if (op_val == (int)py::int_(ReduceOp.attr("SUM").attr("value"))) {
+            // For bool tensors, map sum to max, which both represent a bitwise or.
+            // This is to prevent overflow issues with sum, since we use uint8 to
+            // represent a bool (see cclDataType mapping).
+            ccl_op = ccl::reduction::max;
+        } else if (op_val == (int)py::int_(ReduceOp.attr("AVG").attr("value"))) {
+            throw std::runtime_error("Error: For bool tensors, op must be of type ReduceOp");
+        }
+    }
+
+    if (op_val == (int)py::int_(ReduceOp.attr("SUM").attr("value"))) {
+        ccl_op = ccl::reduction::sum;
+    } else if (op_val == (int)py::int_(ReduceOp.attr("MIN").attr("value"))) {
+        ccl_op = ccl::reduction::min;
+    } else if (op_val == (int)py::int_(ReduceOp.attr("MAX").attr("value"))) {
+        ccl_op = ccl::reduction::max;
+    } else if (op_val == (int)py::int_(ReduceOp.attr("PRODUCT").attr("value"))) {
+        ccl_op = ccl::reduction::prod;
+    } else {
+        throw std::runtime_error("Error: Unrecognized ReduceOp type");
+    }
+    return ccl_op;
+}
+
+void broadcast(torch::Tensor& data, int src, std::vector<int> group, bool async_op)
+{
+    CCLCHECK(ccl::broadcast(data.data_ptr(),
+                            data.numel(),
+                            get_ccl_datatype(data.scalar_type()),
+                            src,
+                            _get_comm_from_group(group))
+                 .wait());
+}
+
+// TODO: implement torch's async_op behavior, document it.
+void all_reduce(torch::Tensor& data, py::object op, std::vector<int> group, bool async_op)
+{
+    CCLCHECK(ccl::allreduce(data.data_ptr(),
+                            data.data_ptr(),
+                            data.numel(),
+                            get_ccl_datatype(data.scalar_type()),
+                            get_ccl_reduce_op(op, data),
+                            _get_comm_from_group(group))
+                 .wait());
+}
+
+void all_reduce_caching(torch::Tensor& data,
+                        py::object op,
+                        std::string match_id,
+                        std::vector<int> group,
+                        bool async_op)
+{
+    ccl::allreduce_attr attr = ccl::default_allreduce_attr;
+    auto match_str = ccl::v1::string(match_id);
+    attr.template set<ccl::operation_attr_id::to_cache>(true);
+    attr.template set<ccl::operation_attr_id::match_id>(match_str);
+    // To control this, use operation attribute and set true value for to_cache field and unique
+    // string (for example, tensor name) for match_id field. Note that:
+    //   match_id should be the same for a specific communication operation across all ranks.
+    //   If the same tensor is a part of different communication operations, match_id should have
+    //   different values for each of these operations.
+    CCLCHECK(ccl::allreduce(data.data_ptr(),
+                            data.data_ptr(),
+                            data.numel(),
+                            get_ccl_datatype(data.scalar_type()),
+                            get_ccl_reduce_op(op, data),
+                            _get_comm_from_group(group),
+                            attr)
+                 .wait());
+}
+
+static void parallel_memcpy(void* to, void* from, size_t n_bytes)
+    __attribute__((target("avx512bw")));
+static void parallel_memcpy(void* to, void* from, size_t n_bytes)
+{
+#pragma omp parallel for
+    for (int i = 0; i < n_bytes; i += VECTOR_LENGTH_IN_BYTES) {
+        auto val = _mm256_loadu_si256((__m256i*)((char*)from + i));
+        _mm256_storeu_si256((__m256i*)((char*)to + i), val);
+    }
+}
+
+void inference_all_reduce(torch::Tensor& data, py::object op, std::vector<int> group, bool async_op)
+{
+    static py::object ReduceOp = py::module_::import("deepspeed.comm").attr("ReduceOp");
+    static auto ReduceOpSum = (int)py::int_(ReduceOp.attr("SUM").attr("value"));
+
+    assert(py::int_(op.attr("value")) == ReduceOpSum);
+
+    auto numel = data.numel();
+
+    int data_size = 0;
+    bool data_type_fallback = false;
+
+    switch (data.scalar_type()) {
+        case c10::ScalarType::BFloat16: data_size = numel * 2; break;
+        case c10::ScalarType::Float: data_size = numel * 4; break;
+        default: data_type_fallback = true;
+    }
+
+    if (data_type_fallback || (data_size % VECTOR_LENGTH_IN_BYTES) != 0 || !all_ranks_local_p) {
+        // fallback to oneccl allreduce
+        CCLCHECK(ccl::allreduce(data.data_ptr(),
+                                data.data_ptr(),
+                                data.numel(),
+                                get_ccl_datatype(data.scalar_type()),
+                                get_ccl_reduce_op(op, data),
+                                _get_comm_from_group(group))
+                     .wait());
+        return;
+    }
+
+    for (int offset = 0; offset < data_size; offset += MAX_BUF_SIZE) {
+        auto data_ptr = ((char*)(data.data_ptr()) + offset);
+        size_t chunk_size = data_size - offset > MAX_BUF_SIZE ? MAX_BUF_SIZE : data_size - offset;
+        size_t chunk_el = chunk_size / (data_size / numel);
+
+        parallel_memcpy(workspace[world_rank].buffer, data_ptr, chunk_size);
+        std::atomic_thread_fence(std::memory_order_release);
+        workspace[world_rank].state = coll_allreduce_naive__copy_in_done;
+
+        if (world_rank == 0) {
+            // compute allreduce result on rank 0
+            for (int i = 1; i < world_size; i++) {
+                // wait until the other rank copy the buffer
+                wait_buffer_state_until(i, coll_allreduce_naive__copy_in_done);
+            }
+            reduce_all_buffers(workspace, chunk_el, data.scalar_type(), world_size);
+            std::atomic_thread_fence(std::memory_order_release);
+            workspace[world_rank].state = coll_allreduce_naive__reduce_done;
+            parallel_memcpy(data_ptr, workspace[0].buffer, chunk_size);
+        }
+        if (world_rank != 0) {
+            wait_buffer_state_until(0, coll_allreduce_naive__reduce_done);
+            parallel_memcpy(data_ptr, workspace[0].buffer, chunk_size);
+            std::atomic_thread_fence(std::memory_order_release);
+            workspace[world_rank].state = coll_allreduce_naive__copy_out_done;
+        }
+        if (world_rank == 0) {
+            for (int i = 1; i < world_size; i++) {
+                wait_buffer_state_until(i, coll_allreduce_naive__copy_out_done);
+            }
+            std::atomic_thread_fence(std::memory_order_release);
+            workspace[world_rank].state = coll_begin;
+        }
+        if (world_rank != 0) {
+            // if rank 0 spin too fast it could be in state 1 of next allreduce
+            // in this case wait_buffer_state_until(0, 0) may cause deadlock
+            // what we are certain is when rank 0 finishes the state won't be 2
+            wait_buffer_state_until_not(0, coll_allreduce_naive__reduce_done);
+            workspace[world_rank].state = coll_begin;
+        }
+    }
+}
+
+void barrier(std::vector<int> group, bool async_op)
+{
+    CCLCHECK(ccl::barrier(_get_comm_from_group(group)).wait());
+}
+
+std::vector<std::string> get_available_coll()
+{
+    std::vector<std::string> colls{
+        "broadcast", "all_reduce", "inference_all_reduce", "all_reduce_caching", "barrier"};
+    return colls;
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+    m.def("get_kvs_addr", &get_kvs_addr, "create and get main kvs addr");
+    m.def("initialize", &initialize, "ccl initialize");
+    m.def("get_rank", &get_rank, "get rank");
+    m.def("get_world_size", &get_world_size, "get world size");
+    m.def("broadcast", &broadcast, "ccl broadcast");
+    m.def("all_reduce", &all_reduce, "ccl all_reduce");
+    m.def("inference_all_reduce", &inference_all_reduce, "low latency all_reduce implementation");
+    m.def("all_reduce_caching", &all_reduce_caching, "ccl all_reduce with caching");
+    m.def("barrier", &barrier, "barrier");
+    m.def("initialize_sub_comm", &initialize_sub_comm, "initialize_sub_comm");
+    m.def("get_sub_kvs_addr", &get_sub_kvs_addr, "get_sub_kvs_addr");
+    m.def("get_available_coll", &get_available_coll, "get_available_coll");
+}
diff --git a/csrc/cpu/lion/fused_lion.cpp b/csrc/cpu/lion/fused_lion.cpp
new file mode 100644
index 000000000000..708df7f0146a
--- /dev/null
+++ b/csrc/cpu/lion/fused_lion.cpp
@@ -0,0 +1,43 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include "cpu_lion.h"
+
+// C++ interface
+
+void multi_tensor_lion(int chunk_size,
+                       at::Tensor noop_flag,
+                       std::vector<std::vector<at::Tensor>> tensor_lists, /*gpmv*/
+                       const float lr,
+                       const float beta1,
+                       const float beta2,
+                       const int step,
+                       const int mode,
+                       const float weight_decay)
+{
+    static bool initialized = false;
+    if (!initialized) {
+        create_lion_optimizer(0);
+        initialized = true;
+    }
+    for (int i = 0; i < tensor_lists[0].size(); i++) {
+        ds_lion_step(0,
+                     step,
+                     lr,
+                     beta1,
+                     beta2,
+                     weight_decay,
+                     tensor_lists[1][i],
+                     tensor_lists[0][i],
+                     tensor_lists[2][i]);
+    }
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+    m.def("multi_tensor_lion",
+          &multi_tensor_lion,
+          "Compute and apply gradient update to parameters for Lion optimizer");
+}
diff --git a/csrc/deepspeed4science/evoformer_attn/attention.cpp b/csrc/deepspeed4science/evoformer_attn/attention.cpp
new file mode 100644
index 000000000000..ac3364539ff1
--- /dev/null
+++ b/csrc/deepspeed4science/evoformer_attn/attention.cpp
@@ -0,0 +1,62 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include <torch/extension.h>
+
+void attention_impl(torch::Tensor& q,
+                    torch::Tensor& k,
+                    torch::Tensor& v,
+                    torch::Tensor& bias1,
+                    torch::Tensor& bias2,
+                    torch::Tensor& o,
+                    torch::Tensor& lse);
+void attention(torch::Tensor& q,
+               torch::Tensor& k,
+               torch::Tensor& v,
+               torch::Tensor& bias1,
+               torch::Tensor& bias2,
+               torch::Tensor& o,
+               torch::Tensor& lse)
+{
+    attention_impl(q, k, v, bias1, bias2, o, lse);
+}
+
+void attention_back_impl(torch::Tensor& go,
+                         torch::Tensor& q,
+                         torch::Tensor& k,
+                         torch::Tensor& v,
+                         torch::Tensor& o,
+                         torch::Tensor& lse,
+                         torch::Tensor& delta,
+                         torch::Tensor& bias1,
+                         torch::Tensor& bias2,
+                         torch::Tensor& gq,
+                         torch::Tensor& gk,
+                         torch::Tensor& gv,
+                         torch::Tensor& gb1,
+                         torch::Tensor& gb2);
+void attention_bwd(torch::Tensor& go,
+                   torch::Tensor& q,
+                   torch::Tensor& k,
+                   torch::Tensor& v,
+                   torch::Tensor& o,
+                   torch::Tensor& lse,
+                   torch::Tensor& delta,
+                   torch::Tensor& bias1,
+                   torch::Tensor& bias2,
+                   torch::Tensor& gq,
+                   torch::Tensor& gk,
+                   torch::Tensor& gv,
+                   torch::Tensor& gb1,
+                   torch::Tensor& gb2)
+{
+    attention_back_impl(go, q, k, v, o, lse, delta, bias1, bias2, gq, gk, gv, gb1, gb2);
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+    m.def("attention", &attention, "");
+    m.def("attention_bwd", &attention_bwd, "");
+}
diff --git a/csrc/deepspeed4science/evoformer_attn/attention_back.cu b/csrc/deepspeed4science/evoformer_attn/attention_back.cu
new file mode 100644
index 000000000000..a82c4ec68a13
--- /dev/null
+++ b/csrc/deepspeed4science/evoformer_attn/attention_back.cu
@@ -0,0 +1,218 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include <ATen/cuda/CUDAContext.h>
+#include <torch/extension.h>
+#include <type_traits>
+#include "gemm_kernel_utils.h"
+#include "kernel_backward.h"
+#include "transform/bias_broadcast.h"
+
+constexpr auto kBlockSizeI = 64;
+constexpr auto kBlockSizeJ = 64;
+
+template <typename arch,
+          typename scalar_t,
+          typename torch_scalar_t,
+          template <typename, typename, typename>
+          class Broadcast1_,
+          template <typename, typename, typename>
+          class Broadcast2_>
+typename std::enable_if<!CheckArch<arch, scalar_t>::value>::type attention_back_impl_template(
+    torch::Tensor& go,
+    torch::Tensor& q,
+    torch::Tensor& k,
+    torch::Tensor& v,
+    torch::Tensor& o,
+    torch::Tensor& lse,
+    torch::Tensor& delta,
+    torch::Tensor& bias1,
+    torch::Tensor& bias2,
+    torch::Tensor& gq,
+    torch::Tensor& gk,
+    torch::Tensor& gv,
+    torch::Tensor& gb1,
+    torch::Tensor& gb2)
+{
+    EVOFORMER_CHECK(false, "Unsupported GPU and data type combination")
+}
+
+template <typename arch,
+          typename scalar_t,
+          typename torch_scalar_t,
+          template <typename, typename, typename>
+          class Broadcast1_,
+          template <typename, typename, typename>
+          class Broadcast2_>
+typename std::enable_if<CheckArch<arch, scalar_t>::value>::type attention_back_impl_template(
+    torch::Tensor& go,
+    torch::Tensor& q,
+    torch::Tensor& k,
+    torch::Tensor& v,
+    torch::Tensor& o,
+    torch::Tensor& lse,
+    torch::Tensor& delta,
+    torch::Tensor& bias1,
+    torch::Tensor& bias2,
+    torch::Tensor& gq,
+    torch::Tensor& gk,
+    torch::Tensor& gv,
+    torch::Tensor& gb1,
+    torch::Tensor& gb2)
+{
+    constexpr bool kPreload_ = arch::kMinComputeCapability >= 80;
+    using Kernel = AttentionBackwardKernel<arch,
+                                           scalar_t,     // scalar_t
+                                           true,         // kIsAligned_
+                                           false,        // kApplyDropout_
+                                           kPreload_,    // kPreload_
+                                           kBlockSizeI,  // kBlockSizeI_,
+                                           kBlockSizeJ,  // kBlockSizeJ_,
+                                           64,           // kMaxK
+                                           Broadcast1_,
+                                           Broadcast2_>;
+    int head_size = q.size(-1);
+    int head_number = q.size(-2);
+    int seq_length = q.size(-3);
+    auto q_view = q.view({-1, seq_length, head_number, head_size});
+    auto k_view = k.view({-1, seq_length, head_number, head_size});
+    auto v_view = v.view({-1, seq_length, head_number, head_size});
+    auto o_view = o.view({-1, seq_length, head_number, head_size});
+    auto do_view = go.view({-1, seq_length, head_number, head_size});
+    auto dk_view = gk.view({-1, seq_length, head_number, head_size});
+    auto dv_view = gv.view({-1, seq_length, head_number, head_size});
+    auto dq_view = gq.view({-1, seq_length, head_number, head_size});
+    auto q_ptr = reinterpret_cast<scalar_t*>(q.data_ptr<torch_scalar_t>());
+    auto k_ptr = reinterpret_cast<scalar_t*>(k.data_ptr<torch_scalar_t>());
+    auto v_ptr = reinterpret_cast<scalar_t*>(v.data_ptr<torch_scalar_t>());
+    auto o_ptr = reinterpret_cast<scalar_t*>(o.data_ptr<torch_scalar_t>());
+    auto do_ptr = reinterpret_cast<scalar_t*>(go.data_ptr<torch_scalar_t>());
+    auto dk_ptr = reinterpret_cast<scalar_t*>(gk.data_ptr<torch_scalar_t>());
+    auto dv_ptr = reinterpret_cast<scalar_t*>(gv.data_ptr<torch_scalar_t>());
+    auto dq_ptr = reinterpret_cast<scalar_t*>(gq.data_ptr<torch_scalar_t>());
+    auto db1_ptr = gb1.size(0) > 0 ? reinterpret_cast<float*>(gb1.data_ptr<float>()) : nullptr;
+    auto db2_ptr = gb2.size(0) > 0 ? reinterpret_cast<float*>(gb2.data_ptr<float>()) : nullptr;
+    auto lse_ptr = reinterpret_cast<float*>(lse.data_ptr<float>());
+    auto delta_ptr = reinterpret_cast<float*>(delta.data_ptr<float>());
+    auto bias1_ptr = reinterpret_cast<scalar_t*>(bias1.data_ptr<torch_scalar_t>());
+    auto bias2_ptr = reinterpret_cast<scalar_t*>(bias2.data_ptr<torch_scalar_t>());
+    static_assert(Kernel::kKernelComputesDelta, "Kernel must compute delta");
+
+    typename Kernel::Params p;
+    p.query_ptr = q_ptr;
+    p.key_ptr = k_ptr;
+    p.value_ptr = v_ptr;
+    p.logsumexp_ptr = lse_ptr;
+    p.output_ptr = o_ptr;
+    p.grad_output_ptr = do_ptr;
+    p.delta_ptr = delta_ptr;
+    p.grad_query_ptr = dq_ptr;
+    p.grad_key_ptr = dk_ptr;
+    p.grad_value_ptr = dv_ptr;
+
+    p.grad_bias1_ptr = db1_ptr;
+    p.grad_bias2_ptr = db2_ptr;
+    p.B = q.size(0);
+    p.N = q.size(1);
+    p.bias1_ptr = bias1.size(0) ? bias1_ptr : nullptr;
+    p.bias2_ptr = bias2.size(0) ? bias2_ptr : nullptr;
+
+    p.scale = 1.0f / sqrtf(head_size);
+
+    p.head_dim = head_size;
+    p.head_dim_value = head_size;
+    p.num_queries = seq_length;
+    p.num_keys = seq_length;
+    p.num_heads = head_number;
+
+    p.q_strideM = q_view.stride(-3);
+    p.k_strideM = k_view.stride(-3);
+    p.v_strideM = v_view.stride(-3);
+    p.gO_strideM = do_view.stride(-3);
+    p.o_strideH = o_view.stride(-2);
+    p.q_strideH = q_view.stride(-2);
+    p.k_strideH = k_view.stride(-2);
+    p.v_strideH = v_view.stride(-2);
+    p.o_strideB = o_view.stride(-4);
+    p.q_strideB = q_view.stride(-4);
+    p.k_strideB = k_view.stride(-4);
+    p.v_strideB = v_view.stride(-4);
+    p.lse_strideB = lse.stride(-3);
+    p.lse_strideH = lse.stride(-2);
+    p.delta_strideB = delta.stride(-3);
+    p.delta_strideH = delta.stride(-2);
+    p.num_batches = q_view.size(-4);
+
+    p.gO_strideB = do_view.stride(-4);
+    p.gQ_strideB = dq_view.stride(-4);
+    p.gK_strideB = dk_view.stride(-4);
+    p.gV_strideB = dv_view.stride(-4);
+    p.gO_strideH = do_view.stride(-2);
+    p.gQ_strideH = dq_view.stride(-2);
+    p.gK_strideH = dk_view.stride(-2);
+    p.gV_strideH = dv_view.stride(-2);
+
+    torch::Tensor workspace = torch::empty(p.workspace_size() / 4, lse.options());
+    p.workspace = workspace.data_ptr<float>();
+
+    auto kernel_fn = attention_kernel_backward_batched_impl<Kernel>;
+    size_t smem_bytes = sizeof(typename Kernel::SharedStorage);
+    cudaFuncSetAttribute(kernel_fn, cudaFuncAttributeMaxDynamicSharedMemorySize, int(smem_bytes));
+    if (!Kernel::check_supported(p)) { throw std::runtime_error("Unsupported parameters"); }
+    kernel_fn<<<p.getBlocksGrid(), p.getThreadsGrid(), smem_bytes>>>(p);
+}
+
+#define CODE(scalar_t, torch_scalar_t)                                           \
+    do {                                                                         \
+        if (bias1.size(0) == 0 && bias2.size(0) == 0) {                          \
+            attention_back_impl_template<ArchTag,                                \
+                                         scalar_t,                               \
+                                         torch_scalar_t,                         \
+                                         BroadcastNoLoad,                        \
+                                         BroadcastNoLoad>(                       \
+                go, q, k, v, o, lse, delta, bias1, bias2, gq, gk, gv, gb1, gb2); \
+        } else if (bias1.size(0) > 0 && bias2.size(0) > 0) {                     \
+            attention_back_impl_template<ArchTag,                                \
+                                         scalar_t,                               \
+                                         torch_scalar_t,                         \
+                                         BroadcastA,                             \
+                                         BroadcastB>(                            \
+                go, q, k, v, o, lse, delta, bias1, bias2, gq, gk, gv, gb1, gb2); \
+        } else if (bias1.size(0) > 0) {                                          \
+            attention_back_impl_template<ArchTag,                                \
+                                         scalar_t,                               \
+                                         torch_scalar_t,                         \
+                                         BroadcastA,                             \
+                                         BroadcastNoLoad>(                       \
+                go, q, k, v, o, lse, delta, bias1, bias2, gq, gk, gv, gb1, gb2); \
+        } else {                                                                 \
+            attention_back_impl_template<ArchTag,                                \
+                                         scalar_t,                               \
+                                         torch_scalar_t,                         \
+                                         BroadcastNoLoad,                        \
+                                         BroadcastB>(                            \
+                go, q, k, v, o, lse, delta, bias1, bias2, gq, gk, gv, gb1, gb2); \
+        }                                                                        \
+    } while (0)
+
+void attention_back_impl(torch::Tensor& go,
+                         torch::Tensor& q,
+                         torch::Tensor& k,
+                         torch::Tensor& v,
+                         torch::Tensor& o,
+                         torch::Tensor& lse,
+                         torch::Tensor& delta,
+                         torch::Tensor& bias1,
+                         torch::Tensor& bias2,
+                         torch::Tensor& gq,
+                         torch::Tensor& gk,
+                         torch::Tensor& gv,
+                         torch::Tensor& gb1,
+                         torch::Tensor& gb2)
+{
+    cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
+    DISPATCH_ARCHTAG(prop->major * 10 + prop->minor,
+                     DISPATCH_TYPES(q, { CODE(scalar_t, torch_scalar_t); }));
+}
diff --git a/csrc/deepspeed4science/evoformer_attn/attention_cu.cu b/csrc/deepspeed4science/evoformer_attn/attention_cu.cu
new file mode 100644
index 000000000000..37636c4bf988
--- /dev/null
+++ b/csrc/deepspeed4science/evoformer_attn/attention_cu.cu
@@ -0,0 +1,160 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include <ATen/cuda/CUDAContext.h>
+#include <torch/extension.h>
+#include "gemm_kernel_utils.h"
+#include "kernel_forward.h"
+#include "transform/bias_broadcast.h"
+
+template <typename arch,
+          typename scalar_t,
+          typename torch_scalar_t,
+          template <typename, typename, typename>
+          class Broadcast1_,
+          template <typename, typename, typename>
+          class Broadcast2_>
+typename std::enable_if<!CheckArch<arch, scalar_t>::value>::type attention_impl_template(
+    torch::Tensor& q,
+    torch::Tensor& k,
+    torch::Tensor& v,
+    torch::Tensor& bias1,
+    torch::Tensor& bias2,
+    torch::Tensor& o,
+    float* lse_ptr)
+{
+    EVOFORMER_CHECK(false, "Unsupported GPU and data type combination")
+}
+
+template <typename arch,
+          typename scalar_t,
+          typename torch_scalar_t,
+          template <typename, typename, typename>
+          class Broadcast1_,
+          template <typename, typename, typename>
+          class Broadcast2_>
+typename std::enable_if<CheckArch<arch, scalar_t>::value>::type attention_impl_template(
+    torch::Tensor& q,
+    torch::Tensor& k,
+    torch::Tensor& v,
+    torch::Tensor& bias1,
+    torch::Tensor& bias2,
+    torch::Tensor& o,
+    float* lse_ptr)
+{
+    // Attention definition goes here, replaced with BroadcastType1 and
+    // BroadcastType2
+    using Attention = AttentionKernel<scalar_t, /* scalar_t */
+                                      arch,     /* ArchTag */
+                                      true,     /* Memory is aligned */
+                                      64,
+                                      64,
+                                      true,
+                                      true, /* Supports bias */
+                                      Broadcast1_,
+                                      Broadcast2_>;
+
+    static_assert(!Attention::kNeedsOutputAccumulatorBuffer,
+                  "This test does not support output accumulator buffer");
+    int head_size = q.size(-1);
+    int head_number = q.size(-2);
+    int seq_length = q.size(-3);
+    auto q_view = q.view({-1, seq_length, head_number, head_size});
+    auto k_view = k.view({-1, seq_length, head_number, head_size});
+    auto v_view = v.view({-1, seq_length, head_number, head_size});
+    auto o_view = o.view({-1, seq_length, head_number, head_size});
+    int batch_size = q_view.size(0);
+    auto q_ptr = reinterpret_cast<scalar_t*>(q.data_ptr<torch_scalar_t>());
+    auto k_ptr = reinterpret_cast<scalar_t*>(k.data_ptr<torch_scalar_t>());
+    auto v_ptr = reinterpret_cast<scalar_t*>(v.data_ptr<torch_scalar_t>());
+    auto o_ptr = reinterpret_cast<scalar_t*>(o.data_ptr<torch_scalar_t>());
+
+    auto bias1_ptr = reinterpret_cast<scalar_t*>(bias1.data_ptr<torch_scalar_t>());
+    auto bias2_ptr = reinterpret_cast<scalar_t*>(bias2.data_ptr<torch_scalar_t>());
+
+    typename Attention::Params p;
+    {  // set parameters
+        p.query_ptr = q_ptr;
+        p.key_ptr = k_ptr;
+        p.value_ptr = v_ptr;
+        p.logsumexp_ptr = lse_ptr;  // Only needed for bw
+        p.output_accum_ptr = nullptr;
+        p.output_ptr = o_ptr;
+        p.scale = 1.0f / sqrt(float(head_size));
+
+        p.bias1_ptr = bias1_ptr;
+        p.bias2_ptr = bias2_ptr;
+        p.B = q.size(0);
+        p.N = q.size(1);
+
+        p.num_heads = head_number;
+        p.num_batches = batch_size;
+        p.head_dim = head_size;
+        p.head_dim_value = head_size;
+        p.num_queries = seq_length;
+        p.num_keys = seq_length;
+
+        // All tensors are in BMHK shapes
+        p.q_strideH = q_view.stride(-2);
+        p.k_strideH = k_view.stride(-2);
+        p.v_strideH = v_view.stride(-2);
+        p.q_strideM = q_view.stride(-3);
+        p.k_strideM = k_view.stride(-3);
+        p.v_strideM = v_view.stride(-3);
+        p.o_strideM = o_view.stride(-3);
+        p.q_strideB = q_view.stride(-4);
+        p.k_strideB = k_view.stride(-4);
+        p.v_strideB = v_view.stride(-4);
+    }
+
+    constexpr auto kernel_fn = attention_kernel_batched_impl<Attention>;
+    int smem_bytes = sizeof(typename Attention::SharedStorage);
+    if (smem_bytes > 0xc000) {
+        cudaFuncSetAttribute(kernel_fn, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_bytes);
+    }
+    if (!Attention::check_supported(p)) { throw std::runtime_error("Parameters not supported"); }
+    kernel_fn<<<p.getBlocksGrid(), p.getThreadsGrid(), smem_bytes>>>(p);
+}
+
+#define CODE(scalar_t, torch_scalar_t)                                                          \
+    do {                                                                                        \
+        if (bias1.size(0) == 0 && bias2.size(0) == 0) {                                         \
+            attention_impl_template<ArchTag,                                                    \
+                                    scalar_t,                                                   \
+                                    torch_scalar_t,                                             \
+                                    BroadcastNoLoad,                                            \
+                                    BroadcastNoLoad>(q, k, v, bias1, bias2, o, lse_ptr);        \
+        } else if (bias1.size(0) == 0) {                                                        \
+            attention_impl_template<ArchTag,                                                    \
+                                    scalar_t,                                                   \
+                                    torch_scalar_t,                                             \
+                                    BroadcastNoLoad,                                            \
+                                    BroadcastB>(q, k, v, bias1, bias2, o, lse_ptr);             \
+        } else if (bias2.size(0) == 0) {                                                        \
+            attention_impl_template<ArchTag,                                                    \
+                                    scalar_t,                                                   \
+                                    torch_scalar_t,                                             \
+                                    BroadcastA,                                                 \
+                                    BroadcastNoLoad>(q, k, v, bias1, bias2, o, lse_ptr);        \
+        } else {                                                                                \
+            attention_impl_template<ArchTag, scalar_t, torch_scalar_t, BroadcastA, BroadcastB>( \
+                q, k, v, bias1, bias2, o, lse_ptr);                                             \
+        }                                                                                       \
+    } while (0)
+
+// Function to select and call the correct template based on biases sizes
+void attention_impl(torch::Tensor& q,
+                    torch::Tensor& k,
+                    torch::Tensor& v,
+                    torch::Tensor& bias1,
+                    torch::Tensor& bias2,
+                    torch::Tensor& o,
+                    torch::Tensor& lse)
+{
+    auto lse_ptr = lse.size(0) == 0 ? nullptr : reinterpret_cast<float*>(lse.data_ptr<float>());
+    cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
+    DISPATCH_ARCHTAG(prop->major * 10 + prop->minor,
+                     DISPATCH_TYPES(q, { CODE(scalar_t, torch_scalar_t); }));
+}
diff --git a/csrc/deepspeed4science/evoformer_attn/epilogue/epilogue_grad_bias.h b/csrc/deepspeed4science/evoformer_attn/epilogue/epilogue_grad_bias.h
new file mode 100644
index 000000000000..17b6479ed8c5
--- /dev/null
+++ b/csrc/deepspeed4science/evoformer_attn/epilogue/epilogue_grad_bias.h
@@ -0,0 +1,250 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#pragma once
+#include <cutlass/epilogue/threadblock/default_epilogue_tensor_op.h>
+#include <cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h>
+#include "../iterators/predicated_tile_iterator_atomic.h"
+#include "cutlass/epilogue/threadblock/epilogue.h"
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+template <int Rank,
+          typename Shape_,
+          typename WarpMmaTensorOp_,
+          int PartitionsK,
+          typename OutputOp_,
+          int ElementsPerAccess>
+struct EpilogueTensorOpAffineRankN : public DefaultEpilogueTensorOpAffineRankN<Rank,
+                                                                               Shape_,
+                                                                               WarpMmaTensorOp_,
+                                                                               PartitionsK,
+                                                                               OutputOp_,
+                                                                               ElementsPerAccess> {
+    using Base = DefaultEpilogueTensorOpAffineRankN<Rank,
+                                                    Shape_,
+                                                    WarpMmaTensorOp_,
+                                                    PartitionsK,
+                                                    OutputOp_,
+                                                    ElementsPerAccess>;
+    using OutputTileIterator =
+        cutlass::epilogue::threadblock::PredicatedTileIteratorAffineRankNAtomic<
+            typename Base::OutputTileThreadMap,
+            typename Base::ElementOutput,
+            Rank>;
+
+    using Epilogue =
+        cutlass::epilogue::threadblock::Epilogue<typename Base::Shape,
+                                                 typename Base::WarpMmaTensorOp,
+                                                 Base::kPartitionsK,
+                                                 OutputTileIterator,
+                                                 typename Base::AccumulatorFragmentIterator,
+                                                 typename Base::WarpTileIterator,
+                                                 typename Base::SharedLoadIterator,
+                                                 typename Base::OutputOp,
+                                                 typename Base::Padding,
+                                                 Base::kFragmentsPerIteration>;
+};
+
+template <int Rank,
+          typename Shape_,
+          typename WarpMmaTensorOp_,
+          int PartitionsK,
+          typename OutputOp_,
+          int ElementsPerAccess>
+struct EpilogueVoltaTensorOpAffineRankN
+    : public DefaultEpilogueVoltaTensorOpAffineRankN<Rank,
+                                                     Shape_,
+                                                     WarpMmaTensorOp_,
+                                                     PartitionsK,
+                                                     OutputOp_,
+                                                     ElementsPerAccess> {
+    using Base = DefaultEpilogueVoltaTensorOpAffineRankN<Rank,
+                                                         Shape_,
+                                                         WarpMmaTensorOp_,
+                                                         PartitionsK,
+                                                         OutputOp_,
+                                                         ElementsPerAccess>;
+    using OutputTileIterator =
+        cutlass::epilogue::threadblock::PredicatedTileIteratorAffineRankNAtomic<
+            typename Base::OutputTileThreadMap,
+            typename Base::ElementOutput,
+            Rank>;
+
+    using Epilogue =
+        cutlass::epilogue::threadblock::Epilogue<typename Base::Shape,
+                                                 typename Base::WarpMmaTensorOp,
+                                                 Base::kPartitionsK,
+                                                 OutputTileIterator,
+                                                 typename Base::AccumulatorFragmentIterator,
+                                                 typename Base::WarpTileIterator,
+                                                 typename Base::SharedLoadIterator,
+                                                 typename Base::OutputOp,
+                                                 typename Base::Padding>;
+};
+
+template <typename Shape_,
+          typename WarpMmaTensorOp_,
+          int PartitionsK,
+          typename OutputOp_,
+          int ElementsPerAccess,
+          bool ScatterD = false,
+          typename PermuteDLayout = layout::NoPermute>
+struct EpilogueTensorOp : public DefaultEpilogueTensorOp<Shape_,
+                                                         WarpMmaTensorOp_,
+                                                         PartitionsK,
+                                                         OutputOp_,
+                                                         ElementsPerAccess,
+                                                         ScatterD,
+                                                         PermuteDLayout> {
+    using Base = DefaultEpilogueTensorOp<Shape_,
+                                         WarpMmaTensorOp_,
+                                         PartitionsK,
+                                         OutputOp_,
+                                         ElementsPerAccess,
+                                         ScatterD,
+                                         PermuteDLayout>;
+    using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorAtomic<
+        typename Base::OutputTileThreadMap,
+        typename Base::ElementOutput,
+        ScatterD,
+        PermuteDLayout>;
+    using Epilogue =
+        cutlass::epilogue::threadblock::Epilogue<typename Base::Shape,
+                                                 typename Base::WarpMmaTensorOp,
+                                                 Base::kPartitionsK,
+                                                 OutputTileIterator,
+                                                 typename Base::AccumulatorFragmentIterator,
+                                                 typename Base::WarpTileIterator,
+                                                 typename Base::SharedLoadIterator,
+                                                 typename Base::OutputOp,
+                                                 typename Base::Padding,
+                                                 Base::kFragmentsPerIteration>;
+};
+
+template <typename Shape_,
+          typename WarpMmaTensorOp_,
+          int PartitionsK,
+          typename OutputOp_,
+          int ElementsPerAccess,
+          bool ScatterD = false,
+          typename PermuteDLayout = layout::NoPermute>
+struct EpilogueVoltaTensorOp : public DefaultEpilogueVoltaTensorOp<Shape_,
+                                                                   WarpMmaTensorOp_,
+                                                                   PartitionsK,
+                                                                   OutputOp_,
+                                                                   ElementsPerAccess,
+                                                                   ScatterD,
+                                                                   PermuteDLayout> {
+    using Base = DefaultEpilogueVoltaTensorOp<Shape_,
+                                              WarpMmaTensorOp_,
+                                              PartitionsK,
+                                              OutputOp_,
+                                              ElementsPerAccess,
+                                              ScatterD,
+                                              PermuteDLayout>;
+    using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorAtomic<
+        typename Base::OutputTileThreadMap,
+        typename Base::ElementOutput,
+        ScatterD,
+        PermuteDLayout>;
+    using Epilogue =
+        cutlass::epilogue::threadblock::Epilogue<typename Base::Shape,
+                                                 typename Base::WarpMmaTensorOp,
+                                                 Base::kPartitionsK,
+                                                 OutputTileIterator,
+                                                 typename Base::AccumulatorFragmentIterator,
+                                                 typename Base::WarpTileIterator,
+                                                 typename Base::SharedLoadIterator,
+                                                 typename Base::OutputOp,
+                                                 typename Base::Padding>;
+};
+}  // namespace threadblock
+}  // namespace epilogue
+}  // namespace cutlass
+
+template <typename Arch_,
+          typename Shape_,
+          typename WarpMmaTensorOp_,
+          int PartitionsK,
+          typename OutputOp_,
+          int ElementsPerAccess,
+          bool ScatterD = false,
+          typename PermuteDLayout = cutlass::layout::NoPermute>
+struct BiasGradEpilogue {
+    using Epilogue =
+        typename cutlass::epilogue::threadblock::EpilogueTensorOp<Shape_,
+                                                                  WarpMmaTensorOp_,
+                                                                  PartitionsK,
+                                                                  OutputOp_,
+                                                                  ElementsPerAccess,
+                                                                  ScatterD,
+                                                                  PermuteDLayout>::Epilogue;
+};
+
+template <typename Shape_,
+          typename WarpMmaTensorOp_,
+          int PartitionsK,
+          typename OutputOp_,
+          int ElementsPerAccess,
+          bool ScatterD,
+          typename PermuteDLayout>
+struct BiasGradEpilogue<cutlass::arch::Sm70,
+                        Shape_,
+                        WarpMmaTensorOp_,
+                        PartitionsK,
+                        OutputOp_,
+                        ElementsPerAccess,
+                        ScatterD,
+                        PermuteDLayout> {
+    using Epilogue =
+        typename cutlass::epilogue::threadblock::EpilogueVoltaTensorOp<Shape_,
+                                                                       WarpMmaTensorOp_,
+                                                                       PartitionsK,
+                                                                       OutputOp_,
+                                                                       ElementsPerAccess,
+                                                                       ScatterD,
+                                                                       PermuteDLayout>::Epilogue;
+};
+
+template <typename Arch_,
+          int Rank,
+          typename Shape_,
+          typename WarpMmaTensorOp_,
+          int PartitionsK,
+          typename OutputOp_,
+          int ElementsPerAccess>
+struct BiasGradEpilogueAffineRankN {
+    using Epilogue = typename cutlass::epilogue::threadblock::EpilogueTensorOpAffineRankN<
+        Rank,
+        Shape_,
+        WarpMmaTensorOp_,
+        PartitionsK,
+        OutputOp_,
+        ElementsPerAccess>::Epilogue;
+};
+
+template <int Rank,
+          typename Shape_,
+          typename WarpMmaTensorOp_,
+          int PartitionsK,
+          typename OutputOp_,
+          int ElementsPerAccess>
+struct BiasGradEpilogueAffineRankN<cutlass::arch::Sm70,
+                                   Rank,
+                                   Shape_,
+                                   WarpMmaTensorOp_,
+                                   PartitionsK,
+                                   OutputOp_,
+                                   ElementsPerAccess> {
+    using Epilogue = typename cutlass::epilogue::threadblock::EpilogueVoltaTensorOpAffineRankN<
+        Rank,
+        Shape_,
+        WarpMmaTensorOp_,
+        PartitionsK,
+        OutputOp_,
+        ElementsPerAccess>::Epilogue;
+};
diff --git a/csrc/deepspeed4science/evoformer_attn/epilogue/epilogue_pipelined.h b/csrc/deepspeed4science/evoformer_attn/epilogue/epilogue_pipelined.h
new file mode 100644
index 000000000000..3b7b32d61452
--- /dev/null
+++ b/csrc/deepspeed4science/evoformer_attn/epilogue/epilogue_pipelined.h
@@ -0,0 +1,592 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holdvr nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  File copied from "cutlass/epilogue/threadblock/epilogue.h"
+  then modified to:
+  (1) load 2 source fragments at the same time (pipelining)
+  (2) support reading from a different dtype
+  (3) pass the row id to the OutputOp if it takes it
+    (see MemoryEfficientAttentionNormalize)
+  Note that in general the fragment passed to the OutputOp could
+  span multiple rows but it does not happen with the configurations we have
+*/
+
+#pragma once
+
+#if defined(__CUDACC_RTC__)
+#include <cuda/std/cassert>
+#else
+#include <assert.h>
+#endif
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/functional.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/vector.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_coord.h"
+
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/transform/threadblock/regular_tile_iterator.h"
+
+#include "cutlass/epilogue/threadblock/epilogue_base.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+#include "cutlass/numeric_types.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+template <typename Op>
+struct ApplyEpilogueOp {
+    static CUTLASS_DEVICE typename Op::FragmentOutput apply(
+        Op const& output_op,
+        int row_id,
+        typename Op::FragmentAccumulator const& accum,
+        typename Op::FragmentOutput const& source)
+    {
+        return output_op(accum, source);
+    }
+    static CUTLASS_DEVICE typename Op::FragmentOutput
+    apply(Op const& output_op, int row_id, typename Op::FragmentAccumulator const& accum)
+    {
+        return output_op(accum);
+    }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Epilogue operator
+template <typename Shape_,               ///< Shape of threadblock tile (concept: GemmShape)
+          typename WarpMmaOperator_,     ///< Warp-level MMA operator (concept:
+                                         ///< gemm::warp::MmaTensorOp)
+          int PartitionsK,               ///< Number of partitions of the K dimension
+          typename OutputTileIterator_,  ///< Tile iterator writing output tensors
+          typename AccumulatorFragmentIterator_,  ///< Fragment iterator selecting
+                                                  ///< accumulators
+          typename WarpTileIterator_,             ///< Warp-scoped tile iterator writing
+                                                  ///< accumulators to SMEM
+          typename SharedLoadIterator_,           ///< Threadblock-scoped tile iterator loading
+                                                  ///< from SMEM
+          typename OutputOp_,                     ///< Output operator
+          typename Padding_,              ///< Padding added to SMEM allocation to avoid bank
+                                          ///< conflicts (concept: MatrixShape)
+          int FragmentsPerPartition = 1,  ///< Used to coarsten the epilogue granularity
+          int IterationsUnroll =          ///< Used to reduce binary size when epilogue op is
+                                          ///< large
+          (!IsEpilogueFunctorHeavy<OutputOp_>::value),
+          typename OutputTileSourceIterator_ =
+              OutputTileIterator_  ///< Tile iterator reading tensors
+          >
+class EpiloguePipelined : public EpilogueBase<Shape_,
+                                              typename WarpMmaOperator_::Shape,
+                                              PartitionsK,
+                                              AccumulatorFragmentIterator_,
+                                              WarpTileIterator_,
+                                              Padding_,
+                                              FragmentsPerPartition> {
+public:
+    using Base = EpilogueBase<Shape_,
+                              typename WarpMmaOperator_::Shape,
+                              PartitionsK,
+                              AccumulatorFragmentIterator_,
+                              WarpTileIterator_,
+                              Padding_,
+                              FragmentsPerPartition>;
+
+    using Shape = Shape_;
+    using WarpMmaOperator = WarpMmaOperator_;
+    static int const kPartitionsK = PartitionsK;
+    using OutputTileIterator = OutputTileIterator_;
+    using OutputTileSourceIterator = OutputTileSourceIterator_;
+    using AccumulatorFragmentIterator = AccumulatorFragmentIterator_;
+    using WarpTileIterator = WarpTileIterator_;
+    using SharedLoadIterator = SharedLoadIterator_;
+    using OutputOp = OutputOp_;
+    using Padding = Padding_;
+
+    using Layout = layout::RowMajor;
+    using LongIndex = typename Layout::LongIndex;
+
+    /// The complete warp-level accumulator tile
+    using AccumulatorTile = typename Base::AccumulatorTile;
+
+    /// Accumulator element
+    using ElementAccumulator = typename WarpTileIterator::Element;
+
+    /// Output element
+    using ElementOutput = typename OutputTileIterator::Element;
+    using ElementSource = typename OutputTileSourceIterator::Element;
+
+    /// Output access size
+    static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;
+
+    /// Tensor reference to destination tensor
+    using TensorRef = typename OutputTileIterator::TensorRef;
+
+    /// Tensor reference to sync tensor
+    using SyncTensorRef = typename cutlass::TensorRef<int, cutlass::layout::PackedVectorLayout>;
+
+    /// Const tensor reference to source tensor
+    using ConstTensorRef = typename OutputTileIterator::ConstTensorRef;
+
+    /// Array type used to output
+    using OutputAccessType =
+        Array<typename OutputTileIterator::Element, OutputTileIterator::kElementsPerAccess>;
+    using SourceAccessType = Array<typename OutputTileSourceIterator::Element,
+                                   OutputTileSourceIterator::kElementsPerAccess>;
+
+    /// Array type used by output functor
+    using AccumulatorAccessType =
+        Array<typename WarpTileIterator::Element, OutputTileIterator::kElementsPerAccess>;
+
+    /// Number of warps
+    using WarpCount = typename Base::WarpCount;
+
+    static int constexpr kSmemTiles =
+        Base::kFragmentsPerIteration > 1 ? Base::kFragmentsPerIteration : kPartitionsK;
+    static int constexpr kSmemPointerOffset =
+        Base::SharedStorage::StorageShape::kCount / kSmemTiles;
+
+public:
+    static_assert(OutputTileSourceIterator::Fragment::kElements ==
+                      OutputTileIterator::Fragment::kElements,
+                  "Mismatch between input tile and output tile iterator (kElements)");
+    static_assert(OutputTileSourceIterator::kIterations == OutputTileIterator::kIterations,
+                  "Mismatch between input tile and output tile iterator (kIterations)");
+    static_assert(SharedLoadIterator::Fragment::kElements ==
+                      OutputTileIterator::Fragment::kElements,
+                  "Mismatch between shared load iterator and output tile iterator.");
+
+    static_assert(OutputTileIterator::kElementsPerAccess,
+                  "OutputTileIterator::kElementsPerAccess must not be zero.");
+
+    static_assert(!(OutputTileIterator::Fragment::kElements %
+                    OutputTileIterator::kElementsPerAccess),
+                  "Divisibility");
+
+private:
+    /// Loads fragment from shared memory aligned with output tensor
+    SharedLoadIterator shared_load_iterator_;
+
+public:
+    /// Constructor
+    CUTLASS_DEVICE
+    EpiloguePipelined(typename Base::SharedStorage& shared_storage,  ///< Shared storage object
+                      int thread_idx,  ///< ID of a thread within the threadblock
+                      int warp_idx,    ///< ID of warp within threadblock
+                      int lane_idx     ///< Id of thread within warp
+                      )
+        : Base(shared_storage, thread_idx, warp_idx, lane_idx),
+          shared_load_iterator_(shared_storage.reference(), thread_idx)
+    {
+    }
+
+    /// Streams the result to global memory
+    CUTLASS_DEVICE
+    void operator()(OutputOp const& output_op,                ///< Output operator
+                    OutputTileIterator destination_iterator,  ///< Tile iterator for destination
+                    AccumulatorTile const& accumulators,  ///< Complete warp-level accumulator tile
+                    OutputTileSourceIterator source_iterator)
+    {  ///< Threadblock tile coordinate in GEMM (in units
+       ///< of threadblock tiles)
+
+        if (!output_op.is_source_needed()) {
+            compute_source_not_needed_(output_op, destination_iterator, accumulators);
+        } else {
+            compute_source_needed_(output_op, destination_iterator, accumulators, source_iterator);
+        }
+    }
+    CUTLASS_DEVICE
+    void operator()(OutputOp const& output_op,                ///< Output operator
+                    OutputTileIterator destination_iterator,  ///< Tile iterator for destination
+                    AccumulatorTile const& accumulators)
+    {  ///< Complete warp-level accumulator tile
+        compute_source_not_needed_(output_op, destination_iterator, accumulators);
+    }
+
+private:
+    template <class Seq>
+    struct acc2smem_source_not_needed;
+
+    template <size_t... Seq>
+    struct acc2smem_source_not_needed<cutlass::index_sequence<Seq...>> {
+        template <int Advance>
+        CUTLASS_DEVICE static void helper(AccumulatorFragmentIterator accum_fragment_iterator,
+                                          WarpTileIterator& warp_tile_iterator)
+        {
+            CUTLASS_PRAGMA_UNROLL
+            for (int i = 0; i < Advance; i++) { ++accum_fragment_iterator; }
+
+            CUTLASS_PRAGMA_UNROLL
+            for (int p = 0; p < Base::kFragmentsPerIteration; ++p) {
+                typename AccumulatorFragmentIterator::Fragment accum_fragment;
+
+                accum_fragment_iterator.load(accum_fragment);
+                ++accum_fragment_iterator;
+
+                warp_tile_iterator.store(accum_fragment);
+                if (p < Base::kFragmentsPerIteration - 1) {
+                    warp_tile_iterator.add_pointer_offset(kSmemPointerOffset);
+                }
+            }
+
+            if (Base::kFragmentsPerIteration > 1) {
+                warp_tile_iterator.add_pointer_offset(kSmemPointerOffset *
+                                                      (1 - Base::kFragmentsPerIteration));
+            }
+        }
+
+        CUTLASS_DEVICE
+        static void push(size_t pos,
+                         AccumulatorFragmentIterator const& iterator_begin,
+                         WarpTileIterator& warp_tile_iterator)
+        {
+            int dummy[] = {
+                (pos == (Seq * Base::kFragmentsPerIteration)) &&
+                (helper<Seq * Base::kFragmentsPerIteration>(iterator_begin, warp_tile_iterator),
+                 0)...};
+
+            CUTLASS_UNUSED(dummy[0]);
+        }
+    };
+
+    static_assert(kPartitionsK == 1 || Base::kFragmentsPerIteration == 1,
+                  "One of these must be exactly 1.");
+
+    /// Streams the result to global memory
+    CUTLASS_DEVICE
+    void compute_source_not_needed_(
+        OutputOp const& output_op,                ///< Output operator
+        OutputTileIterator destination_iterator,  ///< Tile iterator for destination
+        AccumulatorTile const& accumulators       ///< Complete warp-level accumulator tile
+    )
+    {
+        //
+        // Iterator over warp-level accumulator fragment
+        //
+
+        AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
+
+        //
+        // Iterate over accumulator tile
+        //
+
+#pragma unroll(IterationsUnroll ? OutputTileIterator::kIterations / Base::kFragmentsPerIteration \
+                                : 1)
+        for (int iter = 0; iter < OutputTileIterator::kIterations;
+             iter += Base::kFragmentsPerIteration) {
+            //
+            // Convert and store fragment
+            //
+
+            __syncthreads();
+
+            acc2smem_source_not_needed<cutlass::make_index_sequence<
+                OutputTileIterator::kIterations / Base::kFragmentsPerIteration>>::
+                push(iter, accum_fragment_iterator, this->warp_tile_iterator_);
+
+            __syncthreads();
+
+            //
+            // Load fragments from shared memory
+            //
+
+            CUTLASS_PRAGMA_UNROLL
+            for (int p = 0; p < Base::kFragmentsPerIteration; ++p) {
+                typename SharedLoadIterator::Fragment aligned_accum_fragment[kPartitionsK];
+
+                shared_load_iterator_.load(aligned_accum_fragment[0]);
+
+                if (p < Base::kFragmentsPerIteration - 1) {
+                    shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+                } else if (kPartitionsK > 1) {
+                    plus<typename SharedLoadIterator::Fragment> add_fragments;
+
+                    CUTLASS_PRAGMA_UNROLL
+                    for (int i = 1; i < kPartitionsK; ++i) {
+                        shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+                        shared_load_iterator_.load(aligned_accum_fragment[i]);
+                        aligned_accum_fragment[0] =
+                            add_fragments(aligned_accum_fragment[0], aligned_accum_fragment[i]);
+                    }
+
+                    shared_load_iterator_.add_pointer_offset((1 - kPartitionsK) *
+                                                             kSmemPointerOffset);
+                }
+
+                //
+                // Compute the output result
+                //
+
+                typename OutputTileIterator::Fragment output_fragment;
+
+                apply_output_operator_source_not_needed_(destination_iterator.thread_start_row(),
+                                                         output_fragment,
+                                                         output_op,
+                                                         aligned_accum_fragment[0]);
+
+                //
+                // Store the final result
+                //
+
+                destination_iterator.store(output_fragment);
+                ++destination_iterator;
+            }
+
+            if (Base::kFragmentsPerIteration > 1) {
+                shared_load_iterator_.add_pointer_offset(kSmemPointerOffset *
+                                                         (1 - Base::kFragmentsPerIteration));
+            }
+        }
+    }
+
+    template <class Seq>
+    struct acc2smem_source_needed;
+
+    template <size_t... Seq>
+    struct acc2smem_source_needed<cutlass::index_sequence<Seq...>> {
+        template <int Advance>
+        CUTLASS_DEVICE static void helper(AccumulatorFragmentIterator accum_fragment_iterator,
+                                          WarpTileIterator& warp_tile_iterator)
+        {
+            CUTLASS_PRAGMA_UNROLL
+            for (int i = 0; i < Advance; i++) { ++accum_fragment_iterator; }
+
+            typename AccumulatorFragmentIterator::Fragment accum_fragment;
+            accum_fragment_iterator.load(accum_fragment);
+            warp_tile_iterator.store(accum_fragment);
+        }
+
+        CUTLASS_DEVICE
+        static void push(size_t pos,
+                         AccumulatorFragmentIterator const& iterator_begin,
+                         WarpTileIterator& warp_tile_iterator)
+        {
+            int dummy[] = {(pos == Seq) && (helper<Seq>(iterator_begin, warp_tile_iterator), 0)...};
+        }
+    };
+
+    /// Streams the result to global memory
+    CUTLASS_DEVICE
+    void compute_source_needed_(
+        OutputOp const& output_op,                ///< Output operator
+        OutputTileIterator destination_iterator,  ///< Tile iterator for destination
+        AccumulatorTile const& accumulators,      ///< Complete warp-level accumulator tile
+        OutputTileSourceIterator source_iterator  ///< Threadblock tile coordinate in GEMM (in units
+                                                  ///< of threadblock tiles)
+    )
+    {
+        typename OutputTileSourceIterator::Fragment source_fragment[2];
+
+        source_fragment[0].clear();
+        source_iterator.load(source_fragment[0]);
+        ++source_iterator;
+        source_fragment[1].clear();
+
+        //
+        // Iterator over warp-level accumulator fragment
+        //
+
+        AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
+
+        //
+        // Iterate over accumulator tile
+        //
+
+#pragma unroll(IterationsUnroll ? OutputTileIterator::kIterations : 1)
+        for (int iter = 0; iter < OutputTileIterator::kIterations; ++iter) {
+            if (iter > 0) { __syncthreads(); }
+            //
+            // Load the source for next iteration (pipelining)
+            //
+
+            if (iter + 1 < OutputTileIterator::kIterations) {
+                source_iterator.load(source_fragment[(iter + 1) % 2]);
+            }
+            ++source_iterator;
+            acc2smem_source_needed<cutlass::make_index_sequence<OutputTileIterator::kIterations>>::
+                push(iter, accum_fragment_iterator, this->warp_tile_iterator_);
+
+            __syncthreads();
+
+            //
+            // Load fragments from shared memory
+            //
+
+            typename SharedLoadIterator::Fragment aligned_accum_fragment[kPartitionsK];
+
+            shared_load_iterator_.load(aligned_accum_fragment[0]);
+
+            // If the number of k-slices is > 1 - perform a reduction amongst the
+            // k-slices
+            if (kPartitionsK > 1) {
+                plus<typename SharedLoadIterator::Fragment> add_fragments;
+
+                CUTLASS_PRAGMA_UNROLL
+                for (int i = 1; i < kPartitionsK; ++i) {
+                    shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+                    shared_load_iterator_.load(aligned_accum_fragment[i]);
+                    aligned_accum_fragment[0] =
+                        add_fragments(aligned_accum_fragment[0], aligned_accum_fragment[i]);
+                }
+
+                shared_load_iterator_.add_pointer_offset((1 - kPartitionsK) * kSmemPointerOffset);
+            }
+
+            //
+            // Compute the output result
+            //
+
+            typename OutputTileIterator::Fragment output_fragment;
+
+            apply_output_operator_(destination_iterator.thread_start_row(),
+                                   output_fragment,
+                                   output_op,
+                                   aligned_accum_fragment[0],
+                                   source_fragment[iter % 2]);
+
+            //
+            // Store the final result
+            //
+
+            destination_iterator.store(output_fragment);
+            ++destination_iterator;
+        }
+    }
+
+    /// Helper to invoke the output functor over each vector of output
+    CUTLASS_DEVICE
+    void apply_output_operator_(int begin_row,
+                                typename OutputTileIterator::Fragment& output_fragment,
+                                OutputOp const& output_op,  ///< Output operator
+                                typename SharedLoadIterator::Fragment const& aligned_accum_fragment,
+                                typename OutputTileSourceIterator::Fragment const& source_fragment)
+    {
+        OutputAccessType* output_frag_ptr = reinterpret_cast<OutputAccessType*>(&output_fragment);
+
+        AccumulatorAccessType const* compute_frag_ptr =
+            reinterpret_cast<AccumulatorAccessType const*>(&aligned_accum_fragment);
+
+        SourceAccessType const* source_frag_ptr =
+            reinterpret_cast<SourceAccessType const*>(&source_fragment);
+
+        int const kOutputOpIterations =
+            OutputTileIterator::Fragment::kElements / OutputTileIterator::kElementsPerAccess;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < kOutputOpIterations; ++i) {
+            // Call the output operator
+            output_frag_ptr[i] = ApplyEpilogueOp<OutputOp>::apply(
+                output_op,
+                begin_row + getRowOffset(i * OutputTileIterator::kElementsPerAccess),
+                compute_frag_ptr[i],
+                source_frag_ptr[i]);
+        }
+    }
+
+    /// Helper to invoke the output functor over each vector of output
+    CUTLASS_DEVICE
+    void apply_output_operator_source_not_needed_(
+        int begin_row,
+        typename OutputTileIterator::Fragment& output_fragment,
+        OutputOp const& output_op,  ///< Output operator
+        typename SharedLoadIterator::Fragment const& aligned_accum_fragment)
+    {
+        OutputAccessType* output_frag_ptr = reinterpret_cast<OutputAccessType*>(&output_fragment);
+
+        AccumulatorAccessType const* compute_frag_ptr =
+            reinterpret_cast<AccumulatorAccessType const*>(&aligned_accum_fragment);
+
+        int const kOutputOpIterations =
+            OutputTileIterator::Fragment::kElements / OutputTileIterator::kElementsPerAccess;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < kOutputOpIterations; ++i) {
+            // Call the output operator
+            output_frag_ptr[i] = ApplyEpilogueOp<OutputOp>::apply(
+                output_op,
+                begin_row + getRowOffset(i * OutputTileIterator::kElementsPerAccess),
+                compute_frag_ptr[i]);
+        }
+    }
+
+    // This should be constexpr, but it's only supported on c++14
+    static int CUTLASS_HOST_DEVICE getRowOffset(int i)
+    {
+        using ThreadMap = typename OutputTileIterator::ThreadMap;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
+            CUTLASS_PRAGMA_UNROLL
+            for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+                CUTLASS_PRAGMA_UNROLL
+                for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+                    int row_offset = row * ThreadMap::Delta::kRow +
+                                     group * ThreadMap::Delta::kGroup +
+                                     cluster * ThreadMap::Delta::kCluster;
+                    int frag_row_idx =
+                        (row + ThreadMap::Iterations::kRow *
+                                   (group + ThreadMap::Iterations::kGroup * cluster));
+                    CUTLASS_PRAGMA_UNROLL
+                    for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
+                        int frag_idx = ThreadMap::kElementsPerAccess *
+                                       (frag_row_idx * ThreadMap::Iterations::kColumn + column);
+                        if (i < frag_idx + ThreadMap::kElementsPerAccess) { return row_offset; }
+                    }
+                }
+            }
+        }
+        return -1;
+    }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace epilogue
+}  // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/deepspeed4science/evoformer_attn/epilogue/epilogue_rescale_output.h b/csrc/deepspeed4science/evoformer_attn/epilogue/epilogue_rescale_output.h
new file mode 100644
index 000000000000..f81a09f74f1e
--- /dev/null
+++ b/csrc/deepspeed4science/evoformer_attn/epilogue/epilogue_rescale_output.h
@@ -0,0 +1,251 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holdvr nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  The epilogue rearranges the result of a matrix product through shared memory
+  to match canonical tensor layouts in global memory. Epilogues support
+  conversion and reduction operations.
+
+  This is a copy of cutlass/epilogue/threadblock/epilogue.h that can
+  handle "row_id" as a first argument, as uses it to get the corresponding
+  `m_prime` / `s_prime` to rescale the output.
+*/
+
+#pragma once
+
+#if defined(__CUDACC_RTC__)
+#include <cuda/std/cassert>
+#else
+#include <assert.h>
+#endif
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/functional.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/vector.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_coord.h"
+
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/transform/threadblock/regular_tile_iterator.h"
+
+#include "cutlass/epilogue/threadblock/epilogue_base.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/thread/scale_type.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/numeric_types.h"
+#include "epilogue_pipelined.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Applies a linear combination operator to an array of elements.
+// output <- alpha * accumulator + beta * source
+//   with:
+//     alpha = 1 / s_prime (to normalize when isLast=True, 1 otherwise)
+//     beta = alpha / m_prime (renormalize the output when the max changes)
+//     source is the current output
+template <typename ElementOutput_,  ///< Data type used to store tensors
+          typename ElementSource_,  //< Data type for source (usually matches
+                                    //`ElementOutput`)
+          int Count,                ///< Number of elements computed per operation.
+                                    ///< Usually it is 128/sizeof_bits<ElementOutput_>,
+          ///< but we use 64 or 32 sometimes when there are not enough data
+          ///< to store
+          typename ElementAccumulator_,  ///< Accumulator data type
+          typename ElementCompute_,      ///< Data type used to compute linear combination
+          bool isFirst,
+          bool isLast,
+          typename FragmentAlphaBeta_,
+          FloatRoundStyle Round = FloatRoundStyle::round_to_nearest>
+class MemoryEfficientAttentionNormalize {
+public:
+    using ElementOutput = ElementOutput_;
+    using ElementSource = ElementSource_;
+    using ElementAccumulator = ElementAccumulator_;
+    using ElementCompute = ElementCompute_;
+
+    static int const kCount = Count;
+
+    using FragmentOutput = Array<ElementOutput, kCount>;
+    using FragmentSource = Array<ElementSource, kCount>;
+    using FragmentAccumulator = Array<ElementAccumulator, kCount>;
+    using ComputeFragment = Array<ElementCompute, kCount>;
+    using FragmentAlphaBeta = FragmentAlphaBeta_;
+
+    static FloatRoundStyle const kRound = Round;
+
+private:
+    //
+    // Data members
+    //
+
+    FragmentAlphaBeta const& s_prime_;
+    FragmentAlphaBeta const& m_prime_;
+
+public:
+    /// Constructs the function object, possibly loading from pointers in host
+    /// memory
+    CUTLASS_HOST_DEVICE
+    MemoryEfficientAttentionNormalize(FragmentAlphaBeta const& s_prime,
+                                      FragmentAlphaBeta const& m_prime)
+        : s_prime_(s_prime), m_prime_(m_prime)
+    {
+    }
+
+    /// Returns true if source is needed
+    CUTLASS_HOST_DEVICE
+    bool is_source_needed() const { return !isFirst; }
+
+    /// Functionally required for serial reduction in the epilogue
+    CUTLASS_HOST_DEVICE
+    void set_k_partition(int k_partition, int k_partition_count) {}
+
+    /// Computes linear scaling: D = alpha * accumulator + beta * source
+    CUTLASS_HOST_DEVICE
+    FragmentOutput operator()(int row,
+                              FragmentAccumulator const& accumulator,
+                              FragmentSource const& source) const
+    {
+        assert(!isFirst);
+
+        // Convert source to internal compute numeric type
+        NumericArrayConverter<ElementCompute, ElementSource, kCount, Round> source_converter;
+        NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round>
+            accumulator_converter;
+
+        // Convert to destination numeric type
+        NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
+
+        ComputeFragment converted_source = source_converter(source);
+        ComputeFragment converted_accumulator = accumulator_converter(accumulator);
+
+        // Perform binary operations
+        ComputeFragment intermediate;
+
+        multiplies<ComputeFragment> mul_add_source;
+        multiply_add<ComputeFragment> mul_add_accumulator;
+
+        ElementCompute alpha = isLast ? (1 / s_prime_[row]) : 1;
+        ElementCompute beta = alpha * m_prime_[row];
+
+        intermediate = mul_add_source(beta, converted_source);  // X =  beta * C
+
+        intermediate = mul_add_accumulator(
+            alpha, converted_accumulator, intermediate);  // D = alpha * Accum + X
+
+        return destination_converter(intermediate);
+    }
+
+    /// Computes linear scaling: D = alpha * accumulator
+    CUTLASS_HOST_DEVICE
+    FragmentOutput operator()(int row, FragmentAccumulator const& accumulator) const
+    {
+        assert(isFirst);
+
+        // Convert source to internal compute numeric type
+        NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round>
+            accumulator_converter;
+
+        // Convert to destination numeric type
+        NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
+
+        ComputeFragment converted_accumulator = accumulator_converter(accumulator);
+
+        ComputeFragment intermediate;
+        multiplies<ComputeFragment> mul_accumulator;
+
+        ElementCompute alpha = isLast ? (1 / s_prime_[row]) : 1;
+
+        intermediate = mul_accumulator(alpha, converted_accumulator);  // X =  alpha * C + uniform
+
+        return destination_converter(intermediate);
+    }
+};
+
+}  // namespace thread
+
+namespace threadblock {
+template <typename EO,
+          typename ES,
+          int Count,
+          typename EA,
+          typename EC,
+          bool F,
+          bool L,
+          typename FAB,
+          FloatRoundStyle R>
+struct ApplyEpilogueOp<
+    thread::MemoryEfficientAttentionNormalize<EO, ES, Count, EA, EC, F, L, FAB, R>> {
+    using Op = thread::MemoryEfficientAttentionNormalize<EO, ES, Count, EA, EC, F, L, FAB, R>;
+    static CUTLASS_DEVICE typename Op::FragmentOutput apply(
+        Op const& output_op,
+        int row_id,
+        typename Op::FragmentAccumulator const& accum,
+        typename Op::FragmentSource const& source)
+    {
+        return output_op(row_id, accum, source);
+    }
+    static CUTLASS_DEVICE typename Op::FragmentOutput
+    apply(Op const& output_op, int row_id, typename Op::FragmentAccumulator const& accum)
+    {
+        return output_op(row_id, accum);
+    }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace epilogue
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/deepspeed4science/evoformer_attn/epilogue/epilogue_thread_apply_logsumexp.h b/csrc/deepspeed4science/evoformer_attn/epilogue/epilogue_thread_apply_logsumexp.h
new file mode 100644
index 000000000000..46fb2bf17c1c
--- /dev/null
+++ b/csrc/deepspeed4science/evoformer_attn/epilogue/epilogue_thread_apply_logsumexp.h
@@ -0,0 +1,168 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+/*! \file
+  \brief Functor performing linear combination operations used by epilogues.
+*/
+
+#pragma once
+
+#include <cuda_fp16.h>
+
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/numeric_types.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+template <typename Element, int ElementsPerAccess>
+struct ArrayExponential {
+    CUTLASS_HOST_DEVICE
+    Array<Element, ElementsPerAccess> operator()(
+        Array<Element, ElementsPerAccess> const& input) const
+    {
+        Array<Element, ElementsPerAccess> result;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < ElementsPerAccess; ++i) { result[i] = expf(input[i]); }
+
+        return result;
+    }
+};
+
+template <int ElementsPerAccess>
+struct ArrayExponential<half_t, ElementsPerAccess> {
+    CUTLASS_DEVICE
+    Array<half_t, ElementsPerAccess> operator()(Array<half_t, ElementsPerAccess> const& input) const
+    {
+        Array<half_t, ElementsPerAccess> result;
+
+        int const kVectorCount = ElementsPerAccess / 2;
+
+        __half2 const* input_ptr = reinterpret_cast<__half2 const*>(input.raw_data());
+        __half2* res_ptr = reinterpret_cast<__half2*>(result.raw_data());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < kVectorCount; ++i) { res_ptr[i] = h2exp(input_ptr[i]); }
+
+        return result;
+    }
+};
+}  // namespace detail
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Applies:
+/// output <- (input - lse).exp()
+template <typename ElementOutput_,       // output
+          typename ElementLSE_,          // accumulator from LSE
+          typename ElementAccumulator_,  // accumulator from matmul
+          typename ElementCompute_,      // intermediate compute (and exp calculation)
+          int ElementsPerAccess>
+class ApplyLogSumExp {
+public:
+    using ElementOutput = ElementOutput_;
+    using ElementAccumulator = ElementAccumulator_;
+    using ElementCompute = ElementCompute_;
+    using ElementLSE = ElementLSE_;
+
+    static int const kElementsPerAccess = ElementsPerAccess;
+    static int const kCount = kElementsPerAccess;
+    static const ScaleType::Kind kScale = cutlass::epilogue::thread::ScaleType::NoBetaScaling;
+
+    using FragmentOutput = Array<ElementOutput, kCount>;
+    using FragmentAccumulator = Array<ElementAccumulator, kElementsPerAccess>;
+    using FragmentCompute = Array<ElementCompute, kElementsPerAccess>;
+    using FragmentLSE = Array<ElementLSE, kElementsPerAccess>;
+    using FragmentScaleBias = FragmentLSE;  // Used by epilogue_smem_accumulator.h
+
+public:
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    ApplyLogSumExp() {}
+
+    /// Returns true if source is needed
+    CUTLASS_HOST_DEVICE
+    bool is_source_needed() const { return true; }
+
+    /// Functionally required for serial reduction in the epilogue
+    CUTLASS_HOST_DEVICE
+    void set_k_partition(int k_partition, int k_partition_count) {}
+
+    CUTLASS_HOST_DEVICE
+    FragmentOutput operator()(FragmentAccumulator const& AB,
+                              FragmentLSE const& scale_unused,
+                              // bias used as LSE
+                              FragmentLSE const& bias) const
+    {
+        FragmentCompute frag_AB =
+            NumericArrayConverter<ElementCompute, ElementAccumulator, kElementsPerAccess>()(AB);
+        FragmentCompute frag_lse_compute =
+            NumericArrayConverter<ElementCompute, ElementLSE, kElementsPerAccess>()(bias);
+        FragmentCompute frag_compute;
+
+        minus<FragmentCompute> minus_lse;
+        detail::ArrayExponential<ElementCompute, kElementsPerAccess> apply_exp;
+        frag_compute = minus_lse(frag_AB, frag_lse_compute);
+        frag_compute = apply_exp(frag_compute);
+
+        return NumericArrayConverter<ElementOutput, ElementCompute, kElementsPerAccess>()(
+            frag_compute);
+    }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace thread
+}  // namespace epilogue
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/deepspeed4science/evoformer_attn/gemm/custom_mma.h b/csrc/deepspeed4science/evoformer_attn/gemm/custom_mma.h
new file mode 100644
index 000000000000..75833bbfe7d2
--- /dev/null
+++ b/csrc/deepspeed4science/evoformer_attn/gemm/custom_mma.h
@@ -0,0 +1,119 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holdvr nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#pragma once
+
+#include "custom_mma_multistage.h"
+#include "custom_mma_pipelined.h"
+#include "cutlass/gemm/threadblock/mma_multistage.h"
+#include "cutlass/gemm/threadblock/mma_pipelined.h"
+
+template <typename Mma, int kMaxK>
+struct MakeCustomMma;
+
+template <typename Shape,
+          typename IteratorA,
+          typename SmemIteratorA,
+          cutlass::arch::CacheOperation::Kind CacheOpA,
+          typename IteratorB,
+          typename SmemIteratorB,
+          cutlass::arch::CacheOperation::Kind CacheOpB,
+          typename ElementC,
+          typename LayoutC,
+          typename Policy,
+          int Stages,
+          cutlass::gemm::SharedMemoryClearOption SharedMemoryClear,
+          int kMaxK>
+struct MakeCustomMma<cutlass::gemm::threadblock::MmaMultistage<Shape,
+                                                               IteratorA,
+                                                               SmemIteratorA,
+                                                               CacheOpA,
+                                                               IteratorB,
+                                                               SmemIteratorB,
+                                                               CacheOpB,
+                                                               ElementC,
+                                                               LayoutC,
+                                                               Policy,
+                                                               Stages,
+                                                               SharedMemoryClear>,
+                     kMaxK> {
+    // Reduce the number of stages if we don't need that many
+    static int constexpr kStages =
+        kMaxK == cutlass::platform::numeric_limits<int>::max()
+            ? Stages
+            : cutlass::const_min(Stages, (kMaxK + int(Shape::kK) - 1) / int(Shape::kK));
+    using Mma = cutlass::gemm::threadblock::CustomMmaMultistage<Shape,
+                                                                IteratorA,
+                                                                SmemIteratorA,
+                                                                CacheOpA,
+                                                                IteratorB,
+                                                                SmemIteratorB,
+                                                                CacheOpB,
+                                                                ElementC,
+                                                                LayoutC,
+                                                                Policy,
+                                                                kStages,
+                                                                SharedMemoryClear,
+                                                                kMaxK>;
+};
+
+template <typename Shape,
+          typename IteratorA,
+          typename SmemIteratorA,
+          typename IteratorB,
+          typename SmemIteratorB,
+          typename ElementC,
+          typename LayoutC,
+          typename Policy,
+          int kMaxK>
+struct MakeCustomMma<cutlass::gemm::threadblock::MmaPipelined<Shape,
+                                                              IteratorA,
+                                                              SmemIteratorA,
+                                                              IteratorB,
+                                                              SmemIteratorB,
+                                                              ElementC,
+                                                              LayoutC,
+                                                              Policy>,
+                     kMaxK> {
+    using Mma = cutlass::gemm::threadblock::CustomMmaPipelined<Shape,
+                                                               IteratorA,
+                                                               SmemIteratorA,
+                                                               IteratorB,
+                                                               SmemIteratorB,
+                                                               ElementC,
+                                                               LayoutC,
+                                                               Policy>;
+};
diff --git a/csrc/deepspeed4science/evoformer_attn/gemm/custom_mma_base.h b/csrc/deepspeed4science/evoformer_attn/gemm/custom_mma_base.h
new file mode 100644
index 000000000000..bbf91240b900
--- /dev/null
+++ b/csrc/deepspeed4science/evoformer_attn/gemm/custom_mma_base.h
@@ -0,0 +1,181 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/threadblock/mma_base.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Used for partial specialization
+    typename Enable = bool>
+class CustomMmaBase {
+public:
+    ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+    using Shape = Shape_;
+
+    ///< Policy describing tuning details
+    using Policy = Policy_;
+
+    //
+    // Dependent types
+    //
+
+    /// Warp-level Mma
+    using Operator = typename Policy::Operator;
+
+    /// Shape describing the overall GEMM computed from shared memory
+    /// by each warp.
+    using WarpGemm = typename Policy::Operator::Shape;
+
+    /// Shape describing the number of warps filling the CTA
+    using WarpCount =
+        GemmShape<Shape::kM / WarpGemm::kM, Shape::kN / WarpGemm::kN, Shape::kK / WarpGemm::kK>;
+
+    /// Number of warp-level GEMM oeprations
+    static int const kWarpGemmIterations = (WarpGemm::kK / Operator::Policy::MmaShape::kK);
+
+    /// Number of stages
+    static int const kStages = Stages;
+
+    //
+    // Nested structs
+    //
+
+    /// Shared storage object needed by threadblock-scoped GEMM
+    template <typename Element, typename OperandShape, typename OperandLayout>
+    struct OperandSharedStorage {
+        AlignedBuffer<Element, OperandShape::kCount> buffer;
+        using TensorRef = TensorRef<Element, OperandLayout>;
+
+        CUTLASS_DEVICE
+        static OperandLayout Layout()
+        {
+            return OperandLayout::packed({OperandShape::kRow, OperandShape::kColumn});
+        }
+
+        /// Returns a TensorRef to the operand
+        CUTLASS_HOST_DEVICE
+        TensorRef ref() { return TensorRef{buffer.data(), Layout()}; }
+    };
+
+    /// Shape of the A matrix operand in shared memory
+    using ShapeA = MatrixShape<Shape::kM + Policy::SmemPaddingA::kRow,
+                               Shape::kK * kStages + Policy::SmemPaddingA::kColumn>;
+
+    /// Shape of the B matrix operand in shared memory
+    using ShapeB = MatrixShape<Shape::kK * kStages + Policy::SmemPaddingB::kRow,
+                               Shape::kN + Policy::SmemPaddingB::kColumn>;
+
+    using SharedStorageA =
+        OperandSharedStorage<typename Operator::ElementA, ShapeA, typename Operator::LayoutA>;
+    using SharedStorageB =
+        OperandSharedStorage<typename Operator::ElementB, ShapeB, typename Operator::LayoutB>;
+    using TensorRefA = typename SharedStorageA::TensorRef;
+    using TensorRefB = typename SharedStorageB::TensorRef;
+
+    struct SharedStorage {
+        /// Buffer for A operand
+        SharedStorageA operand_A;
+
+        /// Buffer for B operand
+        SharedStorageB operand_B;
+    };
+
+protected:
+    //
+    // Data members
+    //
+
+    /// Iterator to load a warp-scoped tile of A operand from shared memory
+    typename Operator::IteratorA warp_tile_iterator_A_;
+
+    /// Iterator to load a warp-scoped tile of B operand from shared memory
+    typename Operator::IteratorB warp_tile_iterator_B_;
+
+public:
+    /// Construct from tensor references
+    CUTLASS_DEVICE
+    CustomMmaBase(
+        ///< Shared storage needed for internal use by threadblock-scoped GEMM
+        SharedStorageA& shared_storageA,
+        SharedStorageB& shared_storageB,
+        ///< ID within the threadblock
+        int thread_idx,
+        ///< ID of warp
+        int warp_idx,
+        ///< ID of each thread within a warp
+        int lane_idx)
+        : warp_tile_iterator_A_(shared_storageA.ref(), lane_idx),
+          warp_tile_iterator_B_(shared_storageB.ref(), lane_idx)
+    {
+    }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/deepspeed4science/evoformer_attn/gemm/custom_mma_multistage.h b/csrc/deepspeed4science/evoformer_attn/gemm/custom_mma_multistage.h
new file mode 100644
index 000000000000..3760ccab852a
--- /dev/null
+++ b/csrc/deepspeed4science/evoformer_attn/gemm/custom_mma_multistage.h
@@ -0,0 +1,714 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/cache_operation.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+
+#include "custom_mma_base.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA_,
+    /// Cache operation for operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone,
+    /// Upper boundon the K dimension
+    int kMaxK = cutlass::platform::numeric_limits<int>::max(),
+    /// Used for partial specialization
+    typename Enable = bool>
+class CustomMmaMultistage : public CustomMmaBase<Shape_, Policy_, Stages> {
+public:
+    ///< Base class
+    using Base = CustomMmaBase<Shape_, Policy_, Stages>;
+    ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+    using Shape = Shape_;
+    ///< Iterates over tiles of A operand in global memory
+    using IteratorA = IteratorA_;
+    ///< Iterates over tiles of B operand in global memory
+    using IteratorB = IteratorB_;
+    ///< Data type of accumulator matrix
+    using ElementC = ElementC_;
+    ///< Layout of accumulator matrix
+    using LayoutC = LayoutC_;
+    ///< Policy describing tuning details
+    using Policy = Policy_;
+
+    using SmemIteratorA = SmemIteratorA_;
+    using SmemIteratorB = SmemIteratorB_;
+
+    static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
+    static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
+
+    //
+    // Dependent types
+    //
+
+    /// Fragment of accumulator tile
+    using FragmentC = typename Policy::Operator::FragmentC;
+
+    /// Warp-level Mma
+    using Operator = typename Policy::Operator;
+
+    /// Minimum architecture is Sm80 to support cp.async
+    using ArchTag = arch::Sm80;
+
+    /// Complex transform on A operand
+    static ComplexTransform const kTransformA = Operator::kTransformA;
+
+    /// Complex transform on B operand
+    static ComplexTransform const kTransformB = Operator::kTransformB;
+
+    /// Internal structure exposed for introspection.
+    struct Detail {
+        static_assert(Base::kWarpGemmIterations > 1,
+                      "The pipelined structure requires at least two warp-level "
+                      "GEMM operations.");
+
+        /// Number of cp.async instructions to load one stage of operand A
+        static int const AsyncCopyIterationsPerStageA = IteratorA::ThreadMap::Iterations::kCount;
+
+        /// Number of cp.async instructions to load one stage of operand B
+        static int const AsyncCopyIterationsPerStageB = IteratorB::ThreadMap::Iterations::kCount;
+
+        /// Number of stages
+        static int const kStages = Stages;
+
+        /// Number of cp.async instructions to load on group of operand A
+        static int const kAccessesPerGroupA =
+            (AsyncCopyIterationsPerStageA + Base::kWarpGemmIterations - 1) /
+            Base::kWarpGemmIterations;
+
+        /// Number of cp.async instructions to load on group of operand B
+        static int const kAccessesPerGroupB =
+            (AsyncCopyIterationsPerStageB + Base::kWarpGemmIterations - 1) /
+            Base::kWarpGemmIterations;
+    };
+
+    static bool const kSmemContainsEntireMat = kMaxK <= Shape::kK * Stages;
+    static constexpr int kNumStagesConcurrentLoad = kSmemContainsEntireMat ? Stages : Stages - 1;
+
+private:
+    using WarpLoadedFragmentA = typename Operator::FragmentA;
+    using WarpLoadedFragmentB = typename Operator::FragmentB;
+    using WarpTransformedFragmentA = typename Operator::TransformedFragmentA;
+    using WarpTransformedFragmentB = typename Operator::TransformedFragmentB;
+
+private:
+    //
+    // Data members
+    //
+
+    /// Iterator to write threadblock-scoped tile of A operand to shared memory
+    SmemIteratorA smem_iterator_A_;
+
+    /// Iterator to write threadblock-scoped tile of B operand to shared memory
+    SmemIteratorB smem_iterator_B_;
+
+    bool prologue_done_;
+
+    // Set to `True` to ensure the accumulator will be zero outside the GEMM
+    // footprint
+    bool zero_outside_bounds_;
+
+public:
+    /// Construct from tensor references
+    CUTLASS_DEVICE
+    CustomMmaMultistage(
+        ///< Shared storage needed for internal use by threadblock-scoped GEMM
+        typename Base::SharedStorageA& shared_storageA,
+        typename Base::SharedStorageB& shared_storageB,
+        ///< ID within the threadblock
+        int thread_idx,
+        ///< ID of warp
+        int warp_idx,
+        ///< ID of each thread within a warp
+        int lane_idx)
+        : Base(shared_storageA, shared_storageB, thread_idx, warp_idx, lane_idx),
+          smem_iterator_A_(shared_storageA.ref(), thread_idx),
+          smem_iterator_B_(shared_storageB.ref(), thread_idx),
+          prologue_done_(false),
+          zero_outside_bounds_(false)
+    {
+        // Compute warp location within threadblock tile by mapping the warp_id to
+        // three coordinates:
+        //   _m: the warp's position within the threadblock along the M dimension
+        //   _n: the warp's position within the threadblock along the N dimension
+        //   _k: the warp's position within the threadblock along the K dimension
+
+        int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+        int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+
+        int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
+        int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
+
+        // Add per-warp offsets in units of warp-level tiles
+        this->warp_tile_iterator_A_.add_tile_offset(
+            {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+        this->warp_tile_iterator_B_.add_tile_offset(
+            {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
+    }
+    CUTLASS_DEVICE
+    CustomMmaMultistage(
+        ///< Shared storage needed for internal use by threadblock-scoped GEMM
+        typename Base::SharedStorage& st,
+        ///< ID within the threadblock
+        int thread_idx,
+        ///< ID of warp
+        int warp_idx,
+        ///< ID of each thread within a warp
+        int lane_idx)
+        : CustomMmaMultistage(st.operand_A, st.operand_B, thread_idx, warp_idx, lane_idx)
+    {
+    }
+
+    CUTLASS_DEVICE
+    bool set_prologue_done(bool value)
+    {
+        prologue_done_ = value;
+        return true;
+    }
+
+    CUTLASS_DEVICE
+    bool set_zero_outside_bounds(bool value)
+    {
+        zero_outside_bounds_ = value;
+        return true;
+    }
+
+    template <bool kLoadA = true, bool kLoadB = true>
+    CUTLASS_DEVICE static void prologue(typename Base::SharedStorage& shared_storage,
+                                        ///< iterator over A operand in global memory
+                                        IteratorA iterator_A,
+                                        ///< iterator over B operand in global memory
+                                        IteratorB iterator_B,
+                                        int thread_idx,
+                                        int problem_size_k)
+    {
+        prologue<kLoadA, kLoadB>(shared_storage.operand_A,
+                                 shared_storage.operand_B,
+                                 iterator_A,
+                                 iterator_B,
+                                 thread_idx,
+                                 problem_size_k);
+    }
+
+    template <bool kLoadA = true, bool kLoadB = true>
+    CUTLASS_DEVICE static void prologue(typename Base::SharedStorageA& shared_storageA,
+                                        typename Base::SharedStorageB& shared_storageB,
+                                        ///< iterator over A operand in global memory
+                                        IteratorA iterator_A,
+                                        ///< iterator over B operand in global memory
+                                        IteratorB iterator_B,
+                                        int thread_idx,
+                                        int problem_size_k)
+    {
+        SmemIteratorA smem_iterator_A(shared_storageA.ref(), thread_idx);
+        SmemIteratorB smem_iterator_B(shared_storageB.ref(), thread_idx);
+        int32_t iter = (problem_size_k + Base::Shape::kK - 1) / Base::Shape::kK;
+        _prologue<kLoadA, kLoadB>(iterator_A, iterator_B, iter, smem_iterator_A, smem_iterator_B);
+    }
+
+    CUTLASS_DEVICE
+    void copy_tiles_and_advance(IteratorA& iterator_A,
+                                IteratorB& iterator_B,
+                                int group_start_A = 0,
+                                int group_start_B = 0)
+    {
+        iterator_A.set_iteration_index(group_start_A * IteratorA::kAccessesPerVector);
+        this->smem_iterator_A_.set_iteration_index(group_start_A);
+
+        // Async Copy for operand A
+        CUTLASS_PRAGMA_UNROLL
+        for (int j = 0; j < Detail::kAccessesPerGroupA; ++j) {
+            if (group_start_A + j < Detail::AsyncCopyIterationsPerStageA) {
+                typename IteratorA::AccessType* dst_ptr =
+                    reinterpret_cast<typename IteratorA::AccessType*>(this->smem_iterator_A_.get());
+
+                int const kSrcBytes = sizeof_bits<typename IteratorA::Element>::value *
+                                      IteratorA::ThreadMap::kElementsPerAccess /
+                                      IteratorA::kAccessesPerVector / 8;
+
+                CUTLASS_PRAGMA_UNROLL
+                for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+                    auto gmem_ptr = iterator_A.get();
+
+                    if (zero_outside_bounds_ ||
+                        SharedMemoryClear == SharedMemoryClearOption::kZfill) {
+                        cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
+                            dst_ptr + v, gmem_ptr, iterator_A.valid());
+                    } else {
+                        cutlass::arch::cp_async<kSrcBytes, kCacheOpA>(
+                            dst_ptr + v, gmem_ptr, iterator_A.valid());
+                    }
+
+                    ++iterator_A;
+                }
+
+                ++this->smem_iterator_A_;
+            }
+        }
+
+        iterator_B.set_iteration_index(group_start_B * IteratorB::kAccessesPerVector);
+        this->smem_iterator_B_.set_iteration_index(group_start_B);
+
+        // Async Copy for operand B
+        CUTLASS_PRAGMA_UNROLL
+        for (int j = 0; j < Detail::kAccessesPerGroupB; ++j) {
+            if (group_start_B + j < Detail::AsyncCopyIterationsPerStageB) {
+                typename IteratorB::AccessType* dst_ptr =
+                    reinterpret_cast<typename IteratorB::AccessType*>(this->smem_iterator_B_.get());
+
+                int const kSrcBytes = sizeof_bits<typename IteratorB::Element>::value *
+                                      IteratorB::ThreadMap::kElementsPerAccess /
+                                      IteratorB::kAccessesPerVector / 8;
+
+                CUTLASS_PRAGMA_UNROLL
+                for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
+                    auto gmem_ptr = iterator_B.get();
+
+                    if (zero_outside_bounds_ ||
+                        SharedMemoryClear == SharedMemoryClearOption::kZfill) {
+                        cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
+                            dst_ptr + v, gmem_ptr, iterator_B.valid());
+                    } else {
+                        cutlass::arch::cp_async<kSrcBytes, kCacheOpB>(
+                            dst_ptr + v, gmem_ptr, iterator_B.valid());
+                    }
+
+                    ++iterator_B;
+                }
+                ++this->smem_iterator_B_;
+            }
+        }
+    }
+
+    template <bool kLoadA = true, bool kLoadB = true>
+    CUTLASS_DEVICE static void _prologue(IteratorA& iterator_A,
+                                         IteratorB& iterator_B,
+                                         int32_t& gemm_k_iterations,
+                                         SmemIteratorA& smem_iterator_A_,
+                                         SmemIteratorB& smem_iterator_B_)
+    {
+        // Issue several complete stages
+        CUTLASS_PRAGMA_UNROLL
+        for (int stage = 0; stage < kNumStagesConcurrentLoad; ++stage, --gemm_k_iterations) {
+            iterator_A.clear_mask(gemm_k_iterations == 0);
+            iterator_B.clear_mask(gemm_k_iterations == 0);
+
+            iterator_A.set_iteration_index(0);
+            smem_iterator_A_.set_iteration_index(0);
+
+            // Async Copy for operand A
+            CUTLASS_PRAGMA_UNROLL
+            for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) {
+                typename IteratorA::AccessType* dst_ptr =
+                    reinterpret_cast<typename IteratorA::AccessType*>(smem_iterator_A_.get());
+
+                CUTLASS_PRAGMA_UNROLL
+                for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+                    int const kSrcBytes = sizeof_bits<typename IteratorA::Element>::value *
+                                          IteratorA::ThreadMap::kElementsPerAccess /
+                                          IteratorA::kAccessesPerVector / 8;
+
+                    int src_bytes = (iterator_A.valid() ? kSrcBytes : 0);
+
+                    if (kLoadA) {
+                        cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
+                            dst_ptr + v, iterator_A.get(), iterator_A.valid());
+                    }
+
+                    ++iterator_A;
+                }
+
+                ++smem_iterator_A_;
+            }
+
+            iterator_B.set_iteration_index(0);
+            smem_iterator_B_.set_iteration_index(0);
+
+            // Async Copy for operand B
+            CUTLASS_PRAGMA_UNROLL
+            for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) {
+                typename IteratorB::AccessType* dst_ptr =
+                    reinterpret_cast<typename IteratorB::AccessType*>(smem_iterator_B_.get());
+
+                CUTLASS_PRAGMA_UNROLL
+                for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
+                    int const kSrcBytes = sizeof_bits<typename IteratorB::Element>::value *
+                                          IteratorB::ThreadMap::kElementsPerAccess /
+                                          IteratorB::kAccessesPerVector / 8;
+
+                    if (kLoadB) {
+                        cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
+                            dst_ptr + v, iterator_B.get(), iterator_B.valid());
+                    }
+
+                    ++iterator_B;
+                }
+
+                ++smem_iterator_B_;
+            }
+
+            // Move to the next stage
+            iterator_A.add_tile_offset({0, 1});
+            iterator_B.add_tile_offset({1, 0});
+
+            smem_iterator_A_.add_tile_offset({0, 1});
+            smem_iterator_B_.add_tile_offset({1, 0});
+
+            // Defines the boundary of a stage of cp.async.
+            cutlass::arch::cp_async_fence();
+        }
+    }
+
+    /// Perform a threadblock-scoped matrix multiply-accumulate
+    CUTLASS_DEVICE
+    void operator()(
+        ///< problem size of GEMM
+        int gemm_k_iterations,
+        ///< destination accumulator tile
+        FragmentC& accum,
+        ///< iterator over A operand in global memory
+        IteratorA iterator_A,
+        ///< iterator over B operand in global memory
+        IteratorB iterator_B,
+        ///< initial value of accumulator
+        FragmentC const& src_accum)
+    {
+        //
+        // Prologue
+        //
+
+        if (!prologue_done_) {
+            _prologue<true, true>(
+                iterator_A, iterator_B, gemm_k_iterations, smem_iterator_A_, smem_iterator_B_);
+        } else if (!kSmemContainsEntireMat) {
+            _prologue<false, false>(
+                iterator_A, iterator_B, gemm_k_iterations, smem_iterator_A_, smem_iterator_B_);
+        } else {
+            gemm_k_iterations -= kNumStagesConcurrentLoad;
+        }
+
+        // Perform accumulation in the 'd' output operand
+        accum = src_accum;
+
+        //
+        // Clear the remaining tiles of SMEM. This is a functional requirement for
+        // some kernels so that all accumulator elements outside the GEMM footprint
+        // are zero.
+        //
+
+        if (SharedMemoryClear == SharedMemoryClearOption::kClearLastStage) {
+            /// Iterator to write threadblock-scoped tile of A operand to shared
+            /// memory
+            SmemIteratorA last_smem_iterator_A(this->smem_iterator_A_);
+
+            typename IteratorA::AccessType zero_A;
+            zero_A.clear();
+
+            last_smem_iterator_A.set_iteration_index(0);
+
+            // Async Copy for operand A
+            CUTLASS_PRAGMA_UNROLL
+            for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) {
+                typename IteratorA::AccessType* dst_ptr =
+                    reinterpret_cast<typename IteratorA::AccessType*>(last_smem_iterator_A.get());
+
+                *dst_ptr = zero_A;
+
+                ++last_smem_iterator_A;
+            }
+
+            /// Iterator to write threadblock-scoped tile of B operand to shared
+            /// memory
+            SmemIteratorB last_smem_iterator_B(this->smem_iterator_B_);
+            typename IteratorB::AccessType zero_B;
+
+            zero_B.clear();
+            last_smem_iterator_B.set_iteration_index(0);
+
+            // Async Copy for operand B
+            CUTLASS_PRAGMA_UNROLL
+            for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) {
+                typename IteratorB::AccessType* dst_ptr =
+                    reinterpret_cast<typename IteratorB::AccessType*>(last_smem_iterator_B.get());
+
+                *dst_ptr = zero_B;
+
+                ++last_smem_iterator_B;
+            }
+        }
+
+        // Waits until kStages-2 stages have committed.
+        cutlass::arch::cp_async_wait<kNumStagesConcurrentLoad - 1>();
+        __syncthreads();
+
+        // Pair of fragments used to overlap shared memory loads and math
+        // instructions
+        WarpLoadedFragmentA warp_loaded_frag_A[2];
+        WarpLoadedFragmentB warp_loaded_frag_B[2];
+        WarpTransformedFragmentA warp_transformed_frag_A[2];
+        WarpTransformedFragmentB warp_transformed_frag_B[2];
+
+        Operator warp_mma;
+
+        this->warp_tile_iterator_A_.set_kgroup_index(0);
+        this->warp_tile_iterator_B_.set_kgroup_index(0);
+
+        this->warp_tile_iterator_A_.load(warp_loaded_frag_A[0]);
+        this->warp_tile_iterator_B_.load(warp_loaded_frag_B[0]);
+
+        ++this->warp_tile_iterator_A_;
+        ++this->warp_tile_iterator_B_;
+
+        iterator_A.clear_mask(gemm_k_iterations == 0);
+        iterator_B.clear_mask(gemm_k_iterations == 0);
+
+        int smem_write_stage_idx = Base::kStages - 1;
+        int smem_read_stage_idx = 0;
+
+        warp_mma.transform(warp_transformed_frag_A[0],
+                           warp_transformed_frag_B[0],
+                           warp_loaded_frag_A[0],
+                           warp_loaded_frag_B[0]);
+
+        // tf32x3 kernels use staging accumulation. warp_mma uses a temporary
+        // accumulator and this temporary accumulator is added to the final
+        // accumulator once in every mainloop iteration.
+        plus<FragmentC> plus_accum;
+
+        FragmentC tmp_accum;
+
+        if (platform::is_same<typename Operator::MathOperator, arch::OpMultiplyAddFastF32>::value ||
+            platform::is_same<typename Operator::MathOperator,
+                              arch::OpMultiplyAddComplexFastF32>::value) {
+            tmp_accum.clear();
+        }
+
+        //
+        // Mainloop
+        //
+
+        CUTLASS_GEMM_LOOP
+        for (; gemm_k_iterations > (-kNumStagesConcurrentLoad);) {
+            //
+            // Loop over GEMM K dimension
+            //
+
+            // Computes a warp-level GEMM on data held in shared memory
+            // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
+            CUTLASS_PRAGMA_UNROLL
+            for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations; ++warp_mma_k) {
+                // Load warp-level tiles from shared memory, wrapping to k offset if
+                // this is the last group as the case may be.
+
+                this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) %
+                                                             Base::kWarpGemmIterations);
+                this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) %
+                                                             Base::kWarpGemmIterations);
+
+                // In case of a non-circular buffer ("kSmemContainsEntireMat")
+                // make sure we don't load out of bounds data.
+                if (!kSmemContainsEntireMat || gemm_k_iterations > (-kNumStagesConcurrentLoad) ||
+                    warp_mma_k < Base::kWarpGemmIterations - 1) {
+                    this->warp_tile_iterator_A_.load(warp_loaded_frag_A[(warp_mma_k + 1) % 2]);
+                    this->warp_tile_iterator_B_.load(warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
+                }
+
+                ++this->warp_tile_iterator_A_;
+                ++this->warp_tile_iterator_B_;
+
+                if (warp_mma_k > 0)
+                    warp_mma.transform(warp_transformed_frag_A[warp_mma_k % 2],
+                                       warp_transformed_frag_B[warp_mma_k % 2],
+                                       warp_loaded_frag_A[warp_mma_k % 2],
+                                       warp_loaded_frag_B[warp_mma_k % 2]);
+
+                if (platform::is_same<typename Operator::MathOperator,
+                                      arch::OpMultiplyAddFastF32>::value ||
+                    platform::is_same<typename Operator::MathOperator,
+                                      arch::OpMultiplyAddComplexFastF32>::value) {
+                    warp_mma(tmp_accum,
+                             warp_transformed_frag_A[warp_mma_k % 2],
+                             warp_transformed_frag_B[warp_mma_k % 2],
+                             tmp_accum);
+
+                    if (warp_mma_k == 0) {
+                        accum = plus_accum(accum, tmp_accum);
+                        tmp_accum.clear();
+                    }
+                } else {
+                    warp_mma(accum,
+                             warp_transformed_frag_A[warp_mma_k % 2],
+                             warp_transformed_frag_B[warp_mma_k % 2],
+                             accum);
+                }
+
+                // Issue global->shared copies for the this stage
+                if (!kSmemContainsEntireMat && warp_mma_k < Base::kWarpGemmIterations - 1) {
+                    int group_start_iteration_A, group_start_iteration_B;
+
+                    group_start_iteration_A = warp_mma_k * Detail::kAccessesPerGroupA;
+                    group_start_iteration_B = warp_mma_k * Detail::kAccessesPerGroupB;
+
+                    copy_tiles_and_advance(
+                        iterator_A, iterator_B, group_start_iteration_A, group_start_iteration_B);
+                }
+
+                if (warp_mma_k + 2 == Base::kWarpGemmIterations) {
+                    if (!kSmemContainsEntireMat) {
+                        int group_start_iteration_A, group_start_iteration_B;
+                        group_start_iteration_A = (warp_mma_k + 1) * Detail::kAccessesPerGroupA;
+                        group_start_iteration_B = (warp_mma_k + 1) * Detail::kAccessesPerGroupB;
+
+                        copy_tiles_and_advance(iterator_A,
+                                               iterator_B,
+                                               group_start_iteration_A,
+                                               group_start_iteration_B);
+                    }
+
+                    // Inserts a memory fence between stages of cp.async instructions.
+                    cutlass::arch::cp_async_fence();
+
+                    // Waits until kStages-2 stages have committed.
+                    cutlass::arch::cp_async_wait<kNumStagesConcurrentLoad - 1>();
+                    __syncthreads();
+
+                    // Move to the next stage
+                    iterator_A.add_tile_offset({0, 1});
+                    iterator_B.add_tile_offset({1, 0});
+
+                    this->smem_iterator_A_.add_tile_offset({0, 1});
+                    this->smem_iterator_B_.add_tile_offset({1, 0});
+
+                    // Add negative offsets to return iterators to the 'start' of the
+                    // circular buffer in shared memory
+                    if (smem_write_stage_idx == (Base::kStages - 1)) {
+                        this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
+                        this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
+                        smem_write_stage_idx = 0;
+                    } else {
+                        ++smem_write_stage_idx;
+                    }
+
+                    if (!kSmemContainsEntireMat && smem_read_stage_idx == (Base::kStages - 1)) {
+                        this->warp_tile_iterator_A_.add_tile_offset(
+                            {0, -Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations});
+                        this->warp_tile_iterator_B_.add_tile_offset(
+                            {-Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations, 0});
+                        smem_read_stage_idx = 0;
+                    } else {
+                        ++smem_read_stage_idx;
+                    }
+
+                    --gemm_k_iterations;
+                    iterator_A.clear_mask(gemm_k_iterations == 0);
+                    iterator_B.clear_mask(gemm_k_iterations == 0);
+                }
+
+                // Do any conversions feeding the first stage at the end of the loop so
+                // we can start right away on mma instructions
+                if (warp_mma_k + 1 == Base::kWarpGemmIterations)
+                    warp_mma.transform(warp_transformed_frag_A[(warp_mma_k + 1) % 2],
+                                       warp_transformed_frag_B[(warp_mma_k + 1) % 2],
+                                       warp_loaded_frag_A[(warp_mma_k + 1) % 2],
+                                       warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
+            }
+        }
+
+        if (platform::is_same<typename Operator::MathOperator, arch::OpMultiplyAddFastF32>::value ||
+            platform::is_same<typename Operator::MathOperator,
+                              arch::OpMultiplyAddComplexFastF32>::value) {
+            accum = plus_accum(accum, tmp_accum);
+        }
+    }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/deepspeed4science/evoformer_attn/gemm/custom_mma_pipelined.h b/csrc/deepspeed4science/evoformer_attn/gemm/custom_mma_pipelined.h
new file mode 100644
index 000000000000..07b26ca31299
--- /dev/null
+++ b/csrc/deepspeed4science/evoformer_attn/gemm/custom_mma_pipelined.h
@@ -0,0 +1,388 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_conversion.h"
+
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+
+#include "custom_mma_base.h"
+#include "cutlass/gemm/gemm.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA_,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Transformation applied to A operand
+    typename TransformA_ = NumericArrayConverter<typename SmemIteratorA_::Element,
+                                                 typename IteratorA_::Element,
+                                                 IteratorA_::Fragment::kElements>,
+    ///
+    /// Transformation applied to B operand
+    typename TransformB_ = NumericArrayConverter<typename SmemIteratorB_::Element,
+                                                 typename IteratorB_::Element,
+                                                 IteratorB_::Fragment::kElements>,
+    /// Used for partial specialization
+    typename Enable = bool>
+class CustomMmaPipelined : public CustomMmaBase<Shape_, Policy_, 2> {
+public:
+    ///< Base class
+    using Base = CustomMmaBase<Shape_, Policy_, 2>;
+
+    using Shape = Shape_;          ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+    using IteratorA = IteratorA_;  ///< Iterates over tiles of A operand in global memory
+    using IteratorB = IteratorB_;  ///< Iterates over tiles of B operand in global memory
+    using ElementC = ElementC_;    ///< Data type of accumulator matrix
+    using LayoutC = LayoutC_;      ///< Layout of accumulator matrix
+    using Policy = Policy_;        ///< Policy describing tuning details
+
+    using SmemIteratorA = SmemIteratorA_;
+    using SmemIteratorB = SmemIteratorB_;
+
+    using TransformA = TransformA_;
+    using TransformB = TransformB_;
+
+    //
+    // Dependent types
+    //
+
+    /// Fragment of operand A loaded from global memory
+    using FragmentA = typename IteratorA::Fragment;
+
+    /// Fragment of operand B loaded from global memory
+    using FragmentB = typename IteratorB::Fragment;
+
+    /// Fragment of accumulator tile
+    using FragmentC = typename Policy::Operator::FragmentC;
+
+    /// Warp-level Mma
+    using Operator = typename Policy::Operator;
+
+    /// Obtain the arch tag from the warp-level operator
+    using ArchTag = typename Policy::Operator::ArchTag;
+
+    /// Complex transform on A operand
+    static ComplexTransform const kTransformA = Operator::kTransformA;
+
+    /// Complex transform on B operand
+    static ComplexTransform const kTransformB = Operator::kTransformB;
+
+    // staticaly assert kStages for MmaPipelined is two (Double-buffered pipeline)
+    static_assert((Base::kStages == 2), "MmaPipelined requires kStages set to value 2");
+
+    static bool const kSmemContainsEntireMat = false;
+
+private:
+    using WarpFragmentA = typename Operator::FragmentA;
+    using WarpFragmentB = typename Operator::FragmentB;
+
+protected:
+    /// Iterator to write threadblock-scoped tile of A operand to shared memory
+    SmemIteratorA smem_iterator_A_;
+
+    /// Iterator to write threadblock-scoped tile of B operand to shared memory
+    SmemIteratorB smem_iterator_B_;
+
+public:
+    /// Construct from tensor references
+    CUTLASS_DEVICE
+    CustomMmaPipelined(typename Base::SharedStorageA& shared_storageA,
+                       typename Base::SharedStorageB& shared_storageB,
+                       int thread_idx,  ///< ID within the threadblock
+                       int warp_idx,    ///< ID of warp
+                       int lane_idx     ///< ID of each thread within a warp
+                       )
+        : Base(shared_storageA, shared_storageB, thread_idx, warp_idx, lane_idx),
+          smem_iterator_A_(shared_storageA.ref(), thread_idx),
+          smem_iterator_B_(shared_storageB.ref(), thread_idx)
+    {
+        // Compute warp location within threadblock tile by mapping the warp_id to
+        // three coordinates:
+        //   _m: the warp's position within the threadblock along the M dimension
+        //   _n: the warp's position within the threadblock along the N dimension
+        //   _k: the warp's position within the threadblock along the K dimension
+
+        int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+        int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+
+        int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
+        int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
+
+        // Add per-warp offsets in units of warp-level tiles
+        this->warp_tile_iterator_A_.add_tile_offset(
+            {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+        this->warp_tile_iterator_B_.add_tile_offset(
+            {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
+    }
+    CUTLASS_DEVICE
+    CustomMmaPipelined(
+        ///< Shared storage needed for internal use by threadblock-scoped GEMM
+        typename Base::SharedStorage& st,
+        ///< ID within the threadblock
+        int thread_idx,
+        ///< ID of warp
+        int warp_idx,
+        ///< ID of each thread within a warp
+        int lane_idx)
+        : CustomMmaPipelined(st.operand_A, st.operand_B, thread_idx, warp_idx, lane_idx)
+    {
+    }
+
+    CUTLASS_DEVICE
+    bool set_prologue_done(bool value)
+    {
+        // NOT IMPLEMENTED FOR PIPELINED
+    }
+
+    CUTLASS_DEVICE
+    bool set_zero_outside_bounds(bool value)
+    {
+        // NOT NEEDED FOR PIPELINED
+        // shared memory will always be zero-filled
+    }
+
+    template <bool kLoadA = true, bool kLoadB = true>
+    CUTLASS_DEVICE static void prologue(typename Base::SharedStorage& shared_storage,
+                                        ///< iterator over A operand in global memory
+                                        IteratorA iterator_A,
+                                        ///< iterator over B operand in global memory
+                                        IteratorB iterator_B,
+                                        int thread_idx,
+                                        int problem_size_k)
+    {
+        prologue<kLoadA, kLoadB>(shared_storage.operand_A,
+                                 shared_storage.operand_B,
+                                 iterator_A,
+                                 iterator_B,
+                                 thread_idx,
+                                 problem_size_k);
+    }
+
+    template <bool kLoadA = true, bool kLoadB = true>
+    CUTLASS_DEVICE static void prologue(typename Base::SharedStorageA& shared_storageA,
+                                        typename Base::SharedStorageB& shared_storageB,
+                                        ///< iterator over A operand in global memory
+                                        IteratorA iterator_A,
+                                        ///< iterator over B operand in global memory
+                                        IteratorB iterator_B,
+                                        int thread_idx,
+                                        int problem_size_k)
+    {
+        // NOT IMPLEMENTED FOR PIPELINED
+    }
+
+    /// Perform a threadblock-scoped matrix multiply-accumulate
+    CUTLASS_DEVICE
+    void operator()(
+        int gemm_k_iterations,                  ///< number of iterations of the mainloop
+        FragmentC& accum,                       ///< destination accumulator tile
+        IteratorA iterator_A,                   ///< iterator over A operand in global memory
+        IteratorB iterator_B,                   ///< iterator over B operand in global memory
+        FragmentC const& src_accum,             ///< source accumulator tile
+        TransformA transform_A = TransformA(),  ///< transformation applied to A fragment
+        TransformB transform_B = TransformB())
+    {  ///< transformation applied to B fragment
+
+        //
+        // Prologue
+        //
+
+        // Perform accumulation in the 'd' output operand
+        accum = src_accum;
+
+        FragmentA tb_frag_A;
+        FragmentB tb_frag_B;
+
+        tb_frag_A.clear();
+        tb_frag_B.clear();
+
+        // The last kblock is loaded in the prolog
+        iterator_A.load(tb_frag_A);
+        iterator_B.load(tb_frag_B);
+
+        ++iterator_A;
+        ++iterator_B;
+
+        this->smem_iterator_A_.store(transform_A(tb_frag_A));
+        this->smem_iterator_B_.store(transform_B(tb_frag_B));
+
+        ++this->smem_iterator_A_;
+        ++this->smem_iterator_B_;
+
+        __syncthreads();
+
+        // Pair of fragments used to overlap shared memory loads and math
+        // instructions
+        WarpFragmentA warp_frag_A[2];
+        WarpFragmentB warp_frag_B[2];
+
+        this->warp_tile_iterator_A_.set_kgroup_index(0);
+        this->warp_tile_iterator_B_.set_kgroup_index(0);
+
+        this->warp_tile_iterator_A_.load(warp_frag_A[0]);
+        this->warp_tile_iterator_B_.load(warp_frag_B[0]);
+
+        ++this->warp_tile_iterator_A_;
+        ++this->warp_tile_iterator_B_;
+
+        Operator warp_mma;
+
+        int smem_write_stage_idx = 1;
+
+        // Avoid reading out of bounds
+        iterator_A.clear_mask(gemm_k_iterations <= 1);
+        iterator_B.clear_mask(gemm_k_iterations <= 1);
+
+        // Issue loads during the first warp-level matrix multiply-add *AFTER*
+        // issuing shared memory loads (which have the tightest latency requirement).
+
+        //
+        // Mainloop
+        //
+
+        // Note: The main loop does not support Base::kWarpGemmIterations == 2.
+        CUTLASS_GEMM_LOOP
+        for (; gemm_k_iterations > 0; --gemm_k_iterations) {
+            //
+            // Loop over GEMM K dimension
+            //
+
+            CUTLASS_PRAGMA_UNROLL
+            for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations; ++warp_mma_k) {
+                // Load warp-level tiles from shared memory, wrapping to k offset if
+                // this is the last group as the case may be.
+
+                if (warp_mma_k == Base::kWarpGemmIterations - 1) {
+                    // Write fragments to shared memory
+                    this->smem_iterator_A_.store(transform_A(tb_frag_A));
+
+                    this->smem_iterator_B_.store(transform_B(tb_frag_B));
+
+                    __syncthreads();
+
+                    ++this->smem_iterator_A_;
+                    ++this->smem_iterator_B_;
+
+                    // Add negative offsets to return iterators to the 'start' of the
+                    // circular buffer in shared memory
+                    if (smem_write_stage_idx == 1) {
+                        this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
+                        this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
+                    } else {
+                        this->warp_tile_iterator_A_.add_tile_offset(
+                            {0, -Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations});
+                        this->warp_tile_iterator_B_.add_tile_offset(
+                            {-Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations, 0});
+                    }
+
+                    smem_write_stage_idx ^= 1;
+                }
+
+                this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) %
+                                                             Base::kWarpGemmIterations);
+                this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) %
+                                                             Base::kWarpGemmIterations);
+
+                this->warp_tile_iterator_A_.load(warp_frag_A[(warp_mma_k + 1) % 2]);
+                this->warp_tile_iterator_B_.load(warp_frag_B[(warp_mma_k + 1) % 2]);
+
+                ++this->warp_tile_iterator_A_;
+                ++this->warp_tile_iterator_B_;
+
+                if (warp_mma_k == 0) {
+                    iterator_A.load(tb_frag_A);
+                    iterator_B.load(tb_frag_B);
+
+                    ++iterator_A;
+                    ++iterator_B;
+
+                    // Avoid reading out of bounds if this was the last loop iteration
+                    iterator_A.clear_mask(gemm_k_iterations <= 2);
+                    iterator_B.clear_mask(gemm_k_iterations <= 2);
+                }
+
+                warp_mma(accum, warp_frag_A[warp_mma_k % 2], warp_frag_B[warp_mma_k % 2], accum);
+            }
+        }
+    }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/deepspeed4science/evoformer_attn/gemm/find_default_mma.h b/csrc/deepspeed4science/evoformer_attn/gemm/find_default_mma.h
new file mode 100644
index 000000000000..163dcbf85259
--- /dev/null
+++ b/csrc/deepspeed4science/evoformer_attn/gemm/find_default_mma.h
@@ -0,0 +1,191 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holdvr nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+/*! \file
+    \brief Cutlass provides helper template functions to figure out the right
+   datastructures to instantiate to run a GEMM with various parameters (see
+   `cutlass/gemm/threadblock/default_mma.h`). However, due to template
+   instantiation priority rules, it will only create an MmaMultiStage with
+   kStages=3 (otherwise creates an MmePipelined - which is not compatible with
+   FastF32). kStages=3 uses too much shared memory and we want to use kStages=2,
+   so we just copy-pasted some code from `default_mma.h` and
+   `default_mma_core.h` files and wrapped this template to allow our usecase.
+
+    This is really only for the FastF32 case - aka using TensorCores with fp32.
+*/
+
+#pragma once
+
+#include "cutlass/gemm/threadblock/default_mma.h"
+#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Layout type for C and D matrix operand
+    typename LayoutC,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator,
+    typename Enable_ = void>
+struct FindDefaultMma {
+    static constexpr bool AccumulatorsInRowMajor = false;
+    static constexpr SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone;
+    using DefaultMma = cutlass::gemm::threadblock::DefaultMma<ElementA,
+                                                              LayoutA,
+                                                              kAlignmentA,
+                                                              ElementB,
+                                                              LayoutB,
+                                                              kAlignmentB,
+                                                              ElementAccumulator,
+                                                              LayoutC,
+                                                              OperatorClass,
+                                                              ArchTag,
+                                                              ThreadblockShape,
+                                                              WarpShape,
+                                                              InstructionShape,
+                                                              Stages,
+                                                              Operator,
+                                                              AccumulatorsInRowMajor,
+                                                              SharedMemoryClear>;
+};
+
+/// Specialization for sm80 / FastF32 / multistage with kStages=2
+template <typename ElementA_,
+          /// Layout type for A matrix operand
+          typename LayoutA_,
+          /// Access granularity of A matrix in units of elements
+          int kAlignmentA,
+          typename ElementB_,
+          /// Layout type for B matrix operand
+          typename LayoutB_,
+          /// Access granularity of B matrix in units of elements
+          int kAlignmentB,
+          typename ElementAccumulator,
+          /// Threadblock-level tile size (concept: GemmShape)
+          typename ThreadblockShape,
+          /// Warp-level tile size (concept: GemmShape)
+          typename WarpShape,
+          /// Instruction-level tile size (concept: GemmShape)
+          typename InstructionShape,
+          int kStages,
+          typename Operator>
+struct FindDefaultMma<ElementA_,
+                      LayoutA_,
+                      kAlignmentA,
+                      ElementB_,
+                      LayoutB_,
+                      kAlignmentB,
+                      ElementAccumulator,
+                      layout::RowMajor,
+                      arch::OpClassTensorOp,
+                      arch::Sm80,
+                      ThreadblockShape,
+                      WarpShape,
+                      InstructionShape,
+                      kStages,
+                      Operator,
+                      typename cutlass::platform::enable_if<(kAlignmentA > 1)>::type> {
+    using LayoutC = layout::RowMajor;
+    using OperatorClass = arch::OpClassTensorOp;
+    using ArchTag = arch::Sm80;
+
+    using DefaultMma_ = cutlass::gemm::threadblock::DefaultMma<ElementA_,
+                                                               LayoutA_,
+                                                               kAlignmentA,
+                                                               ElementB_,
+                                                               LayoutB_,
+                                                               kAlignmentB,
+                                                               ElementAccumulator,
+                                                               LayoutC,
+                                                               OperatorClass,
+                                                               ArchTag,
+                                                               ThreadblockShape,
+                                                               WarpShape,
+                                                               InstructionShape,
+                                                               3,
+                                                               Operator>;
+    struct DefaultMma : DefaultMma_ {
+        using MmaCore_ = typename DefaultMma_::MmaCore;
+        // Define the threadblock-scoped multistage matrix multiply
+        using ThreadblockMma =
+            cutlass::gemm::threadblock::MmaMultistage<typename MmaCore_::Shape,
+                                                      typename DefaultMma_::IteratorA,
+                                                      typename MmaCore_::SmemIteratorA,
+                                                      MmaCore_::kCacheOpA,
+                                                      typename DefaultMma_::IteratorB,
+                                                      typename MmaCore_::SmemIteratorB,
+                                                      MmaCore_::kCacheOpB,
+                                                      ElementAccumulator,
+                                                      LayoutC,
+                                                      typename MmaCore_::MmaPolicy,
+                                                      kStages>;
+    };
+};
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
diff --git a/csrc/deepspeed4science/evoformer_attn/gemm/mma_accum_lambda_iterator.h b/csrc/deepspeed4science/evoformer_attn/gemm/mma_accum_lambda_iterator.h
new file mode 100644
index 000000000000..5e2f0cf681bf
--- /dev/null
+++ b/csrc/deepspeed4science/evoformer_attn/gemm/mma_accum_lambda_iterator.h
@@ -0,0 +1,347 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holdvr nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#pragma once
+
+#include "cutlass/functional.h"
+#include "cutlass/gemm/warp/mma_simt_tile_iterator.h"
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm70.h"
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h"
+#include "cutlass/matrix_shape.h"
+
+/*
+TensorCores have different accumulator layouts.
+This file provides a class to easily map the accumulator
+i-th element with the corresponding matrix row/col.
+*/
+
+template <typename T, typename accum_t, int kWarpSize>
+struct AccumLambdaIteratorSm80 {
+    static_assert(cutlass::platform::is_same<typename T::Layout, cutlass::layout::RowMajor>::value,
+                  "only RowMajor is supported");
+
+    using Policy = typename T::Policy;
+    using InstructionShape = typename T::InstructionShape;
+    using OpDelta = typename T::OpDelta;
+    using Shape = typename T::Shape;
+    static int const kElementsPerAccess = InstructionShape::kN / 4;
+    static int const kRowsPerTile = 8;
+    static int const kAccumulatorRows = InstructionShape::kM / kRowsPerTile;
+
+    static cutlass::MatrixCoord CUTLASS_DEVICE
+    get_lane_offset(int8_t lane_id, int8_t warp_id, typename T::TensorCoord const& tile_offset)
+    {
+        int quad = (lane_id >> 2);
+        int lane_in_quad = (lane_id & 3);
+        return cutlass::MatrixCoord(
+            quad + tile_offset.row() * Shape::kRow,
+            lane_in_quad * kElementsPerAccess + tile_offset.column() * Shape::kColumn);
+    }
+
+    template <typename FA, typename FB, typename FC>
+    CUTLASS_DEVICE static void iterateRows(cutlass::MatrixCoord& lane_offset,
+                                           FA beginRow,
+                                           FB op,
+                                           FC endRow)
+    {
+        // See cutlass/gemm/warp/mma_tensor_op_tile_iterator.h
+        CUTLASS_PRAGMA_UNROLL
+        for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
+            CUTLASS_PRAGMA_UNROLL
+            for (int row = 0; row < kAccumulatorRows; ++row) {
+                int accum_m = mma_m * InstructionShape::kM * OpDelta::kRow + row * kRowsPerTile +
+                              lane_offset.row();
+                beginRow(accum_m);
+
+                CUTLASS_PRAGMA_UNROLL
+                for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
+                    int mma_accum_start = kAccumulatorRows * kElementsPerAccess *
+                                          (mma_n * Policy::MmaIterations::kRow + mma_m);
+                    CUTLASS_PRAGMA_UNROLL
+                    for (int col = 0; col < kElementsPerAccess; ++col) {
+                        int accum_n = mma_n * InstructionShape::kN * OpDelta::kColumn + col +
+                                      lane_offset.column();
+                        int idx = mma_accum_start + row * kElementsPerAccess + col;
+                        op(accum_m, accum_n, idx);
+                    }
+                }
+
+                endRow(accum_m);
+            }
+        }
+    }
+
+    template <typename DT, typename F>
+    CUTLASS_DEVICE static bool reduceSameRow(int lane_id, DT& myValue, F fn)
+    {
+        // In each warp, 4 threads will work on the same row
+        // - the ones with the same `quad`
+        auto otherV = __shfl_xor_sync(0xffffffff, myValue, 1);
+        myValue = fn(myValue, otherV);
+        otherV = __shfl_xor_sync(0xffffffff, myValue, 2);
+        myValue = fn(myValue, otherV);
+        int lane_in_quad = (lane_id & 3);
+        return lane_in_quad == 0;
+    }
+};
+
+template <typename T, typename accum_t, int kWarpSize>
+struct AccumLambdaIteratorSm70 {
+    static_assert(cutlass::platform::is_same<typename T::Layout, cutlass::layout::RowMajor>::value,
+                  "only RowMajor is supported");
+
+    using Policy = typename T::Policy;
+    using InstructionShape = typename T::InstructionShape;
+    using OpDelta = typename T::OpDelta;
+    using Shape = typename T::Shape;
+    using Element = accum_t;
+
+    static int const kElementsPerPartial = 4;
+    using EleShapePerPatial =
+        typename cutlass::platform::conditional<cutlass::platform::is_same<Element, float>::value,
+                                                cutlass::MatrixShape<2, 2>,
+                                                cutlass::MatrixShape<1, 4>>::type;
+    static int const kElementsPerMma = 8;
+    static int const kAccumulatorPatials = 2;
+    using QuadShapePerPatialMma = cutlass::MatrixShape<4, 4>;
+
+    static cutlass::MatrixCoord CUTLASS_DEVICE
+    get_lane_offset(int8_t lane_id, int8_t warp_id, typename T::TensorCoord const& tile_offset)
+    {
+        int quad = (lane_id >> 2);
+        int lane_in_quad = (lane_id & 3);
+        int accum_m, accum_n;
+
+        if (cutlass::platform::is_same<Element, float>::value) {
+            // (quad[2],quad[0])+lane_in_quad[0]
+            accum_m = (((quad & 0x4) >> 1) + (quad & 0x1)) * 8 + (lane_in_quad & 1);
+            // (quad[1])+lane_in_quad[1]
+            accum_n = ((quad >> 1) & 0x1) * kElementsPerPartial * kAccumulatorPatials +
+                      (lane_in_quad & 2);
+        } else {
+            accum_m = (((quad & 0x4) >> 1) + (quad & 0x1)) * 8 + lane_in_quad;  // (quad[2],quad[0])
+            accum_n = ((quad >> 1) & 0x1) * kElementsPerPartial * kAccumulatorPatials;
+        }
+        return cutlass::MatrixCoord(accum_m + tile_offset.row() * Shape::kRow,
+                                    accum_n + tile_offset.column() * Shape::kColumn);
+    }
+
+    template <typename DT, typename F>
+    CUTLASS_DEVICE static bool reduceSameRow(int lane_id, DT& myValue, F fn)
+    {
+        static_assert(cutlass::platform::is_same<Element, float>::value,
+                      "update to support non-float accum");
+        // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-fragment-mma-884-f16
+        // T0 & T2 share same line within a quad
+        auto otherV = __shfl_xor_sync(0xffffffff, myValue, 1 << 1);
+        myValue = fn(myValue, otherV);
+        // quad 0 and quad 2 are on the same lines
+        otherV = __shfl_xor_sync(0xffffffff, myValue, 1 << 3);
+        myValue = fn(myValue, otherV);
+        return (lane_id & ((1 << 1) | (1 << 3))) == 0;
+    }
+
+    template <typename FA, typename FB, typename FC>
+    CUTLASS_DEVICE static void iterateRows(cutlass::MatrixCoord& lane_offset,
+                                           FA beginRow,
+                                           FB op,
+                                           FC endRow)
+    {
+        CUTLASS_PRAGMA_UNROLL
+        for (int tile_m = 0; tile_m < Policy::TileIterations::kRow; ++tile_m) {
+            CUTLASS_PRAGMA_UNROLL
+            for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
+                CUTLASS_PRAGMA_UNROLL
+                for (int m = 0; m < EleShapePerPatial::kRow; ++m) {
+                    int accum_m = tile_m * Policy::InterleavedTile::kRow +
+                                  mma_m * QuadShapePerPatialMma::kRow + m * 2 + lane_offset.row();
+                    beginRow(accum_m);
+
+                    CUTLASS_PRAGMA_UNROLL
+                    for (int tile_n = 0; tile_n < Policy::TileIterations::kColumn; ++tile_n) {
+                        CUTLASS_PRAGMA_UNROLL
+                        for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
+                            CUTLASS_PRAGMA_UNROLL
+                            for (int p = 0; p < kAccumulatorPatials; ++p) {
+                                CUTLASS_PRAGMA_UNROLL
+                                for (int n = 0; n < EleShapePerPatial::kColumn; ++n) {
+                                    int mma_accum_start =
+                                        (((tile_n * Policy::TileIterations::kRow + tile_m) *
+                                              Policy::MmaIterations::kColumn +
+                                          mma_n) *
+                                             Policy::MmaIterations::kRow +
+                                         mma_m) *
+                                        kElementsPerMma;
+                                    int accum_n = tile_n * Policy::InterleavedTile::kColumn +
+                                                  mma_n * QuadShapePerPatialMma::kColumn +
+                                                  p * Policy::InterleavedTile::kColumn / 2 + n +
+                                                  lane_offset.column();
+                                    int idx = mma_accum_start + p * kElementsPerPartial +
+                                              m * EleShapePerPatial::kColumn + n;
+                                    op(accum_m, accum_n, idx);
+                                }
+                            }
+                        }
+                    }
+                    endRow(accum_m);
+                }
+            }
+        }
+    }
+};
+
+template <typename T, typename accum_t, int kWarpSize>
+struct AccumLambdaIteratorSimt {
+    using Policy = typename T::Policy;
+    using Iterations = typename T::Iterations;
+    using Element = typename T::Element;
+    using Delta = typename T::Delta;
+    using Shape = typename T::Shape;
+    static_assert(cutlass::platform::is_same<typename T::Layout, cutlass::layout::RowMajor>::value,
+                  "only RowMajor is supported");
+
+    template <typename DT, typename F>
+    CUTLASS_DEVICE static bool reduceSameRow(int lane_id, DT& myValue, F fn)
+    {
+        CUTLASS_PRAGMA_UNROLL
+        for (int bit = 1; bit < Policy::WarpShape::kColumn; bit *= 2) {
+            auto otherV = __shfl_xor_sync(0xffffffff, myValue, bit);
+            myValue = fn(myValue, otherV);
+        }
+        return (lane_id & (Policy::WarpShape::kColumn - 1)) == 0;
+    }
+
+    template <typename FA, typename FB, typename FC>
+    CUTLASS_DEVICE static void iterateRows(cutlass::MatrixCoord& lane_offset,
+                                           FA beginRow,
+                                           FB op,
+                                           FC endRow)
+    {
+        CUTLASS_PRAGMA_UNROLL
+        for (int mma_m = 0; mma_m < Iterations::kRow; ++mma_m) {
+            CUTLASS_PRAGMA_UNROLL
+            for (int m = 0; m < Policy::LaneMmaShape::kM; ++m) {
+                int accum_m = mma_m * Delta::kRow + m + lane_offset.row();
+                beginRow(accum_m);
+
+                CUTLASS_PRAGMA_UNROLL
+                for (int mma_n = 0; mma_n < Iterations::kColumn; ++mma_n) {
+                    int accum_n = mma_n * Policy::WarpShape::kColumn * Policy::LaneMmaShape::kN +
+                                  lane_offset.column();
+                    CUTLASS_PRAGMA_UNROLL
+                    for (int n = 0; n < Policy::LaneMmaShape::kN; ++n) {
+                        int idx = n + Policy::LaneMmaShape::kN *
+                                          (mma_n + Iterations::kColumn *
+                                                       (m + mma_m * Policy::LaneMmaShape::kM));
+                        op(accum_m, accum_n + n, idx);
+                    }
+                }
+                endRow(accum_m);
+            }
+        }
+    }
+
+    static cutlass::MatrixCoord CUTLASS_DEVICE
+    get_lane_offset(int8_t lane_id, int8_t warp_id, typename T::TensorCoord const& tile_offset)
+    {
+        static_assert(cutlass::platform::is_same<typename Policy::LaneLayout,
+                                                 cutlass::layout::RowMajorInterleaved<1>>::value,
+                      "");
+        typename Policy::LaneLayout lane_layout = Policy::get_lane_layout();
+
+        cutlass::MatrixCoord lane_offset =
+            lane_layout.inverse(lane_id) *
+            cutlass::MatrixCoord(Policy::LaneMmaShape::kM, Policy::LaneMmaShape::kN);
+        return lane_offset + tile_offset * cutlass::MatrixCoord(Shape::kRow, Shape::kColumn);
+    }
+};
+
+template <typename T, typename accum_t, int kWarpSize>
+struct DefaultMmaAccumLambdaIterator;
+
+// Simt
+template <typename S, typename P, typename accum_t, int kWarpSize>
+struct DefaultMmaAccumLambdaIterator<
+    cutlass::gemm::warp::MmaSimtTileIterator<S,
+                                             cutlass::gemm::Operand::kC,
+                                             accum_t,
+                                             cutlass::layout::RowMajor,
+                                             P,
+                                             1,
+                                             1>,
+    accum_t,
+    kWarpSize> {
+    using WarpIterator =
+        typename cutlass::gemm::warp::MmaSimtTileIterator<S,
+                                                          cutlass::gemm::Operand::kC,
+                                                          accum_t,
+                                                          cutlass::layout::RowMajor,
+                                                          P,
+                                                          1,
+                                                          1>;
+    using Iterator = AccumLambdaIteratorSimt<WarpIterator, accum_t, kWarpSize>;
+};
+
+// TensorOp - Volta
+template <typename S1, typename S2, typename accum_t, int kWarpSize>
+struct DefaultMmaAccumLambdaIterator<
+    cutlass::gemm::warp::MmaVoltaTensorOpAccumulatorTileIterator<S1,
+                                                                 accum_t,
+                                                                 cutlass::layout::RowMajor,
+                                                                 S2,
+                                                                 cutlass::MatrixShape<1, 1>>,
+    accum_t,
+    kWarpSize> {
+    using WarpIterator = typename cutlass::gemm::warp::MmaVoltaTensorOpAccumulatorTileIterator<
+        S1,
+        accum_t,
+        cutlass::layout::RowMajor,
+        S2,
+        cutlass::MatrixShape<1, 1>>;
+    using Iterator = AccumLambdaIteratorSm70<WarpIterator, accum_t, kWarpSize>;
+};
+
+// TensorOp - Sm75+
+template <typename S1, typename S2, typename S3, typename accum_t, int kWarpSize>
+struct DefaultMmaAccumLambdaIterator<
+    cutlass::gemm::warp::
+        MmaTensorOpAccumulatorTileIterator<S1, accum_t, cutlass::layout::RowMajor, S2, S3>,
+    accum_t,
+    kWarpSize> {
+    using WarpIterator = typename cutlass::gemm::warp::
+        MmaTensorOpAccumulatorTileIterator<S1, accum_t, cutlass::layout::RowMajor, S2, S3>;
+    using Iterator = AccumLambdaIteratorSm80<WarpIterator, accum_t, kWarpSize>;
+};
diff --git a/csrc/deepspeed4science/evoformer_attn/gemm/mma_from_smem.h b/csrc/deepspeed4science/evoformer_attn/gemm/mma_from_smem.h
new file mode 100644
index 000000000000..40d3265c7a63
--- /dev/null
+++ b/csrc/deepspeed4science/evoformer_attn/gemm/mma_from_smem.h
@@ -0,0 +1,1939 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_simt.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
+#include "cutlass/functional.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/platform/platform.h"
+#include "cutlass/transform/threadblock/vector_iterator.h"
+
+#include "../epilogue/epilogue_thread_apply_logsumexp.h"
+#include "../gemm/mma_accum_lambda_iterator.h"
+#include "../gemm_kernel_utils.h"
+#include "../iterators/make_residual_last.h"
+#include "../iterators/transpose_warp_iterator.h"
+#include "../iterators/warp_iterator_from_smem.h"
+#include "cutlass/epilogue/threadblock/epilogue_smem_accumulator.h"
+#include "cutlass/gemm/threadblock/mma_base.h"
+#include "cutlass/gemm/threadblock/mma_multistage.h"
+#include "cutlass/gemm/threadblock/mma_pipelined.h"
+#include "cutlass/gemm/warp/mma_tensor_op_tile_access_iterator.h"
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/// Shared storage object needed by accumulator
+/// From 13_two_tensor_op_fusion/threadblock/b2b_mma_base_smem_accumulator.h
+template <typename Shape_, typename Element_, typename Layout_, typename Padding_>
+class AccumulatorSharedStorage {
+public:
+    //
+    // Type definitions
+    //
+    using Shape = Shape_;
+    using Element = Element_;
+    using Layout = Layout_;
+    using Padding = Padding_;
+
+    /// Tensor reference to the accumulator
+    using TensorRefAccum = cutlass::TensorRef<Element, Layout>;
+
+    /// Shape of the accumulator matrix in shared memory
+    using ShapeAccum =
+        cutlass::MatrixShape<Shape::kM + Padding::kRow, Shape::kN + Padding::kColumn>;
+
+public:
+    //
+    // Data members
+    //
+
+    /// Buffer for accumulator
+    cutlass::AlignedBuffer<Element, ShapeAccum::kCount> accum;
+
+public:
+    //
+    // Methods
+    //
+
+    /// Returns a layout object for the Accum matrix
+    CUTLASS_DEVICE
+    static Layout LayoutAccum() { return Layout::packed({ShapeAccum::kRow, ShapeAccum::kColumn}); }
+
+    /// Returns a TensorRef to the Accumulator
+    CUTLASS_HOST_DEVICE
+    TensorRefAccum accum_ref() { return TensorRefAccum{accum.data(), LayoutAccum()}; }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Taken from
+// https://github.com/NVIDIA/cutlass/blob/master/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_base_smem_accumulator.h
+////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    // Maximum value for K
+    int kMaxK,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Used for partial specialization
+    typename Enable = bool>
+class MmaBaseFromSharedMemory {
+public:
+    ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+    using Shape = Shape_;
+
+    ///< Policy describing tuning details
+    using Policy = Policy_;
+
+    //
+    // Dependent types
+    //
+
+    /// Warp-level Mma
+    using Operator = typename Policy::Operator;
+
+    /// Shape describing the overall GEMM computed from shared memory
+    /// by each warp.
+    using WarpGemm = typename Policy::Operator::Shape;
+
+    /// Shape describing the number of warps filling the CTA
+    using WarpCount =
+        GemmShape<Shape::kM / WarpGemm::kM, Shape::kN / WarpGemm::kN, Shape::kK / WarpGemm::kK>;
+    using WarpCount1 = WarpCount;
+
+    /// Number of warp-level GEMM oeprations
+    static int const kWarpGemmIterations = (WarpGemm::kK / Operator::Policy::MmaShape::kK);
+    static int const kWarpGemmIterations1 = kWarpGemmIterations;
+
+    /// Number of stages
+    static int const kStages = Stages;
+
+    /// If this is true, we fill the entire shmem buffer at start
+    /// and don't need to iterate through it in a circular fashion
+    static bool const kSmemContainsEntireB = kMaxK <= Shape::kK * kStages;
+
+    /// Tensor reference to the A operand
+    using TensorRefA = TensorRef<typename Operator::ElementA, typename Operator::LayoutA>;
+
+    /// Tensor reference to the B operand
+    using TensorRefB = TensorRef<typename Operator::ElementB, typename Operator::LayoutB>;
+
+    //
+    // Nested structs
+    //
+
+    /// Shared storage object needed by threadblock-scoped GEMM
+    class SharedStorage {
+    public:
+        //
+        // Type definitions
+        //
+
+        /// Shape of the B matrix operand in shared memory
+        using ShapeB = MatrixShape<Shape::kK * kStages + Policy::SmemPaddingB::kRow,
+                                   Shape::kN + Policy::SmemPaddingB::kColumn>;
+
+    public:
+        //
+        // Data members
+        //
+
+        /// Buffer for B operand
+        AlignedBuffer<typename Operator::ElementB, ShapeB::kCount> operand_B;
+
+    public:
+        //
+        // Methods
+        //
+
+        /// Returns a layout object for the B matrix
+        CUTLASS_HOST_DEVICE
+        static typename Operator::LayoutB LayoutB()
+        {
+            return Operator::LayoutB::packed({ShapeB::kRow, ShapeB::kColumn});
+        }
+
+        /// Returns a TensorRef to the B operand
+        CUTLASS_HOST_DEVICE
+        TensorRefB operand_B_ref() { return TensorRefB{operand_B.data(), LayoutB()}; }
+    };
+
+protected:
+    //
+    // Data members
+    //
+
+    // /// Iterator to load a warp-scoped tile of A operand from shared memory
+    // typename Operator::IteratorA warp_tile_iterator_A_;
+
+    /// Iterator to load a warp-scoped tile of B operand from shared memory
+    typename Operator::IteratorB warp_tile_iterator_B_;
+
+public:
+    /// Construct from tensor references
+    CUTLASS_DEVICE
+    MmaBaseFromSharedMemory(
+        ///< Shared storage needed for internal use by threadblock-scoped GEMM
+        SharedStorage& shared_storage,
+        ///< ID within the threadblock
+        int thread_idx,
+        ///< ID of warp
+        int warp_idx,
+        ///< ID of each thread within a warp
+        int lane_idx)
+        : warp_tile_iterator_B_(shared_storage.operand_B_ref(), lane_idx)
+    {
+    }
+};
+
+namespace {
+
+// has necessary trait compliance with WarpIteratorFromSmem but doesn't do
+// anything, can be default initialized, and uses fragment that takes up
+// (almost) no space. this warp iterator is selected at compile time when
+// elementwise on-the-fly scaling for operand A is disabled, in which case
+// operations related to loading scale factors for operand A get wiped out by
+// the compiler.
+template <typename TensorRef>
+class NoOpWarpIteratorScale {
+public:
+    // in pipelined+multistage MMA implementations we keep an array of fragments.
+    // if we aren't using scaling we don't want to waste registers on fragments
+    // of scale elements, so ideally this would be sized 0.
+    // Since arrays of zero-sized objects are not allowed, using size as 1.
+    // The compiler will most likely wipe it out anyways.
+    using Fragment = cutlass::Array<char, 1>;
+
+    CUTLASS_HOST_DEVICE
+    NoOpWarpIteratorScale() {}
+
+    CUTLASS_HOST_DEVICE
+    NoOpWarpIteratorScale(TensorRef const&, int) {}
+
+    CUTLASS_HOST_DEVICE
+    NoOpWarpIteratorScale& add_tile_offset(typename TensorRef::TensorCoord const&) { return *this; }
+
+    CUTLASS_HOST_DEVICE
+    NoOpWarpIteratorScale& operator++() { return *this; }
+
+    CUTLASS_DEVICE
+    void load(Fragment&) const {}
+};
+
+// if scaling is enabled, performs fragment elementwise multiplication between
+// fragment and its scaling factor.
+template <typename Fragment, typename FragmentScale, bool ScalingEnabled>
+class FragmentElementwiseScaler;
+
+// specialization for scaling being enabled.
+template <typename Fragment, typename FragmentScale>
+class FragmentElementwiseScaler<Fragment, FragmentScale, true> {
+public:
+    // cast scale_frag to correct type then apply elementwise to fragment
+    CUTLASS_DEVICE
+    static Fragment apply(Fragment frag, FragmentScale const& scale_frag)
+    {
+        Fragment converted_scale_frag =
+            cutlass::NumericArrayConverter<typename Fragment::Element,
+                                           typename FragmentScale::Element,
+                                           FragmentScale::kElements>()(scale_frag);
+        return cutlass::multiplies<Fragment>()(frag, converted_scale_frag);
+    }
+};
+
+// specialization for scaling being disabled. doesn't do anything and should
+// just get wiped out by the compiler.
+template <typename Fragment, typename FragmentScale>
+class FragmentElementwiseScaler<Fragment, FragmentScale, false> {
+public:
+    CUTLASS_DEVICE
+    static Fragment apply(Fragment frag, FragmentScale const&) { return frag; }
+};
+}  // namespace
+
+////////////////////////////////////////////////////////////////////////////////
+// Taken from
+// https://github.com/NVIDIA/cutlass/blob/master/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_pipelined_smem_accumulator.h
+////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    // BEGIN smem
+    /// Iterates over the intermediate accumulator tile in shared memory
+    typename WarpIteratorA,
+    /// whether or not to perform elementwise multiplication of A
+    //  by another matrix (A_scale) that is also kept in shared memory prior
+    //  to matmul A @ B
+    bool ScaleOperandA_,
+    // Accumulator type
+    typename AccumulatorSharedStorage,
+    // END smem
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Transformation applied to B operand
+    typename TransformB_ = NumericArrayConverter<typename SmemIteratorB_::Element,
+                                                 typename IteratorB_::Element,
+                                                 IteratorB_::Fragment::kElements>,
+    /// Used for partial specialization
+    typename Enable = bool>
+class MmaPipelinedFromSharedMemory
+    : public MmaBaseFromSharedMemory<Shape_, AccumulatorSharedStorage::Shape::kN, Policy_, 2> {
+public:
+    ///< Base class
+    using Base = MmaBaseFromSharedMemory<Shape_, AccumulatorSharedStorage::Shape::kN, Policy_, 2>;
+
+    using Shape = Shape_;  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+    static constexpr bool ScaleOperandA = ScaleOperandA_;
+
+    ///< loads fragments of A_scale from shared memory if operand A scaling is
+    ///< enabled. otherwise no-op.
+    using WarpIteratorAScale = typename cutlass::platform::conditional<
+        ScaleOperandA,
+        WarpIteratorA,
+        NoOpWarpIteratorScale<typename WarpIteratorA::TensorRef>>::type;
+
+    using IteratorB = IteratorB_;  ///< Iterates over tiles of B operand in global memory
+    using ElementC = ElementC_;    ///< Data type of accumulator matrix
+    using LayoutC = LayoutC_;      ///< Layout of accumulator matrix
+    using Policy = Policy_;        ///< Policy describing tuning details
+
+    using SmemIteratorB = SmemIteratorB_;
+
+    using TransformB = TransformB_;
+
+    //
+    // Dependent types
+    //
+
+    /// Fragment of operand B loaded from global memory
+    using FragmentB = typename IteratorB::Fragment;
+
+    /// Fragment of accumulator tile
+    using FragmentC = typename Policy::Operator::FragmentC;
+
+    /// Warp-level Mma
+    using Operator = typename Policy::Operator;
+
+    /// Obtain the arch tag from the warp-level operator
+    using ArchTag = typename Policy::Operator::ArchTag;
+
+    /// Complex transform on B operand
+    static ComplexTransform const kTransformB = Operator::kTransformB;
+
+    // staticaly assert kStages for MmaPipelined is two (Double-buffered pipeline)
+    static_assert((Base::kStages == 2), "MmaPipelined requires kStages set to value 2");
+
+private:
+    using WarpFragmentA = typename Operator::FragmentA;
+
+    /// fragment type of OperandA elementwise scaling matrix. (almost) empty
+    /// if operand A scaling is disabled.
+    using WarpFragmentAScale = typename WarpIteratorAScale::Fragment;
+
+    using WarpFragmentB = typename Operator::FragmentB;
+
+    /// applies scaling factor to operand A fragment if operand A scaling is
+    /// enabled. otherwise no-op.
+    using FragmentAScaler =
+        FragmentElementwiseScaler<WarpFragmentA, WarpFragmentAScale, ScaleOperandA>;
+
+protected:
+    // /// Iterator to write threadblock-scoped tile of A operand to shared memory
+    // SmemIteratorA smem_iterator_A_;
+
+    /// Iterator to write threadblock-scoped tile of B operand to shared memory
+    SmemIteratorB smem_iterator_B_;
+
+    /// Iterator to load a warp-scoped tile of A operand from intermediate
+    /// accumulator tile
+    WarpIteratorA warp_tile_iterator_A_;
+
+    /// Iterator to load a warp-scoped tile of A_scale from intermediate
+    /// accumulator tile (only used if ScaleOperandA_ is true)
+    WarpIteratorAScale warp_tile_iterator_A_scale_;
+
+public:
+    /// constructor for MMA with operand A scaling enabled.
+    CUTLASS_DEVICE
+    MmaPipelinedFromSharedMemory(
+        // shared storage needed for internal use by threadblock-scoped GEMM
+        typename Base::SharedStorage& shared_storage,
+        // warp iterator over A tile held in shared memory
+        WarpIteratorA warp_iter_a,
+        // warp iterator over A_scale tile held in shared memory
+        WarpIteratorAScale warp_iter_a_scale,
+        int thread_idx,
+        int warp_idx,
+        int lane_idx)
+        : Base(shared_storage, thread_idx, warp_idx, lane_idx),
+          warp_tile_iterator_A_(warp_iter_a),
+          warp_tile_iterator_A_scale_(warp_iter_a_scale),
+          smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx)
+    {
+        // Compute warp location within threadblock tile by mapping the warp_id to
+        // three coordinates:
+        //   _m: the warp's position within the threadblock along the M dimension
+        //   _n: the warp's position within the threadblock along the N dimension
+        //   _k: the warp's position within the threadblock along the K dimension
+        int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+        int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+        int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
+        int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
+
+        // Add per-warp offsets in units of warp-level tiles
+        this->warp_tile_iterator_A_.add_tile_offset(
+            {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+        this->warp_tile_iterator_A_scale_.add_tile_offset(
+            {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+        this->warp_tile_iterator_B_.add_tile_offset(
+            {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
+    }
+
+    /// Construct from tensor references
+    CUTLASS_DEVICE
+    MmaPipelinedFromSharedMemory(
+        typename Base::SharedStorage& shared_storage,  ///< Shared storage needed for internal use
+                                                       ///< by threadblock-scoped GEMM
+        AccumulatorSharedStorage& accumulator_shared_storage,
+        int thread_idx,  ///< ID within the threadblock
+        int warp_idx,    ///< ID of warp
+        int lane_idx,    ///< ID of each thread within a warp
+        int problem_size_0_n)
+        : Base(shared_storage, thread_idx, warp_idx, lane_idx),
+          warp_tile_iterator_A_(accumulator_shared_storage.accum_ref(), lane_idx),
+          smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx)
+    {
+        // Compute warp location within threadblock tile by mapping the warp_id to
+        // three coordinates:
+        //   _m: the warp's position within the threadblock along the M dimension
+        //   _n: the warp's position within the threadblock along the N dimension
+        //   _k: the warp's position within the threadblock along the K dimension
+
+        int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+        int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+
+        int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
+        int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
+
+        // Add per-warp offsets in units of warp-level tiles
+        this->warp_tile_iterator_A_.add_tile_offset(
+            {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+        this->warp_tile_iterator_B_.add_tile_offset(
+            {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
+    }
+
+    // For API compatibility with MmaMultistageFromSharedMemory
+    // but not supported as it worsens perf: older gpus < sm80 don't
+    // support async transfers and have to waste registers
+    CUTLASS_DEVICE
+    void set_prologue_done(bool value) {}
+    CUTLASS_DEVICE
+    static void prologue(typename Base::SharedStorage& shared_storage,
+                         IteratorB iterator_B1,
+                         int thread_idx,
+                         int problem_size_0_n)
+    {
+    }
+
+    CUTLASS_DEVICE
+    static void drain_cp_asyncs() {}
+
+    /// Perform a threadblock-scoped matrix multiply-accumulate
+    CUTLASS_DEVICE
+    void operator()(int gemm_k_iterations,  ///< number of iterations of the mainloop
+                    FragmentC& accum,       ///< destination accumulator tile
+                    // IteratorA iterator_A,                             ///< iterator over A
+                    // operand in global memory
+                    IteratorB iterator_B,        ///< iterator over B operand in global memory
+                    FragmentC const& src_accum,  ///< source accumulator tile
+                    // TransformA transform_A = TransformA(),            ///< transformation
+                    // applied to A fragment
+                    TransformB transform_B = TransformB())
+    {  ///< transformation applied to B fragment
+
+        //
+        // Prologue
+        //
+
+        // Perform accumulation in the 'd' output operand
+        accum = src_accum;
+
+        FragmentB tb_frag_B;
+
+        tb_frag_B.clear();
+
+        // The last kblock is loaded in the prolog
+        iterator_B.set_residual_tile(gemm_k_iterations == 1);
+        iterator_B.load(tb_frag_B);
+
+        ++iterator_B;
+
+        this->smem_iterator_B_.store(transform_B(tb_frag_B));
+
+        ++this->smem_iterator_B_;
+
+        __syncthreads();
+
+        // remember that WarpFragmentAScale and WarpIteratorAScale are empty/no-op
+        // if scaling is disabled.
+
+        // Pair of fragments used to overlap shared memory loads and math
+        // instructions
+        WarpFragmentA warp_frag_A[2];
+        WarpFragmentAScale warp_frag_A_scale[2];
+        WarpFragmentB warp_frag_B[2];
+        warp_frag_A[0].clear();
+        warp_frag_A_scale[0].clear();
+        warp_frag_B[0].clear();
+
+        this->warp_tile_iterator_B_.set_kgroup_index(0);
+
+        this->warp_tile_iterator_A_.load(warp_frag_A[0]);
+        this->warp_tile_iterator_A_scale_.load(warp_frag_A_scale[0]);
+        this->warp_tile_iterator_B_.load(warp_frag_B[0]);
+
+        ++this->warp_tile_iterator_A_;
+        ++this->warp_tile_iterator_A_scale_;
+        ++this->warp_tile_iterator_B_;
+
+        Operator warp_mma;
+
+        int smem_write_stage_idx = 1;
+
+        // Avoid reading out of bounds
+        iterator_B.set_residual_tile(gemm_k_iterations == 2);
+        iterator_B.clear_mask(gemm_k_iterations <= 1);
+
+        // Issue loads during the first warp-level matrix multiply-add *AFTER*
+        // issuing shared memory loads (which have the tightest latency requirement).
+
+        //
+        // Mainloop
+        //
+
+        // Note: The main loop does not support Base::kWarpGemmIterations == 2.
+        CUTLASS_GEMM_LOOP
+        for (; gemm_k_iterations > 0; --gemm_k_iterations) {
+            //
+            // Loop over GEMM K dimension
+            //
+
+            CUTLASS_PRAGMA_UNROLL
+            for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations; ++warp_mma_k) {
+                // Load warp-level tiles from shared memory, wrapping to k offset if
+                // this is the last group as the case may be.
+                bool hasNext = true;
+
+                if (warp_mma_k == Base::kWarpGemmIterations - 1) {
+                    // Write fragments to shared memory
+                    this->smem_iterator_B_.store(transform_B(tb_frag_B));
+
+                    __syncthreads();
+
+                    ++this->smem_iterator_B_;
+
+                    // Add negative offsets to return iterators to the 'start' of the
+                    // circular buffer in shared memory SMEM: Don't reset iterator A, as
+                    // we are continuing our iteration at this point
+                    if (smem_write_stage_idx == 1) {
+                        this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
+                    } else {
+                        this->warp_tile_iterator_B_.add_tile_offset(
+                            {-Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations, 0});
+                    }
+
+                    smem_write_stage_idx ^= 1;
+                    hasNext = gemm_k_iterations > 1;
+                }
+
+                // Only read the next if we need to
+                if (hasNext) {
+                    this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) %
+                                                                 Base::kWarpGemmIterations);
+
+                    this->warp_tile_iterator_A_.load(warp_frag_A[(warp_mma_k + 1) % 2]);
+                    this->warp_tile_iterator_A_scale_.load(warp_frag_A_scale[(warp_mma_k + 1) % 2]);
+                    this->warp_tile_iterator_B_.load(warp_frag_B[(warp_mma_k + 1) % 2]);
+
+                    ++this->warp_tile_iterator_A_;
+                    ++this->warp_tile_iterator_A_scale_;
+                    ++this->warp_tile_iterator_B_;
+
+                    if (warp_mma_k == 0) {
+                        iterator_B.load(tb_frag_B);
+
+                        ++iterator_B;
+
+                        // Avoid reading out of bounds if this was the last loop iteration
+                        iterator_B.set_residual_tile(gemm_k_iterations == 3);
+                        iterator_B.clear_mask(gemm_k_iterations <= 2);
+                    }
+                }
+
+                warp_mma(accum,
+                         FragmentAScaler::apply(warp_frag_A[warp_mma_k % 2],
+                                                warp_frag_A_scale[warp_mma_k % 2]),
+                         warp_frag_B[warp_mma_k % 2],
+                         accum);
+            }
+        }
+    }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Taken from
+// https://github.com/NVIDIA/cutlass/blob/master/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_multistage_smem_accumulator.h
+////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape1_,
+    /// Iterates over the intermediate accumulator tile in shared memory
+    typename WarpIteratorA1_,
+    /// whether or not to perform elementwise multiplication of A
+    //  by another matrix (A_scale) that is also kept in shared memory prior
+    //  to matmul A @ B
+    bool ScaleOperandA_,
+    // Accumulator type
+    typename AccumulatorSharedStorage,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB1_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB1_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB1,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy1_,
+    /// Number of stages,
+    int Stages_,
+    int kMaxK_,
+    /// Used for partial specialization
+    typename Enable = bool>
+class MmaMultistageFromSharedMemory
+    : public MmaBaseFromSharedMemory<Shape1_, kMaxK_, Policy1_, Stages_> {
+public:
+    ///< Base class
+    using Base = MmaBaseFromSharedMemory<Shape1_, kMaxK_, Policy1_, Stages_>;
+
+    ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+    using Shape1 = Shape1_;
+    ///< Iterates over tiles of B operand in global memory
+    using IteratorB1 = IteratorB1_;
+    using IteratorB = IteratorB1;
+    ///< Policy describing tuning details
+    using Policy1 = Policy1_;
+
+    using SmemIteratorB1 = SmemIteratorB1_;
+    using WarpIteratorA1 = WarpIteratorA1_;  ///< Iterates over the intermediate
+                                             ///< accumulator tile in shared memory
+    static constexpr bool ScaleOperandA = ScaleOperandA_;
+
+    ///< warp level iterator over A_scale matrix tile kept in shared memory.
+    ///< if elementwise A scaling is disabled then everything this does is no-op.
+    using WarpIteratorAScale = typename cutlass::platform::conditional<
+        ScaleOperandA,
+        WarpIteratorA1,
+        NoOpWarpIteratorScale<typename WarpIteratorA1::TensorRef>>::type;
+    ///< Data type of accumulator matrix
+    using ElementC = ElementC_;
+    ///< Layout of accumulator matrix
+    using LayoutC = LayoutC_;
+
+    static cutlass::arch::CacheOperation::Kind const kCacheOpB1 = CacheOpB1;
+    static constexpr bool kSmemContainsEntireB = Base::kSmemContainsEntireB;
+
+    //
+    // Dependent types
+    //
+
+    /// Fragment of accumulator tile
+    using FragmentC1 = typename Policy1::Operator::FragmentC;
+    using FragmentC = FragmentC1;
+
+    /// Warp-level Mma
+    using Operator1 = typename Policy1::Operator;
+
+    /// Minimum architecture is Sm80 to support cp.async
+    using ArchTag = arch::Sm80;
+
+    /// Complex transform on B operand
+    static ComplexTransform const kTransformB1 = Operator1::kTransformB;
+
+    /// Internal structure exposed for introspection.
+    struct Detail {
+        static_assert(Base::kWarpGemmIterations1 > 1,
+                      "The pipelined structure requires at least two warp-level "
+                      "GEMM operations.");
+
+        /// Number of cp.async instructions to load one stage of operand B
+        static int const TBLoadIterationsB1 = IteratorB1::ThreadMap::Iterations::kCount;
+
+        /// Number of cp.async instructions to load on group of operand B
+        static int const kAccessesPerGroupB1 =
+            (TBLoadIterationsB1 + Base::kWarpGemmIterations1 - 1) / Base::kWarpGemmIterations1;
+    };
+
+    static constexpr int kNumStagesConcurrentLoad = kSmemContainsEntireB ? Base::kStages
+                                                                         : Base::kStages - 1;
+
+private:
+    using WarpLoadedFragmentA1 = typename Operator1::FragmentA;
+    /// fragment of OperandA scale matrix. if operand A scaling is disabled this
+    /// is (almost) empty.
+    using WarpLoadedFragmentA1Scale = typename WarpIteratorAScale::Fragment;
+    using WarpLoadedFragmentB1 = typename Operator1::FragmentB;
+    using WarpTransformedFragmentA1 = typename Operator1::TransformedFragmentA;
+    using WarpTransformedFragmentB1 = typename Operator1::TransformedFragmentB;
+
+    /// applies elementwise scaling to fragment of A. if operand A scaling is
+    /// disabled this is a no-op.
+    using FragmentAScaler =
+        FragmentElementwiseScaler<WarpLoadedFragmentA1, WarpLoadedFragmentA1Scale, ScaleOperandA>;
+
+private:
+    //
+    // Data members
+    //
+
+    /// Iterator to load a warp-scoped tile of A1 operand from intermediate
+    /// accumulator tile
+    WarpIteratorA1 warp_tile_iterator_A1_;
+
+    /// Iterator to load a warp-scoped tile of A1_scale operand from shared memory
+    /// if operand A scaling is disabled everything this does is a no-op.
+    WarpIteratorAScale warp_tile_iterator_A1_scale_;
+
+    /// Iterator to write threadblock-scoped tile of B operand to shared memory
+    SmemIteratorB1 smem_iterator_B1_;
+
+    bool prologue_done_;
+
+public:
+    /// constructor for MMA with operand A scaling enabled.
+    CUTLASS_DEVICE
+    MmaMultistageFromSharedMemory(
+        // shared storage needed for internal use by threadblock-scoped GEMM
+        typename Base::SharedStorage& shared_storage,
+        // warp level iterator over operand A tile kept in shared memory
+        WarpIteratorA1 warp_tile_iterator_A1,
+        // warp level iterator over operand A elementwise scale tile kept in
+        // shared memory.
+        WarpIteratorAScale warp_tile_iterator_A1_scale,
+        int thread_idx,
+        int warp_idx,
+        int lane_idx)
+        : Base(shared_storage, thread_idx, warp_idx, lane_idx),
+          warp_tile_iterator_A1_(warp_tile_iterator_A1),
+          warp_tile_iterator_A1_scale_(warp_tile_iterator_A1_scale),
+          smem_iterator_B1_(shared_storage.operand_B_ref(), thread_idx),
+          prologue_done_(false)
+    {
+        // Compute warp location within threadblock tile by mapping the warp_id to
+        // three coordinates:
+        //   _m: the warp's position within the threadblock along the M dimension
+        //   _n: the warp's position within the threadblock along the N dimension
+        //   _k: the warp's position within the threadblock along the K dimension
+        int warp_idx_mn_1 = warp_idx % (Base::WarpCount1::kM * Base::WarpCount1::kN);
+        int warp_idx_k_1 = warp_idx / (Base::WarpCount1::kM * Base::WarpCount1::kN);
+        int warp_idx_m_1 = warp_idx_mn_1 % Base::WarpCount1::kM;
+        int warp_idx_n_1 = warp_idx_mn_1 / Base::WarpCount1::kM;
+
+        // Add per-warp offsets in units of warp-level tiles
+        warp_tile_iterator_A1_.add_tile_offset(
+            {warp_idx_m_1, Base::kWarpGemmIterations1 * warp_idx_k_1});
+        warp_tile_iterator_A1_scale_.add_tile_offset(
+            {warp_idx_m_1, Base::kWarpGemmIterations1 * warp_idx_k_1});
+        this->warp_tile_iterator_B_.add_tile_offset(
+            {Base::kWarpGemmIterations1 * warp_idx_k_1, warp_idx_n_1});
+    }
+
+    /// Construct from tensor references
+    CUTLASS_DEVICE
+    MmaMultistageFromSharedMemory(
+        typename Base::SharedStorage& shared_storage,  ///< Shared storage needed for internal use
+                                                       ///< by threadblock-scoped GEMM
+        AccumulatorSharedStorage& accumulator_shared_storage,
+        ///< ID within the threadblock
+        int thread_idx,
+        ///< ID of warp
+        int warp_idx,
+        ///< ID of each thread within a warp
+        int lane_idx,
+        ///< GEMM0 N is used for accumulator extent
+        int problem_size_0_n)
+        : Base(shared_storage, thread_idx, warp_idx, lane_idx),
+          warp_tile_iterator_A1_(accumulator_shared_storage.accum_ref(), lane_idx),
+          smem_iterator_B1_(shared_storage.operand_B_ref(), thread_idx),
+          prologue_done_(false)
+    {
+        // Compute warp location within threadblock tile by mapping the warp_id to
+        // three coordinates:
+        //   _m: the warp's position within the threadblock along the M dimension
+        //   _n: the warp's position within the threadblock along the N dimension
+        //   _k: the warp's position within the threadblock along the K dimension
+
+        int warp_idx_mn_1 = warp_idx % (Base::WarpCount1::kM * Base::WarpCount1::kN);
+        int warp_idx_k_1 = warp_idx / (Base::WarpCount1::kM * Base::WarpCount1::kN);
+
+        int warp_idx_m_1 = warp_idx_mn_1 % Base::WarpCount1::kM;
+        int warp_idx_n_1 = warp_idx_mn_1 / Base::WarpCount1::kM;
+
+        // Add per-warp offsets in units of warp-level tiles
+        warp_tile_iterator_A1_.add_tile_offset(
+            {warp_idx_m_1, Base::kWarpGemmIterations1 * warp_idx_k_1});
+        this->warp_tile_iterator_B_.add_tile_offset(
+            {Base::kWarpGemmIterations1 * warp_idx_k_1, warp_idx_n_1});
+    }
+
+    CUTLASS_DEVICE
+    void set_prologue_done(bool value) { prologue_done_ = value; }
+
+    CUTLASS_DEVICE
+    static void prologue(typename Base::SharedStorage& shared_storage,
+                         IteratorB iterator_B1,
+                         int thread_idx,
+                         int problem_size_0_n)
+    {
+        SmemIteratorB1 smem_iterator_B1(shared_storage.operand_B_ref(), thread_idx);
+        _prologue(iterator_B1,
+                  (problem_size_0_n + Base::Shape::kK - 1) / Base::Shape::kK,
+                  smem_iterator_B1);
+    }
+
+    CUTLASS_DEVICE
+    static void drain_cp_asyncs()
+    {
+        // commit and drain all pending and predicated cp.async pnz from the GEMM
+        // mainloop
+        cutlass::arch::cp_async_fence();
+        cutlass::arch::cp_async_wait<0>();
+        __syncthreads();
+    }
+
+    CUTLASS_DEVICE
+    void copy_tiles_and_advance_1(IteratorB1& iterator_B1, int group_start_B1 = 0)
+    {
+        iterator_B1.set_iteration_index(group_start_B1 * IteratorB1::kAccessesPerVector);
+        this->smem_iterator_B1_.set_iteration_index(group_start_B1);
+
+        // Load for operand B
+        CUTLASS_PRAGMA_UNROLL
+        for (int j = 0; j < Detail::kAccessesPerGroupB1; ++j) {
+            if (group_start_B1 + j < Detail::TBLoadIterationsB1) {
+                typename IteratorB1::AccessType* dst_ptr =
+                    reinterpret_cast<typename IteratorB1::AccessType*>(
+                        this->smem_iterator_B1_.get());
+
+                int const kSrcBytes = sizeof_bits<typename IteratorB1::Element>::value *
+                                      IteratorB1::ThreadMap::kElementsPerAccess /
+                                      IteratorB1::kAccessesPerVector / 8;
+
+                CUTLASS_PRAGMA_UNROLL
+                for (int v = 0; v < IteratorB1::kAccessesPerVector; ++v) {
+                    auto gmem_ptr = iterator_B1.get();
+
+                    cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB1>(
+                        dst_ptr + v, gmem_ptr, iterator_B1.valid());
+
+                    ++iterator_B1;
+                }
+                ++this->smem_iterator_B1_;
+            }
+        }
+    }
+
+    CUTLASS_DEVICE
+    static void _prologue(IteratorB& iterator_B1,
+                          int32_t gemm_k_iterations_1,
+                          SmemIteratorB1& smem_iterator_B1_)
+    {
+        // Issue several complete stages
+        CUTLASS_PRAGMA_UNROLL
+        for (int stage = 0; stage < kNumStagesConcurrentLoad; ++stage, --gemm_k_iterations_1) {
+            iterator_B1.set_residual_tile(gemm_k_iterations_1 == 1);
+            iterator_B1.clear_mask(gemm_k_iterations_1 == 0);
+
+            iterator_B1.set_iteration_index(0);
+            smem_iterator_B1_.set_iteration_index(0);
+
+            // Load for operand B
+            CUTLASS_PRAGMA_UNROLL
+            for (int j = 0; j < Detail::TBLoadIterationsB1; ++j) {
+                typename IteratorB1::AccessType* dst_ptr =
+                    reinterpret_cast<typename IteratorB1::AccessType*>(smem_iterator_B1_.get());
+
+                CUTLASS_PRAGMA_UNROLL
+                for (int v = 0; v < IteratorB1::kAccessesPerVector; ++v) {
+                    int const kSrcBytes = sizeof_bits<typename IteratorB1::Element>::value *
+                                          IteratorB1::ThreadMap::kElementsPerAccess /
+                                          IteratorB1::kAccessesPerVector / 8;
+
+                    cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB1>(
+                        dst_ptr + v, iterator_B1.get(), iterator_B1.valid());
+
+                    ++iterator_B1;
+                }
+
+                ++smem_iterator_B1_;
+            }
+
+            // Move to the next stage
+            iterator_B1.add_tile_offset({1, 0});
+
+            smem_iterator_B1_.add_tile_offset({1, 0});
+
+            // Defines the boundary of a stage of cp.async.
+            cutlass::arch::cp_async_fence();
+        }
+        iterator_B1.set_residual_tile(gemm_k_iterations_1 == 1);
+        iterator_B1.clear_mask(gemm_k_iterations_1 == 0);
+    }
+
+    /// Perform a threadblock-scoped matrix multiply-accumulate
+    CUTLASS_DEVICE
+    void operator()(
+        ///< problem size of GEMM
+        int gemm_k_iterations_1_,
+        ///< destination accumulator tile
+        FragmentC1& accum,
+        ///< iterator over B1 operand in global memory
+        IteratorB1 iterator_B1,
+        ///< initial value of accumulator
+        FragmentC1 const& src_accum)
+    {
+        // 2nd Gemm
+
+        //
+        // Prologue
+        //
+        // Perform accumulation in the 'd' output operand
+        accum = src_accum;
+
+        if (!prologue_done_) {
+            _prologue(iterator_B1, gemm_k_iterations_1_, smem_iterator_B1_);
+        } else if (!kSmemContainsEntireB) {
+            // Restore the iterators increments
+
+            int gemm_k_iterations_1 = gemm_k_iterations_1_;
+            // Issue several complete stages
+            CUTLASS_PRAGMA_UNROLL
+            for (int stage = 0; stage < kNumStagesConcurrentLoad; ++stage, --gemm_k_iterations_1) {
+                iterator_B1.set_iteration_index(0);
+                this->smem_iterator_B1_.set_iteration_index(0);
+
+                // Load for operand B
+                CUTLASS_PRAGMA_UNROLL
+                for (int j = 0; j < Detail::TBLoadIterationsB1; ++j) {
+                    CUTLASS_PRAGMA_UNROLL
+                    for (int v = 0; v < IteratorB1::kAccessesPerVector; ++v) { ++iterator_B1; }
+                    ++this->smem_iterator_B1_;
+                }
+                iterator_B1.add_tile_offset({1, 0});
+                this->smem_iterator_B1_.add_tile_offset({1, 0});
+            }
+            iterator_B1.set_residual_tile(gemm_k_iterations_1 <= 1);
+            iterator_B1.clear_mask(gemm_k_iterations_1 <= 0);
+        }
+
+        // DEPBAR+SYNC
+        cutlass::arch::cp_async_wait<kNumStagesConcurrentLoad - 1>();
+        __syncthreads();
+
+        // remember that WarpFragmentAScale and WarpIteratorAScale are no-op/empty
+        // if scaling is disabled.
+
+        // Pair of fragments used to overlap shared memory loads and math
+        // instructions
+        WarpLoadedFragmentA1 warp_loaded_frag_A1[2];
+        WarpLoadedFragmentA1Scale warp_loaded_frag_A1_scale[2];
+        WarpLoadedFragmentB1 warp_loaded_frag_B1[2];
+        WarpTransformedFragmentA1 warp_transformed_frag_A1[2];
+        WarpTransformedFragmentB1 warp_transformed_frag_B1[2];
+
+        Operator1 warp_mma1;
+
+        warp_tile_iterator_A1_.load(warp_loaded_frag_A1[0]);
+        ++warp_tile_iterator_A1_;
+
+        warp_tile_iterator_A1_scale_.load(warp_loaded_frag_A1_scale[0]);
+        ++warp_tile_iterator_A1_scale_;
+
+        this->warp_tile_iterator_B_.set_kgroup_index(0);
+        this->warp_tile_iterator_B_.load(warp_loaded_frag_B1[0]);
+        ++this->warp_tile_iterator_B_;
+
+        int smem_write_stage_idx = Base::kStages - 1;
+        int smem_read_stage_idx = 0;
+
+        warp_mma1.transform(
+            warp_transformed_frag_A1[0],
+            warp_transformed_frag_B1[0],
+            FragmentAScaler::apply(warp_loaded_frag_A1[0], warp_loaded_frag_A1_scale[0]),
+            warp_loaded_frag_B1[0]);
+
+        // tf32x3 kernels use staging accumulation. warp_mma uses a temporary
+        // accumulator and this temporary accumulator is added to the final
+        // accumulator once in every mainloop iteration.
+        plus<FragmentC1> plus_accum;
+
+        FragmentC1 tmp_accum;
+
+        if (platform::is_same<typename Operator1::MathOperator,
+                              arch::OpMultiplyAddFastF32>::value ||
+            platform::is_same<typename Operator1::MathOperator,
+                              arch::OpMultiplyAddComplexFastF32>::value) {
+            tmp_accum.clear();
+        }
+
+        //
+        // Mainloop
+        //
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int gemm_k_iterations_1 = gemm_k_iterations_1_ - (Base::kStages - 1);
+             gemm_k_iterations_1 > (-Base::kStages + 1);
+             gemm_k_iterations_1--) {
+            //
+            // Loop over GEMM K dimension
+            //
+
+            // Computes a warp-level GEMM on data held in shared memory
+            // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
+            CUTLASS_PRAGMA_UNROLL
+            for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations1; ++warp_mma_k) {
+                // Load warp-level tile from accumulator fragment (A)
+                // or shared memory (operand B)
+                this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) %
+                                                             Base::kWarpGemmIterations1);
+                // skip warp tile loading for the last kgroup (we are out of the buf)
+                if (gemm_k_iterations_1 > (-Base::kStages + 2) ||
+                    warp_mma_k < Base::kWarpGemmIterations1 - 1) {
+                    warp_tile_iterator_A1_.load(warp_loaded_frag_A1[(warp_mma_k + 1) % 2]);
+                    warp_tile_iterator_A1_scale_.load(
+                        warp_loaded_frag_A1_scale[(warp_mma_k + 1) % 2]);
+                    this->warp_tile_iterator_B_.load(warp_loaded_frag_B1[(warp_mma_k + 1) % 2]);
+                }
+                ++warp_tile_iterator_A1_;
+                ++warp_tile_iterator_A1_scale_;
+                ++this->warp_tile_iterator_B_;
+
+                if (warp_mma_k > 0)
+                    warp_mma1.transform(
+                        warp_transformed_frag_A1[warp_mma_k % 2],
+                        warp_transformed_frag_B1[warp_mma_k % 2],
+                        FragmentAScaler::apply(warp_loaded_frag_A1[warp_mma_k % 2],
+                                               warp_loaded_frag_A1_scale[warp_mma_k % 2]),
+                        warp_loaded_frag_B1[warp_mma_k % 2]);
+
+                if (platform::is_same<typename Operator1::MathOperator,
+                                      arch::OpMultiplyAddFastF32>::value ||
+                    platform::is_same<typename Operator1::MathOperator,
+                                      arch::OpMultiplyAddComplexFastF32>::value) {
+                    warp_mma1(tmp_accum,
+                              warp_transformed_frag_A1[warp_mma_k % 2],
+                              warp_transformed_frag_B1[warp_mma_k % 2],
+                              tmp_accum);
+
+                    if (warp_mma_k == 0) {
+                        accum = plus_accum(accum, tmp_accum);
+                        tmp_accum.clear();
+                    }
+                } else {
+                    warp_mma1(accum,
+                              warp_transformed_frag_A1[warp_mma_k % 2],
+                              warp_transformed_frag_B1[warp_mma_k % 2],
+                              accum);
+                }
+
+                // Issue global->shared copies for the this stage
+                if (warp_mma_k < Base::kWarpGemmIterations1 - 1) {
+                    int group_start_iteration_B1;
+
+                    group_start_iteration_B1 = warp_mma_k * Detail::kAccessesPerGroupB1;
+
+                    if (!kSmemContainsEntireB) {
+                        copy_tiles_and_advance_1(iterator_B1, group_start_iteration_B1);
+                    }
+                }
+
+                if (warp_mma_k + 2 == Base::kWarpGemmIterations1) {
+                    int group_start_iteration_B1;
+                    group_start_iteration_B1 = (warp_mma_k + 1) * Detail::kAccessesPerGroupB1;
+
+                    if (!kSmemContainsEntireB) {
+                        copy_tiles_and_advance_1(iterator_B1, group_start_iteration_B1);
+                    }
+
+                    // Inserts a memory fence between stages of cp.async instructions.
+                    cutlass::arch::cp_async_fence();
+
+                    // Waits until kStages-2 stages have committed.
+                    arch::cp_async_wait<kNumStagesConcurrentLoad - 1>();
+                    __syncthreads();
+
+                    // Move to the next stage
+                    iterator_B1.add_tile_offset({1, 0});
+
+                    this->smem_iterator_B1_.add_tile_offset({1, 0});
+
+                    // Add negative offsets to return iterators to the 'start' of the
+                    // circular buffer in shared memory
+                    if (!kSmemContainsEntireB) {
+                        if (smem_write_stage_idx == (Base::kStages - 1)) {
+                            this->smem_iterator_B1_.add_tile_offset({-Base::kStages, 0});
+                            smem_write_stage_idx = 0;
+                        } else {
+                            ++smem_write_stage_idx;
+                        }
+
+                        if (smem_read_stage_idx == (Base::kStages - 1)) {
+                            this->warp_tile_iterator_B_.add_tile_offset(
+                                {-Base::kStages * Policy1::kPartitionsK *
+                                     Base::kWarpGemmIterations1,
+                                 0});
+                            smem_read_stage_idx = 0;
+                        } else {
+                            ++smem_read_stage_idx;
+                        }
+                    }
+
+                    iterator_B1.set_residual_tile(gemm_k_iterations_1 == 2);
+                    iterator_B1.clear_mask(gemm_k_iterations_1 == 1);
+                }
+
+                // Do any conversions feeding the first stage at the end of the loop so
+                // we can start right away on mma instructions
+                if (warp_mma_k + 1 == Base::kWarpGemmIterations1)
+                    warp_mma1.transform(
+                        warp_transformed_frag_A1[(warp_mma_k + 1) % 2],
+                        warp_transformed_frag_B1[(warp_mma_k + 1) % 2],
+                        FragmentAScaler::apply(warp_loaded_frag_A1[(warp_mma_k + 1) % 2],
+                                               warp_loaded_frag_A1_scale[(warp_mma_k + 1) % 2]),
+                        warp_loaded_frag_B1[(warp_mma_k + 1) % 2]);
+            }
+        }
+
+        if (platform::is_same<typename Operator1::MathOperator,
+                              arch::OpMultiplyAddFastF32>::value ||
+            platform::is_same<typename Operator1::MathOperator,
+                              arch::OpMultiplyAddComplexFastF32>::value) {
+            accum = plus_accum(accum, tmp_accum);
+        }
+    }
+};
+
+template <typename WarpShape,
+          typename InstructionShape,
+          typename RegularWarpIterator,
+          typename Policy,
+          typename Enable = void>
+struct DefaultWarpIteratorAFromSharedMemory {};
+
+// TensorOp - Ampere half
+template <typename RegularWarpIterator, typename Policy>
+struct DefaultWarpIteratorAFromSharedMemory<
+    cutlass::gemm::GemmShape<32, 32, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    RegularWarpIterator,
+    Policy,
+    typename platform::enable_if<(sizeof_bits<typename RegularWarpIterator::Element>::value == 16 &&
+                                  Policy::Operator::Policy::OpDelta::kRow == 1)>::type> {
+    static constexpr auto kWarpSize = 32;
+    using OpDelta = typename Policy::Operator::Policy::OpDelta;
+    using WarpShape = cutlass::MatrixShape<32, 32>;
+
+    using WarpIterator =
+        cutlass::gemm::warp::WarpIteratorFromSmem<cutlass::gemm::Operand::kA,
+                                                  typename RegularWarpIterator::Element>;
+};
+
+// TensorOp - Ampere f32
+template <typename WarpShape, typename RegularWarpIterator, typename Policy>
+struct DefaultWarpIteratorAFromSharedMemory<
+    WarpShape,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    RegularWarpIterator,
+    Policy,
+    typename platform::enable_if<(sizeof_bits<typename RegularWarpIterator::Element>::value != 16 ||
+                                  Policy::Operator::Policy::OpDelta::kRow != 1)>::type> {
+    using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+    static constexpr auto kWarpSize = 32;
+    using OpDelta = typename Policy::Operator::Policy::OpDelta;
+
+    using WarpIterator = cutlass::gemm::warp::MmaTensorOpMultiplicandTileAccessIterator<
+        cutlass::MatrixShape<WarpShape::kM, WarpShape::kK>,
+        cutlass::gemm::Operand::kA,
+        typename RegularWarpIterator::Element,
+        cutlass::layout::RowMajor,
+        cutlass::MatrixShape<InstructionShape::kM, InstructionShape::kK>,
+        OpDelta::kRow,
+        kWarpSize>;
+};
+
+// TensorOp - Volta
+template <typename WarpShape, typename RegularWarpIterator, typename Policy>
+struct DefaultWarpIteratorAFromSharedMemory<WarpShape,
+                                            cutlass::gemm::GemmShape<16, 16, 4>,
+                                            RegularWarpIterator,
+                                            Policy> {
+    using InstructionShape = cutlass::gemm::GemmShape<16, 16, 4>;
+    static constexpr auto kWarpSize = 32;
+    using OpDelta = typename Policy::Operator::Policy::OpDelta;
+
+    using WarpIterator = cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator<
+        cutlass::MatrixShape<32, 32>,  // MatrixShape<WarpShape::kM,
+                                       // WarpShape::kK>,
+        cutlass::gemm::Operand::kA,
+        typename RegularWarpIterator::Element,
+        cutlass::layout::RowMajorVoltaTensorOpMultiplicandCrosswise<16, 32>,
+        cutlass::MatrixShape<16, 4>,
+        OpDelta::kRow,
+        kWarpSize>;
+};
+
+// Simt
+template <typename WarpShape, typename RegularWarpIterator, typename Policy>
+struct DefaultWarpIteratorAFromSharedMemory<WarpShape,
+                                            cutlass::gemm::GemmShape<1, 1, 1>,
+                                            RegularWarpIterator,
+                                            Policy> {
+    using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+    static constexpr auto kWarpSize = 32;
+
+    // We just use the same iterator, as we reproduced the same shared-memory
+    // schema. Just modify it to handle non-complete tiles.
+    using WarpIterator = RegularWarpIterator;
+};
+
+// Converts a "regular" Mma into their counterpart from shared memory
+template <typename Mma_,
+          typename AccumulatorSharedStorage,
+          /// whether or not to apply elementwise multiplication of operand A by
+          /// another matrix in shared memory before usage in A @ B
+          bool kScaleOperandA,
+          bool kTransposeA = false>
+struct DefaultMmaFromSharedMemory;
+
+// Mma pipelined
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA_,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Transformation applied to A operand
+    typename TransformA_,
+    /// Transformation applied to B operand
+    typename TransformB_,
+    typename AccumulatorSharedStorage_,
+    /// whether or not to apply elementwise multiplication of operand A by
+    /// another matrix in shared memory before usage in A @ B
+    bool kScaleOperandA,
+    bool kTransposeA>
+struct DefaultMmaFromSharedMemory<MmaPipelined<Shape_,
+                                               IteratorA_,
+                                               SmemIteratorA_,
+                                               IteratorB_,
+                                               SmemIteratorB_,
+                                               ElementC_,
+                                               LayoutC_,
+                                               Policy_,
+                                               TransformA_,
+                                               TransformB_>,
+                                  AccumulatorSharedStorage_,
+                                  kScaleOperandA,
+                                  kTransposeA> {
+    static constexpr int kWarpSize = 32;
+    using SmemAccumulatorLayout = cutlass::layout::RowMajor;
+
+    using RegularMma = MmaPipelined<Shape_,
+                                    IteratorA_,
+                                    SmemIteratorA_,
+                                    IteratorB_,
+                                    SmemIteratorB_,
+                                    ElementC_,
+                                    LayoutC_,
+                                    Policy_,
+                                    TransformA_,
+                                    TransformB_>;
+
+    using WarpShape = typename Policy_::Operator::Shape;
+    using InstructionShape = typename Policy_::Operator::InstructionShape;
+    using ArchMmaOperator = typename Policy_::Operator;
+
+    static constexpr bool kIsTransposedA = false;
+    using WarpIteratorA =
+        typename DefaultWarpIteratorAFromSharedMemory<WarpShape,
+                                                      InstructionShape,
+                                                      typename RegularMma::Operator::IteratorA,
+                                                      Policy_>::WarpIterator;
+    using IteratorB =
+        typename cutlass::transform::threadblock::MakeIteratorResidualLast<IteratorB_>::Iterator;
+
+    using Mma =
+        typename cutlass::gemm::threadblock::MmaPipelinedFromSharedMemory<Shape_,
+                                                                          WarpIteratorA,
+                                                                          kScaleOperandA,
+                                                                          AccumulatorSharedStorage_,
+                                                                          IteratorB,
+                                                                          SmemIteratorB_,
+                                                                          ElementC_,
+                                                                          LayoutC_,
+                                                                          Policy_>;
+};
+
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA_,
+    /// Cache operation for operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear,
+    typename AccumulatorSharedStorage_,
+    /// whether or not to apply elementwise multiplication of operand A by
+    /// another matrix in shared memory before usage in A @ B
+    bool kScaleOperandA,
+    bool kTransposeA>
+struct DefaultMmaFromSharedMemory<MmaMultistage<Shape_,
+                                                IteratorA_,
+                                                SmemIteratorA_,
+                                                CacheOpA,
+                                                IteratorB_,
+                                                SmemIteratorB_,
+                                                CacheOpB,
+                                                ElementC_,
+                                                LayoutC_,
+                                                Policy_,
+                                                Stages,
+                                                SharedMemoryClear>,
+                                  AccumulatorSharedStorage_,
+                                  kScaleOperandA,
+                                  kTransposeA> {
+    static constexpr int kWarpSize = 32;
+
+    using RegularMma = MmaMultistage<Shape_,
+                                     IteratorA_,
+                                     SmemIteratorA_,
+                                     CacheOpA,
+                                     IteratorB_,
+                                     SmemIteratorB_,
+                                     CacheOpB,
+                                     ElementC_,
+                                     LayoutC_,
+                                     Policy_,
+                                     Stages,
+                                     SharedMemoryClear>;
+
+    using WarpShape = typename Policy_::Operator::Shape;
+    using InstructionShape = typename Policy_::Operator::InstructionShape;
+    using WarpIteratorA_ =
+        typename DefaultWarpIteratorAFromSharedMemory<WarpShape,
+                                                      InstructionShape,
+                                                      typename RegularMma::Operator::IteratorA,
+                                                      Policy_>::WarpIterator;
+    using WarpIteratorTranspose = TransposeWarpIterator<WarpIteratorA_>;
+    static constexpr bool kIsTransposedA = WarpIteratorTranspose::kSupportsTranspose && kTransposeA;
+    using WarpIteratorA = typename platform::
+        conditional<kIsTransposedA, typename WarpIteratorTranspose::Iterator, WarpIteratorA_>::type;
+
+    static int constexpr kMaxK = kIsTransposedA ? AccumulatorSharedStorage_::Shape::kM
+                                                : AccumulatorSharedStorage_::Shape::kN;
+    // Reduce the number of stages if we don't need that many
+    static int constexpr kStagesMax = (kMaxK + int(Shape_::kK) - 1) / int(Shape_::kK);
+    static int constexpr kStages = cutlass::const_min(Stages, kStagesMax);
+
+    using IteratorB =
+        typename cutlass::transform::threadblock::MakeIteratorResidualLast<IteratorB_>::Iterator;
+    using Mma = typename cutlass::gemm::threadblock::MmaMultistageFromSharedMemory<
+        Shape_,
+        WarpIteratorA,
+        kScaleOperandA,
+        AccumulatorSharedStorage_,
+        IteratorB,
+        SmemIteratorB_,
+        RegularMma::kCacheOpB,
+        ElementC_,
+        LayoutC_,
+        Policy_,
+        kStages,
+        kMaxK>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename IteratorC,
+          typename Operator,
+          typename scalar_t,
+          typename WarpShape_,
+          typename ThreadblockShape_>
+struct B2bGemm;
+
+// Tensor Cores >= Sm75 specialization (Ampere ...)
+template <  /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Element type
+    typename Element_,
+    /// Layout of operand in memory
+    typename Layout_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions, concept: MatrixShape)
+    typename OpDelta_,
+    typename Operator,
+    typename scalar_t,
+    typename WarpShape_,
+    typename ThreadblockShape_>
+struct B2bGemm<
+    cutlass::gemm::warp::
+        MmaTensorOpAccumulatorTileIterator<Shape_, Element_, Layout_, InstructionShape_, OpDelta_>,
+    Operator,
+    scalar_t,
+    WarpShape_,
+    ThreadblockShape_> {
+    using IteratorC = typename cutlass::gemm::warp::
+        MmaTensorOpAccumulatorTileIterator<Shape_, Element_, Layout_, InstructionShape_, OpDelta_>;
+    using FragmentC = typename IteratorC::Fragment;
+    using InstructionShape = InstructionShape_;
+    using WarpShape = WarpShape_;
+    using ThreadblockShape = ThreadblockShape_;
+    using accum_t = Element_;
+    using lse_scalar_t = float;
+
+    using SmemAccumulatorLayout = cutlass::layout::RowMajor;
+
+    // Iterator to load accumulators (results of matmul in registers)
+    using FragmentIteratorAccumulator = cutlass::epilogue::warp::FragmentIteratorTensorOp<
+        WarpShape,
+        InstructionShape,
+        accum_t,
+        typename Operator::Policy::Operator::FragmentC,
+        cutlass::layout::RowMajor>;
+
+    // Iterator to store to shared-memory
+    using SmemIteratorD0 =
+        typename cutlass::epilogue::warp::TileIteratorTensorOp<WarpShape,
+                                                               InstructionShape,
+                                                               scalar_t,  // accum_t,
+                                                               SmemAccumulatorLayout>;
+    using AccumulatorSharedStorage =
+        cutlass::gemm::threadblock::AccumulatorSharedStorage<ThreadblockShape,
+                                                             typename SmemIteratorD0::Element,
+                                                             typename SmemIteratorD0::TensorLayout,
+                                                             typename SmemIteratorD0::Padding>;
+    // We need to provide an operation for the epilogue. Let's create an
+    // operation that does nothing (ScaleType::Nothing), just converts
+    // from accum_t (float) -> scalar_t (can be half)
+    using OutputOpNoOp = cutlass::epilogue::thread::LinearCombination<
+        typename SmemIteratorD0::Element,  // ElementOutput
+        FragmentIteratorAccumulator::Fragment::kElements,
+        accum_t,                           // ElementAccumulator
+        typename SmemIteratorD0::Element,  // ElementCompute
+        cutlass::epilogue::thread::ScaleType::Nothing>;
+    using Epilogue = cutlass::epilogue::threadblock::EpilogueSmemAccumulator<
+        SmemIteratorD0,
+        FragmentIteratorAccumulator,
+        SmemIteratorD0,  // ScaleBiasIterator
+                         // - not used
+        OutputOpNoOp>;
+
+    // Epilogue 2: with LSE (for backwards pass)
+    static int const kElementsPerAccess = 2;  // TODO: Why 2?
+    using IteratorAccumulatorLSE = cutlass::transform::threadblock::VectorIterator<
+        cutlass::transform::threadblock::PredicatedVectorAccessIterator<
+            // Shape
+            cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kN>,
+            // WarpShape
+            cutlass::MatrixShape<WarpShape::kM, WarpShape::kN>,
+            lse_scalar_t,
+            cutlass::layout::RowMajor,
+            kElementsPerAccess>>;
+    using EpilogueOpApplyLSE = cutlass::epilogue::thread::ApplyLogSumExp<
+        scalar_t,      // ElementOutput_
+        lse_scalar_t,  // ElementLSE_
+        accum_t,       // ElementAccumulator_
+        accum_t,       // ElementCompute_
+        128 / cutlass::sizeof_bits<scalar_t>::value
+        // FragmentIteratorAccumulator::Fragment::kElements
+        // InstructionShape::kM * InstructionShape::kN / 32
+        >;
+    using EpilogueWithLSE =
+        cutlass::epilogue::threadblock::EpilogueSmemAccumulator<SmemIteratorD0,
+                                                                FragmentIteratorAccumulator,
+                                                                IteratorAccumulatorLSE,
+                                                                EpilogueOpApplyLSE>;
+
+    static void CUTLASS_DEVICE accumToSmem(AccumulatorSharedStorage& shared_storage,
+                                           FragmentC const& accum,
+                                           int lane_id,
+                                           cutlass::MatrixCoord const& tile_coords)
+    {
+        SmemIteratorD0 smem_iterator_attn(shared_storage.accum_ref(), lane_id);
+        smem_iterator_attn.add_tile_offset(
+            tile_coords * cutlass::MatrixCoord{SmemIteratorD0::TileIterations::kRow,
+                                               SmemIteratorD0::TileIterations::kColumn});
+        Epilogue epilogue;
+        epilogue(OutputOpNoOp({}), smem_iterator_attn, accum);
+    }
+
+    static void CUTLASS_DEVICE accumApplyLSEToSmem(AccumulatorSharedStorage& shared_storage,
+                                                   FragmentC& accum,
+                                                   lse_scalar_t const* lse,
+                                                   int32_t lse_extents,
+                                                   int thread_id,
+                                                   int warp_id,
+                                                   int lane_id,
+                                                   cutlass::MatrixCoord const& tile_coords)
+    {
+        constexpr int32_t kAlignLSE = 32;
+        IteratorAccumulatorLSE iterator_lse(
+            lse,
+            {(int32_t)0, (int32_t)ceil_div(lse_extents, kAlignLSE) * kAlignLSE},
+            thread_id,
+            warp_id,
+            cutlass::MatrixCoord{0, 0}  // offset
+        );
+
+        SmemIteratorD0 smem_iterator_attn(shared_storage.accum_ref(), lane_id);
+        smem_iterator_attn.add_tile_offset(
+            tile_coords * cutlass::MatrixCoord{SmemIteratorD0::TileIterations::kRow,
+                                               SmemIteratorD0::TileIterations::kColumn});
+        EpilogueWithLSE epilogue;
+        EpilogueOpApplyLSE minus_lse_exp({});
+        epilogue(minus_lse_exp,
+                 smem_iterator_attn,
+                 accum,
+                 // scale - unused
+                 iterator_lse,
+                 // bias
+                 iterator_lse);
+    }
+};
+
+// Volta Specialization
+// only supported for f16
+template <typename Operator, typename WarpShape_, typename ThreadblockShape_>
+struct B2bGemm<cutlass::gemm::warp::MmaVoltaTensorOpAccumulatorTileIterator<
+                   cutlass::MatrixShape<32, 32>,
+                   float,
+                   cutlass::layout::RowMajor,
+                   cutlass::gemm::GemmShape<16, 16, 4>,
+                   cutlass::MatrixShape<1, 1>>,
+               Operator,
+               cutlass::half_t,
+               WarpShape_,
+               ThreadblockShape_> {
+    using IteratorC = cutlass::gemm::warp::MmaVoltaTensorOpAccumulatorTileIterator<
+        cutlass::MatrixShape<32, 32>,
+        float,
+        cutlass::layout::RowMajor,
+        cutlass::gemm::GemmShape<16, 16, 4>,
+        cutlass::MatrixShape<1, 1>>;
+    using scalar_t = cutlass::half_t;
+    using accum_t = IteratorC::Element;
+    using WarpShape = WarpShape_;
+    using ThreadblockShape = ThreadblockShape_;
+    using FragmentC = IteratorC::Fragment;
+    using lse_scalar_t = float;
+
+    using SmemAccumulatorLayout = cutlass::layout::RowMajor;
+    using SmemIteratorD0 =
+        cutlass::epilogue::warp::TileIteratorVoltaTensorOp<WarpShape,
+                                                           cutlass::gemm::GemmShape<32, 32, 4>,
+                                                           scalar_t,
+                                                           SmemAccumulatorLayout>;
+
+    // // Storage in shared-memory for Q.Kt
+    using AccumulatorSharedStorage = cutlass::gemm::threadblock::AccumulatorSharedStorage<
+        ThreadblockShape,
+        scalar_t,
+        cutlass::layout::RowMajorVoltaTensorOpMultiplicandCrosswise<
+            16,
+            32>,                    // typename SmemIteratorD0::TensorLayout,
+        cutlass::MatrixShape<0, 0>  // Padding
+        >;
+
+    using OutputLayout = cutlass::layout::RowMajorVoltaTensorOpMultiplicandCrosswise<16, 32>;
+    using TensorRef = cutlass::TensorRef<scalar_t, OutputLayout>;
+    using Policy = typename IteratorC::Policy;
+    using Element = accum_t;
+    // Those are MmaVoltaTensorOpAccumulatorTileIterator private fields
+    // Let's copy their values
+    static int const kElementsPerPartial = 4;
+    using EleShapePerPatial =
+        typename cutlass::platform::conditional<cutlass::platform::is_same<Element, float>::value,
+                                                cutlass::MatrixShape<2, 2>,
+                                                cutlass::MatrixShape<1, 4>>::type;
+    static int const kElementsPerMma = 8;
+    static int const kAccumulatorPatials = 2;
+    using QuadShapePerPatialMma = cutlass::MatrixShape<4, 4>;
+
+    static void CUTLASS_DEVICE accumToSmem(AccumulatorSharedStorage& shared_storage,
+                                           FragmentC const& accum,
+                                           int lane_id,
+                                           cutlass::MatrixCoord const& tile_coords)
+    {
+        // ctor - from MmaVoltaTensorOpAccumulatorTileIterator
+        TensorRef ref_(shared_storage.accum_ref());
+        int quad = (lane_id >> 2);
+        int lane_in_quad = (lane_id & 3);
+        int accum_m, accum_n;
+
+        if (cutlass::platform::is_same<Element, float>::value) {
+            // (quad[2],quad[0])+lane_in_quad[0]
+            accum_m = (((quad & 0x4) >> 1) + (quad & 0x1)) * 8 + (lane_in_quad & 1);
+            // (quad[1])+lane_in_quad[1]
+            accum_n = ((quad >> 1) & 0x1) * kElementsPerPartial * kAccumulatorPatials +
+                      (lane_in_quad & 2);
+        } else {
+            accum_m = (((quad & 0x4) >> 1) + (quad & 0x1)) * 8 + lane_in_quad;  // (quad[2],quad[0])
+            accum_n = ((quad >> 1) & 0x1) * kElementsPerPartial * kAccumulatorPatials;
+        }
+        cutlass::MatrixCoord lane_offset(accum_m, accum_n);
+
+        // Tile offset
+        ref_.add_coord_offset(tile_coords * cutlass::MatrixCoord({IteratorC::Shape::kRow,
+                                                                  IteratorC::Shape::kColumn}));
+
+        using AccessType = cutlass::Array<scalar_t, EleShapePerPatial::kColumn>;
+
+        // store - from MmaVoltaTensorOpAccumulatorTileIterator
+        CUTLASS_PRAGMA_UNROLL
+        for (int tile_n = 0; tile_n < Policy::TileIterations::kColumn; ++tile_n) {
+            CUTLASS_PRAGMA_UNROLL
+            for (int tile_m = 0; tile_m < Policy::TileIterations::kRow; ++tile_m) {
+                CUTLASS_PRAGMA_UNROLL
+                for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
+                    CUTLASS_PRAGMA_UNROLL
+                    for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
+                        int mma_accum_start = (((tile_n * Policy::TileIterations::kRow + tile_m) *
+                                                    Policy::MmaIterations::kColumn +
+                                                mma_n) *
+                                                   Policy::MmaIterations::kRow +
+                                               mma_m) *
+                                              kElementsPerMma;
+
+                        CUTLASS_PRAGMA_UNROLL
+                        for (int p = 0; p < kAccumulatorPatials; ++p) {
+                            CUTLASS_PRAGMA_UNROLL
+                            for (int m = 0; m < EleShapePerPatial::kRow; ++m) {
+                                int accum_m = tile_m * Policy::InterleavedTile::kRow +
+                                              mma_m * QuadShapePerPatialMma::kRow + m * 2;
+                                int accum_n = tile_n * Policy::InterleavedTile::kColumn +
+                                              mma_n * QuadShapePerPatialMma::kColumn +
+                                              p * Policy::InterleavedTile::kColumn / 2;
+                                int r = (accum_m + lane_offset.row());
+                                AccessType to_store;
+                                CUTLASS_PRAGMA_UNROLL
+                                for (int n = 0; n < EleShapePerPatial::kColumn; ++n) {
+                                    int idx = mma_accum_start + p * kElementsPerPartial +
+                                              m * EleShapePerPatial::kColumn + n;
+                                    int c = (accum_n + n + lane_offset.column());
+                                    to_store[n] = scalar_t(accum[idx]);
+                                }
+                                int c = (accum_n + lane_offset.column());
+                                assert(r < 32);
+                                assert(c < 32);
+                                *reinterpret_cast<AccessType*>(ref_.data() + ref_.offset({r, c})) =
+                                    to_store;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    static void CUTLASS_DEVICE accumApplyLSEToSmem(AccumulatorSharedStorage& shared_storage,
+                                                   typename IteratorC::Fragment& accum,
+                                                   lse_scalar_t const* lse,
+                                                   int lse_extent,
+                                                   int thread_id,
+                                                   int warp_id,
+                                                   int lane_id,
+                                                   cutlass::MatrixCoord const& tile_coords)
+    {
+        // Non-optimized way to apply LSE to registers
+        // NOTE: accum is attn.T
+        // TODO: Optimize for each architecture
+        static constexpr int WarpSize = 32;
+        using AccumLambdaIterator =
+            typename DefaultMmaAccumLambdaIterator<IteratorC, accum_t, WarpSize>::Iterator;
+        auto lane_offset = AccumLambdaIterator::get_lane_offset(lane_id, warp_id, tile_coords);
+
+        cutlass::Array<lse_scalar_t, IteratorC::Fragment::kElements> lse_prefetched;
+        lse_prefetched.clear();
+        int rowIdx = 0;
+        int colIdx = 0;
+        AccumLambdaIterator::iterateRows(
+            lane_offset,
+            [&](int accum_m) {
+                ++rowIdx;
+                colIdx = 0;
+            },
+            [&](int accum_m, int accum_n, int idx) {
+                if (rowIdx == 1) {
+                    lse_prefetched[colIdx] = accum_n < lse_extent
+                                                 ? lse[accum_n]
+                                                 : platform::numeric_limits<accum_t>::infinity();
+                }
+                accum[idx] = expf(accum[idx] - lse_prefetched[colIdx]);
+                ++colIdx;
+            },
+            [&](int accum_m) {});
+        accumToSmem(shared_storage, accum, lane_id, tile_coords);
+    }
+};
+
+// Simt Specialization
+// for f32 on Sm70-Sm75 and f16/f32 below
+
+template <typename Operator,
+          typename OperatorPolicy,
+          typename scalar_t,
+          typename WarpShape_,
+          typename ThreadblockShape_>
+struct B2bGemm<cutlass::gemm::warp::MmaSimtTileIterator<cutlass::MatrixShape<32, 32>,
+                                                        cutlass::gemm::Operand::kC,
+                                                        float,
+                                                        cutlass::layout::RowMajor,
+                                                        OperatorPolicy,
+                                                        1,
+                                                        1>,
+               Operator,
+               scalar_t,
+               WarpShape_,
+               ThreadblockShape_> {
+    using IteratorC = cutlass::gemm::warp::MmaSimtTileIterator<cutlass::MatrixShape<32, 32>,
+                                                               cutlass::gemm::Operand::kC,
+                                                               float,
+                                                               cutlass::layout::RowMajor,
+                                                               OperatorPolicy,
+                                                               1,
+                                                               1>;
+    using accum_t = typename IteratorC::Element;
+    using WarpShape = WarpShape_;
+    using ThreadblockShape = ThreadblockShape_;
+    using FragmentC = typename IteratorC::Fragment;
+    using lse_scalar_t = float;
+
+    // Storage in shared-memory for Q.Kt
+    using AccumulatorSharedStorage =
+        cutlass::gemm::threadblock::AccumulatorSharedStorage<ThreadblockShape,
+                                                             scalar_t,
+                                                             cutlass::layout::ColumnMajor,
+                                                             cutlass::MatrixShape<0, 0>  // Padding
+                                                             >;
+
+    static void CUTLASS_DEVICE accumToSmem(AccumulatorSharedStorage& shared_storage,
+                                           FragmentC const& accum,
+                                           int lane_id,
+                                           cutlass::MatrixCoord const& tile_coords)
+    {
+        using Policy = typename IteratorC::Policy;
+        using Element = typename IteratorC::Element;
+        using Iterations = typename IteratorC::Iterations;
+        using Delta = typename IteratorC::Delta;
+
+        auto ref_ = shared_storage.accum_ref();
+        // ctor - MmaSimtTileIterator
+        // compute offset based on thread ID and lane layout
+        typename Policy::LaneLayout lane_layout = Policy::get_lane_layout();
+
+        MatrixCoord lane_offset = lane_layout.inverse(lane_id) *
+                                  MatrixCoord(Policy::LaneMmaShape::kM, Policy::LaneMmaShape::kN);
+
+        ref_.add_coord_offset(lane_offset);
+
+        // Tile offset
+        ref_.add_coord_offset(tile_coords * cutlass::MatrixCoord({IteratorC::Shape::kRow,
+                                                                  IteratorC::Shape::kColumn}));
+
+        // store - MmaSimtTileIterator
+        CUTLASS_PRAGMA_UNROLL
+        for (int mma_n = 0; mma_n < Iterations::kColumn; ++mma_n) {
+            CUTLASS_PRAGMA_UNROLL
+            for (int n = 0; n < Policy::LaneMmaShape::kN; ++n) {
+                CUTLASS_PRAGMA_UNROLL
+                for (int mma_m = 0; mma_m < Iterations::kRow; ++mma_m) {
+                    CUTLASS_PRAGMA_UNROLL
+                    for (int m = 0; m < Policy::LaneMmaShape::kM; ++m) {
+                        int r = Policy::LaneMmaShape::kM * (mma_m * Policy::WarpShape::kRow) + m;
+                        int c = mma_n * Delta::kColumn + n;
+                        int idx = n + Policy::LaneMmaShape::kN *
+                                          (mma_n + Iterations::kColumn *
+                                                       (m + mma_m * Policy::LaneMmaShape::kM));
+                        ref_.at({r, c}) = scalar_t(accum[idx]);
+                    }
+                }
+            }
+        }
+    }
+
+    static void CUTLASS_DEVICE accumApplyLSEToSmem(AccumulatorSharedStorage& shared_storage,
+                                                   typename IteratorC::Fragment& accum,
+                                                   lse_scalar_t const* lse,
+                                                   int lse_extent,
+                                                   int thread_id,
+                                                   int warp_id,
+                                                   int lane_id,
+                                                   cutlass::MatrixCoord const& tile_coords)
+    {
+        // Non-optimized way to apply LSE to registers
+        // NOTE: accum is attn.T
+        // TODO: Optimize for each architecture
+        static constexpr int WarpSize = 32;
+        using AccumLambdaIterator =
+            typename DefaultMmaAccumLambdaIterator<IteratorC, accum_t, WarpSize>::Iterator;
+        auto lane_offset = AccumLambdaIterator::get_lane_offset(lane_id, warp_id, tile_coords);
+
+        cutlass::Array<lse_scalar_t, IteratorC::Fragment::kElements> lse_prefetched;
+        lse_prefetched.clear();
+        int rowIdx = 0;
+        int colIdx = 0;
+        AccumLambdaIterator::iterateRows(
+            lane_offset,
+            [&](int accum_m) {
+                ++rowIdx;
+                colIdx = 0;
+            },
+            [&](int accum_m, int accum_n, int idx) {
+                if (rowIdx == 1) {
+                    lse_prefetched[colIdx] = accum_n < lse_extent
+                                                 ? lse[accum_n]
+                                                 : platform::numeric_limits<accum_t>::infinity();
+                }
+                accum[idx] = expf(accum[idx] - lse_prefetched[colIdx]);
+                ++colIdx;
+            },
+            [&](int accum_m) {});
+        accumToSmem(shared_storage, accum, lane_id, tile_coords);
+    }
+};
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/deepspeed4science/evoformer_attn/gemm_kernel_utils.h b/csrc/deepspeed4science/evoformer_attn/gemm_kernel_utils.h
new file mode 100644
index 000000000000..2a4300c5cac1
--- /dev/null
+++ b/csrc/deepspeed4science/evoformer_attn/gemm_kernel_utils.h
@@ -0,0 +1,254 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holdvr nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#pragma once
+
+#include <type_traits>
+#include "cutlass/arch/mma.h"
+
+template <typename arch, typename scalar_t>
+struct CheckArch {
+    static constexpr bool isPreVolta = arch::kMinComputeCapability < 70;
+    static constexpr bool isPreAmpere =
+        arch::kMinComputeCapability < 80 && arch::kMinComputeCapability >= 70;
+    static constexpr bool isAmpere = arch::kMinComputeCapability >= 80;
+#if defined(__CUDA_ARCH__)
+    static constexpr bool compiler_cc = arch::kMinComputeCapability * 10 <= __CUDA_ARCH__;
+#else
+    static constexpr bool compiler_cc = true;
+#endif
+    static constexpr bool value = (isPreVolta && std::is_same_v<scalar_t, float>) ||
+                                  (isPreAmpere && !std::is_same_v<scalar_t, cutlass::bfloat16_t>) ||
+                                  isAmpere && compiler_cc;
+};
+
+#define DISPATCH_ARCHTAG(CC, func)                                                      \
+    {                                                                                   \
+        if constexpr (GPU_ARCH >= 80) {                                                 \
+            if (CC >= 80) {                                                             \
+                using ArchTag = cutlass::arch::Sm80;                                    \
+                func;                                                                   \
+            } else {                                                                    \
+                EVOFORMER_CHECK(false, "Compile flag error. Unexpected GPU");           \
+            }                                                                           \
+        } else if constexpr (GPU_ARCH >= 75) {                                          \
+            if (CC >= 75) {                                                             \
+                using ArchTag = cutlass::arch::Sm75;                                    \
+                func;                                                                   \
+            } else {                                                                    \
+                EVOFORMER_CHECK(false, "Compile flag error. Unexpected GPU");           \
+            }                                                                           \
+        } else if constexpr (GPU_ARCH >= 70) {                                          \
+            if (CC >= 70) {                                                             \
+                using ArchTag = cutlass::arch::Sm70;                                    \
+                func;                                                                   \
+            } else {                                                                    \
+                EVOFORMER_CHECK(false, "Compile flag error. Unexpected GPU");           \
+            }                                                                           \
+        } else {                                                                        \
+            EVOFORMER_CHECK(false, "Only GPUs with Tensor Core are supported for now"); \
+        }                                                                               \
+    }
+
+#define DISPATCH_TYPES(tensor, func)                                              \
+    {                                                                             \
+        if (tensor.scalar_type() == at::ScalarType::Half) {                       \
+            using scalar_t = cutlass::half_t;                                     \
+            using torch_scalar_t = at::Half;                                      \
+            func;                                                                 \
+        } else if (tensor.scalar_type() == at::ScalarType::BFloat16) {            \
+            using scalar_t = cutlass::bfloat16_t;                                 \
+            using torch_scalar_t = at::BFloat16;                                  \
+            func;                                                                 \
+        } else {                                                                  \
+            EVOFORMER_CHECK(false, "Only fp16 and bf16 supported at the moment"); \
+        }                                                                         \
+    }
+
+#define DISPATCH_BOOL(BOOL_V, BOOL_NAME, F)   \
+    {                                         \
+        if (BOOL_V) {                         \
+            constexpr bool BOOL_NAME = true;  \
+            F();                              \
+        } else {                              \
+            constexpr bool BOOL_NAME = false; \
+            F();                              \
+        }                                     \
+    }
+
+#ifdef TORCH_CHECK
+#define CHECK_ALIGNED_PTR(PTR, ALIGNMENT) \
+    EVOFORMER_CHECK(uint64_t(PTR) % ALIGNMENT == 0, #PTR " is not correctly aligned")
+#define EVOFORMER_CHECK TORCH_CHECK
+#elif defined(__CUDACC_RTC__)
+#define CHECK_ALIGNED_PTR(PTR, ALIGNMENT) \
+    if (!(uint64_t(PTR) % ALIGNMENT == 0)) { return false; }
+#define EVOFORMER_CHECK(COND, ERR) \
+    if (!(COND)) { return false; }
+#else
+#include <iostream>
+#define CHECK_ALIGNED_PTR(PTR, ALIGNMENT)                \
+    if (!(uint64_t(PTR) % ALIGNMENT == 0)) {             \
+        std::cerr << #PTR " is not correctly aligned\n"; \
+        return false;                                    \
+    }
+#define EVOFORMER_CHECK(COND, ERR)                          \
+    if (!(COND)) {                                          \
+        std::cerr << "[Evoformer Attention]"                \
+                  << "'" #COND "' failed: " << ERR << "\n"; \
+        return false;                                       \
+    }
+#endif
+
+namespace gemm_kernel_utils {
+
+template <typename integer>
+constexpr CUTLASS_HOST_DEVICE integer ceil_div(integer n, integer m)
+{
+    return (n + m - 1) / m;
+}
+
+template <typename integer>
+constexpr CUTLASS_HOST_DEVICE integer align_up(integer n, integer m)
+{
+    return ((n + m - 1) / m) * m;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Determine the type of GEMM we do (TensorCores or not, Shapes ...)
+// TODO: Maybe we could rely on Cutlass's DefaultGemm templates
+////////////////////////////////////////////////////////////////////////////////
+
+// Fallback to Simt (FMA on cuda cores) if not in a special case below
+template <typename ArchTag, typename scalar_t_, typename Enable = void>
+struct DefaultGemmType {
+    static constexpr int ThreadK = 8;
+    static constexpr int WarpK = 8;
+    static constexpr int kMinimumAlignment = 1;
+    using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+    using OpClass = cutlass::arch::OpClassSimt;
+    using Operator = cutlass::arch::OpMultiplyAdd;
+};
+
+// Specialization for tensorcores with f32
+template <typename ArchTag>
+struct DefaultGemmType<
+    ArchTag,
+    float,
+    typename cutlass::platform::enable_if<ArchTag::kMinComputeCapability >= 80>::type> {
+    static constexpr int ThreadK = 32;
+    static constexpr int WarpK = 32;
+    static constexpr int kMinimumAlignment = 4;
+    using OpClass = cutlass::arch::OpClassTensorOp;
+    using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+    using Operator = cutlass::arch::OpMultiplyAddFastF32;
+};
+
+// Specialization for tensorcores with f16/bf16 - Sm75+
+template <typename ArchTag, typename scalar_t>
+struct DefaultGemmType<
+    ArchTag,
+    scalar_t,
+    typename cutlass::platform::enable_if<ArchTag::kMinComputeCapability >= 75 &&
+                                          cutlass::sizeof_bits<scalar_t>::value == 16>::type> {
+    static constexpr int ThreadK = 32;
+    static constexpr int WarpK = 32;
+    static constexpr int kMinimumAlignment = 4;
+    using OpClass = cutlass::arch::OpClassTensorOp;
+    using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+    using Operator = cutlass::arch::OpMultiplyAdd;
+};
+
+// Specialization for tensorcores with f16 - Volta
+template <>
+struct DefaultGemmType<cutlass::arch::Sm70, cutlass::half_t, void> {
+    static constexpr int ThreadK = 32;
+    static constexpr int WarpK = 32;
+    static constexpr int kMinimumAlignment = 2;
+    using OpClass = cutlass::arch::OpClassTensorOp;
+    using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
+    using Operator = cutlass::arch::OpMultiplyAdd;
+};
+
+// Enables to do
+// `auto x = kCondition ? fa(arg) : fb(arg)`
+// when `fa` and `fb` have different types
+template <bool kVal, typename TA, typename TB>
+struct call_conditional;
+
+template <typename TA, typename TB>
+struct call_conditional<true, TA, TB> {
+    template <typename Arg>
+    static CUTLASS_HOST_DEVICE auto apply(TA ta, TB tb, Arg arg) -> decltype(ta(arg))
+    {
+        return ta(arg);
+    }
+};
+
+template <typename TA, typename TB>
+struct call_conditional<false, TA, TB> {
+    template <typename Arg>
+    static CUTLASS_HOST_DEVICE auto apply(TA ta, TB tb, Arg arg) -> decltype(tb(arg))
+    {
+        return tb(arg);
+    }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Mark a variable as warp-uniform - enables some compiler optimizations
+// The cheapest way to do it is just to broadcast it from lane 0
+////////////////////////////////////////////////////////////////////////////////
+
+CUTLASS_DEVICE int32_t warp_uniform(int32_t value)
+{
+    return (int32_t)__shfl_sync(0xffffffff, (unsigned)value, 0);
+}
+
+template <typename T>
+CUTLASS_DEVICE T* warp_uniform(T* ptr)
+{
+    struct {
+        union {
+            T* ptr;
+            uint32_t asInt[2];
+        };
+    } p;
+    p.ptr = ptr;
+    p.asInt[0] = warp_uniform(p.asInt[0]);
+    p.asInt[1] = warp_uniform(p.asInt[1]);
+    return p.ptr;
+}
+}  // namespace gemm_kernel_utils
diff --git a/csrc/deepspeed4science/evoformer_attn/iterators/epilogue_predicated_tile_iterator.h b/csrc/deepspeed4science/evoformer_attn/iterators/epilogue_predicated_tile_iterator.h
new file mode 100644
index 000000000000..667f1982d30d
--- /dev/null
+++ b/csrc/deepspeed4science/evoformer_attn/iterators/epilogue_predicated_tile_iterator.h
@@ -0,0 +1,691 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+/*! \file
+  \brief Epilogue iterator that supports prefetching
+
+  Mostly copied from "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+*/
+
+#pragma once
+
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/threadblock/output_tile_thread_map.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_params.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/transform/pitch_linear_thread_map.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator used to load and store output tile from global memory in
+/// epilogue.
+///
+/// Satisfies: ReadableTileIterator | PredicatedTileIterator |
+/// ForwardTileIterator
+///
+template <typename ThreadMap_,    ///< Thread map (conept: OutputTileThreadMap)
+          typename Element_,      ///< Element data type
+          bool ScatterD = false,  ///< Scatter D operand or not
+          bool UseCUDAStore = false>
+class PredicatedTileIteratorPrefetch {
+public:
+    using ThreadMap = ThreadMap_;
+    using Shape = typename ThreadMap::Shape;
+
+    using Element = Element_;
+
+    using Layout = layout::RowMajor;
+    using TensorRef = TensorRef<Element, Layout>;
+    using ConstTensorRef = typename TensorRef::ConstTensorRef;
+
+    using Index = typename Layout::Index;
+    using LongIndex = typename Layout::LongIndex;
+    using TensorCoord = MatrixCoord;
+
+    static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+    static int const kThreads = ThreadMap::kThreads;
+    static int const kIterations = ThreadMap::Count::kTile;
+
+    static_assert(ThreadMap::Iterations::kRow > 0, "ThreadMap::Iterations::kRow must be > 0");
+    static_assert(ThreadMap::Iterations::kGroup > 0, "ThreadMap::Iterations::kGroup must be > 0");
+    static_assert(ThreadMap::Iterations::kCluster > 0,
+                  "ThreadMap::Iterations::kCluster must be > 0");
+    static_assert(ThreadMap::Iterations::kColumn > 0, "ThreadMap::Iterations::kColumn must be > 0");
+
+    /// Fragment object
+    using Fragment = Array<Element,
+                           ThreadMap::Iterations::kColumn * ThreadMap::Iterations::kRow *
+                               ThreadMap::Iterations::kGroup * ThreadMap::Iterations::kCluster *
+                               ThreadMap::kElementsPerAccess>;
+
+    /// Memory access size
+    using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
+
+    //
+    // Parameters struct
+    //
+
+    /// Uses a non-template class
+    struct Params : PredicatedTileIteratorParams {
+        using Base = PredicatedTileIteratorParams;
+
+        CUTLASS_HOST_DEVICE
+        Params() {}
+
+        CUTLASS_HOST_DEVICE
+        Params(Layout const& layout)
+            : PredicatedTileIteratorParams(
+                  layout.stride(0) * int(sizeof(AccessType)) / kElementsPerAccess,
+                  make_OutputTileThreadMapDesc<ThreadMap>())
+        {
+        }
+
+        CUTLASS_HOST_DEVICE
+        Params(Base const& base) : Base(base) {}
+    };
+
+    /// Mask object
+    struct Mask {
+        static int const kCount = ThreadMap::Iterations::kColumn;
+
+        /// Predicate state
+        bool predicates[kCount];
+
+        //
+        // Mask
+        //
+        CUTLASS_HOST_DEVICE
+        Mask() { enable(); }
+
+        ///< Efficiently disables all accesses guarded by mask
+        CUTLASS_HOST_DEVICE void clear()
+        {
+            CUTLASS_PRAGMA_UNROLL
+            for (int i = 0; i < kCount; ++i) { predicates[i] = false; }
+        }
+
+        ///< CUTLASS_HOST_DEVICE enables all accesses guarded by mask
+        CUTLASS_DEVICE void enable()
+        {
+            CUTLASS_PRAGMA_UNROLL
+            for (int i = 0; i < kCount; ++i) { predicates[i] = true; }
+        }
+    };
+
+private:
+    //
+    // Data members
+    //
+
+    /// Parameters structure containing reference and precomputed state.
+    PredicatedTileIteratorParams params_;
+
+    /// Byte-level pointer
+    uint8_t* byte_pointer_;
+
+    /// Array of boolean values to contain steady-state predicates
+    Mask mask_;
+
+    /// Extent of the matrix tile in rows
+    Index extent_row_;
+
+    /// Extent of the matrix tile in rows
+    Index extent_column_;
+
+    /// A thread's starting row position (assuming steady-state predicates have
+    /// been computed)
+    Index thread_start_row_;
+
+    /// A thread's starting column
+    Index thread_start_column_;
+
+    /// Internal state counter
+    int state_[3];
+
+    /// Scatter indices
+    int const* indices_;
+
+    //
+    // Static asserts about internal strides
+    //
+
+    static_assert(sizeof(extent_row_) == 4, "Expected 32b extents");
+    static_assert(sizeof(thread_start_row_) == 4, "Expected 32b extents");
+    static_assert(sizeof(PredicatedTileIteratorParams::stride) == 8, "Expected 64b strides");
+
+private:
+    //
+    // Methods
+    //
+
+public:
+    //
+    // Methods
+    //
+
+    /// Constructor
+    CUTLASS_DEVICE
+    PredicatedTileIteratorPrefetch(PredicatedTileIteratorParams const& params,
+                                   Element* pointer,
+                                   TensorCoord extent,
+                                   int thread_idx,
+                                   TensorCoord threadblock_offset = TensorCoord(),
+                                   int const* indices = nullptr)
+        : params_(params), indices_(indices)
+    {
+        TensorCoord thread_offset = ThreadMap::initial_offset(thread_idx) + threadblock_offset;
+
+        extent_row_ = extent.row();
+        extent_column_ = extent.column();
+
+        thread_start_row_ = thread_offset.row();
+        thread_start_column_ = thread_offset.column();
+
+        // Initialize predicates
+        CUTLASS_PRAGMA_UNROLL
+        for (int c = 0; c < ThreadMap::Iterations::kColumn; ++c) {
+            mask_.predicates[c] =
+                ((thread_offset.column() + ThreadMap::Delta::kColumn * c) < extent.column());
+        }
+
+        // Null pointer performs no accesses
+        if (!pointer) { mask_.clear(); }
+
+        if (ScatterD && !indices) { mask_.clear(); }
+
+        // Initialize pointer
+        byte_pointer_ = reinterpret_cast<uint8_t*>(pointer) +
+                        LongIndex(thread_offset.row()) * LongIndex(params_.stride) +
+                        LongIndex(thread_offset.column()) * sizeof(AccessType) / kElementsPerAccess;
+
+        if (ScatterD) {
+            byte_pointer_ =
+                reinterpret_cast<uint8_t*>(pointer) +
+                LongIndex(thread_offset.column()) * sizeof(AccessType) / kElementsPerAccess;
+        }
+
+        // Initialize internal state counter
+        state_[0] = state_[1] = state_[2] = 0;
+    }
+
+    /// Adds a pointer offset in units of Element
+    CUTLASS_HOST_DEVICE
+    void add_pointer_offset(LongIndex pointer_offset)
+    {
+        byte_pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+    }
+
+    CUTLASS_DEVICE
+    void prefetch_all()
+    {
+        CUTLASS_PRAGMA_UNROLL
+        for (int iter = 0; iter < kIterations; ++iter) {
+            prefetch();
+            ++(*this);
+        }
+    }
+
+    CUTLASS_DEVICE
+    void prefetch()
+    {
+        uint8_t* byte_pointer = byte_pointer_;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
+            CUTLASS_PRAGMA_UNROLL
+            for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+                CUTLASS_PRAGMA_UNROLL
+                for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+                    int row_offset = row * ThreadMap::Delta::kRow +
+                                     group * ThreadMap::Delta::kGroup +
+                                     cluster * ThreadMap::Delta::kCluster;
+
+                    AccessType* memory_pointer = reinterpret_cast<AccessType*>(byte_pointer);
+
+                    CUTLASS_PRAGMA_UNROLL
+                    for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
+                        // on windows using unsigned long here gives the error
+                        // error: asm operand type size(4) does not match
+                        // type/size implied by constraint 'l'
+                        uint64_t addr =
+                            (uint64_t)((void*)&memory_pointer[column * ThreadMap::Delta::kColumn /
+                                                              kElementsPerAccess]);
+                        asm volatile("prefetch.global.L1 [ %1 ];" : "=l"(addr) : "l"(addr));
+                    }
+
+                    if (row + 1 < ThreadMap::Iterations::kRow) {
+                        if (!ScatterD) { byte_pointer += params_.increment_row; }
+                    }
+                }
+
+                if (group + 1 < ThreadMap::Iterations::kGroup) {
+                    byte_pointer += params_.increment_group;
+                }
+            }
+
+            if (cluster + 1 < ThreadMap::Iterations::kCluster) {
+                byte_pointer += params_.increment_cluster;
+            }
+        }
+    }
+
+    /// Loads a fragment from memory
+    CUTLASS_DEVICE
+    void load_with_byte_offset(Fragment& frag, int64_t byte_offset) const
+    {
+        uint8_t* byte_pointer = byte_pointer_;
+        AccessType* frag_ptr = reinterpret_cast<AccessType*>(&frag);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
+            CUTLASS_PRAGMA_UNROLL
+            for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+                CUTLASS_PRAGMA_UNROLL
+                for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+                    int frag_row_idx =
+                        (row + ThreadMap::Iterations::kRow *
+                                   (group + ThreadMap::Iterations::kGroup * cluster));
+
+                    int row_offset = row * ThreadMap::Delta::kRow +
+                                     group * ThreadMap::Delta::kGroup +
+                                     cluster * ThreadMap::Delta::kCluster;
+
+                    bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
+
+                    AccessType* memory_pointer =
+                        reinterpret_cast<AccessType*>(byte_pointer + byte_offset);
+
+                    if (ScatterD && row_guard) {
+                        assert(indices_);
+
+                        memory_pointer = reinterpret_cast<AccessType*>(
+                            byte_pointer + byte_offset +
+                            LongIndex(indices_[row_offset + thread_start_row_]) *
+                                LongIndex(params_.stride));
+                    }
+
+                    CUTLASS_PRAGMA_UNROLL
+                    for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
+                        bool guard = row_guard && mask_.predicates[column];
+
+                        cutlass::arch::global_load<AccessType, sizeof(AccessType)>(
+                            frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column],
+                            (void*)&memory_pointer[column * ThreadMap::Delta::kColumn /
+                                                   kElementsPerAccess],
+                            guard);
+                    }
+
+                    if (row + 1 < ThreadMap::Iterations::kRow) {
+                        if (!ScatterD) { byte_pointer += params_.increment_row; }
+                    }
+                }
+
+                if (group + 1 < ThreadMap::Iterations::kGroup) {
+                    byte_pointer += params_.increment_group;
+                }
+            }
+
+            if (cluster + 1 < ThreadMap::Iterations::kCluster) {
+                byte_pointer += params_.increment_cluster;
+            }
+        }
+    }
+
+    /// Loads a fragment from memory
+    CUTLASS_DEVICE
+    void load(Fragment& frag) const { load_with_byte_offset(frag, 0); }
+
+    /// Stores a fragment to memory
+    CUTLASS_DEVICE
+    void store_with_byte_offset(Fragment const& frag, int64_t byte_offset) const
+    {
+        uint8_t* byte_pointer = byte_pointer_;
+        AccessType const* frag_ptr = reinterpret_cast<AccessType const*>(&frag);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
+            CUTLASS_PRAGMA_UNROLL
+            for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+                CUTLASS_PRAGMA_UNROLL
+                for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+                    int frag_row_idx =
+                        (row + ThreadMap::Iterations::kRow *
+                                   (group + ThreadMap::Iterations::kGroup * cluster));
+
+                    int row_offset = row * ThreadMap::Delta::kRow +
+                                     group * ThreadMap::Delta::kGroup +
+                                     cluster * ThreadMap::Delta::kCluster;
+
+                    bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
+
+                    AccessType* memory_pointer =
+                        reinterpret_cast<AccessType*>(byte_pointer + byte_offset);
+
+                    if (ScatterD && row_guard) {
+                        assert(indices_);
+
+                        memory_pointer = reinterpret_cast<AccessType*>(
+                            byte_pointer + byte_offset +
+                            LongIndex(indices_[row_offset + thread_start_row_]) *
+                                LongIndex(params_.stride));
+                    }
+
+                    CUTLASS_PRAGMA_UNROLL
+                    for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
+                        bool guard = row_guard && mask_.predicates[column];
+
+                        if (UseCUDAStore) {
+                            if (guard) {
+                                memory_pointer[column * ThreadMap::Delta::kColumn /
+                                               kElementsPerAccess] =
+                                    frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn +
+                                             column];
+                            }
+                        } else {
+                            cutlass::arch::global_store<AccessType, sizeof(AccessType)>(
+                                frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column],
+                                (void*)&memory_pointer[column * ThreadMap::Delta::kColumn /
+                                                       kElementsPerAccess],
+                                guard);
+                        }
+                    }
+
+                    if (row + 1 < ThreadMap::Iterations::kRow) {
+                        if (!ScatterD) { byte_pointer += params_.increment_row; }
+                    }
+                }
+
+                if (group + 1 < ThreadMap::Iterations::kGroup) {
+                    byte_pointer += params_.increment_group;
+                }
+            }
+
+            if (cluster + 1 < ThreadMap::Iterations::kCluster) {
+                byte_pointer += params_.increment_cluster;
+            }
+        }
+    }
+
+    /// Stores a fragment to memory
+    CUTLASS_DEVICE
+    void store(Fragment const& frag) const { store_with_byte_offset(frag, 0); }
+
+    /// Loads a fragment from memory
+    CUTLASS_DEVICE
+    void downsample_load_with_byte_offset(Fragment& frag,
+                                          int64_t byte_offset,
+                                          int convolution_P,
+                                          int convolution_Q,
+                                          int add_P,
+                                          int add_Q,
+                                          int problem_N) const
+    {
+        uint8_t* byte_pointer = byte_pointer_;
+        AccessType* frag_ptr = reinterpret_cast<AccessType*>(&frag);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
+            CUTLASS_PRAGMA_UNROLL
+            for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+                CUTLASS_PRAGMA_UNROLL
+                for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+                    int frag_row_idx =
+                        (row + ThreadMap::Iterations::kRow *
+                                   (group + ThreadMap::Iterations::kGroup * cluster));
+
+                    int row_offset = row * ThreadMap::Delta::kRow +
+                                     group * ThreadMap::Delta::kGroup +
+                                     cluster * ThreadMap::Delta::kCluster;
+
+                    bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
+
+                    int output_row = row_offset + thread_start_row_;
+                    int output_N = output_row / (convolution_P * convolution_Q);
+                    int output_PQ = output_row % (convolution_P * convolution_Q);
+                    int output_P = output_PQ / convolution_Q;
+                    int output_Q = output_PQ % convolution_Q;
+
+                    int input_row = output_N * 2 * convolution_P * 2 * convolution_Q +
+                                    (2 * output_P + add_P) * 2 * convolution_Q + 2 * output_Q +
+                                    add_Q;
+
+                    int64_t byte_offset = (input_row - output_row) * problem_N * sizeof(float);
+
+                    AccessType* memory_pointer =
+                        reinterpret_cast<AccessType*>(byte_pointer + byte_offset);
+
+                    CUTLASS_PRAGMA_UNROLL
+                    for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
+                        bool guard = row_guard && mask_.predicates[column];
+
+                        cutlass::arch::global_load<AccessType, sizeof(AccessType)>(
+                            frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column],
+                            (void*)&memory_pointer[column * ThreadMap::Delta::kColumn /
+                                                   kElementsPerAccess],
+                            guard);
+                    }
+
+                    if (row + 1 < ThreadMap::Iterations::kRow) {
+                        byte_pointer += params_.increment_row;
+                    }
+                }
+
+                if (group + 1 < ThreadMap::Iterations::kGroup) {
+                    byte_pointer += params_.increment_group;
+                }
+            }
+
+            if (cluster + 1 < ThreadMap::Iterations::kCluster) {
+                byte_pointer += params_.increment_cluster;
+            }
+        }
+    }
+
+    /// Loads a fragment from memory
+    CUTLASS_DEVICE
+    void upsample_load_with_byte_offset(Fragment& frag,
+                                        int64_t byte_offset,
+                                        int convolution_P,
+                                        int convolution_Q,
+                                        int add_P,
+                                        int add_Q,
+                                        int problem_N) const
+    {
+        uint8_t* byte_pointer = byte_pointer_;
+        AccessType* frag_ptr = reinterpret_cast<AccessType*>(&frag);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
+            CUTLASS_PRAGMA_UNROLL
+            for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+                CUTLASS_PRAGMA_UNROLL
+                for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+                    int frag_row_idx =
+                        (row + ThreadMap::Iterations::kRow *
+                                   (group + ThreadMap::Iterations::kGroup * cluster));
+
+                    int row_offset = row * ThreadMap::Delta::kRow +
+                                     group * ThreadMap::Delta::kGroup +
+                                     cluster * ThreadMap::Delta::kCluster;
+
+                    bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
+
+                    int output_row = row_offset + thread_start_row_;
+                    int output_N = output_row / (convolution_P * convolution_Q);
+                    int output_PQ = output_row % (convolution_P * convolution_Q);
+                    int output_P = output_PQ / convolution_Q;
+                    int output_Q = output_PQ % convolution_Q;
+                    int row_add_P = add_P;
+                    int row_add_Q = add_Q;
+                    if (output_P > convolution_P - 2) row_add_P = 0;
+                    if (output_Q > convolution_Q - 2) row_add_Q = 0;
+
+                    int input_row = output_N * (convolution_P / 2) * (convolution_Q / 2) +
+                                    ((output_P + row_add_P) / 2) * (convolution_Q / 2) +
+                                    (output_Q + row_add_Q) / 2;
+
+                    int64_t byte_offset = (input_row - output_row) * problem_N * sizeof(float);
+
+                    AccessType* memory_pointer =
+                        reinterpret_cast<AccessType*>(byte_pointer + byte_offset);
+
+                    CUTLASS_PRAGMA_UNROLL
+                    for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
+                        bool guard = row_guard && mask_.predicates[column];
+
+                        cutlass::arch::global_load<AccessType, sizeof(AccessType)>(
+                            frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column],
+                            (void*)&memory_pointer[column * ThreadMap::Delta::kColumn /
+                                                   kElementsPerAccess],
+                            guard);
+                    }
+
+                    if (row + 1 < ThreadMap::Iterations::kRow) {
+                        byte_pointer += params_.increment_row;
+                    }
+                }
+
+                if (group + 1 < ThreadMap::Iterations::kGroup) {
+                    byte_pointer += params_.increment_group;
+                }
+            }
+
+            if (cluster + 1 < ThreadMap::Iterations::kCluster) {
+                byte_pointer += params_.increment_cluster;
+            }
+        }
+    }
+
+    CUTLASS_DEVICE
+    MatrixCoord thread_start() const
+    {
+        return MatrixCoord(thread_start_row_, thread_start_column_);
+    }
+
+    /// Need to get the thread start row from the tile iterator
+    CUTLASS_DEVICE
+    int32_t thread_start_row() const { return thread_start_row_; }
+
+    /// Need to get the thread start row from the tile iterator
+    CUTLASS_DEVICE
+    int32_t thread_start_column() const { return thread_start_column_; }
+
+    /// Extent of the matrix in rows
+    CUTLASS_DEVICE
+    Index extent_row() const { return extent_row_; }
+
+    /// Extent of the matrix in columns
+    CUTLASS_DEVICE
+    Index extent_column() const { return extent_column_; }
+
+    /// Advances to the next position to load or store
+    CUTLASS_HOST_DEVICE
+    PredicatedTileIteratorPrefetch& operator++()
+    {
+        ++state_[0];
+
+        if (!ScatterD) { byte_pointer_ += params_.advance_row; }
+
+        thread_start_row_ += ThreadMap::Shape::kRow;
+
+        if (state_[0] == ThreadMap::Count::kRow) {
+            state_[0] = 0;
+            ++state_[1];
+            byte_pointer_ += params_.advance_group;
+
+            thread_start_row_ +=
+                (ThreadMap::Shape::kGroup - 1) * ThreadMap::Shape::kRow * ThreadMap::Count::kRow;
+
+            if (state_[1] == ThreadMap::Count::kGroup) {
+                state_[1] = 0;
+                ++state_[2];
+                byte_pointer_ += params_.advance_cluster;
+
+                thread_start_row_ += ThreadMap::Count::kGroup * ThreadMap::Shape::kGroup *
+                                     ThreadMap::Count::kRow * ThreadMap::Shape::kRow;
+
+                if (state_[2] == ThreadMap::Count::kCluster) {
+                    state_[2] = 0;
+                    byte_pointer_ += params_.advance_tile;
+                }
+            }
+        }
+
+        return *this;
+    }
+
+    ///< Efficiently disables all accesses guarded by mask
+    CUTLASS_DEVICE void clear_mask() { mask_.clear(); }
+
+    ///< Efficiently enables all accesses guarded by mask
+    CUTLASS_DEVICE void enable_mask() { mask_.enable(); }
+
+    ///< Sets the mask
+    CUTLASS_DEVICE void get_mask(Mask& mask) const { mask = mask_; }
+
+    ///< Sets the mask
+    CUTLASS_DEVICE void set_mask(Mask const& mask) { mask_ = mask; }
+};
+
+template <typename IT>
+struct MakePrefetchableIterator {
+    using Iterator = PredicatedTileIteratorPrefetch<typename IT::ThreadMap, typename IT::Element>;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace epilogue
+}  // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/deepspeed4science/evoformer_attn/iterators/make_residual_last.h b/csrc/deepspeed4science/evoformer_attn/iterators/make_residual_last.h
new file mode 100644
index 000000000000..ff0e324c3a6c
--- /dev/null
+++ b/csrc/deepspeed4science/evoformer_attn/iterators/make_residual_last.h
@@ -0,0 +1,91 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holdvr nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#pragma once
+
+#include "predicated_tile_access_iterator_residual_last.h"
+#include "predicated_tile_iterator_residual_last.h"
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+template <typename BaseIterator>
+struct MakeIteratorResidualLast;
+
+template <typename Shape,
+          typename Element,
+          typename Layout,
+          int AdvanceRank,
+          typename ThreadMap,
+          int AccessSize,
+          bool Gather>
+struct MakeIteratorResidualLast<
+    PredicatedTileIterator<Shape, Element, Layout, AdvanceRank, ThreadMap, AccessSize, Gather>> {
+    using Iterator = PredicatedTileIteratorResidualLast<Shape,
+                                                        Element,
+                                                        Layout,
+                                                        AdvanceRank,
+                                                        ThreadMap,
+                                                        AccessSize,
+                                                        Gather>;
+};
+
+template <typename Shape,
+          typename Element,
+          typename Layout,
+          int AdvanceRank,
+          typename ThreadMap,
+          typename AccessType,
+          bool Gather>
+struct MakeIteratorResidualLast<PredicatedTileAccessIterator<Shape,
+                                                             Element,
+                                                             Layout,
+                                                             AdvanceRank,
+                                                             ThreadMap,
+                                                             AccessType,
+                                                             Gather>> {
+    using Iterator = PredicatedTileAccessIteratorResidualLast<Shape,
+                                                              Element,
+                                                              Layout,
+                                                              AdvanceRank,
+                                                              ThreadMap,
+                                                              AccessType,
+                                                              Gather>;
+};
+}  // namespace threadblock
+}  // namespace transform
+}  // namespace cutlass
diff --git a/csrc/deepspeed4science/evoformer_attn/iterators/predicated_tile_access_iterator_residual_last.h b/csrc/deepspeed4science/evoformer_attn/iterators/predicated_tile_access_iterator_residual_last.h
new file mode 100644
index 000000000000..7f6a2430845a
--- /dev/null
+++ b/csrc/deepspeed4science/evoformer_attn/iterators/predicated_tile_access_iterator_residual_last.h
@@ -0,0 +1,1964 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+/*! \file
+    \brief Templates calculating the address and predicates to the load of tiles
+    from pitch-linear rank=2 tensors.
+
+    This iterator uses masks to guard out-of-bounds accesses. The first tile
+   this iterator visits maybe partial, then the remaining tiles are complete.
+   So, we only need to compute the predicates twice, once before the first tile
+   and once for the remaining full tiles which can share the same predicates.
+
+    A precomputed "Params" object minimizes the amount of state that must be
+    stored in registers, and integer addition is used to advance the pointer
+    through memory.
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/transform/threadblock/predicated_tile_access_iterator_params.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// PredicatedTileAccessIteratorResidualLast
+///
+template <typename Shape,
+          typename Element,
+          typename Layout,
+          int AdvanceRank,
+          typename ThreadMap,
+          typename AccessType,
+          bool Gather = false>
+class PredicatedTileAccessIteratorResidualLast;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIteratorResidualLast for pitch-linear
+/// data.
+///
+template <typename Shape_,
+          typename Element_,
+          int AdvanceRank,
+          typename ThreadMap_,
+          typename AccessType_,
+          bool Gather>
+class PredicatedTileAccessIteratorResidualLast<Shape_,
+                                               Element_,
+                                               layout::PitchLinear,
+                                               AdvanceRank,
+                                               ThreadMap_,
+                                               AccessType_,
+                                               Gather> {
+public:
+    static_assert(AdvanceRank == 0 || AdvanceRank == 1,
+                  "Specialization for pitch-linear iterator may along advance along the "
+                  "contiguous(rank=0) or strided(rank=1) dimension.");
+
+    using Shape = Shape_;
+    using Element = Element_;
+    using Layout = layout::PitchLinear;
+    static int const kAdvanceRank = AdvanceRank;
+    using ThreadMap = ThreadMap_;
+    using AccessType = AccessType_;
+
+    using Index = typename Layout::Index;
+    using LongIndex = typename Layout::LongIndex;
+
+    using TensorRef = TensorRef<Element, Layout>;
+    using TensorView = TensorView<Element, Layout>;
+    using TensorCoord = typename Layout::TensorCoord;
+
+    using Pointer = Element*;
+    using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+    using UnderlyingPredicates = PredicatedTileAccessIteratorPredicates<Shape,
+                                                                        Element,
+                                                                        Layout,
+                                                                        AdvanceRank,
+                                                                        ThreadMap,
+                                                                        AccessType>;
+
+    static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
+
+    static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements),
+                  "Vectors implied by the thread map must be divisible by the access type.");
+
+    using Mask = typename UnderlyingPredicates::Mask;
+
+    /// Uses a non-template class
+    struct Params : PredicatedTileAccessIteratorParams {
+        using Base = PredicatedTileAccessIteratorParams;
+
+        // Default ctor
+        CUTLASS_HOST_DEVICE
+        Params() {}
+
+        /// Construct the Params object given a pitch-linear tensor's layout
+        CUTLASS_HOST_DEVICE
+        Params(Layout const& layout)
+            : Base(layout.stride(0),
+                   MakePredicatedTileAccessIteratorDesc<Shape,
+                                                        Element,
+                                                        Layout,
+                                                        kAdvanceRank,
+                                                        ThreadMap>()())
+        {
+        }
+
+        CUTLASS_HOST_DEVICE
+        Params(Base const& base) : Base(base) {}
+    };
+
+private:
+    /// Internal pointer type permits fast address arithmetic
+    using BytePointer = char*;
+
+private:
+    //
+    // Data members
+    //
+
+    UnderlyingPredicates the_predicates;
+    Mask residual_tile_mask;
+
+    /// Parameters object with precomputed internal state
+    Params params_;
+
+    /// Internal pointer to first access of tile
+    BytePointer pointer_;
+
+    /// Below is used when Gather is turned on.  We need to record strided_offset
+    /// and contiguous_offset separated to compute the offset by using
+    ///
+    /// offset = contiguous_offset + indices[strided_offset]
+    ///
+
+    /// Gather indices
+    int const* indices_;
+
+    Index gather_offset_strided;
+
+private:
+    /// Computes predicates based on internally tracked per-thread offset.
+    CUTLASS_DEVICE
+    void compute_predicates_(
+        /// Extent of the matrix window
+        TensorCoord extent,
+        /// optionally, simplify predicate calculation during 'steady state' phase
+        bool is_steady_state = false)
+    {
+        the_predicates.compute_predicates_(extent, is_steady_state);
+    }
+
+public:
+    /// Constructs a TileIterator from its precomputed state, threadblock offset,
+    /// and thread ID
+    CUTLASS_HOST_DEVICE
+    PredicatedTileAccessIteratorResidualLast(
+        /// Precomputed parameters object
+        Params const& params,
+        /// Pointer to start of tensor
+        Pointer pointer,
+        /// Extent of tensor
+        TensorCoord extent,
+        /// ID of each participating thread
+        int thread_id,
+        /// Initial offset of threadblock
+        TensorCoord const& threadblock_offset,
+        /// Gather indices
+        int const* indices = nullptr)
+        : params_(params),
+          pointer_(reinterpret_cast<BytePointer>(const_cast<NonConstPointer>(pointer))),
+          the_predicates(extent),
+          indices_(indices)
+    {
+        the_predicates.set_predicates(thread_id, threadblock_offset);
+        the_predicates.get_mask(residual_tile_mask);
+
+        // Working around a weird compiler bug happening on P100 for the backward.
+        // I've seen together: the_predicates.predicates_[0] = 14 (instead of 15)
+        // residual_tile_mask[0] = 15 (correct)
+        //
+        // Adding prints when the value is calculated (in `compute_predicates_`)
+        // sometimes removes the bug. The consequence is that we skip some
+        // element of a tensor, leading to wrong results
+        // Setting `compute_predicates_`'s second argument (`is_steady_state`) to
+        // true also seems to get rid of the bug - at the cost of twice as many
+        // comparisons.
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700)
+        constexpr bool kWorkAroundCompilerBug = false;
+#else
+        constexpr bool kWorkAroundCompilerBug = true;
+#endif
+        the_predicates.compute_predicates_(extent, true && !kWorkAroundCompilerBug);
+
+        // update internal pointers
+        Layout layout(params_.stride_);
+
+        if (!Gather) {
+            add_pointer_offset(layout(the_predicates.thread_offset_));
+        } else {
+            gather_offset_strided = the_predicates.thread_offset_.strided();
+            add_pointer_offset(layout(make_Coord(the_predicates.thread_offset_.contiguous(), 0)));
+        }
+    }
+
+    /// Construct a PredicatedTileAccessIteratorResidualLast with zero threadblock
+    /// offset
+    CUTLASS_HOST_DEVICE
+    PredicatedTileAccessIteratorResidualLast(
+        /// Precomputed parameters object
+        Params const& params,
+        /// Pointer to start of tensor
+        Pointer pointer,
+        /// Extent of tensor
+        TensorCoord extent,
+        ///< ID of each participating thread
+        int thread_id)
+        : PredicatedTileAccessIteratorResidualLast(params,
+                                                   pointer,
+                                                   extent,
+                                                   thread_id,
+                                                   make_Coord(0, 0))
+    {
+    }
+
+    /// Overrides the internal iteration index
+    CUTLASS_HOST_DEVICE
+    void set_iteration_index(int index) { the_predicates.set_iteration_index(index); }
+
+    CUTLASS_HOST_DEVICE
+    void set_residual_tile(bool is_residual_tile)
+    {
+        if (is_residual_tile) { the_predicates.set_mask(residual_tile_mask); }
+    }
+
+    /// Adds a pointer offset in units of Element
+    CUTLASS_HOST_DEVICE
+    void add_pointer_offset(LongIndex pointer_offset)
+    {
+        pointer_ += sizeof_bits<Element>::value * pointer_offset / 8;
+    }
+
+    /// Advances an iterator along logical dimensions of matrix in units of whole
+    /// tiles
+    CUTLASS_DEVICE
+    void add_tile_offset(TensorCoord const& tile_offset)
+    {
+        if (!Gather) {
+            if (kAdvanceRank) {
+                pointer_ += params_.inc_advance_ * LongIndex(tile_offset.strided());
+                pointer_ += Shape::kContiguous * tile_offset.contiguous();
+            } else {
+                pointer_ += params_.inc_advance_ * LongIndex(tile_offset.contiguous());
+                pointer_ += Shape::kStrided * tile_offset.strided();
+            }
+        } else {
+            add_pointer_offset(Shape::kContiguous * tile_offset.contiguous());
+            gather_offset_strided += Shape::kStrided * tile_offset.strided();
+        }
+    }
+
+    /// Returns a pointer
+    CUTLASS_HOST_DEVICE
+    AccessType* get() const
+    {
+        if (Gather) {
+            assert(indices_);
+
+            if (!valid()) { return nullptr; }
+
+            LongIndex contiguous_offset =
+                the_predicates.iteration_contiguous_ *
+                    (ThreadMap::Delta::kContiguous * sizeof_bits<Element>::value / 8) +
+                the_predicates.iteration_vector_;
+            int strided_index = gather_offset_strided +
+                                the_predicates.iteration_strided_ * ThreadMap::Delta::kStrided;
+
+            LongIndex strided_offset = indices_[strided_index] * LongIndex(params_.stride_) *
+                                       sizeof_bits<Element>::value / 8;
+
+            return reinterpret_cast<AccessType*>(pointer_ + contiguous_offset + strided_offset);
+        }
+
+        return reinterpret_cast<AccessType*>(
+                   pointer_ + the_predicates.iteration_contiguous_ *
+                                  (ThreadMap::Delta::kContiguous * sizeof_bits<Element>::value) /
+                                  8) +
+               the_predicates.iteration_vector_;
+    }
+
+    /// Increment and return an instance to self.
+    CUTLASS_HOST_DEVICE
+    PredicatedTileAccessIteratorResidualLast& operator++()
+    {
+        the_predicates.operator++();
+
+        ++the_predicates.iteration_vector_;
+        if (the_predicates.iteration_vector_ < kAccessesPerVector) { return *this; }
+
+        the_predicates.iteration_vector_ = 0;
+        ++the_predicates.iteration_contiguous_;
+
+        if (the_predicates.iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+            return *this;
+        }
+
+        // Enter here only if (iteration_contiguous_ ==
+        // ThreadMap::Iteration::kContiguous)
+        the_predicates.iteration_contiguous_ = 0;
+        ++the_predicates.iteration_strided_;
+
+        if (the_predicates.iteration_strided_ < ThreadMap::Iterations::kStrided) {
+            if (!Gather) { pointer_ += params_.inc_strided_; }
+
+            return *this;
+        }
+
+        // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided)
+        // which means we enter the next tile.
+        the_predicates.iteration_strided_ = 0;
+
+        if (!Gather) {
+            // advance to next tile
+            pointer_ += params_.inc_next_;
+
+            // now return to start tile - if the iterator is subsequently advanced,
+            // this subtraction as well as the subsequent integer addition are both
+            // elided by the compiler.
+            pointer_ -= params_.inc_advance_;
+        }
+
+        return *this;
+    }
+
+    /// Increment and return an instance to self.
+    CUTLASS_HOST_DEVICE
+    PredicatedTileAccessIteratorResidualLast operator++(int)
+    {
+        PredicatedTileAccessIteratorResidualLast self(*this);
+        operator++();
+        return self;
+    }
+
+    /// Clears the predicate set efficiently
+    CUTLASS_HOST_DEVICE
+    void clear_mask(bool enable = true) { the_predicates.clear_mask(enable); }
+
+    /// Clears the predicate set efficiently
+    CUTLASS_HOST_DEVICE
+    void enable_mask() { the_predicates.enable_mask(); }
+
+    /// Sets the predicate mask, overriding value stored in predicate iterator
+    CUTLASS_HOST_DEVICE
+    void set_mask(Mask const& mask) { the_predicates.set_mask(mask); }
+
+    /// Gets the mask
+    CUTLASS_HOST_DEVICE
+    void get_mask(Mask& mask) { the_predicates.get_mask(mask); }
+
+    /// Returns whether access is valid or not
+    CUTLASS_HOST_DEVICE
+    bool valid() const { return the_predicates.valid(); }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIteratorResidualLast for column-major
+/// data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_,
+          typename Element_,
+          int AdvanceRank,
+          typename ThreadMap_,
+          typename AccessType_,
+          bool Gather>
+class PredicatedTileAccessIteratorResidualLast<Shape_,
+                                               Element_,
+                                               layout::ColumnMajor,
+                                               AdvanceRank,
+                                               ThreadMap_,
+                                               AccessType_,
+                                               Gather> {
+public:
+    static_assert(AdvanceRank == 0 || AdvanceRank == 1,
+                  "Specialization for pitch-linear iterator may along advance along the "
+                  "contiguous(rank=0) or strided(rank=1) dimension.");
+
+    using Shape = Shape_;
+    using Element = Element_;
+    using Layout = layout::ColumnMajor;
+    static int const kAdvanceRank = AdvanceRank;
+    using ThreadMap = ThreadMap_;
+    using AccessType = AccessType_;
+
+    using Index = typename Layout::Index;
+    using LongIndex = typename Layout::LongIndex;
+
+    using TensorRef = TensorRef<Element, Layout>;
+    using TensorView = TensorView<Element, Layout>;
+    using TensorCoord = typename Layout::TensorCoord;
+
+    using Pointer = Element*;
+    using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+    using UnderlyingIterator = PredicatedTileAccessIteratorResidualLast<
+        layout::PitchLinearShape<Shape::kRow, Shape::kColumn>,
+        Element,
+        layout::PitchLinear,
+        (kAdvanceRank == 0 ? 0 : 1),
+        ThreadMap,
+        AccessType,
+        Gather>;
+
+    /// Predicate vector stores mask to guard accesses
+    using Mask = typename UnderlyingIterator::Mask;
+
+    static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+    /// Parameters object is precomputed state and is host-constructible
+    class Params {
+    private:
+        friend PredicatedTileAccessIteratorResidualLast;
+
+        /// Parameters object
+        typename UnderlyingIterator::Params params_;
+
+    public:
+        /// Default ctor
+        CUTLASS_HOST_DEVICE
+        Params() {}
+
+        /// Construct the Params object given a pitch-linear tensor's layout
+        CUTLASS_HOST_DEVICE
+        Params(Layout const& layout) : params_(layout::PitchLinear(layout.stride(0))){};
+
+        /// Construct the Params object given a pitch-linear tensor's layout
+        CUTLASS_HOST_DEVICE
+        Params(typename UnderlyingIterator::Params::Base const& base) : params_(base) {}
+    };
+
+private:
+    //
+    // Data members
+    //
+
+    /// Underlying pitch-linear tile iterator
+    UnderlyingIterator iterator_;
+
+public:
+    /// Constructs a TileIterator from its precomputed state, threadblock offset,
+    /// and thread ID
+    CUTLASS_HOST_DEVICE
+    PredicatedTileAccessIteratorResidualLast(
+        ///< Precomputed parameters object
+        Params const& params,
+        ///< Pointer to start of tensor
+        Pointer pointer,
+        ///< Extent of tensor
+        TensorCoord extent,
+        ///< ID of each participating thread
+        int thread_id,
+        ///< Initial offset of threadblock
+        TensorCoord const& threadblock_offset,
+        int const* indices = nullptr  ///< gather/scatter indices, note no support for
+                                      ///< gather/scatter at this specialization
+        )
+        : iterator_(params.params_,
+                    pointer,
+                    layout::PitchLinearCoord(extent.row(), extent.column()),
+                    thread_id,
+                    layout::PitchLinearCoord(threadblock_offset.row(), threadblock_offset.column()),
+                    indices)
+    {
+    }
+
+    /// Construct a PredicatedTileAccessIteratorResidualLast with zero threadblock
+    /// offset
+    CUTLASS_HOST_DEVICE
+    PredicatedTileAccessIteratorResidualLast(
+        Params const& params,  ///< Precomputed parameters object
+        Pointer pointer,       ///< Pointer to start of tensor
+        TensorCoord extent,    ///< Extent of tensor
+        int thread_id          ///< ID of each participating thread
+        )
+        : PredicatedTileAccessIteratorResidualLast(params,
+                                                   pointer,
+                                                   extent,
+                                                   thread_id,
+                                                   make_Coord(0, 0))
+    {
+    }
+
+    /// Overrides the internal iteration index
+    CUTLASS_HOST_DEVICE
+    void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+    CUTLASS_HOST_DEVICE
+    void set_residual_tile(bool enable) { iterator_.set_residual_tile(enable); }
+
+    /// Adds a pointer offset in units of Element
+    CUTLASS_HOST_DEVICE
+    void add_pointer_offset(LongIndex pointer_offset)
+    {
+        iterator_.add_pointer_offset(pointer_offset);
+    }
+
+    /// Advances an iterator along logical dimensions of matrix in units of whole
+    /// tiles
+    CUTLASS_HOST_DEVICE
+    void add_tile_offset(TensorCoord const& tile_offset)
+    {
+        iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
+    }
+
+    /// Returns a pointer
+    CUTLASS_HOST_DEVICE
+    AccessType* get() const { return reinterpret_cast<AccessType*>(iterator_.get()); }
+
+    /// Advances to the next tile in memory.
+    ///
+    /// The first time this method is called, predicates are updated, and the
+    /// iterator's internal pointer is reverted to the first "steady state" tile.
+    /// Subsequent calls are lightweight and must only update the internal
+    /// pointer.
+    CUTLASS_HOST_DEVICE
+    PredicatedTileAccessIteratorResidualLast& operator++()
+    {
+        ++iterator_;
+        return *this;
+    }
+
+    /// Advances to the next tile in memory.
+    ///
+    /// The first time this method is called, predicates are updated, and the
+    /// iterator's internal pointer is reverted to the first "steady state" tile.
+    /// Subsequent calls are lightweight and must only update the internal
+    /// pointer.
+    CUTLASS_HOST_DEVICE
+    PredicatedTileAccessIteratorResidualLast operator++(int)
+    {
+        PredicatedTileAccessIteratorResidualLast self(*this);
+        operator++();
+        return self;
+    }
+
+    /// Clears the predicate set efficiently
+    CUTLASS_HOST_DEVICE
+    void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+
+    /// Clears the predicate set efficiently
+    CUTLASS_HOST_DEVICE
+    void enable_mask() { iterator_.enable_mask(); }
+
+    /// Sets the predicate mask, overriding value stored in predicate iterator
+    CUTLASS_HOST_DEVICE
+    void set_mask(Mask const& mask) { iterator_.set_mask(mask); }
+
+    /// Gets the mask
+    CUTLASS_HOST_DEVICE
+    void get_mask(Mask& mask) { iterator_.get_mask(mask); }
+
+    /// Returns whether access is valid or not
+    CUTLASS_HOST_DEVICE
+    bool valid() { return iterator_.valid(); }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIteratorResidualLast for row-major
+/// data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_,
+          typename Element_,
+          int AdvanceRank,
+          typename ThreadMap_,
+          typename AccessType_,
+          bool Gather>
+class PredicatedTileAccessIteratorResidualLast<Shape_,
+                                               Element_,
+                                               layout::RowMajor,
+                                               AdvanceRank,
+                                               ThreadMap_,
+                                               AccessType_,
+                                               Gather> {
+public:
+    static_assert(AdvanceRank == 0 || AdvanceRank == 1,
+                  "Specialization for pitch-linear iterator may along advance along the "
+                  "contiguous(rank=0) or strided(rank=1) dimension.");
+
+    using Shape = Shape_;
+    using Element = Element_;
+    using Layout = layout::RowMajor;
+    static int const kAdvanceRank = AdvanceRank;
+    using ThreadMap = ThreadMap_;
+    using AccessType = AccessType_;
+
+    using Index = typename Layout::Index;
+    using LongIndex = typename Layout::LongIndex;
+
+    using TensorRef = TensorRef<Element, Layout>;
+    using TensorView = TensorView<Element, Layout>;
+    using TensorCoord = typename Layout::TensorCoord;
+
+    using Pointer = Element*;
+    using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+    using UnderlyingIterator = PredicatedTileAccessIteratorResidualLast<
+        layout::PitchLinearShape<Shape::kColumn, Shape::kRow>,
+        Element,
+        layout::PitchLinear,
+        (kAdvanceRank == 0 ? 1 : 0),
+        ThreadMap,
+        AccessType,
+        Gather>;
+
+    static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+    /// Predicate vector stores mask to guard accesses
+    using Mask = typename UnderlyingIterator::Mask;
+
+    /// Parameters object is precomputed state and is host-constructible
+    class Params {
+    private:
+        friend PredicatedTileAccessIteratorResidualLast;
+
+        /// Parameters object
+        typename UnderlyingIterator::Params params_;
+
+    public:
+        /// Default ctor
+        CUTLASS_HOST_DEVICE
+        Params() {}
+
+        /// Construct the Params object given a pitch-linear tensor's layout
+        CUTLASS_HOST_DEVICE
+        Params(Layout const& layout) : params_(layout::PitchLinear(layout.stride(0))){};
+
+        /// Construct the Params object given a pitch-linear tensor's layout
+        CUTLASS_HOST_DEVICE
+        Params(typename UnderlyingIterator::Params::Base const& base) : params_(base) {}
+    };
+
+private:
+    //
+    // Data members
+    //
+
+    /// Underlying pitch-linear tile iterator
+    UnderlyingIterator iterator_;
+
+public:
+    /// Constructs a TileIterator from its precomputed state, threadblock offset,
+    /// and thread ID
+    CUTLASS_HOST_DEVICE
+    PredicatedTileAccessIteratorResidualLast(
+        ///< Precomputed parameters object
+        Params const& params,
+        ///< Pointer to start of tensor
+        Pointer pointer,
+        ///< Extent of tensor
+        TensorCoord extent,
+        ///< ID of each participating thread
+        int thread_id,
+        ///< Initial offset of threadblock
+        TensorCoord const& threadblock_offset,
+        /// Gather indices
+        int const* indices = nullptr)
+        : iterator_(params.params_,
+                    pointer,
+                    layout::PitchLinearCoord(extent.column(), extent.row()),
+                    thread_id,
+                    layout::PitchLinearCoord(threadblock_offset.column(), threadblock_offset.row()),
+                    indices)
+    {
+    }
+
+    /// Construct a PredicatedTileAccessIteratorResidualLast with zero threadblock
+    /// offset
+    CUTLASS_HOST_DEVICE
+    PredicatedTileAccessIteratorResidualLast(
+        Params const& params,  ///< Precomputed parameters object
+        Pointer pointer,       ///< Pointer to start of tensor
+        TensorCoord extent,    ///< Extent of tensor
+        int thread_id          ///< ID of each participating thread
+        )
+        : PredicatedTileAccessIteratorResidualLast(params,
+                                                   pointer,
+                                                   extent,
+                                                   thread_id,
+                                                   make_Coord(0, 0))
+    {
+    }
+
+    /// Overrides the internal iteration index
+    CUTLASS_HOST_DEVICE
+    void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+    CUTLASS_HOST_DEVICE
+    void set_residual_tile(bool enable) { iterator_.set_residual_tile(enable); }
+
+    /// Adds a pointer offset in units of Element
+    CUTLASS_HOST_DEVICE
+    void add_pointer_offset(LongIndex pointer_offset)
+    {
+        iterator_.add_pointer_offset(pointer_offset);
+    }
+
+    /// Advances an iterator along logical dimensions of matrix in units of whole
+    /// tiles
+    CUTLASS_HOST_DEVICE
+    void add_tile_offset(TensorCoord const& tile_offset)
+    {
+        iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
+    }
+
+    /// Returns a pointer
+    CUTLASS_HOST_DEVICE
+    AccessType* get() const { return reinterpret_cast<AccessType*>(iterator_.get()); }
+
+    /// Advances to the next tile in memory.
+    ///
+    /// The first time this method is called, predicates are updated, and the
+    /// iterator's internal pointer is reverted to the first "steady state" tile.
+    /// Subsequent calls are lightweight and must only update the internal
+    /// pointer.
+    CUTLASS_HOST_DEVICE
+    PredicatedTileAccessIteratorResidualLast& operator++()
+    {
+        ++iterator_;
+        return *this;
+    }
+
+    /// Advances to the next tile in memory.
+    ///
+    /// The first time this method is called, predicates are updated, and the
+    /// iterator's internal pointer is reverted to the first "steady state" tile.
+    /// Subsequent calls are lightweight and must only update the internal
+    /// pointer.
+    CUTLASS_HOST_DEVICE
+    PredicatedTileAccessIteratorResidualLast operator++(int)
+    {
+        PredicatedTileAccessIteratorResidualLast self(*this);
+        operator++();
+        return self;
+    }
+
+    /// Clears the predicate set efficiently
+    CUTLASS_HOST_DEVICE
+    void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+
+    /// Clears the predicate set efficiently
+    CUTLASS_HOST_DEVICE
+    void enable_mask() { iterator_.enable_mask(); }
+
+    /// Sets the predicate mask, overriding value stored in predicate iterator
+    CUTLASS_HOST_DEVICE
+    void set_mask(Mask const& mask) { iterator_.set_mask(mask); }
+
+    /// Gets the mask
+    CUTLASS_HOST_DEVICE
+    void get_mask(Mask& mask) { iterator_.get_mask(mask); }
+
+    /// Returns whether access is valid or not
+    CUTLASS_HOST_DEVICE
+    bool valid() { return iterator_.valid(); }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIteratorResidualLast for affine rank 2
+/// data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_,
+          typename Element_,
+          int AdvanceRank,
+          typename ThreadMap_,
+          typename AccessType_>
+class PredicatedTileAccessIteratorResidualLast<Shape_,
+                                               Element_,
+                                               layout::AffineRankN<2>,
+                                               AdvanceRank,
+                                               ThreadMap_,
+                                               AccessType_,
+                                               false> {
+public:
+    static_assert(AdvanceRank == 0 || AdvanceRank == 1,
+                  "Specialization for pitch-linear iterator may along advance along the "
+                  "contiguous(rank=0) or strided(rank=1) dimension.");
+
+    using Shape = Shape_;
+    using Element = Element_;
+    using Layout = layout::AffineRankN<2>;
+    static int const kAdvanceRank = AdvanceRank;
+    using ThreadMap = ThreadMap_;
+    using AccessType = AccessType_;
+
+    using Index = typename Layout::Index;
+    using LongIndex = typename Layout::LongIndex;
+
+    using TensorRef = TensorRef<Element, Layout>;
+    using TensorView = TensorView<Element, Layout>;
+    using TensorCoord = typename Layout::TensorCoord;
+
+    using Pointer = Element*;
+    using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+    using UnderlyingPredicates = PredicatedTileAccessIteratorPredicates<Shape,
+                                                                        Element,
+                                                                        layout::PitchLinear,
+                                                                        AdvanceRank,
+                                                                        ThreadMap,
+                                                                        AccessType>;
+
+    static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
+
+    static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements),
+                  "Vectors implied by the thread map must be divisible by the access type.");
+
+    /// Predicate vector stores mask to guard accesses
+    using Mask = typename UnderlyingPredicates::Mask;
+
+    /// Parameters object is precomputed state and is host-constructible
+    class Params {
+    public:
+        friend PredicatedTileAccessIteratorResidualLast;
+
+    private:
+        /// stride of pitch-linear layout (units of Element)
+        Coord<Layout::kStrideRank, Layout::LongIndex> stride_;
+        /// amount (in byte) to increment pointer to move to next access along
+        /// contiguous dimension
+        LongIndex inc_contiguous_;
+        /// amount (in byte) to increment pointer from first access of current
+        /// contiguous dimension to first access of next one.
+        LongIndex inc_strided_;
+        /// amount (in byte) to increment pointer from last access of current
+        /// contiguous dimension to first access of next one.
+        LongIndex inc_next_strided_;
+        /// amount (in byte) to increment pointer from last access to first access
+        /// of next tile
+        LongIndex inc_next_;
+        /// amount (in byte) to increment pointer from first access of current tile
+        /// to first access of next tile
+        LongIndex inc_advance_;
+
+    public:
+        // Default ctor
+        CUTLASS_HOST_DEVICE
+        Params() : stride_(0), inc_contiguous_(0), inc_strided_(0), inc_next_(0), inc_advance_(0) {}
+
+        /// Construct the Params object given a pitch-linear tensor's layout
+        CUTLASS_HOST_DEVICE
+        Params(Layout const& layout) : stride_({layout.stride(0), layout.stride(1)})
+        {
+            inc_contiguous_ = (LongIndex(stride_[0]) * ThreadMap::Delta::kContiguous) *
+                              sizeof_bits<Element>::value / 8;
+
+            inc_strided_ = (LongIndex(stride_[1]) * ThreadMap::Delta::kStrided) *
+                           sizeof_bits<Element>::value / 8;
+
+            inc_next_strided_ =
+                inc_strided_ - LongIndex(ThreadMap::Iterations::kContiguous - 1) * inc_contiguous_;
+
+            if (kAdvanceRank) {
+                // advance along strided dimension
+                inc_advance_ =
+                    Shape::kStrided * LongIndex(stride_[1]) * sizeof_bits<Element>::value / 8;
+            } else {
+                // advance along contiguous dimension
+                inc_advance_ = Shape::kContiguous * stride_[0] * sizeof_bits<Element>::value / 8;
+            }
+
+            inc_next_ = inc_advance_ -
+                        LongIndex(ThreadMap::Iterations::kContiguous - 1) * inc_contiguous_ -
+                        LongIndex(ThreadMap::Iterations::kStrided - 1) * inc_strided_;
+        };
+    };
+
+private:
+    /// Internal pointer type permits fast address arithmetic
+    using BytePointer = char*;
+
+    //
+    // Data members
+    //
+
+    /// Parameters object with precomputed internal state
+    Params params_;
+
+    /// Internal pointer to first access of tile
+    BytePointer pointer_;
+
+    UnderlyingPredicates the_predicates;
+    Mask residual_tile_mask;
+
+private:
+    /// Computes predicates based on internally tracked per-thread offset.
+    CUTLASS_DEVICE
+    void compute_predicates_(
+        /// Extent of the matrix window
+        TensorCoord extent,
+        /// optionally, simplify predicate calculation during 'steady state' phase
+        bool is_steady_state = false)
+    {
+        the_predicates.compute_predicates_(extent, is_steady_state);
+    }
+
+public:
+    /// Constructs a TileIterator from its precomputed state, threadblock offset,
+    /// and thread ID
+    CUTLASS_HOST_DEVICE
+    PredicatedTileAccessIteratorResidualLast(
+        ///< Precomputed parameters object
+        Params const& params,
+        ///< Pointer to start of tensor
+        Pointer pointer,
+        ///< Extent of tensor
+        TensorCoord extent,
+        ///< ID of each participating thread
+        int thread_id,
+        ///< Initial offset of threadblock
+        TensorCoord const& threadblock_offset,
+        int const* indices = nullptr  ///< gather/scatter indices, note no support for
+                                      ///< gather/scatter at this specialization
+        )
+        : params_(params),
+          pointer_(reinterpret_cast<BytePointer>(const_cast<NonConstPointer>(pointer))),
+          the_predicates(extent)
+    {
+        the_predicates.set_predicates(thread_id, threadblock_offset);
+
+        // update internal pointers
+        Layout layout(params_.stride_);
+        add_pointer_offset(layout(the_predicates.thread_offset_));
+    }
+
+    /// Construct a PredicatedTileAccessIteratorResidualLast with zero threadblock
+    /// offset
+    CUTLASS_HOST_DEVICE
+    PredicatedTileAccessIteratorResidualLast(
+        Params const& params,  ///< Precomputed parameters object
+        Pointer pointer,       ///< Pointer to start of tensor
+        TensorCoord extent,    ///< Extent of tensor
+        int thread_id          ///< ID of each participating thread
+        )
+        : PredicatedTileAccessIteratorResidualLast(params,
+                                                   pointer,
+                                                   extent,
+                                                   thread_id,
+                                                   make_Coord(0, 0))
+    {
+    }
+
+    /// Overrides the internal iteration index
+    CUTLASS_HOST_DEVICE
+    void set_iteration_index(int index) { the_predicates.set_iteration_index(index); }
+
+    CUTLASS_HOST_DEVICE
+    void set_residual_tile(bool is_residual_tile)
+    {
+        if (is_residual_tile) { the_predicates.set_mask(residual_tile_mask); }
+    }
+
+    /// Adds a pointer offset in units of Element
+    CUTLASS_HOST_DEVICE
+    void add_pointer_offset(LongIndex pointer_offset)
+    {
+        pointer_ += sizeof_bits<Element>::value * pointer_offset / 8;
+    }
+
+    /// Advances an iterator along logical dimensions of matrix in units of whole
+    /// tiles
+    CUTLASS_HOST_DEVICE
+    void add_tile_offset(TensorCoord const& tile_offset)
+    {
+        if (kAdvanceRank) {
+            pointer_ += params_.inc_advance_ * LongIndex(tile_offset[1]);
+            pointer_ += Shape::kContiguous * tile_offset[0];
+        } else {
+            pointer_ += params_.inc_advance_ * LongIndex(tile_offset[0]);
+            pointer_ += Shape::kStrided * tile_offset[1];
+        }
+    }
+
+    /// Returns a pointer
+    CUTLASS_HOST_DEVICE
+    AccessType* get() const
+    {
+        return reinterpret_cast<AccessType*>(pointer_) + the_predicates.iteration_vector_;
+    }
+
+    /// Advances to the next tile in memory.
+    ///
+    /// The first time this method is called, predicates are updated, and the
+    /// iterator's internal pointer is reverted to the first "steady state" tile.
+    /// Subsequent calls are lightweight and must only update the internal
+    /// pointer.
+    CUTLASS_HOST_DEVICE
+    PredicatedTileAccessIteratorResidualLast& operator++()
+    {
+        the_predicates.operator++();
+        ++the_predicates.iteration_vector_;
+        if (the_predicates.iteration_vector_ < kAccessesPerVector) { return *this; }
+
+        the_predicates.iteration_vector_ = 0;
+        ++the_predicates.iteration_contiguous_;
+
+        if (the_predicates.iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+            pointer_ += params_.inc_contiguous_;
+            return *this;
+        }
+
+        // Enter here only if (iteration_contiguous_ ==
+        // ThreadMap::Iteration::kContiguous)
+        the_predicates.iteration_contiguous_ = 0;
+        ++the_predicates.iteration_strided_;
+
+        if (the_predicates.iteration_strided_ < ThreadMap::Iterations::kStrided) {
+            pointer_ += params_.inc_next_strided_;
+            return *this;
+        }
+
+        // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided)
+        // which means we enter the next tile.
+        the_predicates.iteration_strided_ = 0;
+
+        // advance to next tile
+        pointer_ += params_.inc_next_;
+
+        // now return to start tile - if the iterator is subsequently advanced, this
+        // subtraction as well as the subsequent integer addition are both elided by
+        // the compiler.
+        pointer_ -= params_.inc_advance_;
+
+        return *this;
+    }
+
+    /// Advances to the next tile in memory.
+    ///
+    /// The first time this method is called, predicates are updated, and the
+    /// iterator's internal pointer is reverted to the first "steady state" tile.
+    /// Subsequent calls are lightweight and must only update the internal
+    /// pointer.
+    CUTLASS_HOST_DEVICE
+    PredicatedTileAccessIteratorResidualLast operator++(int)
+    {
+        PredicatedTileAccessIteratorResidualLast self(*this);
+        operator++();
+        return self;
+    }
+
+    /// Clears the predicate set efficiently
+    CUTLASS_HOST_DEVICE
+    void clear_mask(bool enable = true) { the_predicates.clear_mask(enable); }
+
+    /// Clears the predicate set efficiently
+    CUTLASS_HOST_DEVICE
+    void enable_mask() { the_predicates.enable_mask(); }
+
+    /// Sets the predicate mask, overriding value stored in predicate iterator
+    CUTLASS_HOST_DEVICE
+    void set_mask(Mask const& mask) { the_predicates.set_mask(mask); }
+
+    /// Gets the mask
+    CUTLASS_HOST_DEVICE
+    void get_mask(Mask& mask) { the_predicates.get_mask(mask); }
+
+    /// Returns whether access is valid or not
+    CUTLASS_HOST_DEVICE
+    bool valid() { return the_predicates.valid(); }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIteratorResidualLast for affine rank 2
+/// column-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_,
+          typename Element_,
+          int AdvanceRank,
+          typename ThreadMap_,
+          typename AccessType_>
+class PredicatedTileAccessIteratorResidualLast<Shape_,
+                                               Element_,
+                                               layout::AffineRank2ColumnMajor,
+                                               AdvanceRank,
+                                               ThreadMap_,
+                                               AccessType_,
+                                               false> {
+public:
+    static_assert(AdvanceRank == 0 || AdvanceRank == 1,
+                  "Specialization for pitch-linear iterator may along advance along the "
+                  "contiguous(rank=0) or strided(rank=1) dimension.");
+
+    using Shape = Shape_;
+    using Element = Element_;
+    using Layout = layout::AffineRank2ColumnMajor;
+    static int const kAdvanceRank = AdvanceRank;
+    using ThreadMap = ThreadMap_;
+    using AccessType = AccessType_;
+
+    using Index = typename Layout::Index;
+    using LongIndex = typename Layout::LongIndex;
+
+    using TensorRef = TensorRef<Element, Layout>;
+    using TensorView = TensorView<Element, Layout>;
+    using TensorCoord = typename Layout::TensorCoord;
+
+    using Pointer = Element*;
+    using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+    // Map to the underlying AffineRankN<2> layout
+    using UnderlyingIterator = PredicatedTileAccessIteratorResidualLast<
+        layout::PitchLinearShape<Shape::kRow, Shape::kColumn>,
+        Element,
+        layout::AffineRankN<2>,
+        (kAdvanceRank == 0 ? 0 : 1),
+        ThreadMap,
+        AccessType>;
+
+    static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+    /// Predicate vector stores mask to guard accesses
+    using Mask = typename UnderlyingIterator::Mask;
+
+    /// Parameters object is precomputed state and is host-constructible
+    class Params {
+    private:
+        friend PredicatedTileAccessIteratorResidualLast;
+
+        /// Parameters object
+        typename UnderlyingIterator::Params params_;
+
+    public:
+        /// Default ctor
+        CUTLASS_HOST_DEVICE
+        Params() {}
+
+        /// Construct the Params object given an AffineRankN<2> tensor's layout
+        CUTLASS_HOST_DEVICE
+        Params(Layout const& layout)
+            : params_(layout::AffineRankN<2>(layout.stride(0), layout.stride(1))){};
+    };
+
+private:
+    //
+    // Data members
+    //
+
+    /// Underlying AffineRankN<2> tile iterator
+    UnderlyingIterator iterator_;
+
+public:
+    /// Constructs a TileIterator from its precomputed state, threadblock offset,
+    /// and thread ID
+    CUTLASS_HOST_DEVICE
+    PredicatedTileAccessIteratorResidualLast(
+        ///< Precomputed parameters object
+        Params const& params,
+        ///< Pointer to start of tensor
+        Pointer pointer,
+        ///< Extent of tensor
+        TensorCoord extent,
+        ///< ID of each participating thread
+        int thread_id,
+        ///< Initial offset of threadblock
+        TensorCoord const& threadblock_offset,
+        int const* indices = nullptr  ///< gather/scatter indices, note no support for
+                                      ///< gather/scatter at this specialization
+        )
+        : iterator_(params.params_,
+                    pointer,
+                    layout::PitchLinearCoord(extent.row(), extent.column()),
+                    thread_id,
+                    layout::PitchLinearCoord(threadblock_offset.row(), threadblock_offset.column()))
+    {
+    }
+
+    /// Construct a PredicatedTileAccessIteratorResidualLast with zero threadblock
+    /// offset
+    CUTLASS_HOST_DEVICE
+    PredicatedTileAccessIteratorResidualLast(
+        Params const& params,  ///< Precomputed parameters object
+        Pointer pointer,       ///< Pointer to start of tensor
+        TensorCoord extent,    ///< Extent of tensor
+        int thread_id          ///< ID of each participating thread
+        )
+        : PredicatedTileAccessIteratorResidualLast(params,
+                                                   pointer,
+                                                   extent,
+                                                   thread_id,
+                                                   make_Coord(0, 0))
+    {
+    }
+
+    /// Overrides the internal iteration index
+    CUTLASS_HOST_DEVICE
+    void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+    CUTLASS_HOST_DEVICE
+    void set_residual_tile(bool enable) { iterator_.set_residual_tile(enable); }
+
+    /// Adds a pointer offset in units of Element
+    CUTLASS_HOST_DEVICE
+    void add_pointer_offset(LongIndex pointer_offset)
+    {
+        iterator_.add_pointer_offset(pointer_offset);
+    }
+
+    /// Advances an iterator along logical dimensions of matrix in units of whole
+    /// tiles
+    CUTLASS_HOST_DEVICE
+    void add_tile_offset(TensorCoord const& tile_offset)
+    {
+        iterator_.add_tile_offset(make_Coord(tile_offset.row(), tile_offset.column()));
+    }
+
+    /// Returns a pointer
+    CUTLASS_HOST_DEVICE
+    AccessType* get() const { return reinterpret_cast<AccessType*>(iterator_.get()); }
+
+    /// Advances to the next tile in memory.
+    ///
+    /// The first time this method is called, predicates are updated, and the
+    /// iterator's internal pointer is reverted to the first "steady state" tile.
+    /// Subsequent calls are lightweight and must only update the internal
+    /// pointer.
+    CUTLASS_HOST_DEVICE
+    PredicatedTileAccessIteratorResidualLast& operator++()
+    {
+        ++iterator_;
+        return *this;
+    }
+
+    /// Advances to the next tile in memory.
+    ///
+    /// The first time this method is called, predicates are updated, and the
+    /// iterator's internal pointer is reverted to the first "steady state" tile.
+    /// Subsequent calls are lightweight and must only update the internal
+    /// pointer.
+    CUTLASS_HOST_DEVICE
+    PredicatedTileAccessIteratorResidualLast operator++(int)
+    {
+        PredicatedTileAccessIteratorResidualLast self(*this);
+        operator++();
+        return self;
+    }
+
+    /// Clears the predicate set efficiently
+    CUTLASS_HOST_DEVICE
+    void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+
+    /// Clears the predicate set efficiently
+    CUTLASS_HOST_DEVICE
+    void enable_mask() { iterator_.enable_mask(); }
+
+    /// Sets the predicate mask, overriding value stored in predicate iterator
+    CUTLASS_HOST_DEVICE
+    void set_mask(Mask const& mask) { iterator_.set_mask(mask); }
+
+    /// Gets the mask
+    CUTLASS_HOST_DEVICE
+    void get_mask(Mask& mask) { iterator_.get_mask(mask); }
+
+    /// Returns whether access is valid or not
+    CUTLASS_HOST_DEVICE
+    bool valid() { return iterator_.valid(); }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIteratorResidualLast for affine rank-2
+/// row-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_,
+          typename Element_,
+          int AdvanceRank,
+          typename ThreadMap_,
+          typename AccessType_>
+class PredicatedTileAccessIteratorResidualLast<Shape_,
+                                               Element_,
+                                               layout::AffineRank2RowMajor,
+                                               AdvanceRank,
+                                               ThreadMap_,
+                                               AccessType_,
+                                               false> {
+public:
+    static_assert(AdvanceRank == 0 || AdvanceRank == 1,
+                  "Specialization for pitch-linear iterator may along advance along the "
+                  "contiguous(rank=0) or strided(rank=1) dimension.");
+
+    using Shape = Shape_;
+    using Element = Element_;
+    using Layout = layout::AffineRank2RowMajor;
+    static int const kAdvanceRank = AdvanceRank;
+    using ThreadMap = ThreadMap_;
+    using AccessType = AccessType_;
+
+    using Index = typename Layout::Index;
+    using LongIndex = typename Layout::LongIndex;
+
+    using TensorRef = TensorRef<Element, Layout>;
+    using TensorView = TensorView<Element, Layout>;
+    using TensorCoord = typename Layout::TensorCoord;
+
+    using Pointer = Element*;
+    using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+    // Map to the underlying AffineRankN<2> layout
+    using UnderlyingIterator = PredicatedTileAccessIteratorResidualLast<
+        layout::PitchLinearShape<Shape::kColumn, Shape::kRow>,
+        Element,
+        layout::AffineRankN<2>,
+        (kAdvanceRank == 0 ? 1 : 0),
+        ThreadMap,
+        AccessType>;
+
+    static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+    /// Predicate vector stores mask to guard accesses
+    using Mask = typename UnderlyingIterator::Mask;
+
+    /// Parameters object is precomputed state and is host-constructible
+    class Params {
+    private:
+        friend PredicatedTileAccessIteratorResidualLast;
+
+        /// Parameters object
+        typename UnderlyingIterator::Params params_;
+
+    public:
+        /// Default ctor
+        CUTLASS_HOST_DEVICE
+        Params() {}
+
+        /// Construct the Params object given an AffineRankN<2> tensor's layout
+        CUTLASS_HOST_DEVICE
+        Params(Layout const& layout)
+            : params_(layout::AffineRankN<2>(layout.stride(1), layout.stride(0))){};
+    };
+
+private:
+    //
+    // Data members
+    //
+
+    /// Underlying AffineRankN<2> tile iterator
+    UnderlyingIterator iterator_;
+
+public:
+    /// Constructs a TileIterator from its precomputed state, threadblock offset,
+    /// and thread ID
+    CUTLASS_HOST_DEVICE
+    PredicatedTileAccessIteratorResidualLast(
+        ///< Precomputed parameters object
+        Params const& params,
+        ///< Pointer to start of tensor
+        Pointer pointer,
+        ///< Extent of tensor
+        TensorCoord extent,
+        ///< ID of each participating thread
+        int thread_id,
+        ///< Initial offset of threadblock
+        TensorCoord const& threadblock_offset,
+        int const* indices = nullptr  ///< gather/scatter indices, note no support for
+                                      ///< gather/scatter at this specialization
+        )
+        : iterator_(params.params_,
+                    pointer,
+                    layout::PitchLinearCoord(extent.column(), extent.row()),
+                    thread_id,
+                    layout::PitchLinearCoord(threadblock_offset.column(), threadblock_offset.row()))
+    {
+    }
+
+    /// Construct a PredicatedTileAccessIteratorResidualLast with zero threadblock
+    /// offset
+    CUTLASS_HOST_DEVICE
+    PredicatedTileAccessIteratorResidualLast(
+        Params const& params,  ///< Precomputed parameters object
+        Pointer pointer,       ///< Pointer to start of tensor
+        TensorCoord extent,    ///< Extent of tensor
+        int thread_id          ///< ID of each participating thread
+        )
+        : PredicatedTileAccessIteratorResidualLast(params,
+                                                   pointer,
+                                                   extent,
+                                                   thread_id,
+                                                   make_Coord(0, 0))
+    {
+    }
+
+    /// Overrides the internal iteration index
+    CUTLASS_HOST_DEVICE
+    void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+    CUTLASS_HOST_DEVICE
+    void set_residual_tile(bool enable) { iterator_.set_residual_tile(enable); }
+
+    /// Adds a pointer offset in units of Element
+    CUTLASS_HOST_DEVICE
+    void add_pointer_offset(LongIndex pointer_offset)
+    {
+        iterator_.add_pointer_offset(pointer_offset);
+    }
+
+    /// Advances an iterator along logical dimensions of matrix in units of whole
+    /// tiles
+    CUTLASS_HOST_DEVICE
+    void add_tile_offset(TensorCoord const& tile_offset)
+    {
+        iterator_.add_tile_offset(make_Coord(tile_offset.column(), tile_offset.row()));
+    }
+
+    /// Returns a pointer
+    CUTLASS_HOST_DEVICE
+    AccessType* get() const { return reinterpret_cast<AccessType*>(iterator_.get()); }
+
+    /// Advances to the next tile in memory.
+    ///
+    /// The first time this method is called, predicates are updated, and the
+    /// iterator's internal pointer is reverted to the first "steady state" tile.
+    /// Subsequent calls are lightweight and must only update the internal
+    /// pointer.
+    CUTLASS_HOST_DEVICE
+    PredicatedTileAccessIteratorResidualLast& operator++()
+    {
+        ++iterator_;
+        return *this;
+    }
+
+    /// Advances to the next tile in memory.
+    ///
+    /// The first time this method is called, predicates are updated, and the
+    /// iterator's internal pointer is reverted to the first "steady state" tile.
+    /// Subsequent calls are lightweight and must only update the internal
+    /// pointer.
+    CUTLASS_HOST_DEVICE
+    PredicatedTileAccessIteratorResidualLast operator++(int)
+    {
+        PredicatedTileAccessIteratorResidualLast self(*this);
+        operator++();
+        return self;
+    }
+
+    /// Clears the predicate set efficiently
+    CUTLASS_HOST_DEVICE
+    void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+
+    /// Clears the predicate set efficiently
+    CUTLASS_HOST_DEVICE
+    void enable_mask() { iterator_.enable_mask(); }
+
+    /// Sets the predicate mask, overriding value stored in predicate iterator
+    CUTLASS_HOST_DEVICE
+    void set_mask(Mask const& mask) { iterator_.set_mask(mask); }
+
+    /// Gets the mask
+    CUTLASS_HOST_DEVICE
+    void get_mask(Mask& mask) { iterator_.get_mask(mask); }
+
+    /// Returns whether access is valid or not
+    CUTLASS_HOST_DEVICE
+    bool valid() { return iterator_.valid(); }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIteratorResidualLast for column-major
+/// interleaved data. It is mapped to the congruous layout.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+
+template <typename Shape_,
+          typename Element_,
+          int AdvanceRank,
+          typename ThreadMap_,
+          typename AccessType_,
+          int InterleavedK>
+class PredicatedTileAccessIteratorResidualLast<Shape_,
+                                               Element_,
+                                               layout::ColumnMajorInterleaved<InterleavedK>,
+                                               AdvanceRank,
+                                               ThreadMap_,
+                                               AccessType_,
+                                               false> {
+public:
+    static_assert(AdvanceRank == 0 || AdvanceRank == 1,
+                  "Specialization for pitch-linear iterator may along advance along the "
+                  "contiguous(rank=0) or strided(rank=1) dimension.");
+
+    using Shape = Shape_;
+    using Element = Element_;
+    static int const kInterleavedK = InterleavedK;
+    using Layout = layout::ColumnMajorInterleaved<kInterleavedK>;
+    static int const kAdvanceRank = AdvanceRank;
+    using ThreadMap = ThreadMap_;
+    using AccessType = AccessType_;
+
+    using Index = typename Layout::Index;
+    using LongIndex = typename Layout::LongIndex;
+
+    using TensorRef = TensorRef<Element, Layout>;
+    using TensorView = TensorView<Element, Layout>;
+    using TensorCoord = typename Layout::TensorCoord;
+
+    using Pointer = Element*;
+    using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+    using UnderlyingIterator = PredicatedTileAccessIteratorResidualLast<
+        layout::PitchLinearShape<Shape::kRow * kInterleavedK, Shape::kColumn / kInterleavedK>,
+        Element,
+        layout::PitchLinear,
+        (kAdvanceRank == 0 ? 0 : 1),
+        ThreadMap,
+        AccessType>;
+
+    static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+    /// Predicate vector stores mask to guard accesses
+    using Mask = typename UnderlyingIterator::Mask;
+
+    /// Parameters object is precomputed state and is host-constructible
+    class Params {
+    private:
+        friend PredicatedTileAccessIteratorResidualLast;
+
+        /// Parameters object
+        typename UnderlyingIterator::Params params_;
+
+    public:
+        CUTLASS_HOST_DEVICE
+        Params() {}
+
+        /// Construct the Params object given a pitch-linear tensor's layout
+        CUTLASS_HOST_DEVICE
+        Params(Layout const& layout) : params_(layout::PitchLinear(layout.stride(0))) {}
+
+        CUTLASS_HOST_DEVICE
+        Params(typename UnderlyingIterator::Params::Base const& base) : params_(base) {}
+    };
+
+private:
+    //
+    // Data members
+    //
+
+    /// Underlying pitch-linear tile iterator
+    UnderlyingIterator iterator_;
+
+public:
+    /// Constructs a TileIterator from its precomputed state, threadblock offset,
+    /// and thread ID
+    CUTLASS_HOST_DEVICE
+    PredicatedTileAccessIteratorResidualLast(
+        /// Precomputed parameters object
+        Params const& params,
+        /// Pointer to start of tensor
+        Pointer pointer,
+        /// Extent of tensor
+        TensorCoord extent,
+        /// ID of each participating thread
+        int thread_id,
+        /// Initial offset of threadblock
+        TensorCoord const& threadblock_offset,
+        int const* indices = nullptr  ///< gather/scatter indices, note no support for
+                                      ///< gather/scatter at this specialization
+        )
+        : iterator_(params.params_,
+                    pointer,
+                    layout::PitchLinearCoord(extent.row() * kInterleavedK,
+                                             extent.column() / kInterleavedK),
+                    thread_id,
+                    layout::PitchLinearCoord(threadblock_offset.row() * kInterleavedK,
+                                             threadblock_offset.column() / kInterleavedK))
+    {
+    }
+
+    /// Construct a PredicatedTileAccessIteratorResidualLast with zero threadblock
+    /// offset
+    CUTLASS_HOST_DEVICE
+    PredicatedTileAccessIteratorResidualLast(
+        Params const& params,  ///< Precomputed parameters object
+        Pointer pointer,       ///< Pointer to start of tensor
+        TensorCoord extent,    ///< Extent of tensor
+        int thread_id          ///< ID of each participating thread
+        )
+        : PredicatedTileAccessIteratorResidualLast(params,
+                                                   pointer,
+                                                   extent,
+                                                   thread_id,
+                                                   make_Coord(0, 0))
+    {
+    }
+
+    /// Overrides the internal iteration index
+    CUTLASS_HOST_DEVICE
+    void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+    CUTLASS_HOST_DEVICE
+    void set_residual_tile(bool enable) { iterator_.set_residual_tile(enable); }
+
+    /// Adds a pointer offset in units of Element
+    CUTLASS_HOST_DEVICE
+    void add_pointer_offset(LongIndex pointer_offset)
+    {
+        iterator_.add_pointer_offset(pointer_offset);
+    }
+
+    /// Advances an iterator along logical dimensions of matrix in units of whole
+    /// tiles
+    CUTLASS_HOST_DEVICE
+    void add_tile_offset(TensorCoord const& tile_offset)
+    {
+        iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
+    }
+
+    /// Returns a pointer
+    CUTLASS_HOST_DEVICE
+    AccessType* get() const { return reinterpret_cast<AccessType*>(iterator_.get()); }
+
+    /// Advances to the next tile in memory.
+    ///
+    /// The first time this method is called, predicates are updated, and the
+    /// iterator's internal pointer is reverted to the first "steady state" tile.
+    /// Subsequent calls are lightweight and must only update the internal
+    /// pointer.
+    CUTLASS_HOST_DEVICE
+    PredicatedTileAccessIteratorResidualLast& operator++()
+    {
+        ++iterator_;
+        return *this;
+    }
+
+    /// Advances to the next tile in memory.
+    ///
+    /// The first time this method is called, predicates are updated, and the
+    /// iterator's internal pointer is reverted to the first "steady state" tile.
+    /// Subsequent calls are lightweight and must only update the internal
+    /// pointer.
+    CUTLASS_HOST_DEVICE
+    PredicatedTileAccessIteratorResidualLast operator++(int)
+    {
+        PredicatedTileAccessIteratorResidualLast self(*this);
+        operator++();
+        return self;
+    }
+
+    /// Clears the predicate set efficiently
+    CUTLASS_HOST_DEVICE
+    void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+
+    /// Clears the predicate set efficiently
+    CUTLASS_HOST_DEVICE
+    void enable_mask() { iterator_.enable_mask(); }
+
+    /// Sets the predicate mask, overriding value stored in predicate iterator
+    CUTLASS_HOST_DEVICE
+    void set_mask(Mask const& mask) { iterator_.set_mask(mask); }
+
+    /// Gets the mask
+    CUTLASS_HOST_DEVICE
+    void get_mask(Mask& mask) { iterator_.get_mask(mask); }
+
+    /// Returns whether access is valid or not
+    CUTLASS_HOST_DEVICE
+    bool valid() { return iterator_.valid(); }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIteratorResidualLast for row-major
+/// interleaved data.
+//  It is mapped to the congruous layout.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_,
+          typename Element_,
+          int AdvanceRank,
+          typename ThreadMap_,
+          typename AccessType_,
+          int InterleavedK>
+class PredicatedTileAccessIteratorResidualLast<Shape_,
+                                               Element_,
+                                               layout::RowMajorInterleaved<InterleavedK>,
+                                               AdvanceRank,
+                                               ThreadMap_,
+                                               AccessType_,
+                                               false> {
+public:
+    static_assert(AdvanceRank == 0 || AdvanceRank == 1,
+                  "Specialization for pitch-linear iterator may along advance along the "
+                  "contiguous(rank=0) or strided(rank=1) dimension.");
+
+    using Shape = Shape_;
+    using Element = Element_;
+    static int const kInterleavedK = InterleavedK;
+    using Layout = layout::RowMajorInterleaved<kInterleavedK>;
+    static int const kAdvanceRank = AdvanceRank;
+    using ThreadMap = ThreadMap_;
+    using AccessType = AccessType_;
+
+    using Index = typename Layout::Index;
+    using LongIndex = typename Layout::LongIndex;
+
+    using TensorRef = TensorRef<Element, Layout>;
+    using TensorView = TensorView<Element, Layout>;
+    using TensorCoord = typename Layout::TensorCoord;
+
+    using Pointer = Element*;
+    using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+    using UnderlyingIterator = PredicatedTileAccessIteratorResidualLast<
+        layout::PitchLinearShape<Shape::kColumn * kInterleavedK, Shape::kRow / kInterleavedK>,
+        Element,
+        layout::PitchLinear,
+        (kAdvanceRank == 0 ? 1 : 0),
+        ThreadMap,
+        AccessType>;
+
+    static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+    /// Predicate vector stores mask to guard accesses
+    using Mask = typename UnderlyingIterator::Mask;
+
+    /// Parameters object is precomputed state and is host-constructible
+    class Params {
+    private:
+        friend PredicatedTileAccessIteratorResidualLast;
+
+        /// Parameters object
+        typename UnderlyingIterator::Params params_;
+
+    public:
+        CUTLASS_HOST_DEVICE
+        Params() {}
+
+        /// Construct the Params object given a pitch-linear tensor's layout
+        CUTLASS_HOST_DEVICE
+        Params(Layout const& layout) : params_(layout::PitchLinear(layout.stride(0))) {}
+
+        CUTLASS_HOST_DEVICE
+        Params(typename UnderlyingIterator::Params::Base const& base) : params_(base) {}
+    };
+
+private:
+    //
+    // Data members
+    //
+
+    /// Underlying pitch-linear tile iterator
+    UnderlyingIterator iterator_;
+
+public:
+    /// Constructs a TileIterator from its precomputed state, threadblock offset,
+    /// and thread ID
+    CUTLASS_HOST_DEVICE
+    PredicatedTileAccessIteratorResidualLast(
+        /// Precomputed parameters object
+        Params const& params,
+        /// Pointer to start of tensor
+        Pointer pointer,
+        /// Extent of tensor
+        TensorCoord extent,
+        /// ID of each participating thread
+        int thread_id,
+        /// Initial offset of threadblock
+        TensorCoord const& threadblock_offset,
+        int const* indices = nullptr  ///< gather/scatter indices, note no support for
+                                      ///< gather/scatter at this specialization
+        )
+        : iterator_(params.params_,
+                    pointer,
+                    layout::PitchLinearCoord(extent.column() * kInterleavedK,
+                                             extent.row() / kInterleavedK),
+                    thread_id,
+                    layout::PitchLinearCoord(threadblock_offset.column() * kInterleavedK,
+                                             threadblock_offset.row() / kInterleavedK))
+    {
+    }
+
+    /// Construct a PredicatedTileAccessIteratorResidualLast with zero threadblock
+    /// offset
+    CUTLASS_HOST_DEVICE
+    PredicatedTileAccessIteratorResidualLast(
+        Params const& params,  ///< Precomputed parameters object
+        Pointer pointer,       ///< Pointer to start of tensor
+        TensorCoord extent,    ///< Extent of tensor
+        int thread_id          ///< ID of each participating thread
+        )
+        : PredicatedTileAccessIteratorResidualLast(params,
+                                                   pointer,
+                                                   extent,
+                                                   thread_id,
+                                                   make_Coord(0, 0))
+    {
+    }
+
+    /// Overrides the internal iteration index
+    CUTLASS_HOST_DEVICE
+    void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+    CUTLASS_HOST_DEVICE
+    void set_residual_tile(bool enable) { iterator_.set_residual_tile(enable); }
+
+    /// Adds a pointer offset in units of Element
+    CUTLASS_HOST_DEVICE
+    void add_pointer_offset(LongIndex pointer_offset)
+    {
+        iterator_.add_pointer_offset(pointer_offset);
+    }
+
+    /// Advances an iterator along logical dimensions of matrix in units of whole
+    /// tiles
+    CUTLASS_HOST_DEVICE
+    void add_tile_offset(TensorCoord const& tile_offset)
+    {
+        iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
+    }
+
+    /// Returns a pointer
+    CUTLASS_HOST_DEVICE
+    AccessType* get() const { return reinterpret_cast<AccessType*>(iterator_.get()); }
+
+    /// Advances to the next tile in memory.
+    ///
+    /// The first time this method is called, predicates are updated, and the
+    /// iterator's internal pointer is reverted to the first "steady state" tile.
+    /// Subsequent calls are lightweight and must only update the internal
+    /// pointer.
+    CUTLASS_HOST_DEVICE
+    PredicatedTileAccessIteratorResidualLast& operator++()
+    {
+        ++iterator_;
+        return *this;
+    }
+
+    /// Advances to the next tile in memory.
+    ///
+    /// The first time this method is called, predicates are updated, and the
+    /// iterator's internal pointer is reverted to the first "steady state" tile.
+    /// Subsequent calls are lightweight and must only update the internal
+    /// pointer.
+    CUTLASS_HOST_DEVICE
+    PredicatedTileAccessIteratorResidualLast operator++(int)
+    {
+        PredicatedTileAccessIteratorResidualLast self(*this);
+        operator++();
+        return self;
+    }
+
+    /// Clears the predicate set efficiently
+    CUTLASS_HOST_DEVICE
+    void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+
+    /// Clears the predicate set efficiently
+    CUTLASS_HOST_DEVICE
+    void enable_mask() { iterator_.enable_mask(); }
+
+    /// Sets the predicate mask, overriding value stored in predicate iterator
+    CUTLASS_HOST_DEVICE
+    void set_mask(Mask const& mask) { iterator_.set_mask(mask); }
+
+    /// Gets the mask
+    CUTLASS_HOST_DEVICE
+    void get_mask(Mask& mask) { iterator_.get_mask(mask); }
+
+    /// Returns whether access is valid or not
+    CUTLASS_HOST_DEVICE
+    bool valid() { return iterator_.valid(); }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace transform
+}  // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/deepspeed4science/evoformer_attn/iterators/predicated_tile_iterator_atomic.h b/csrc/deepspeed4science/evoformer_attn/iterators/predicated_tile_iterator_atomic.h
new file mode 100644
index 000000000000..8d4173f1a6a2
--- /dev/null
+++ b/csrc/deepspeed4science/evoformer_attn/iterators/predicated_tile_iterator_atomic.h
@@ -0,0 +1,886 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#pragma once
+#include <cutlass/cutlass.h>
+#include <cutlass/epilogue/threadblock/predicated_tile_iterator.h>
+#include <cutlass/tensor_coord.h>
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+template <class AccessType, class Enable = void>
+struct atomic_store {};
+
+template <class AccessType>
+struct atomic_store<AccessType,
+                    typename platform::enable_if<
+                        platform::is_same<typename AccessType::Element, half_t>::value>::type> {
+    using Element = typename AccessType::Element;
+    static const int kCount = AccessType::kElements;
+
+    CUTLASS_DEVICE
+    atomic_store(AccessType const& D, void* ptr, bool pred_guard)
+    {
+        static_assert(!(kCount % 2), "kCount must be even");
+        half2* p = reinterpret_cast<half2*>(ptr);
+        uint const* data = reinterpret_cast<uint const*>(&D);
+        asm volatile(
+            "{\n"
+            "  .reg .pred p;\n"
+            "  setp.ne.b32 p, %0, 0;\n"
+            :
+            : "r"((int)pred_guard));
+        for (int i = 0; i < kCount / 2; i++) {
+            asm volatile("  @p red.relaxed.global.add.noftz.f16x2  [%0], %1;\n"
+                         :
+                         : "l"(p + i), "r"(data[i]));
+        }
+        asm volatile("}\n" ::);
+    }
+};
+
+template <class AccessType>
+struct atomic_store<AccessType,
+                    typename platform::enable_if<
+                        platform::is_same<typename AccessType::Element, float>::value>::type> {
+    using Element = typename AccessType::Element;
+    static const int kCount = AccessType::kElements;
+
+    CUTLASS_DEVICE
+    atomic_store(AccessType const& D, void* ptr, bool pred_guard)
+    {
+        Element* p = reinterpret_cast<Element*>(ptr);
+        uint const* data = reinterpret_cast<uint const*>(&D);
+        asm volatile(
+            "{\n"
+            "  .reg .pred p;\n"
+            "  setp.ne.b32 p, %0, 0;\n"
+            :
+            : "r"((int)pred_guard));
+        for (int i = 0; i < kCount; i++) {
+            asm volatile("  @p red.relaxed.global.add.f32  [%0], %1;\n"
+                         :
+                         : "l"(p + i), "r"(data[i]));
+        }
+        asm volatile("}\n" ::);
+    }
+};
+
+template <typename ThreadMap_,  ///< Thread map (conept: OutputTileThreadMap)
+          typename Element_,    ///< Element data type
+          int Rank>
+class PredicatedTileIteratorAffineRankNAtomic {
+public:
+    using ThreadMap = ThreadMap_;
+    using Shape = typename ThreadMap::Shape;
+
+    using Element = Element_;
+
+    using Layout = layout::AffineRankN<Rank>;
+    using TensorRef = TensorRef<Element, Layout>;
+    using TensorView = TensorView<Element, Layout>;
+    using ConstTensorRef = typename TensorRef::ConstTensorRef;
+
+    using Index = typename Layout::Index;
+    using LongIndex = typename Layout::LongIndex;
+    using TensorCoord = typename Layout::TensorCoord;
+
+    static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+    static int const kThreads = ThreadMap::kThreads;
+    static int const kIterations = ThreadMap::Count::kTile;
+
+    static_assert(ThreadMap::Iterations::kRow > 0, "ThreadMap::Iterations::kRow must be > 0");
+    static_assert(ThreadMap::Iterations::kGroup > 0, "ThreadMap::Iterations::kGroup must be > 0");
+    static_assert(ThreadMap::Iterations::kCluster > 0,
+                  "ThreadMap::Iterations::kCluster must be > 0");
+    static_assert(ThreadMap::Iterations::kColumn > 0, "ThreadMap::Iterations::kColumn must be > 0");
+    static_assert(!(Layout::kRank % 2),
+                  "Layout rank must be even. This assumes the first half of the "
+                  "modes correspond to the 'row' "
+                  "and the second half of the modes correspond to the 'column'");
+
+    static bool const kBigEndian = false;
+
+    /// Fragment object
+    using Fragment = Array<Element,
+                           ThreadMap::Iterations::kColumn * ThreadMap::Iterations::kRow *
+                               ThreadMap::Iterations::kGroup * ThreadMap::Iterations::kCluster *
+                               ThreadMap::kElementsPerAccess>;
+
+    /// Memory access size
+    using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
+
+    //
+    // Parameters struct
+    //
+
+    /// Parameters structure
+    struct Params {
+        //
+        // Data members
+        //
+
+        Layout layout;
+
+        /// Stride in units of bytes along M modes
+        Coord<Layout::kRank / 2, typename Layout::LongIndex> stride_m;
+
+        /// Stride in units of bytes along N modes
+        Coord<Layout::kRank / 2, typename Layout::LongIndex> stride_n;
+
+        /// Fast divmod objects divided by tensor extents
+        FastDivmod divmod_m[(Layout::kRank == 2) ? 1 : (Layout::kRank / 2 - 1)];
+
+        /// Fast divmod objects divided by tensor extents
+        FastDivmod divmod_n[(Layout::kRank == 2) ? 1 : (Layout::kRank / 2 - 1)];
+
+        int64_t rank2_inc_col;
+        int64_t rank2_inc_row;
+
+        //
+        // Methods
+        //
+        CUTLASS_HOST_DEVICE
+        Params() {}
+
+        CUTLASS_HOST_DEVICE
+        Params(TensorCoord const& extent, Layout const& layout_) : layout(layout_)
+        {
+            CUTLASS_PRAGMA_UNROLL
+            for (int i = 0; i < Layout::kRank / 2; ++i) {
+                stride_m[i] = OffsetBytes<Element>(layout_.stride()[i]);
+                stride_n[i] = OffsetBytes<Element>(layout_.stride()[i + Layout::kRank / 2]);
+            }
+
+            if (kBigEndian) {
+                // "Big Endian" scheme
+                CUTLASS_PRAGMA_UNROLL
+                for (int i = 0; i < Layout::kRank / 2 - 1; ++i) {
+                    divmod_m[i] = FastDivmod(extent[i + 1]);
+                    divmod_n[i] = FastDivmod(extent[i + Layout::kRank / 2 + 1]);
+                }
+            } else {
+                // "Little Endian" scheme
+                CUTLASS_PRAGMA_UNROLL
+                for (int i = 0; i < Layout::kRank / 2 - 1; ++i) {
+                    divmod_m[i] = FastDivmod(extent[i]);
+                    divmod_n[i] = FastDivmod(extent[i + Layout::kRank / 2]);
+                }
+            }
+        }
+
+        CUTLASS_HOST_DEVICE
+        Params(Layout const& layout_) : layout(layout_)
+        {
+            CUTLASS_PRAGMA_UNROLL
+            for (int i = 0; i < Layout::kRank / 2; ++i) {
+                stride_m[i] = OffsetBytes<Element>(layout_.stride()[i]);
+                stride_n[i] = OffsetBytes<Element>(layout_.stride()[i + Layout::kRank / 2]);
+            }
+
+            rank2_inc_col = ThreadMap::Delta::kColumn * stride_n[0];
+            rank2_inc_row = ThreadMap::Delta::kRow * stride_m[0];
+        }
+    };
+
+    /// Mask object
+    struct Mask {
+        static int const kCount = ThreadMap::Iterations::kColumn;
+
+        /// Predicate state
+        bool predicates[kCount];
+
+        //
+        // Mask
+        //
+        CUTLASS_HOST_DEVICE
+        Mask() { enable(); }
+
+        ///< Efficiently disables all accesses guarded by mask
+        CUTLASS_HOST_DEVICE void clear()
+        {
+            CUTLASS_PRAGMA_UNROLL
+            for (int i = 0; i < kCount; ++i) { predicates[i] = false; }
+        }
+
+        ///< CUTLASS_HOST_DEVICE enables all accesses guarded by mask
+        CUTLASS_DEVICE void enable()
+        {
+            CUTLASS_PRAGMA_UNROLL
+            for (int i = 0; i < kCount; ++i) { predicates[i] = true; }
+        }
+    };
+
+private:
+    //
+    // Data members
+    //
+
+    /// Parameters structure containing reference and precomputed state.
+    Params params_;
+
+    /// Byte-level pointer
+    uint8_t* byte_pointer_;
+
+    /// Array of boolean values to contain steady-state predicates
+    Mask mask_;
+
+    /// Extent of the matrix tile in rows
+    Index extent_row_;
+
+    /// Extent of the matrix tile in columns
+    Index extent_col_;
+
+    /// A thread's starting row position (assuming steady-state predicates have
+    /// been computed)
+    Index thread_start_row_;
+
+    /// A thread's starting column position (assuming steady-state predicates have
+    /// been computed)
+    Index thread_start_column_;
+
+    /// Internal state counter
+    int state_[3];
+
+    /// Offsets in columns, cached for performance
+    int64_t offset_modes_n_[ThreadMap::Iterations::kColumn];
+
+    //
+    // Static asserts about internal strides
+    //
+
+    static_assert(sizeof(extent_row_) == 4, "Expected 32b extents");
+    static_assert(sizeof(thread_start_row_) == 4, "Expected 32b extents");
+
+private:
+    //
+    // Methods
+    //
+
+public:
+    //
+    // Methods
+    //
+
+    /// Constructor
+    CUTLASS_DEVICE
+    PredicatedTileIteratorAffineRankNAtomic(
+        Params const& params,
+        Element* pointer,
+        MatrixCoord extent,
+        int thread_idx,
+        MatrixCoord threadblock_offset = MatrixCoord(),
+        int const* indices = nullptr  ///< gather/scatter indices, note no support for
+                                      ///< gather/scatter at this specialization
+        )
+        : params_(params)
+    {
+        MatrixCoord thread_offset = ThreadMap::initial_offset(thread_idx) + threadblock_offset;
+
+        extent_row_ = extent.row();
+        extent_col_ = extent.column();
+
+        thread_start_row_ = thread_offset.row();
+        thread_start_column_ = thread_offset.column();
+
+        if (Layout::kRank > 2) {
+            // Initialize predicates
+            CUTLASS_PRAGMA_UNROLL
+            for (int c = 0; c < ThreadMap::Iterations::kColumn; ++c) {
+                //
+                // Compute coordinate and decompose into N modes
+                //
+
+                int coord_n = thread_start_column_ + c * ThreadMap::Delta::kColumn;
+
+                mask_.predicates[c] = coord_n < extent.column();
+
+                Coord<Layout::kRank / 2, Index> modes_n;
+
+                int64_t offset_modes_n = 0;
+
+                if (kBigEndian) {
+                    modes_n = CoordinateDecomposition<Layout::kRank / 2>(coord_n, params_.divmod_n);
+
+                    offset_modes_n = dot(modes_n, params_.stride_n);
+                } else {
+                    modes_n = CoordinateDecompositionLittleEndian<Layout::kRank / 2>(
+                        coord_n, params_.divmod_n);
+
+                    offset_modes_n = dot(modes_n, params_.stride_n);
+                }
+
+                offset_modes_n_[c] = offset_modes_n;
+            }
+
+            if (!pointer) { mask_.clear(); }
+        }
+
+        // Initialize pointer
+        byte_pointer_ = reinterpret_cast<uint8_t*>(pointer);
+
+        // Initialize internal state counter
+        state_[0] = state_[1] = state_[2] = 0;
+    }
+
+    /// Adds a pointer offset in units of Element
+    CUTLASS_HOST_DEVICE
+    void add_pointer_offset(LongIndex pointer_offset)
+    {
+        byte_pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+    }
+
+    /// Stores a fragment to memory
+    CUTLASS_DEVICE
+    void store_with_byte_offset(Fragment const& frag, int64_t byte_offset)
+    {
+        uint8_t* byte_pointer = byte_pointer_;
+        AccessType const* frag_ptr = reinterpret_cast<AccessType const*>(&frag);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
+            CUTLASS_PRAGMA_UNROLL
+            for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+                int row_begin = thread_start_row_ + group * ThreadMap::Delta::kGroup +
+                                cluster * ThreadMap::Delta::kCluster;
+                int64_t offset_modes_m = row_begin * params_.stride_m[0];
+
+                CUTLASS_PRAGMA_UNROLL
+                for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+                    int frag_row_idx =
+                        (row + ThreadMap::Iterations::kRow *
+                                   (group + ThreadMap::Iterations::kGroup * cluster));
+
+                    //
+                    // Compute coordinate and decompose into M modes
+                    //
+
+                    int coord_m = row * ThreadMap::Delta::kRow + row_begin;
+
+                    Coord<Layout::kRank / 2, Index> modes_m;
+
+                    if (Layout::kRank > 2) {
+                        if (kBigEndian) {
+                            modes_m = CoordinateDecomposition<Layout::kRank / 2>(coord_m,
+                                                                                 params_.divmod_m);
+                        } else {
+                            modes_m = CoordinateDecompositionLittleEndian<Layout::kRank / 2>(
+                                coord_m, params_.divmod_m);
+                        }
+
+                        offset_modes_m = dot(modes_m, params_.stride_m);
+                    }
+
+                    //
+                    // Compute the offset due to modes M
+                    //
+
+                    bool row_guard = (coord_m < extent_row_);
+                    int64_t offset_modes_n = thread_start_column_ * params_.stride_n[0];
+
+                    CUTLASS_PRAGMA_UNROLL
+                    for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
+                        //
+                        // Compute coordinate and decompose into N modes
+                        //
+
+                        if (Layout::kRank > 2) { offset_modes_n = offset_modes_n_[column]; }
+
+                        //
+                        // Compute the pointer and access
+                        //
+                        bool guard;
+                        if (Layout::kRank > 2) {
+                            guard = row_guard && mask_.predicates[column];
+                        } else {
+                            guard = (coord_m < extent_row_) &&
+                                    ((thread_start_column_ + ThreadMap::Delta::kColumn * column) <
+                                     extent_col_);
+                        }
+
+                        atomic_store<AccessType>(
+                            frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column],
+                            (void*)(byte_pointer + offset_modes_m + offset_modes_n + byte_offset),
+                            guard);
+
+                        if (Layout::kRank == 2) { offset_modes_n += params_.rank2_inc_col; }
+                    }
+
+                    if (Layout::kRank == 2) { offset_modes_m += params_.rank2_inc_row; }
+                }
+            }
+        }
+    }
+
+    /// Stores a fragment to memory
+    CUTLASS_DEVICE
+    void store(Fragment const& frag) { store_with_byte_offset(frag, 0); }
+
+    CUTLASS_DEVICE
+    void load(Fragment& frag) {}
+
+    /// Advances to the next position to load or store
+    CUTLASS_HOST_DEVICE
+    PredicatedTileIteratorAffineRankNAtomic& operator++()
+    {
+        ++state_[0];
+        thread_start_row_ += ThreadMap::Shape::kRow;
+
+        if (state_[0] == ThreadMap::Count::kRow) {
+            state_[0] = 0;
+            ++state_[1];
+
+            thread_start_row_ +=
+                (ThreadMap::Shape::kGroup - 1) * ThreadMap::Shape::kRow * ThreadMap::Count::kRow;
+
+            if (state_[1] == ThreadMap::Count::kGroup) {
+                state_[1] = 0;
+                ++state_[2];
+
+                thread_start_row_ += ThreadMap::Count::kGroup * ThreadMap::Shape::kGroup *
+                                     ThreadMap::Count::kRow * ThreadMap::Shape::kRow;
+
+                if (state_[2] == ThreadMap::Count::kCluster) { state_[2] = 0; }
+            }
+        }
+
+        return *this;
+    }
+
+    ///< Efficiently disables all accesses guarded by mask
+    CUTLASS_DEVICE void clear_mask() { mask_.clear(); }
+
+    ///< Efficiently enables all accesses guarded by mask
+    CUTLASS_DEVICE void enable_mask() { mask_.enable(); }
+
+    ///< Sets the mask
+    CUTLASS_DEVICE void get_mask(Mask& mask) { mask = mask_; }
+
+    ///< Sets the mask
+    CUTLASS_DEVICE void set_mask(Mask const& mask) { mask_ = mask; }
+};
+
+template <typename ThreadMap_,    ///< Thread map (conept: OutputTileThreadMap)
+          typename Element_,      ///< Element data type
+          bool ScatterD = false,  ///< Scatter D operand or not
+          typename PermuteDLayout = layout::NoPermute,  ///< Permute D operand or not
+          bool UseCUDAStore = false>
+class PredicatedTileIteratorAtomic {
+public:
+    using ThreadMap = ThreadMap_;
+    using Shape = typename ThreadMap::Shape;
+
+    using Element = Element_;
+
+    using Layout = layout::RowMajor;
+    using TensorRef = TensorRef<Element, Layout>;
+    using ConstTensorRef = typename TensorRef::ConstTensorRef;
+
+    using Index = typename Layout::Index;
+    using LongIndex = typename Layout::LongIndex;
+    using TensorCoord = MatrixCoord;
+
+    static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+    static int const kThreads = ThreadMap::kThreads;
+    static int const kIterations = ThreadMap::Count::kTile;
+
+    static bool constexpr PermuteD = !layout::is_trivial_permute<PermuteDLayout>;
+
+    static_assert(ThreadMap::Iterations::kRow > 0, "ThreadMap::Iterations::kRow must be > 0");
+    static_assert(ThreadMap::Iterations::kGroup > 0, "ThreadMap::Iterations::kGroup must be > 0");
+    static_assert(ThreadMap::Iterations::kCluster > 0,
+                  "ThreadMap::Iterations::kCluster must be > 0");
+    static_assert(ThreadMap::Iterations::kColumn > 0, "ThreadMap::Iterations::kColumn must be > 0");
+
+    /// Fragment object
+    using Fragment = Array<Element,
+                           ThreadMap::Iterations::kColumn * ThreadMap::Iterations::kRow *
+                               ThreadMap::Iterations::kGroup * ThreadMap::Iterations::kCluster *
+                               ThreadMap::kElementsPerAccess>;
+
+    /// Memory access size
+    using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
+
+    //
+    // Parameters struct
+    //
+
+    /// Uses a non-template class
+    struct Params : PredicatedTileIteratorParams {
+        using Base = PredicatedTileIteratorParams;
+
+        CUTLASS_HOST_DEVICE
+        Params() {}
+
+        CUTLASS_HOST_DEVICE
+        Params(Layout const& layout)
+            : PredicatedTileIteratorParams(
+                  layout.stride(0) * int(sizeof(AccessType)) / kElementsPerAccess,
+                  make_OutputTileThreadMapDesc<ThreadMap>())
+        {
+        }
+
+        CUTLASS_HOST_DEVICE
+        Params(Base const& base) : Base(base) {}
+    };
+
+    /// Mask object
+    struct Mask {
+        static int const kCount = ThreadMap::Iterations::kColumn;
+
+        /// Predicate state
+        bool predicates[kCount];
+
+        //
+        // Mask
+        //
+        CUTLASS_HOST_DEVICE
+        Mask() { enable(); }
+
+        ///< Efficiently disables all accesses guarded by mask
+        CUTLASS_HOST_DEVICE void clear()
+        {
+            CUTLASS_PRAGMA_UNROLL
+            for (int i = 0; i < kCount; ++i) { predicates[i] = false; }
+        }
+
+        ///< CUTLASS_HOST_DEVICE enables all accesses guarded by mask
+        CUTLASS_DEVICE void enable()
+        {
+            CUTLASS_PRAGMA_UNROLL
+            for (int i = 0; i < kCount; ++i) { predicates[i] = true; }
+        }
+    };
+
+private:
+    //
+    // Data members
+    //
+
+    /// Parameters structure containing reference and precomputed state.
+    PredicatedTileIteratorParams params_;
+
+    /// Byte-level pointer. This pointer is usually for both load() and store(),
+    /// unless PermuteD is performed. When having PermuteD, byte_pointer_ is only
+    /// for load().
+    uint8_t* byte_pointer_;
+
+    /// Byte-level pointer for store(). Due to PermuteD Op, store_byte_pointer_
+    /// may be with different address computation compared to byte_pointer_.
+    uint8_t* store_byte_pointer_;
+
+    /// Array of boolean values to contain steady-state predicates
+    Mask mask_;
+
+    /// Extent of the matrix tile in rows
+    Index extent_row_;
+
+    /// Extent of the matrix tile in rows
+    Index extent_column_;
+
+    /// A thread's starting row position (assuming steady-state predicates have
+    /// been computed)
+    Index thread_start_row_;
+
+    /// A thread's starting column
+    Index thread_start_column_;
+
+    /// Internal state counter
+    int state_[3];
+
+    /// Scatter indices
+    int const* indices_;
+
+    /// PermuteDLayout
+    PermuteDLayout permute_layout_;
+
+    //
+    // Static asserts about internal strides
+    //
+
+    static_assert(sizeof(extent_row_) == 4, "Expected 32b extents");
+    static_assert(sizeof(thread_start_row_) == 4, "Expected 32b extents");
+    static_assert(sizeof(PredicatedTileIteratorParams::stride) == 8, "Expected 64b strides");
+
+private:
+    //
+    // Methods
+    //
+
+public:
+    //
+    // Methods
+    //
+
+    /// Constructor
+    CUTLASS_DEVICE
+    PredicatedTileIteratorAtomic(PredicatedTileIteratorParams const& params,
+                                 Element* pointer,
+                                 TensorCoord extent,
+                                 int thread_idx,
+                                 TensorCoord threadblock_offset = TensorCoord(),
+                                 int const* indices = nullptr)
+        : params_(params),
+          indices_(indices),
+          permute_layout_(PitchLinearCoord(extent.column(), extent.row()),
+                          params_.stride * kElementsPerAccess / sizeof(AccessType))
+    {
+        TensorCoord thread_offset = ThreadMap::initial_offset(thread_idx) + threadblock_offset;
+
+        extent_row_ = extent.row();
+        extent_column_ = extent.column();
+
+        thread_start_row_ = thread_offset.row();
+        thread_start_column_ = thread_offset.column();
+
+        // Initialize predicates
+        CUTLASS_PRAGMA_UNROLL
+        for (int c = 0; c < ThreadMap::Iterations::kColumn; ++c) {
+            mask_.predicates[c] =
+                ((thread_offset.column() + ThreadMap::Delta::kColumn * c) < extent.column());
+        }
+
+        // Null pointer performs no accesses
+        if (!pointer) { mask_.clear(); }
+
+        if (ScatterD && !indices) { mask_.clear(); }
+
+        // Initialize byte_pointer_
+        byte_pointer_ = reinterpret_cast<uint8_t*>(pointer) +
+                        LongIndex(thread_offset.row()) * LongIndex(params_.stride) +
+                        LongIndex(thread_offset.column()) * sizeof(AccessType) / kElementsPerAccess;
+
+        if (ScatterD) {
+            byte_pointer_ =
+                reinterpret_cast<uint8_t*>(pointer) +
+                LongIndex(thread_offset.column()) * sizeof(AccessType) / kElementsPerAccess;
+        }
+
+        // store_byte_pointer_ is set to be the same with byte_pointer_ unless
+        // PermuteD is used.
+        store_byte_pointer_ = PermuteD ? reinterpret_cast<uint8_t*>(pointer) : byte_pointer_;
+
+        // Initialize internal state counter
+        state_[0] = state_[1] = state_[2] = 0;
+    }
+
+    /// Adds a pointer offset in units of Element
+    CUTLASS_HOST_DEVICE
+    void add_pointer_offset(LongIndex pointer_offset)
+    {
+        store_byte_pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+        byte_pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+    }
+
+    /// Stores a fragment to memory
+    CUTLASS_DEVICE
+    void store_with_byte_offset(Fragment const& frag, int64_t byte_offset) const
+    {
+        uint8_t* byte_pointer = store_byte_pointer_;
+        AccessType const* frag_ptr = reinterpret_cast<AccessType const*>(&frag);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
+            CUTLASS_PRAGMA_UNROLL
+            for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+                CUTLASS_PRAGMA_UNROLL
+                for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+                    int frag_row_idx =
+                        (row + ThreadMap::Iterations::kRow *
+                                   (group + ThreadMap::Iterations::kGroup * cluster));
+
+                    int row_offset = row * ThreadMap::Delta::kRow +
+                                     group * ThreadMap::Delta::kGroup +
+                                     cluster * ThreadMap::Delta::kCluster;
+
+                    bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
+
+                    AccessType* memory_pointer =
+                        reinterpret_cast<AccessType*>(byte_pointer + byte_offset);
+
+                    if (ScatterD && row_guard) {
+                        assert(indices_);
+
+                        memory_pointer = reinterpret_cast<AccessType*>(
+                            byte_pointer + byte_offset +
+                            LongIndex(indices_[row_offset + thread_start_row_]) *
+                                LongIndex(params_.stride));
+                    }
+
+                    CUTLASS_PRAGMA_UNROLL
+                    for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
+                        bool guard = row_guard && mask_.predicates[column];
+
+                        if (PermuteD) {
+                            int col_offset = column * ThreadMap::Delta::kColumn;
+
+                            int col = col_offset + thread_start_column_;
+                            int row = row_offset + thread_start_row_;
+
+                            // Locate memory_pointer
+                            memory_pointer = reinterpret_cast<AccessType*>(
+                                byte_pointer + byte_offset +
+                                permute_layout_(PitchLinearCoord(col, row)) * sizeof(AccessType) /
+                                    kElementsPerAccess);
+                        }
+                        atomic_store<AccessType>(
+                            frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column],
+                            (void*)&memory_pointer[0],
+                            guard);
+
+                        if (!PermuteD) {
+                            memory_pointer += (ThreadMap::Delta::kColumn / kElementsPerAccess);
+                        }
+                    }
+
+                    if (row + 1 < ThreadMap::Iterations::kRow) {
+                        if (!ScatterD && !PermuteD) { byte_pointer += params_.increment_row; }
+                    }
+                }
+
+                if (group + 1 < ThreadMap::Iterations::kGroup) {
+                    byte_pointer += params_.increment_group;
+                }
+            }
+
+            if (cluster + 1 < ThreadMap::Iterations::kCluster) {
+                byte_pointer += params_.increment_cluster;
+            }
+        }
+    }
+
+    /// Stores a fragment to memory
+    CUTLASS_DEVICE
+    void store(Fragment const& frag) const { store_with_byte_offset(frag, 0); }
+
+    CUTLASS_DEVICE
+    void load(Fragment& frag) {}
+
+    CUTLASS_DEVICE
+    MatrixCoord thread_start() const
+    {
+        return MatrixCoord(thread_start_row_, thread_start_column_);
+    }
+
+    /// Need to get the thread start row from the tile iterator
+    CUTLASS_DEVICE
+    int32_t thread_start_row() const { return thread_start_row_; }
+
+    /// Need to get the thread start row from the tile iterator
+    CUTLASS_DEVICE
+    int32_t thread_start_column() const { return thread_start_column_; }
+
+    /// Extent of the matrix in rows
+    CUTLASS_DEVICE
+    Index extent_row() const { return extent_row_; }
+
+    /// Extent of the matrix in columns
+    CUTLASS_DEVICE
+    Index extent_column() const { return extent_column_; }
+
+    /// Advances to the next position to load or store
+    CUTLASS_HOST_DEVICE
+    PredicatedTileIteratorAtomic& operator++()
+    {
+        ++state_[0];
+
+        if (!ScatterD && !PermuteD) { store_byte_pointer_ += params_.advance_row; }
+
+        if (!ScatterD) { byte_pointer_ += params_.advance_row; }
+
+        thread_start_row_ += ThreadMap::Shape::kRow;
+
+        if (state_[0] == ThreadMap::Count::kRow) {
+            state_[0] = 0;
+            ++state_[1];
+            byte_pointer_ += params_.advance_group;
+            store_byte_pointer_ += params_.advance_group;
+
+            thread_start_row_ +=
+                (ThreadMap::Shape::kGroup - 1) * ThreadMap::Shape::kRow * ThreadMap::Count::kRow;
+
+            if (state_[1] == ThreadMap::Count::kGroup) {
+                state_[1] = 0;
+                ++state_[2];
+                byte_pointer_ += params_.advance_cluster;
+                store_byte_pointer_ += params_.advance_cluster;
+
+                thread_start_row_ += ThreadMap::Count::kGroup * ThreadMap::Shape::kGroup *
+                                     ThreadMap::Count::kRow * ThreadMap::Shape::kRow;
+
+                if (state_[2] == ThreadMap::Count::kCluster) {
+                    state_[2] = 0;
+                    byte_pointer_ += params_.advance_tile;
+                    store_byte_pointer_ += params_.advance_tile;
+
+                    thread_start_row_ += ThreadMap::Shape::kGroup * ThreadMap::Shape::kRow *
+                                         ThreadMap::Shape::kCluster * ThreadMap::Shape::kTile;
+                }
+            }
+        }
+
+        return *this;
+    }
+
+    /// Advances a number of positions to load or store
+    CUTLASS_HOST_DEVICE
+    PredicatedTileIteratorAtomic& operator+=(int increment)
+    {
+        // Row
+        state_[0] += increment;
+        int increment_row = state_[0] / ThreadMap::Count::kRow;
+        state_[0] = state_[0] % ThreadMap::Count::kRow;
+
+        byte_pointer_ += (params_.advance_row * increment);
+        store_byte_pointer_ += (params_.advance_row * increment);
+        thread_start_row_ += (ThreadMap::Shape::kRow * increment);
+
+        // Group
+        state_[1] += increment_row;
+        int increment_group = state_[1] / ThreadMap::Count::kGroup;
+        state_[1] = state_[1] % ThreadMap::Count::kGroup;
+
+        byte_pointer_ += (params_.advance_group * increment_row);
+        store_byte_pointer_ += (params_.advance_group * increment_row);
+        thread_start_row_ += (ThreadMap::Shape::kGroup - 1) * ThreadMap::Shape::kRow *
+                             ThreadMap::Count::kRow * increment_row;
+
+        // Cluster
+        state_[2] += increment_group;
+        int increment_cluster = state_[2] / ThreadMap::Count::kCluster;
+        state_[2] = state_[2] % ThreadMap::Count::kCluster;
+
+        byte_pointer_ += (params_.advance_cluster * increment_group);
+        store_byte_pointer_ += (params_.advance_cluster * increment_group);
+        thread_start_row_ += ThreadMap::Count::kGroup * ThreadMap::Shape::kGroup *
+                             ThreadMap::Count::kRow * ThreadMap::Shape::kRow * increment_group;
+
+        // Tile
+        byte_pointer_ += (params_.advance_tile * increment_cluster);
+        store_byte_pointer_ += (params_.advance_tile * increment_cluster);
+        thread_start_row_ += ThreadMap::Shape::kGroup * ThreadMap::Shape::kRow *
+                             ThreadMap::Shape::kCluster * ThreadMap::Shape::kTile *
+                             increment_cluster;
+
+        return *this;
+    }
+
+    ///< Efficiently disables all accesses guarded by mask
+    CUTLASS_DEVICE void clear_mask() { mask_.clear(); }
+
+    ///< Efficiently enables all accesses guarded by mask
+    CUTLASS_DEVICE void enable_mask() { mask_.enable(); }
+
+    ///< Sets the mask
+    CUTLASS_DEVICE void get_mask(Mask& mask) const { mask = mask_; }
+
+    ///< Sets the mask
+    CUTLASS_DEVICE void set_mask(Mask const& mask) { mask_ = mask; }
+};
+
+}  // namespace threadblock
+}  // namespace epilogue
+}  // namespace cutlass
diff --git a/csrc/deepspeed4science/evoformer_attn/iterators/predicated_tile_iterator_residual_last.h b/csrc/deepspeed4science/evoformer_attn/iterators/predicated_tile_iterator_residual_last.h
new file mode 100644
index 000000000000..629047dbb057
--- /dev/null
+++ b/csrc/deepspeed4science/evoformer_attn/iterators/predicated_tile_iterator_residual_last.h
@@ -0,0 +1,1938 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+/*! \file
+    \brief Templates implementing loading of tiles from pitch-linear rank=2
+   tensors.
+
+    This iterator uses masks to guard out-of-bounds accesses. The first tile
+   this iterator visits maybe partial, then the remaining tiles are complete.
+   So, we only need to compute the predicates twice, once before the first tile
+   and once for the remaining full tiles which can share the same predicates.
+
+    A precomputed "Params" object minimizes the amount of state that must be
+   stored in registers, and integer addition is used to advance the pointer
+   through memory.
+*/
+
+#pragma once
+
+#include "cutlass/arch/memory.h"
+#include "cutlass/transform/threadblock/predicated_tile_access_iterator.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// PredicatedTileIteratorResidualLast
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+/// Regular tile iterator using a precomputed control structure to minimize
+/// register liveness and integer arithmetic.
+///
+/// Layout is assumed to be invariant at the time the precomputed "Params"
+/// object is constructed.
+///
+/// Base pointer and tensor extents may be specified at the time the iterator is
+/// constructed. Subsequently, they are assumed to be immutable.
+///
+/// Adding a logical coordinate offset may be performed at the time the iterator
+/// is constructed. Subsequent additions to logical coordinate offset may be
+/// performed but are relatively expensive.
+///
+/// Visitation order is intended to first visit a "residual" tile that may be
+/// partially full in both the advance dimension and the steady-state dimension.
+/// This is assumed to be the last tile in the iteration sequence. Advancing an
+/// iterator that has just been constructed moves to the first tile that is full
+/// in the advance dimension and recomputes predicates. Subsequent accesses may
+/// be performed without updating internal predicates and are efficient in terms
+/// of live register state and pointer arithmetic instructions.
+///
+/// To be efficient, this assumes the iterator will be dereferenced and advanced
+/// at least once outside any looping structure to minimize integer arithmetic.
+///
+/// Accesses out of bounds are safe so long as `clear_mask()` is called prior to
+/// dereferencing the iterator.
+///
+///
+/// Example:
+///
+/// An efficient pipeline structure may be constructed as follows:
+///
+// template <typename Iterator>
+// __global__ void kernel(
+//   typename Iterator::Params params,
+//   typename Iterator::Element *ptr,
+//   TensorCoord extent) {
+//
+//   typename Iterator::Fragment fragment;
+//
+//   TensorCoord threadblock_offset(0, 0);
+//
+//   Iterator iter(params, ptr, extent, threadIdx.x, threadblock_offsets);
+//
+//
+//   fragment = *iter;        // load "residue" tile first
+//   ++iter;                  // advance to first "steady state" tile and update
+//   internal masks
+//
+//
+//   #pragma unroll
+//   for (int i = Remaining - 1; i >= 0; --i) {
+//
+//     f(fragment);
+//
+//     if (!i) {
+//       iter.clear_mask();   // light-weight operation to clear masks -
+//       subsequent loads become NO-OPs.
+//     }
+//
+//     fragment = *iter;      // load tile during "steady state" phase
+//     ++iter;                // advance to next tile - lightweight due to
+//     steady-state masks
+//   }
+// }
+//
+// void host(TensorView<Element, 2, layout::PitchLinear> view) {
+//
+//   using Iterator =
+//   transform::threadblock::PredicatedTileIteratorResidualLast;
+//
+//   typename Iterator::Params params(view.layout());
+//
+//   kernel<Iterator>(params, view.data());
+// }
+///
+///
+template <typename Shape,
+          typename Element,
+          typename Layout,
+          int AdvanceRank,
+          typename ThreadMap,
+          int AccessSize = ThreadMap::kElementsPerAccess,
+          bool Gather = false>
+class PredicatedTileIteratorResidualLast;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIteratorResidualLast for pitch-linear data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_,
+          typename Element_,
+          int AdvanceRank,
+          typename ThreadMap_,
+          int AccessSize,
+          bool Gather>
+class PredicatedTileIteratorResidualLast<Shape_,
+                                         Element_,
+                                         layout::PitchLinear,
+                                         AdvanceRank,
+                                         ThreadMap_,
+                                         AccessSize,
+                                         Gather> {
+public:
+    static_assert(AdvanceRank == 0 || AdvanceRank == 1,
+                  "Specialization for pitch-linear iterator may advance along the "
+                  "contiguous(rank=0) or strided(rank=1) dimension.");
+
+    using Shape = Shape_;
+    using Element = Element_;
+    using Layout = layout::PitchLinear;
+    static int const kAdvanceRank = AdvanceRank;
+    using ThreadMap = ThreadMap_;
+
+    using Index = typename Layout::Index;
+    using LongIndex = typename Layout::LongIndex;
+
+    using TensorRef = TensorRef<Element, Layout>;
+    using TensorView = TensorView<Element, Layout>;
+    using TensorCoord = typename Layout::TensorCoord;
+
+    using Pointer = Element*;
+    using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+    /// Type used for internal memory accesses
+    using AccessType =
+        AlignedArray<Element, AccessSize, (AccessSize * sizeof_bits<Element>::value / 8)>;
+
+    /// Underlying iterator to compute the addresses
+    using TileAccessIterator = PredicatedTileAccessIteratorResidualLast<Shape,
+                                                                        Element,
+                                                                        Layout,
+                                                                        kAdvanceRank,
+                                                                        ThreadMap,
+                                                                        AccessType,
+                                                                        Gather>;
+
+    static int const kAccessesPerVector = TileAccessIterator::kAccessesPerVector;
+
+    /// Fragment object to be loaded or stored
+    using Fragment =
+        cutlass::Array<Element, ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+    /// Predicate vector stores mask to guard accesses
+    using Mask = typename TileAccessIterator::Mask;
+
+    /// Parameters object is precomputed state and is host-constructible
+    class Params {
+    public:
+        using Base = typename TileAccessIterator::Params::Base;
+
+        friend PredicatedTileIteratorResidualLast;
+
+    private:
+        /// Parameters object
+        typename TileAccessIterator::Params params_;
+
+    public:
+        /// Construct the Params object given a pitch-linear tensor's layout
+        CUTLASS_HOST_DEVICE
+        Params(Layout const& layout) : params_(layout) {}
+
+        CUTLASS_HOST_DEVICE
+        Params() {}
+
+        CUTLASS_HOST_DEVICE
+        Params(Base const& base) : params_(base) {}
+    };
+
+private:
+    /// Internal pointer type permits fast address arithmetic
+    using BytePointer = char*;
+
+private:
+    //
+    // Data members
+    //
+
+    /// Data member to the tile access iterator
+    TileAccessIterator address_iterator_;
+
+public:
+    /// Constructs a TileIterator from its precomputed state, threadblock offset,
+    /// and thread ID
+    CUTLASS_HOST_DEVICE
+    PredicatedTileIteratorResidualLast(
+        /// Precomputed parameters object
+        Params const& params,
+        /// Pointer to start of tensor
+        Pointer pointer,
+        /// Extent of tensor
+        TensorCoord extent,
+        /// ID of each participating thread
+        int thread_id,
+        /// Initial offset of threadblock
+        TensorCoord const& threadblock_offset,
+        /// Gather indices
+        int const* indices = nullptr)
+        : address_iterator_(params.params_, pointer, extent, thread_id, threadblock_offset, indices)
+    {
+    }
+
+    /// Construct a PredicatedTileIteratorResidualLast with zero threadblock
+    /// offset
+    CUTLASS_HOST_DEVICE
+    PredicatedTileIteratorResidualLast(Params const& params,  ///< Precomputed parameters object
+                                       Pointer pointer,       ///< Pointer to start of tensor
+                                       TensorCoord extent,    ///< Extent of tensor
+                                       int thread_id          ///< ID of each participating thread
+                                       )
+        : PredicatedTileIteratorResidualLast(params, pointer, extent, thread_id, make_Coord(0, 0))
+    {
+    }
+
+    /// Adds a pointer offset in units of Element
+    CUTLASS_HOST_DEVICE
+    void add_pointer_offset(LongIndex pointer_offset)
+    {
+        address_iterator_.add_pointer_offset(pointer_offset);
+    }
+
+    /// Advances to the next tile in memory.
+    ///
+    /// The first time this method is called, predicates are updated, and the
+    /// iterator's internal pointer is reverted to the first "steady state" tile.
+    /// Subsequent calls are lightweight and must only update the internal
+    /// pointer.
+    CUTLASS_HOST_DEVICE
+    PredicatedTileIteratorResidualLast& operator++()
+    {
+        if (kAdvanceRank)
+            address_iterator_.add_tile_offset({0, 1});
+        else
+            address_iterator_.add_tile_offset({1, 0});
+
+        return *this;
+    }
+
+    /// Advances to the next tile in memory.
+    ///
+    /// The first time this method is called, predicates are updated, and the
+    /// iterator's internal pointer is reverted to the first "steady state" tile.
+    /// Subsequent calls are lightweight and must only update the internal
+    /// pointer.
+    CUTLASS_HOST_DEVICE
+    PredicatedTileIteratorResidualLast operator++(int)
+    {
+        PredicatedTileIteratorResidualLast self(*this);
+        operator++();
+        return self;
+    }
+
+    /// Clears the predicate set efficiently
+    CUTLASS_HOST_DEVICE
+    void clear_mask(bool enable = true) { address_iterator_.clear_mask(enable); }
+
+    CUTLASS_HOST_DEVICE
+    void set_residual_tile(bool enable) { address_iterator_.set_residual_tile(enable); }
+
+    /// Clears the predicate set efficiently
+    CUTLASS_HOST_DEVICE
+    void enable_mask() { address_iterator_.enable_mask(); }
+
+    /// Sets the predicate mask, overriding value stored in predicate iterator
+    CUTLASS_HOST_DEVICE
+    void set_mask(Mask const& mask) { address_iterator_.set_mask(mask); }
+
+    /// Gets the mask
+    CUTLASS_HOST_DEVICE
+    void get_mask(Mask& mask) { address_iterator_.get_mask(mask); }
+
+    CUTLASS_DEVICE
+    void load_with_pointer_offset(Fragment& frag, Index pointer_offset)
+    {
+        load_with_byte_offset(frag, pointer_offset * sizeof_bits<Element>::value / 8);
+    }
+
+    CUTLASS_DEVICE
+    void load_with_byte_offset(Fragment& frag, LongIndex byte_offset)
+    {
+        AccessType* frag_ptr = reinterpret_cast<AccessType*>(&frag);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+            CUTLASS_PRAGMA_UNROLL
+            for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+                CUTLASS_PRAGMA_UNROLL
+                for (int v = 0; v < kAccessesPerVector; ++v) {
+                    int idx = v + kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
+
+                    address_iterator_.set_iteration_index(idx);
+                    char const* byte_ptr =
+                        reinterpret_cast<char const*>(address_iterator_.get()) + byte_offset;
+
+                    AccessType const* access_ptr = reinterpret_cast<AccessType const*>(byte_ptr);
+
+                    cutlass::arch::global_load<AccessType, sizeof(AccessType)>(
+                        frag_ptr[idx], access_ptr, address_iterator_.valid());
+
+                    ++address_iterator_;
+                }
+            }
+        }
+    }
+
+    /// Loads a fragment from memory
+    CUTLASS_DEVICE
+    void load(Fragment& frag) { load_with_byte_offset(frag, 0); }
+
+    /// Store a fragment to memory
+    CUTLASS_DEVICE
+    void store_with_pointer_offset(Fragment const& frag, Index pointer_offset)
+    {
+        store_with_byte_offset(frag, pointer_offset * sizeof_bits<Element>::value / 8);
+    }
+
+    /// Store a fragment to memory
+    CUTLASS_DEVICE
+    void store_with_byte_offset(Fragment const& frag, LongIndex byte_offset)
+    {
+        address_iterator_.set_iteration_index(0);
+        AccessType const* frag_ptr = reinterpret_cast<AccessType const*>(&frag);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+            CUTLASS_PRAGMA_UNROLL
+            for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+                CUTLASS_PRAGMA_UNROLL
+                for (int v = 0; v < kAccessesPerVector; ++v) {
+                    int idx = v + kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
+
+                    char* byte_ptr = reinterpret_cast<char*>(address_iterator_.get()) + byte_offset;
+                    AccessType* access_ptr = reinterpret_cast<AccessType*>(byte_ptr);
+
+                    if (address_iterator_.valid()) { *access_ptr = frag_ptr[idx]; }
+                    ++address_iterator_;
+                }
+            }
+        }
+    }
+
+    /// Store a fragment to memory
+    CUTLASS_DEVICE
+    void store(Fragment const& frag) { store_with_byte_offset(frag, 0); }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIteratorResidualLast for pitch-linear data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_,
+          typename Element_,
+          int AdvanceRank,
+          typename ThreadMap_,
+          int AccessSize,
+          bool Gather>
+class PredicatedTileIteratorResidualLast<Shape_,
+                                         Element_,
+                                         layout::ColumnMajor,
+                                         AdvanceRank,
+                                         ThreadMap_,
+                                         AccessSize,
+                                         Gather> {
+public:
+    static_assert(AdvanceRank == 0 || AdvanceRank == 1,
+                  "Specialization for pitch-linear iterator may along advance along the "
+                  "contiguous(rank=0) or strided(rank=1) dimension.");
+
+    using Shape = Shape_;
+    using Element = Element_;
+    using Layout = layout::ColumnMajor;
+    static int const kAdvanceRank = AdvanceRank;
+    using ThreadMap = ThreadMap_;
+
+    using Index = typename Layout::Index;
+    using LongIndex = typename Layout::LongIndex;
+
+    using TensorRef = TensorRef<Element, Layout>;
+    using TensorView = TensorView<Element, Layout>;
+    using TensorCoord = typename Layout::TensorCoord;
+
+    using Pointer = Element*;
+    using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+    using UnderlyingIterator =
+        PredicatedTileIteratorResidualLast<layout::PitchLinearShape<Shape::kRow, Shape::kColumn>,
+                                           Element,
+                                           layout::PitchLinear,
+                                           (kAdvanceRank == 0 ? 0 : 1),
+                                           ThreadMap,
+                                           AccessSize,
+                                           Gather>;
+
+    using AccessType = typename UnderlyingIterator::AccessType;
+
+    /// Fragment object to be loaded or stored
+    using Fragment =
+        cutlass::Array<Element, ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+    /// Predicate vector stores mask to guard accesses
+    using Mask = typename UnderlyingIterator::Mask;
+
+    /// Parameters object is precomputed state and is host-constructible
+    class Params {
+    private:
+        friend PredicatedTileIteratorResidualLast;
+
+        /// Parameters object
+        typename UnderlyingIterator::Params params_;
+
+    public:
+        CUTLASS_HOST_DEVICE
+        Params() {}
+
+        /// Construct the Params object given a pitch-linear tensor's layout
+        CUTLASS_HOST_DEVICE
+        Params(Layout const& layout) : params_(layout::PitchLinear(layout.stride(0))) {}
+
+        CUTLASS_HOST_DEVICE
+        Params(typename UnderlyingIterator::Params::Base const& base) : params_(base) {}
+    };
+
+private:
+    //
+    // Data members
+    //
+
+    /// Underlying pitch-linear tile iterator
+    UnderlyingIterator iterator_;
+
+public:
+    /// Constructs a TileIterator from its precomputed state, threadblock offset,
+    /// and thread ID
+    CUTLASS_HOST_DEVICE
+    PredicatedTileIteratorResidualLast(
+        Params const& params,                   ///< Precomputed parameters object
+        Pointer pointer,                        ///< Pointer to start of tensor
+        TensorCoord extent,                     ///< Extent of tensor
+        int thread_id,                          ///< ID of each participating thread
+        TensorCoord const& threadblock_offset,  ///< Initial offset of threadblock
+        int const* indices = nullptr            ///< gather/scatter indices, note no support for
+                                                ///< gather/scatter at this specialization
+        )
+        : iterator_(params.params_,
+                    pointer,
+                    layout::PitchLinearCoord(extent.row(), extent.column()),
+                    thread_id,
+                    layout::PitchLinearCoord(threadblock_offset.row(), threadblock_offset.column()),
+                    indices)
+    {
+    }
+
+    /// Construct a PredicatedTileIteratorResidualLast with zero threadblock
+    /// offset
+    CUTLASS_HOST_DEVICE
+    PredicatedTileIteratorResidualLast(Params const& params,  ///< Precomputed parameters object
+                                       Pointer pointer,       ///< Pointer to start of tensor
+                                       TensorCoord extent,    ///< Extent of tensor
+                                       int thread_id          ///< ID of each participating thread
+                                       )
+        : PredicatedTileIteratorResidualLast(params, pointer, extent, thread_id, make_Coord(0, 0))
+    {
+    }
+
+    /// Adds a pointer offset in units of Element
+    CUTLASS_HOST_DEVICE
+    void add_pointer_offset(LongIndex pointer_offset)
+    {
+        iterator_.add_pointer_offset(pointer_offset);
+    }
+
+    /// Advances to the next tile in memory.
+    ///
+    /// The first time this method is called, predicates are updated, and the
+    /// iterator's internal pointer is reverted to the first "steady state" tile.
+    /// Subsequent calls are lightweight and must only update the internal
+    /// pointer.
+    CUTLASS_HOST_DEVICE
+    PredicatedTileIteratorResidualLast& operator++()
+    {
+        ++iterator_;
+        return *this;
+    }
+
+    /// Advances to the next tile in memory.
+    ///
+    /// The first time this method is called, predicates are updated, and the
+    /// iterator's internal pointer is reverted to the first "steady state" tile.
+    /// Subsequent calls are lightweight and must only update the internal
+    /// pointer.
+    CUTLASS_HOST_DEVICE
+    PredicatedTileIteratorResidualLast operator++(int)
+    {
+        PredicatedTileIteratorResidualLast self(*this);
+        operator++();
+        return self;
+    }
+
+    /// Clears the predicate set efficiently
+    CUTLASS_HOST_DEVICE
+    void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+
+    CUTLASS_HOST_DEVICE
+    void set_residual_tile(bool enable) { iterator_.set_residual_tile(enable); }
+
+    /// Clears the predicate set efficiently
+    CUTLASS_HOST_DEVICE
+    void enable_mask() { iterator_.enable_mask(); }
+
+    /// Sets the predicate mask, overriding value stored in predicate iterator
+    CUTLASS_HOST_DEVICE
+    void set_mask(Mask const& mask) { iterator_.set_mask(mask); }
+
+    /// Gets the mask
+    CUTLASS_HOST_DEVICE
+    void get_mask(Mask& mask) { iterator_.get_mask(mask); }
+
+    /// Loads a fragment from memory
+    CUTLASS_DEVICE
+    void load_with_pointer_offset(Fragment& frag, Index pointer_offset)
+    {
+        iterator_.load_with_pointer_offset(frag, pointer_offset);
+    }
+
+    /// Loads a fragment from memory
+    CUTLASS_DEVICE
+    void load_with_byte_offset(Fragment& frag, LongIndex byte_offset)
+    {
+        iterator_.load_with_byte_offset(frag, byte_offset);
+    }
+
+    /// Loads a fragment from memory
+    CUTLASS_DEVICE
+    void load(Fragment& frag) { load_with_pointer_offset(frag, 0); }
+
+    /// Store a fragment to memory
+    CUTLASS_DEVICE
+    void store_with_pointer_offset(Fragment const& frag, Index pointer_offset)
+    {
+        iterator_.store_with_pointer_offset(frag, pointer_offset);
+    }
+
+    /// Store a fragment to memory
+    CUTLASS_DEVICE
+    void store_with_byte_offset(Fragment const& frag, LongIndex byte_offset)
+    {
+        iterator_.store_with_byte_offset(frag, byte_offset);
+    }
+
+    /// Store a fragment to memory
+    CUTLASS_DEVICE
+    void store(Fragment const& frag) { store_with_pointer_offset(frag, 0); }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIteratorResidualLast for pitch-linear data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_,
+          typename Element_,
+          int AdvanceRank,
+          typename ThreadMap_,
+          int AccessSize,
+          bool Gather>
+class PredicatedTileIteratorResidualLast<Shape_,
+                                         Element_,
+                                         layout::RowMajor,
+                                         AdvanceRank,
+                                         ThreadMap_,
+                                         AccessSize,
+                                         Gather> {
+public:
+    static_assert(AdvanceRank == 0 || AdvanceRank == 1,
+                  "Specialization for pitch-linear iterator may along advance along the "
+                  "contiguous(rank=0) or strided(rank=1) dimension.");
+
+    using Shape = Shape_;
+    using Element = Element_;
+    using Layout = layout::RowMajor;
+    static int const kAdvanceRank = AdvanceRank;
+    using ThreadMap = ThreadMap_;
+
+    using Index = typename Layout::Index;
+    using LongIndex = typename Layout::LongIndex;
+
+    using TensorRef = TensorRef<Element, Layout>;
+    using TensorView = TensorView<Element, Layout>;
+    using TensorCoord = typename Layout::TensorCoord;
+
+    using Pointer = Element*;
+    using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+    using UnderlyingIterator =
+        PredicatedTileIteratorResidualLast<layout::PitchLinearShape<Shape::kColumn, Shape::kRow>,
+                                           Element,
+                                           layout::PitchLinear,
+                                           (kAdvanceRank == 0 ? 1 : 0),
+                                           ThreadMap,
+                                           AccessSize,
+                                           Gather>;
+
+    using AccessType = typename UnderlyingIterator::AccessType;
+
+    /// Fragment object to be loaded or stored
+    using Fragment =
+        cutlass::Array<Element, ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+    /// Predicate vector stores mask to guard accesses
+    using Mask = typename UnderlyingIterator::Mask;
+
+    /// Parameters object is precomputed state and is host-constructible
+    class Params {
+    private:
+        friend PredicatedTileIteratorResidualLast;
+
+        /// Parameters object
+        typename UnderlyingIterator::Params params_;
+
+    public:
+        CUTLASS_HOST_DEVICE
+        Params() {}
+
+        /// Construct the Params object given a pitch-linear tensor's layout
+        CUTLASS_HOST_DEVICE
+        Params(Layout const& layout) : params_(layout::PitchLinear(layout.stride(0))) {}
+
+        CUTLASS_HOST_DEVICE
+        Params(typename UnderlyingIterator::Params::Base const& base) : params_(base) {}
+    };
+
+private:
+    //
+    // Data members
+    //
+
+    /// Underlying pitch-linear tile iterator
+    UnderlyingIterator iterator_;
+
+public:
+    /// Constructs a TileIterator from its precomputed state, threadblock offset,
+    /// and thread ID
+    CUTLASS_HOST_DEVICE
+    PredicatedTileIteratorResidualLast(
+        Params const& params,                   ///< Precomputed parameters object
+        Pointer pointer,                        ///< Pointer to start of tensor
+        TensorCoord extent,                     ///< Extent of tensor
+        int thread_id,                          ///< ID of each participating thread
+        TensorCoord const& threadblock_offset,  ///< Initial offset of threadblock
+        int const* indices = nullptr            ///< Gather indices
+        )
+        : iterator_(params.params_,
+                    pointer,
+                    layout::PitchLinearCoord(extent.column(), extent.row()),
+                    thread_id,
+                    layout::PitchLinearCoord(threadblock_offset.column(), threadblock_offset.row()),
+                    indices)
+    {
+    }
+
+    /// Construct a PredicatedTileIteratorResidualLast with zero threadblock
+    /// offset
+    CUTLASS_HOST_DEVICE
+    PredicatedTileIteratorResidualLast(Params const& params,  ///< Precomputed parameters object
+                                       Pointer pointer,       ///< Pointer to start of tensor
+                                       TensorCoord extent,    ///< Extent of tensor
+                                       int thread_id          ///< ID of each participating thread
+                                       )
+        : PredicatedTileIteratorResidualLast(params, pointer, extent, thread_id, make_Coord(0, 0))
+    {
+    }
+
+    /// Adds a pointer offset in units of Element
+    CUTLASS_HOST_DEVICE
+    void add_pointer_offset(LongIndex pointer_offset)
+    {
+        iterator_.add_pointer_offset(pointer_offset);
+    }
+
+    /// Advances to the next tile in memory.
+    ///
+    /// The first time this method is called, predicates are updated, and the
+    /// iterator's internal pointer is reverted to the first "steady state" tile.
+    /// Subsequent calls are lightweight and must only update the internal
+    /// pointer.
+    CUTLASS_HOST_DEVICE
+    PredicatedTileIteratorResidualLast& operator++()
+    {
+        ++iterator_;
+        return *this;
+    }
+
+    /// Advances to the next tile in memory.
+    ///
+    /// The first time this method is called, predicates are updated, and the
+    /// iterator's internal pointer is reverted to the first "steady state" tile.
+    /// Subsequent calls are lightweight and must only update the internal
+    /// pointer.
+    CUTLASS_HOST_DEVICE
+    PredicatedTileIteratorResidualLast operator++(int)
+    {
+        PredicatedTileIteratorResidualLast self(*this);
+        operator++();
+        return self;
+    }
+
+    /// Clears the predicate set efficiently
+    CUTLASS_HOST_DEVICE
+    void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+
+    CUTLASS_HOST_DEVICE
+    void set_residual_tile(bool enable) { iterator_.set_residual_tile(enable); }
+
+    /// Clears the predicate set efficiently
+    CUTLASS_HOST_DEVICE
+    void enable_mask() { iterator_.enable_mask(); }
+
+    /// Sets the predicate mask, overriding value stored in predicate iterator
+    CUTLASS_HOST_DEVICE
+    void set_mask(Mask const& mask) { iterator_.set_mask(mask); }
+
+    /// Gets the mask
+    CUTLASS_HOST_DEVICE
+    void get_mask(Mask& mask) { iterator_.get_mask(mask); }
+
+    /// Loads a fragment from memory
+    CUTLASS_DEVICE
+    void load_with_pointer_offset(Fragment& frag, Index pointer_offset)
+    {
+        iterator_.load_with_pointer_offset(frag, pointer_offset);
+    }
+
+    /// Loads a fragment from memory
+    CUTLASS_DEVICE
+    void load_with_byte_offset(Fragment& frag, LongIndex byte_offset)
+    {
+        iterator_.load_with_byte_offset(frag, byte_offset);
+    }
+
+    /// Loads a fragment from memory
+    CUTLASS_DEVICE
+    void load(Fragment& frag) { load_with_pointer_offset(frag, 0); }
+
+    /// Store a fragment to memory
+    CUTLASS_DEVICE
+    void store_with_pointer_offset(Fragment const& frag, Index pointer_offset)
+    {
+        iterator_.store_with_pointer_offset(frag, pointer_offset);
+    }
+
+    /// Store a fragment to memory
+    CUTLASS_DEVICE
+    void store_with_byte_offset(Fragment const& frag, LongIndex byte_offset)
+    {
+        iterator_.store_with_byte_offset(frag, byte_offset);
+    }
+
+    /// Store a fragment to memory
+    CUTLASS_DEVICE
+    void store(Fragment const& frag) { store_with_pointer_offset(frag, 0); }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIteratorResidualLast for affine rank-2 data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank, typename ThreadMap_, int AccessSize>
+class PredicatedTileIteratorResidualLast<Shape_,
+                                         Element_,
+                                         layout::AffineRankN<2>,
+                                         AdvanceRank,
+                                         ThreadMap_,
+                                         AccessSize,
+                                         false> {
+public:
+    static_assert(AdvanceRank == 0 || AdvanceRank == 1,
+                  "Specialization for pitch-linear iterator may advance along the "
+                  "contiguous(rank=0) or strided(rank=1) dimension.");
+
+    using Shape = Shape_;
+    using Element = Element_;
+    using Layout = layout::AffineRankN<2>;
+    static int const kAdvanceRank = AdvanceRank;
+    using ThreadMap = ThreadMap_;
+
+    using Index = typename Layout::Index;
+    using LongIndex = typename Layout::LongIndex;
+
+    using TensorRef = TensorRef<Element, Layout>;
+    using TensorView = TensorView<Element, Layout>;
+    using TensorCoord = typename Layout::TensorCoord;
+
+    using Pointer = Element*;
+    using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+    /// Type used for internal memory accesses
+    using AccessType =
+        AlignedArray<Element, AccessSize, (AccessSize * sizeof_bits<Element>::value / 8)>;
+
+    /// Underlying iterator to compute the addresses
+    using TileAccessIterator = PredicatedTileAccessIteratorResidualLast<Shape,
+                                                                        Element,
+                                                                        Layout,
+                                                                        kAdvanceRank,
+                                                                        ThreadMap,
+                                                                        AccessType>;
+
+    static int const kAccessesPerVector = TileAccessIterator::kAccessesPerVector;
+
+    /// Fragment object to be loaded or stored
+    using Fragment =
+        cutlass::Array<Element, ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+    /// Predicate vector stores mask to guard accesses
+    using Mask = typename TileAccessIterator::Mask;
+
+    /// Parameters object is precomputed state and is host-constructible
+    class Params {
+    public:
+        friend PredicatedTileIteratorResidualLast;
+
+    private:
+        /// Parameters object
+        typename TileAccessIterator::Params params_;
+
+    public:
+        /// Construct the Params object given a pitch-linear tensor's layout
+        CUTLASS_HOST_DEVICE
+        Params(Layout const& layout) : params_(layout) {}
+
+        CUTLASS_HOST_DEVICE
+        Params() {}
+    };
+
+private:
+    /// Internal pointer type permits fast address arithmetic
+    using BytePointer = char*;
+
+private:
+    //
+    // Data members
+    //
+
+    /// Data member to the tile access iterator
+    TileAccessIterator address_iterator_;
+
+public:
+    /// Constructs a TileIterator from its precomputed state, threadblock offset,
+    /// and thread ID
+    CUTLASS_HOST_DEVICE
+    PredicatedTileIteratorResidualLast(
+        /// Precomputed parameters object
+        Params const& params,
+        /// Pointer to start of tensor
+        Pointer pointer,
+        /// Extent of tensor
+        TensorCoord extent,
+        /// ID of each participating thread
+        int thread_id,
+        /// Initial offset of threadblock
+        TensorCoord const& threadblock_offset,
+        int const* indices = nullptr  ///< gather/scatter indices, note no support for
+                                      ///< gather/scatter at this specialization
+        )
+        : address_iterator_(params.params_, pointer, extent, thread_id, threadblock_offset)
+    {
+    }
+
+    /// Construct a PredicatedTileIteratorResidualLast with zero threadblock
+    /// offset
+    CUTLASS_HOST_DEVICE
+    PredicatedTileIteratorResidualLast(Params const& params,  ///< Precomputed parameters object
+                                       Pointer pointer,       ///< Pointer to start of tensor
+                                       TensorCoord extent,    ///< Extent of tensor
+                                       int thread_id          ///< ID of each participating thread
+                                       )
+        : PredicatedTileIteratorResidualLast(params, pointer, extent, thread_id, make_Coord(0, 0))
+    {
+    }
+
+    /// Adds a pointer offset in units of Element
+    CUTLASS_HOST_DEVICE
+    void add_pointer_offset(LongIndex pointer_offset)
+    {
+        address_iterator_.add_pointer_offset(pointer_offset);
+    }
+
+    /// Advances to the next tile in memory.
+    ///
+    /// The first time this method is called, predicates are updated, and the
+    /// iterator's internal pointer is reverted to the first "steady state" tile.
+    /// Subsequent calls are lightweight and must only update the internal
+    /// pointer.
+    CUTLASS_HOST_DEVICE
+    PredicatedTileIteratorResidualLast& operator++()
+    {
+        if (kAdvanceRank)
+            address_iterator_.add_tile_offset(make_Coord(0, 1));
+        else
+            address_iterator_.add_tile_offset(make_Coord(1, 0));
+
+        return *this;
+    }
+
+    /// Advances to the next tile in memory.
+    ///
+    /// The first time this method is called, predicates are updated, and the
+    /// iterator's internal pointer is reverted to the first "steady state" tile.
+    /// Subsequent calls are lightweight and must only update the internal
+    /// pointer.
+    CUTLASS_HOST_DEVICE
+    PredicatedTileIteratorResidualLast operator++(int)
+    {
+        PredicatedTileIteratorResidualLast self(*this);
+        operator++();
+        return self;
+    }
+
+    /// Clears the predicate set efficiently
+    CUTLASS_HOST_DEVICE
+    void clear_mask(bool enable = true) { address_iterator_.clear_mask(enable); }
+
+    CUTLASS_HOST_DEVICE
+    void set_residual_tile(bool enable) { address_iterator_.set_residual_tile(enable); }
+
+    /// Clears the predicate set efficiently
+    CUTLASS_HOST_DEVICE
+    void enable_mask() { address_iterator_.enable_mask(); }
+
+    /// Sets the predicate mask, overriding value stored in predicate iterator
+    CUTLASS_HOST_DEVICE
+    void set_mask(Mask const& mask) { address_iterator_.set_mask(mask); }
+
+    /// Gets the mask
+    CUTLASS_HOST_DEVICE
+    void get_mask(Mask& mask) { address_iterator_.get_mask(mask); }
+
+    CUTLASS_DEVICE
+    void load_with_pointer_offset(Fragment& frag, Index pointer_offset)
+    {
+        load_with_byte_offset(frag, pointer_offset * sizeof_bits<Element>::value / 8);
+    }
+
+    CUTLASS_DEVICE
+    void load_with_byte_offset(Fragment& frag, LongIndex byte_offset)
+    {
+        AccessType* frag_ptr = reinterpret_cast<AccessType*>(&frag);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+            CUTLASS_PRAGMA_UNROLL
+            for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+                CUTLASS_PRAGMA_UNROLL
+                for (int v = 0; v < kAccessesPerVector; ++v) {
+                    int idx = v + kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
+
+                    address_iterator_.set_iteration_index(idx);
+                    char const* byte_ptr =
+                        reinterpret_cast<char const*>(address_iterator_.get()) + byte_offset;
+
+                    AccessType const* access_ptr = reinterpret_cast<AccessType const*>(byte_ptr);
+
+                    cutlass::arch::global_load<AccessType, sizeof(AccessType)>(
+                        frag_ptr[idx], access_ptr, address_iterator_.valid());
+
+                    ++address_iterator_;
+                }
+            }
+        }
+    }
+
+    /// Loads a fragment from memory
+    CUTLASS_DEVICE
+    void load(Fragment& frag) { load_with_byte_offset(frag, 0); }
+
+    /// Store a fragment to memory
+    CUTLASS_DEVICE
+    void store_with_pointer_offset(Fragment const& frag, Index pointer_offset)
+    {
+        store_with_byte_offset(frag, pointer_offset * sizeof_bits<Element>::value / 8);
+    }
+
+    /// Store a fragment to memory
+    CUTLASS_DEVICE
+    void store_with_byte_offset(Fragment const& frag, LongIndex byte_offset)
+    {
+        address_iterator_.set_iteration_index(0);
+        AccessType const* frag_ptr = reinterpret_cast<AccessType const*>(&frag);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+            CUTLASS_PRAGMA_UNROLL
+            for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+                CUTLASS_PRAGMA_UNROLL
+                for (int v = 0; v < kAccessesPerVector; ++v) {
+                    int idx = v + kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
+
+                    char* byte_ptr = reinterpret_cast<char*>(address_iterator_.get()) + byte_offset;
+                    AccessType* access_ptr = reinterpret_cast<AccessType*>(byte_ptr);
+
+                    if (address_iterator_.valid()) { *access_ptr = frag_ptr[idx]; }
+                    ++address_iterator_;
+                }
+            }
+        }
+    }
+
+    /// Store a fragment to memory
+    CUTLASS_DEVICE
+    void store(Fragment const& frag) { store_with_byte_offset(frag, 0); }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIteratorResidualLast for affine rank 2
+/// column-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank, typename ThreadMap_, int AccessSize>
+class PredicatedTileIteratorResidualLast<Shape_,
+                                         Element_,
+                                         layout::AffineRank2ColumnMajor,
+                                         AdvanceRank,
+                                         ThreadMap_,
+                                         AccessSize,
+                                         false> {
+public:
+    static_assert(AdvanceRank == 0 || AdvanceRank == 1,
+                  "Specialization for pitch-linear iterator may along advance along the "
+                  "contiguous(rank=0) or strided(rank=1) dimension.");
+
+    using Shape = Shape_;
+    using Element = Element_;
+    using Layout = layout::AffineRank2ColumnMajor;
+    static int const kAdvanceRank = AdvanceRank;
+    using ThreadMap = ThreadMap_;
+
+    using Index = typename Layout::Index;
+    using LongIndex = typename Layout::LongIndex;
+
+    using TensorRef = TensorRef<Element, Layout>;
+    using TensorView = TensorView<Element, Layout>;
+    using TensorCoord = typename Layout::TensorCoord;
+
+    using Pointer = Element*;
+    using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+    // Map to the underlying AffineRankN<2> layout
+    using UnderlyingIterator =
+        PredicatedTileIteratorResidualLast<layout::PitchLinearShape<Shape::kRow, Shape::kColumn>,
+                                           Element,
+                                           layout::AffineRankN<2>,
+                                           (kAdvanceRank == 0 ? 0 : 1),
+                                           ThreadMap,
+                                           AccessSize>;
+
+    using AccessType = typename UnderlyingIterator::AccessType;
+
+    /// Fragment object to be loaded or stored
+    using Fragment =
+        cutlass::Array<Element, ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+    /// Predicate vector stores mask to guard accesses
+    using Mask = typename UnderlyingIterator::Mask;
+
+    /// Parameters object is precomputed state and is host-constructible
+    class Params {
+    private:
+        friend PredicatedTileIteratorResidualLast;
+
+        /// Parameters object
+        typename UnderlyingIterator::Params params_;
+
+    public:
+        CUTLASS_HOST_DEVICE
+        Params() {}
+
+        /// Construct the Params object given an AffineRankN<2> tensor's layout
+        CUTLASS_HOST_DEVICE
+        Params(Layout const& layout)
+            : params_(layout::AffineRankN<2>(layout.stride(0), layout.stride(1)))
+        {
+        }
+    };
+
+private:
+    //
+    // Data members
+    //
+
+    /// Underlying AffineRankN<2> tile iterator
+    UnderlyingIterator iterator_;
+
+public:
+    /// Constructs a TileIterator from its precomputed state, threadblock offset,
+    /// and thread ID
+    CUTLASS_HOST_DEVICE
+    PredicatedTileIteratorResidualLast(
+        Params const& params,                   ///< Precomputed parameters object
+        Pointer pointer,                        ///< Pointer to start of tensor
+        TensorCoord extent,                     ///< Extent of tensor
+        int thread_id,                          ///< ID of each participating thread
+        TensorCoord const& threadblock_offset,  ///< Initial offset of threadblock
+        int const* indices = nullptr            ///< gather/scatter indices, note no support for
+                                                ///< gather/scatter at this specialization
+        )
+        : iterator_(params.params_,
+                    pointer,
+                    layout::PitchLinearCoord(extent.row(), extent.column()),
+                    thread_id,
+                    layout::PitchLinearCoord(threadblock_offset.row(), threadblock_offset.column()))
+    {
+    }
+
+    /// Construct a PredicatedTileIteratorResidualLast with zero threadblock
+    /// offset
+    CUTLASS_HOST_DEVICE
+    PredicatedTileIteratorResidualLast(Params const& params,  ///< Precomputed parameters object
+                                       Pointer pointer,       ///< Pointer to start of tensor
+                                       TensorCoord extent,    ///< Extent of tensor
+                                       int thread_id          ///< ID of each participating thread
+                                       )
+        : PredicatedTileIteratorResidualLast(params, pointer, extent, thread_id, make_Coord(0, 0))
+    {
+    }
+
+    /// Adds a pointer offset in units of Element
+    CUTLASS_HOST_DEVICE
+    void add_pointer_offset(LongIndex pointer_offset)
+    {
+        iterator_.add_pointer_offset(pointer_offset);
+    }
+
+    /// Advances to the next tile in memory.
+    ///
+    /// The first time this method is called, predicates are updated, and the
+    /// iterator's internal pointer is reverted to the first "steady state" tile.
+    /// Subsequent calls are lightweight and must only update the internal
+    /// pointer.
+    CUTLASS_HOST_DEVICE
+    PredicatedTileIteratorResidualLast& operator++()
+    {
+        ++iterator_;
+        return *this;
+    }
+
+    /// Advances to the next tile in memory.
+    ///
+    /// The first time this method is called, predicates are updated, and the
+    /// iterator's internal pointer is reverted to the first "steady state" tile.
+    /// Subsequent calls are lightweight and must only update the internal
+    /// pointer.
+    CUTLASS_HOST_DEVICE
+    PredicatedTileIteratorResidualLast operator++(int)
+    {
+        PredicatedTileIteratorResidualLast self(*this);
+        operator++();
+        return self;
+    }
+
+    /// Clears the predicate set efficiently
+    CUTLASS_HOST_DEVICE
+    void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+
+    CUTLASS_HOST_DEVICE
+    void set_residual_tile(bool enable) { iterator_.set_residual_tile(enable); }
+
+    /// Clears the predicate set efficiently
+    CUTLASS_HOST_DEVICE
+    void enable_mask() { iterator_.enable_mask(); }
+
+    /// Sets the predicate mask, overriding value stored in predicate iterator
+    CUTLASS_HOST_DEVICE
+    void set_mask(Mask const& mask) { iterator_.set_mask(mask); }
+
+    /// Gets the mask
+    CUTLASS_HOST_DEVICE
+    void get_mask(Mask& mask) { iterator_.get_mask(mask); }
+
+    /// Loads a fragment from memory
+    CUTLASS_DEVICE
+    void load_with_pointer_offset(Fragment& frag, Index pointer_offset)
+    {
+        iterator_.load_with_pointer_offset(frag, pointer_offset);
+    }
+
+    /// Loads a fragment from memory
+    CUTLASS_DEVICE
+    void load_with_byte_offset(Fragment& frag, LongIndex byte_offset)
+    {
+        iterator_.load_with_byte_offset(frag, byte_offset);
+    }
+
+    /// Loads a fragment from memory
+    CUTLASS_DEVICE
+    void load(Fragment& frag) { load_with_pointer_offset(frag, 0); }
+
+    /// Store a fragment to memory
+    CUTLASS_DEVICE
+    void store_with_pointer_offset(Fragment const& frag, Index pointer_offset)
+    {
+        iterator_.store_with_pointer_offset(frag, pointer_offset);
+    }
+
+    /// Store a fragment to memory
+    CUTLASS_DEVICE
+    void store_with_byte_offset(Fragment const& frag, LongIndex byte_offset)
+    {
+        iterator_.store_with_byte_offset(frag, byte_offset);
+    }
+
+    /// Store a fragment to memory
+    CUTLASS_DEVICE
+    void store(Fragment const& frag) { store_with_pointer_offset(frag, 0); }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIteratorResidualLast for affine rank 2
+/// row-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank, typename ThreadMap_, int AccessSize>
+class PredicatedTileIteratorResidualLast<Shape_,
+                                         Element_,
+                                         layout::AffineRank2RowMajor,
+                                         AdvanceRank,
+                                         ThreadMap_,
+                                         AccessSize,
+                                         false> {
+public:
+    static_assert(AdvanceRank == 0 || AdvanceRank == 1,
+                  "Specialization for pitch-linear iterator may along advance along the "
+                  "contiguous(rank=0) or strided(rank=1) dimension.");
+
+    using Shape = Shape_;
+    using Element = Element_;
+    using Layout = layout::AffineRank2RowMajor;
+    static int const kAdvanceRank = AdvanceRank;
+    using ThreadMap = ThreadMap_;
+
+    using Index = typename Layout::Index;
+    using LongIndex = typename Layout::LongIndex;
+
+    using TensorRef = TensorRef<Element, Layout>;
+    using TensorView = TensorView<Element, Layout>;
+    using TensorCoord = typename Layout::TensorCoord;
+
+    using Pointer = Element*;
+    using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+    // Map to the underlying AffineRankN<2> layout
+    using UnderlyingIterator =
+        PredicatedTileIteratorResidualLast<layout::PitchLinearShape<Shape::kColumn, Shape::kRow>,
+                                           Element,
+                                           layout::AffineRankN<2>,
+                                           (kAdvanceRank == 0 ? 1 : 0),
+                                           ThreadMap,
+                                           AccessSize>;
+
+    using AccessType = typename UnderlyingIterator::AccessType;
+
+    /// Fragment object to be loaded or stored
+    using Fragment =
+        cutlass::Array<Element, ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+    /// Predicate vector stores mask to guard accesses
+    using Mask = typename UnderlyingIterator::Mask;
+
+    /// Parameters object is precomputed state and is host-constructible
+    class Params {
+    private:
+        friend PredicatedTileIteratorResidualLast;
+
+        /// Parameters object
+        typename UnderlyingIterator::Params params_;
+
+    public:
+        CUTLASS_HOST_DEVICE
+        Params() {}
+
+        /// Construct the Params object given an AffineRankN<2> tensor's layout
+        CUTLASS_HOST_DEVICE
+        Params(Layout const& layout)
+            : params_(layout::AffineRankN<2>(layout.stride(1), layout.stride(0)))
+        {
+        }
+    };
+
+private:
+    //
+    // Data members
+    //
+
+    /// Underlying AffineRankN<2> tile iterator
+    UnderlyingIterator iterator_;
+
+public:
+    /// Constructs a TileIterator from its precomputed state, threadblock offset,
+    /// and thread ID
+    CUTLASS_HOST_DEVICE
+    PredicatedTileIteratorResidualLast(
+        Params const& params,                   ///< Precomputed parameters object
+        Pointer pointer,                        ///< Pointer to start of tensor
+        TensorCoord extent,                     ///< Extent of tensor
+        int thread_id,                          ///< ID of each participating thread
+        TensorCoord const& threadblock_offset,  ///< Initial offset of threadblock
+        int const* indices = nullptr            ///< gather/scatter indices, note no support for
+                                                ///< gather/scatter at this specialization
+        )
+        : iterator_(params.params_,
+                    pointer,
+                    layout::PitchLinearCoord(extent.column(), extent.row()),
+                    thread_id,
+                    layout::PitchLinearCoord(threadblock_offset.column(), threadblock_offset.row()))
+    {
+    }
+
+    /// Construct a PredicatedTileIteratorResidualLast with zero threadblock
+    /// offset
+    CUTLASS_HOST_DEVICE
+    PredicatedTileIteratorResidualLast(Params const& params,  ///< Precomputed parameters object
+                                       Pointer pointer,       ///< Pointer to start of tensor
+                                       TensorCoord extent,    ///< Extent of tensor
+                                       int thread_id          ///< ID of each participating thread
+                                       )
+        : PredicatedTileIteratorResidualLast(params, pointer, extent, thread_id, make_Coord(0, 0))
+    {
+    }
+
+    /// Adds a pointer offset in units of Element
+    CUTLASS_HOST_DEVICE
+    void add_pointer_offset(LongIndex pointer_offset)
+    {
+        iterator_.add_pointer_offset(pointer_offset);
+    }
+
+    /// Advances to the next tile in memory.
+    ///
+    /// The first time this method is called, predicates are updated, and the
+    /// iterator's internal pointer is reverted to the first "steady state" tile.
+    /// Subsequent calls are lightweight and must only update the internal
+    /// pointer.
+    CUTLASS_HOST_DEVICE
+    PredicatedTileIteratorResidualLast& operator++()
+    {
+        ++iterator_;
+        return *this;
+    }
+
+    /// Advances to the next tile in memory.
+    ///
+    /// The first time this method is called, predicates are updated, and the
+    /// iterator's internal pointer is reverted to the first "steady state" tile.
+    /// Subsequent calls are lightweight and must only update the internal
+    /// pointer.
+    CUTLASS_HOST_DEVICE
+    PredicatedTileIteratorResidualLast operator++(int)
+    {
+        PredicatedTileIteratorResidualLast self(*this);
+        operator++();
+        return self;
+    }
+
+    /// Clears the predicate set efficiently
+    CUTLASS_HOST_DEVICE
+    void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+
+    CUTLASS_HOST_DEVICE
+    void set_residual_tile(bool enable) { iterator_.set_residual_tile(enable); }
+
+    /// Clears the predicate set efficiently
+    CUTLASS_HOST_DEVICE
+    void enable_mask() { iterator_.enable_mask(); }
+
+    /// Sets the predicate mask, overriding value stored in predicate iterator
+    CUTLASS_HOST_DEVICE
+    void set_mask(Mask const& mask) { iterator_.set_mask(mask); }
+
+    /// Gets the mask
+    CUTLASS_HOST_DEVICE
+    void get_mask(Mask& mask) { iterator_.get_mask(mask); }
+
+    /// Loads a fragment from memory
+    CUTLASS_DEVICE
+    void load_with_pointer_offset(Fragment& frag, Index pointer_offset)
+    {
+        iterator_.load_with_pointer_offset(frag, pointer_offset);
+    }
+
+    /// Loads a fragment from memory
+    CUTLASS_DEVICE
+    void load_with_byte_offset(Fragment& frag, LongIndex byte_offset)
+    {
+        iterator_.load_with_byte_offset(frag, byte_offset);
+    }
+
+    /// Loads a fragment from memory
+    CUTLASS_DEVICE
+    void load(Fragment& frag) { load_with_pointer_offset(frag, 0); }
+
+    /// Store a fragment to memory
+    CUTLASS_DEVICE
+    void store_with_pointer_offset(Fragment const& frag, Index pointer_offset)
+    {
+        iterator_.store_with_pointer_offset(frag, pointer_offset);
+    }
+
+    /// Store a fragment to memory
+    CUTLASS_DEVICE
+    void store_with_byte_offset(Fragment const& frag, LongIndex byte_offset)
+    {
+        iterator_.store_with_byte_offset(frag, byte_offset);
+    }
+
+    /// Store a fragment to memory
+    CUTLASS_DEVICE
+    void store(Fragment const& frag) { store_with_pointer_offset(frag, 0); }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIteratorResidualLast for interleaved data.
+/// It is mapped to the congruous layout.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+
+template <typename Shape_,
+          typename Element_,
+          int AdvanceRank,
+          typename ThreadMap_,
+          int AccessSize,
+          int InterleavedK>
+class PredicatedTileIteratorResidualLast<Shape_,
+                                         Element_,
+                                         layout::ColumnMajorInterleaved<InterleavedK>,
+                                         AdvanceRank,
+                                         ThreadMap_,
+                                         AccessSize,
+                                         false> {
+public:
+    static_assert(AdvanceRank == 0 || AdvanceRank == 1,
+                  "Specialization for pitch-linear iterator may along advance along the "
+                  "contiguous(rank=0) or strided(rank=1) dimension.");
+
+    using Shape = Shape_;
+    using Element = Element_;
+    static int const kInterleavedK = InterleavedK;
+    using Layout = layout::ColumnMajorInterleaved<kInterleavedK>;
+    static int const kAdvanceRank = AdvanceRank;
+    using ThreadMap = ThreadMap_;
+
+    using Index = typename Layout::Index;
+    using LongIndex = typename Layout::LongIndex;
+
+    using TensorRef = TensorRef<Element, Layout>;
+    using TensorView = TensorView<Element, Layout>;
+    using TensorCoord = typename Layout::TensorCoord;
+
+    using Pointer = Element*;
+    using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+    using UnderlyingIterator = PredicatedTileIteratorResidualLast<
+        layout::PitchLinearShape<Shape::kRow * kInterleavedK, Shape::kColumn / kInterleavedK>,
+        Element,
+        layout::PitchLinear,
+        (kAdvanceRank == 0 ? 0 : 1),
+        ThreadMap,
+        AccessSize>;
+
+    using AccessType = typename UnderlyingIterator::AccessType;
+
+    /// Fragment object to be loaded or stored
+    using Fragment =
+        cutlass::Array<Element, ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+    /// Predicate vector stores mask to guard accesses
+    using Mask = typename UnderlyingIterator::Mask;
+
+    /// Parameters object is precomputed state and is host-constructible
+    class Params {
+    private:
+        friend PredicatedTileIteratorResidualLast;
+
+        /// Parameters object
+        typename UnderlyingIterator::Params params_;
+
+    public:
+        CUTLASS_HOST_DEVICE
+        Params() {}
+
+        /// Construct the Params object given a pitch-linear tensor's layout
+        CUTLASS_HOST_DEVICE
+        Params(Layout const& layout) : params_(layout::PitchLinear(layout.stride(0))) {}
+
+        CUTLASS_HOST_DEVICE
+        Params(typename UnderlyingIterator::Params::Base const& base) : params_(base) {}
+    };
+
+private:
+    //
+    // Data members
+    //
+
+    /// Underlying pitch-linear tile iterator
+    UnderlyingIterator iterator_;
+
+public:
+    /// Constructs a TileIterator from its precomputed state, threadblock offset,
+    /// and thread ID
+    CUTLASS_HOST_DEVICE
+    PredicatedTileIteratorResidualLast(
+        /// Precomputed parameters object
+        Params const& params,
+        /// Pointer to start of tensor
+        Pointer pointer,
+        /// Extent of tensor
+        TensorCoord extent,
+        /// ID of each participating thread
+        int thread_id,
+        /// Initial offset of threadblock
+        TensorCoord const& threadblock_offset,
+        int const* indices = nullptr  ///< gather/scatter indices, note no support for
+                                      ///< gather/scatter at this specialization
+        )
+        : iterator_(params.params_,
+                    pointer,
+                    layout::PitchLinearCoord(extent.row() * kInterleavedK,
+                                             extent.column() / kInterleavedK),
+                    thread_id,
+                    layout::PitchLinearCoord(threadblock_offset.row() * kInterleavedK,
+                                             threadblock_offset.column() / kInterleavedK))
+    {
+    }
+
+    /// Construct a PredicatedTileIteratorResidualLast with zero threadblock
+    /// offset
+    CUTLASS_HOST_DEVICE
+    PredicatedTileIteratorResidualLast(Params const& params,  ///< Precomputed parameters object
+                                       Pointer pointer,       ///< Pointer to start of tensor
+                                       TensorCoord extent,    ///< Extent of tensor
+                                       int thread_id          ///< ID of each participating thread
+                                       )
+        : PredicatedTileIteratorResidualLast(params, pointer, extent, thread_id, make_Coord(0, 0))
+    {
+    }
+
+    /// Adds a pointer offset in units of Element
+    CUTLASS_HOST_DEVICE
+    void add_pointer_offset(LongIndex pointer_offset)
+    {
+        iterator_.add_pointer_offset(pointer_offset);
+    }
+
+    /// Advances to the next tile in memory.
+    ///
+    /// The first time this method is called, predicates are updated, and the
+    /// iterator's internal pointer is reverted to the first "steady state" tile.
+    /// Subsequent calls are lightweight and must only update the internal
+    /// pointer.
+    CUTLASS_HOST_DEVICE
+    PredicatedTileIteratorResidualLast& operator++()
+    {
+        ++iterator_;
+        return *this;
+    }
+
+    /// Advances to the next tile in memory.
+    ///
+    /// The first time this method is called, predicates are updated, and the
+    /// iterator's internal pointer is reverted to the first "steady state" tile.
+    /// Subsequent calls are lightweight and must only update the internal
+    /// pointer.
+    CUTLASS_HOST_DEVICE
+    PredicatedTileIteratorResidualLast operator++(int)
+    {
+        PredicatedTileIteratorResidualLast self(*this);
+        operator++();
+        return self;
+    }
+
+    /// Clears the predicate set efficiently
+    CUTLASS_HOST_DEVICE
+    void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+
+    CUTLASS_HOST_DEVICE
+    void set_residual_tile(bool enable) { iterator_.set_residual_tile(enable); }
+
+    /// Clears the predicate set efficiently
+    CUTLASS_HOST_DEVICE
+    void enable_mask() { iterator_.enable_mask(); }
+
+    /// Sets the predicate mask, overriding value stored in predicate iterator
+    CUTLASS_HOST_DEVICE
+    void set_mask(Mask const& mask) { iterator_.set_mask(mask); }
+
+    /// Gets the mask
+    CUTLASS_HOST_DEVICE
+    void get_mask(Mask& mask) { iterator_.get_mask(mask); }
+
+    /// Loads a fragment from memory
+    CUTLASS_DEVICE
+    void load_with_pointer_offset(Fragment& frag, Index pointer_offset)
+    {
+        iterator_.load_with_pointer_offset(frag, pointer_offset);
+    }
+
+    /// Loads a fragment from memory
+    CUTLASS_DEVICE
+    void load(Fragment& frag) { load_with_pointer_offset(frag, 0); }
+
+    /// Store a fragment to memory
+    CUTLASS_DEVICE
+    void store_with_pointer_offset(Fragment const& frag, Index pointer_offset)
+    {
+        iterator_.store_with_pointer_offset(frag, pointer_offset);
+    }
+
+    /// Store a fragment to memory
+    CUTLASS_DEVICE
+    void store(Fragment const& frag) { store_with_pointer_offset(frag, 0); }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIteratorResidualLast for interleaved-32
+/// data.  It is mapped to the congruous layout.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_,
+          typename Element_,
+          int AdvanceRank,
+          typename ThreadMap_,
+          int AccessSize,
+          int InterleavedK>
+class PredicatedTileIteratorResidualLast<Shape_,
+                                         Element_,
+                                         layout::RowMajorInterleaved<InterleavedK>,
+                                         AdvanceRank,
+                                         ThreadMap_,
+                                         AccessSize,
+                                         false> {
+public:
+    static_assert(AdvanceRank == 0 || AdvanceRank == 1,
+                  "Specialization for pitch-linear iterator may along advance along the "
+                  "contiguous(rank=0) or strided(rank=1) dimension.");
+
+    using Shape = Shape_;
+    using Element = Element_;
+    static int const kInterleavedK = InterleavedK;
+    using Layout = layout::RowMajorInterleaved<kInterleavedK>;
+    static int const kAdvanceRank = AdvanceRank;
+    using ThreadMap = ThreadMap_;
+
+    using Index = typename Layout::Index;
+    using LongIndex = typename Layout::LongIndex;
+
+    using TensorRef = TensorRef<Element, Layout>;
+    using TensorView = TensorView<Element, Layout>;
+    using TensorCoord = typename Layout::TensorCoord;
+
+    using Pointer = Element*;
+    using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+    using UnderlyingIterator = PredicatedTileIteratorResidualLast<
+        layout::PitchLinearShape<Shape::kColumn * kInterleavedK, Shape::kRow / kInterleavedK>,
+        Element,
+        layout::PitchLinear,
+        (kAdvanceRank == 0 ? 1 : 0),
+        ThreadMap,
+        AccessSize>;
+
+    using AccessType = typename UnderlyingIterator::AccessType;
+
+    /// Fragment object to be loaded or stored
+    using Fragment =
+        cutlass::Array<Element, ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+    /// Predicate vector stores mask to guard accesses
+    using Mask = typename UnderlyingIterator::Mask;
+
+    /// Parameters object is precomputed state and is host-constructible
+    class Params {
+    private:
+        friend PredicatedTileIteratorResidualLast;
+
+        /// Parameters object
+        typename UnderlyingIterator::Params params_;
+
+    public:
+        CUTLASS_HOST_DEVICE
+        Params() {}
+
+        /// Construct the Params object given a pitch-linear tensor's layout
+        CUTLASS_HOST_DEVICE
+        Params(Layout const& layout) : params_(layout::PitchLinear(layout.stride(0))) {}
+
+        CUTLASS_HOST_DEVICE
+        Params(typename UnderlyingIterator::Params::Base const& base) : params_(base) {}
+    };
+
+private:
+    //
+    // Data members
+    //
+
+    /// Underlying pitch-linear tile iterator
+    UnderlyingIterator iterator_;
+
+public:
+    /// Constructs a TileIterator from its precomputed state, threadblock offset,
+    /// and thread ID
+    CUTLASS_HOST_DEVICE
+    PredicatedTileIteratorResidualLast(
+        /// Precomputed parameters object
+        Params const& params,
+        /// Pointer to start of tensor
+        Pointer pointer,
+        /// Extent of tensor
+        TensorCoord extent,
+        /// ID of each participating thread
+        int thread_id,
+        /// Initial offset of threadblock
+        TensorCoord const& threadblock_offset,
+        int const* indices = nullptr  ///< gather/scatter indices, note no support for
+                                      ///< gather/scatter at this specialization
+        )
+        : iterator_(params.params_,
+                    pointer,
+                    layout::PitchLinearCoord(extent.column() * kInterleavedK,
+                                             extent.row() / kInterleavedK),
+                    thread_id,
+                    layout::PitchLinearCoord(threadblock_offset.column() * kInterleavedK,
+                                             threadblock_offset.row() / kInterleavedK))
+    {
+    }
+
+    /// Construct a PredicatedTileIteratorResidualLast with zero threadblock
+    /// offset
+    CUTLASS_HOST_DEVICE
+    PredicatedTileIteratorResidualLast(Params const& params,  ///< Precomputed parameters object
+                                       Pointer pointer,       ///< Pointer to start of tensor
+                                       TensorCoord extent,    ///< Extent of tensor
+                                       int thread_id          ///< ID of each participating thread
+                                       )
+        : PredicatedTileIteratorResidualLast(params, pointer, extent, thread_id, make_Coord(0, 0))
+    {
+    }
+
+    /// Adds a pointer offset in units of Element
+    CUTLASS_HOST_DEVICE
+    void add_pointer_offset(LongIndex pointer_offset)
+    {
+        iterator_.add_pointer_offset(pointer_offset);
+    }
+
+    /// Advances to the next tile in memory.
+    ///
+    /// The first time this method is called, predicates are updated, and the
+    /// iterator's internal pointer is reverted to the first "steady state" tile.
+    /// Subsequent calls are lightweight and must only update the internal
+    /// pointer.
+    CUTLASS_HOST_DEVICE
+    PredicatedTileIteratorResidualLast& operator++()
+    {
+        ++iterator_;
+        return *this;
+    }
+
+    /// Advances to the next tile in memory.
+    ///
+    /// The first time this method is called, predicates are updated, and the
+    /// iterator's internal pointer is reverted to the first "steady state" tile.
+    /// Subsequent calls are lightweight and must only update the internal
+    /// pointer.
+    CUTLASS_HOST_DEVICE
+    PredicatedTileIteratorResidualLast operator++(int)
+    {
+        PredicatedTileIteratorResidualLast self(*this);
+        operator++();
+        return self;
+    }
+
+    /// Clears the predicate set efficiently
+    CUTLASS_HOST_DEVICE
+    void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+
+    CUTLASS_HOST_DEVICE
+    void set_residual_tile(bool enable) { iterator_.set_residual_tile(enable); }
+
+    /// Clears the predicate set efficiently
+    CUTLASS_HOST_DEVICE
+    void enable_mask() { iterator_.enable_mask(); }
+
+    /// Sets the predicate mask, overriding value stored in predicate iterator
+    CUTLASS_HOST_DEVICE
+    void set_mask(Mask const& mask) { iterator_.set_mask(mask); }
+
+    /// Gets the mask
+    CUTLASS_HOST_DEVICE
+    void get_mask(Mask& mask) { iterator_.get_mask(mask); }
+
+    /// Loads a fragment from memory
+    CUTLASS_DEVICE
+    void load_with_pointer_offset(Fragment& frag, Index pointer_offset)
+    {
+        iterator_.load_with_pointer_offset(frag, pointer_offset);
+    }
+
+    /// Loads a fragment from memory
+    CUTLASS_DEVICE
+    void load(Fragment& frag) { load_with_pointer_offset(frag, 0); }
+
+    /// Store a fragment to memory
+    CUTLASS_DEVICE
+    void store_with_pointer_offset(Fragment const& frag, Index pointer_offset)
+    {
+        iterator_.store_with_pointer_offset(frag, pointer_offset);
+    }
+
+    /// Store a fragment to memory
+    CUTLASS_DEVICE
+    void store(Fragment const& frag) { store_with_pointer_offset(frag, 0); }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace transform
+}  // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/deepspeed4science/evoformer_attn/iterators/transpose_warp_iterator.h b/csrc/deepspeed4science/evoformer_attn/iterators/transpose_warp_iterator.h
new file mode 100644
index 000000000000..2435c07f8989
--- /dev/null
+++ b/csrc/deepspeed4science/evoformer_attn/iterators/transpose_warp_iterator.h
@@ -0,0 +1,57 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holdvr nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#pragma once
+
+#include "warp_iterator_from_smem.h"
+
+template <typename WarpIterator>
+struct TransposeWarpIterator {
+    using Iterator = char;
+    static bool constexpr kSupportsTranspose = false;
+};
+
+template <
+    /// Operand identity
+    cutlass::gemm::Operand Operand,
+    /// Data type of A elements
+    typename Element,
+    bool kTranspose>
+struct TransposeWarpIterator<
+    cutlass::gemm::warp::WarpIteratorFromSmem<Operand, Element, kTranspose>> {
+    using Iterator = cutlass::gemm::warp::WarpIteratorFromSmem<Operand, Element, !kTranspose>;
+    static bool constexpr kSupportsTranspose = true;
+};
diff --git a/csrc/deepspeed4science/evoformer_attn/iterators/warp_iterator_from_smem.h b/csrc/deepspeed4science/evoformer_attn/iterators/warp_iterator_from_smem.h
new file mode 100644
index 000000000000..7dd59832b4b0
--- /dev/null
+++ b/csrc/deepspeed4science/evoformer_attn/iterators/warp_iterator_from_smem.h
@@ -0,0 +1,269 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+/*! \file
+    \brief Inspired from
+   "cutlass/gemm/warp/mma_tensor_op_tile_access_iterator.h" Loads tiles of GEMM
+   operands from a RowMajor shared-memory layout into registers to use by A100
+   TensorCores.
+
+    The difference with "mma_tensor_op_tile_access_iterator.h" is that:
+    (1) We use "ldmatrix" to load tiles, rather than manual loads (slightly
+   faster) (2) We support to transpose the operand (eg read `A.transpose()` when
+   the shared memory holds `A`)
+
+    This is only implemented for the specific shapes.
+*/
+#pragma once
+
+#include <cutlass/gemm/gemm.h>
+
+////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+template <
+    /// Operand identity
+    Operand Operand_,
+    /// Data type of A elements
+    typename Element_,
+    bool kTranspose = false>
+class WarpIteratorFromSmem {
+public:
+    /// Shape of tile to load (concept: MatrixShape)
+    using Shape = cutlass::MatrixShape<32, 32>;
+
+    /// Operand tag
+    static Operand const kOperand = Operand_;
+
+    /// Basic check
+    static_assert(
+        kOperand == Operand::kA || kOperand == Operand::kB,
+        "WarpIteratorFromSmem may only be instantiated for A or B operands to warp-level Mma.");
+
+    /// Element type
+    using Element = Element_;
+    static_assert(sizeof_bits<Element>::value == 16, "Only supported for half");
+
+    /// Layout of source tile
+    using Layout = cutlass::layout::RowMajor;
+
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    using InstructionShape = cutlass::MatrixShape<16, 8>;
+
+    /// Delta between *MMA operations (in units of *MMA operations, concept:
+    /// MatrixShape)
+    static int const kOpDelta = 1;
+
+    /// Number of participating threads
+    static int const kThreads = 32;
+
+    /// TensorRef type for loading element from a tensor
+    using TensorRef = TensorRef<Element, Layout>;
+
+    /// Index type
+    using Index = typename TensorRef::Index;
+
+    /// Long Index type
+    using LongIndex = typename TensorRef::LongIndex;
+
+    /// Coordinate for an element in the tensor
+    using TensorCoord = typename TensorRef::TensorCoord;
+
+    /// Number of elements accessed per Shared Memory load
+    static int const kElementsPerAccess =
+        (sizeof_bits<Element>::value >= 32 ? 1 : 32 / sizeof_bits<Element>::value);
+
+    using InstructionCount = MatrixShape<Shape::kRow / InstructionShape::kRow,
+                                         Shape::kColumn / InstructionShape::kColumn>;
+
+    static int const kIterations = (kOperand == Operand::kA) ? InstructionCount::kColumn
+                                                             : InstructionCount::kRow;
+
+public:
+    //
+    // Derived quantities
+    //
+
+    /// Fragment object holding a thread's part of a tile
+    using Fragment =
+        Array<Element,
+              (kOperand == Operand::kA) ? (Shape::kRow* InstructionShape::kColumn / kThreads)
+                                        : (Shape::kColumn* InstructionShape::kRow / kThreads)>;
+
+    /// Memory access type
+    // using AccessType = AlignedArray<Element, kElementsPerAccess>;
+    using AccessType = Array<unsigned, 4>;
+
+    static int constexpr kWarpShapeDivisibleInner =
+        (kOperand == Operand::kA ? InstructionShape::kColumn : InstructionShape::kRow);
+    static int constexpr kAccessesInner = (kWarpShapeDivisibleInner / kElementsPerAccess) / 4;
+    static int const kTilesPerInstruction = InstructionShape::kRow / 8;
+
+private:
+    /// Underlying tensor reference
+    TensorRef ref_;
+
+    /// Origin
+    MatrixCoord origin_;
+
+    /// Iterations in a tile
+    int iterations_;
+
+public:
+    /// Constructor from TensorRef
+    CUTLASS_HOST_DEVICE
+    WarpIteratorFromSmem(TensorRef const& ref, int lane_id)
+        : WarpIteratorFromSmem(ref, {Shape::kRow, Shape::kColumn}, lane_id)
+    {
+    }
+    CUTLASS_HOST_DEVICE
+    WarpIteratorFromSmem(TensorRef const& ref, TensorCoord extent, int lane_id)
+        : ref_(ref), iterations_(0)
+    {
+        int ldsm_vec_num = (lane_id >> 3);
+        if (kOperand == Operand::kA) {
+            origin_ = MatrixCoord(lane_id % 8, 0);
+            static_assert(InstructionCount::kRow * kAccessesInner * kTilesPerInstruction == 4, "");
+            CUTLASS_PRAGMA_UNROLL
+            for (int inst_m_idx = 0; inst_m_idx < InstructionCount::kRow; ++inst_m_idx) {
+                CUTLASS_PRAGMA_UNROLL
+                for (int inner_idx = 0; inner_idx < kAccessesInner; ++inner_idx) {
+                    CUTLASS_PRAGMA_UNROLL
+                    for (int access_m_idx = 0; access_m_idx < kTilesPerInstruction;
+                         ++access_m_idx) {
+                        int access_idx =
+                            access_m_idx +
+                            kTilesPerInstruction * (inner_idx + kAccessesInner * inst_m_idx);
+
+                        MatrixCoord offset(access_m_idx * 8 + inst_m_idx * InstructionShape::kRow,
+                                           inner_idx * 4 * kElementsPerAccess);
+
+                        if (access_idx == ldsm_vec_num) {
+                            if (kTranspose) { offset = MatrixCoord(offset.column(), offset.row()); }
+                            origin_ += offset;
+                        }
+                    }
+                }
+            }
+        } else {
+            origin_ = MatrixCoord(0, lane_id % 8);
+            static_assert(InstructionCount::kColumn * kAccessesInner == 4, "");
+            CUTLASS_PRAGMA_UNROLL
+            for (int inst_n_idx = 0; inst_n_idx < InstructionCount::kColumn; ++inst_n_idx) {
+                CUTLASS_PRAGMA_UNROLL
+                for (int inner_idx = 0; inner_idx < kAccessesInner; ++inner_idx) {
+                    int access_idx = inner_idx + kAccessesInner * inst_n_idx;
+
+                    MatrixCoord offset(inner_idx * 4 * kElementsPerAccess, inst_n_idx * 8);
+
+                    if (access_idx == ldsm_vec_num) {
+                        if (kTranspose) { offset = MatrixCoord(offset.column(), offset.row()); }
+                        origin_ += offset;
+                    }
+                }
+            }
+        }
+
+        ref_.add_coord_offset(origin_);
+    }
+
+    /// Advances an iterator along logical dimensions of matrix in units of whole
+    /// tiles
+    CUTLASS_HOST_DEVICE
+    WarpIteratorFromSmem& add_tile_offset(TensorCoord const& tile_offset)
+    {
+        TensorCoord coord_offset(tile_offset.row() * Shape::kRow,
+                                 tile_offset.column() * Shape::kColumn);
+        if (kTranspose) { coord_offset = TensorCoord{coord_offset.column(), coord_offset.row()}; }
+        origin_ += coord_offset;
+
+        ref_.add_coord_offset(coord_offset);
+
+        return *this;
+    }
+
+    /// Advances the iterator along the advance dimension
+    CUTLASS_DEVICE
+    void advance()
+    {
+        if (kOperand == Operand::kA) {
+            add_tile_offset({0, 1});
+        } else {
+            add_tile_offset({1, 0});
+        }
+
+        iterations_ = 0;
+    }
+
+    /// increase iterations in a tile
+    CUTLASS_HOST_DEVICE
+    WarpIteratorFromSmem& operator++()
+    {
+        iterations_++;
+
+        if (iterations_ >= kIterations) advance();
+
+        return *this;
+    }
+
+    /// Loads a fragment from memory at the location pointed to by the iterator.
+    CUTLASS_DEVICE
+    void load(Fragment& frag) const
+    {
+        AccessType* access_ptr = reinterpret_cast<AccessType*>(&frag);
+        using LoadLayout =
+            typename platform::conditional<kTranspose, layout::ColumnMajor, layout::RowMajor>::type;
+
+        MatrixCoord offset;
+        if (kOperand == Operand::kA) {
+            offset = MatrixCoord(0, iterations_ * InstructionShape::kColumn);
+        } else {
+            offset = MatrixCoord(iterations_ * InstructionShape::kRow, 0);
+        }
+        if (kTranspose) { offset = MatrixCoord(offset.column(), offset.row()); }
+        cutlass::arch::ldsm<LoadLayout, 4>(access_ptr[0], ref_.data() + ref_.offset(offset));
+    }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace warp
+}  // namespace gemm
+}  // namespace cutlass
+////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/deepspeed4science/evoformer_attn/kernel_backward.h b/csrc/deepspeed4science/evoformer_attn/kernel_backward.h
new file mode 100644
index 000000000000..87e6df18bb04
--- /dev/null
+++ b/csrc/deepspeed4science/evoformer_attn/kernel_backward.h
@@ -0,0 +1,1965 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holdvr nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#pragma once
+
+#include <cmath>
+#include <type_traits>
+#include <vector>
+
+#include <cuda_fp16.h>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/thread/scale_type.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/functional.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/vector.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_ref.h"
+
+#include "gemm_kernel_utils.h"
+
+#include "cutlass/epilogue/thread/linear_combination_relu.h"
+#include "cutlass/epilogue/threadblock/epilogue_smem_accumulator.h"
+#include "cutlass/epilogue/warp/fragment_iterator_tensor_op.h"
+#include "cutlass/epilogue/warp/tile_iterator_tensor_op.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+#include "cutlass/gemm/kernel/default_gemm.h"
+#include "cutlass/gemm/threadblock/default_mma.h"
+#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/platform/platform.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+#include "cutlass/transform/threadblock/vector_iterator.h"
+#include "epilogue/epilogue_pipelined.h"
+#include "iterators/epilogue_predicated_tile_iterator.h"
+
+#include "epilogue/epilogue_grad_bias.h"
+#include "gemm/custom_mma.h"
+#include "gemm/find_default_mma.h"
+#include "gemm/mma_accum_lambda_iterator.h"
+#include "gemm/mma_from_smem.h"
+#include "transform/bias_broadcast.h"
+#include "transform/tile_smem_loader.h"
+
+#include <inttypes.h>
+
+using namespace gemm_kernel_utils;
+
+namespace {
+
+template <typename FragmentType, int32_t kNumThreads>
+struct GmemTile {
+    /*
+      Helper functions to efficient store/load RF to gmem
+
+      GEMM accumulators have a particular format on A100, and
+      it takes some compute/shared-memory to rearrange them to
+      a RowMajor or ColumnMajor format in global memory through
+      an Epilogue. The same complexity goes for loading into RF.
+
+      This class loads/stores RF as they are, and can be used for
+      efficient accumulation across gemms for instance:
+
+      ```
+      GmemTile tile;
+      for (int i = 0; i < N; ++i) {
+        // ...
+
+        Fragment accum;
+        if (i == 0) {
+          accum.clear();
+        } else {
+          tile.load(accum);
+        }
+        mma(accum, ...);
+        if (i < N-1) {
+          // Store for next GEMM
+          tile.store(accum);
+        } else {
+          // Store in tensor (eg RowMajor)
+          epilogue(accum);
+        }
+
+        // ...
+      }
+      ```
+    */
+
+    // 128bits per thread
+    using AccessType = cutlass::Array<float, 4>;
+    static constexpr int32_t kBytes = sizeof(AccessType);
+    static constexpr int32_t kStride = kNumThreads * AccessType::kElements;
+    static constexpr int32_t kNumIters = FragmentType::kElements / AccessType::kElements;
+    static constexpr int32_t kElementsStored = kNumThreads * FragmentType::kElements;
+    static_assert(FragmentType::kElements % AccessType::kElements == 0,
+                  "fragment not aligned on 128 bits");
+
+    float* ptr;
+
+    CUTLASS_DEVICE void load(FragmentType& fragment, int thread_id)
+    {
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < kNumIters; ++i) {
+            AccessType* __restrict__ gmem_ptr = reinterpret_cast<AccessType*>(
+                ptr + thread_id * AccessType::kElements + i * kStride);
+            AccessType sub_fragment;
+            cutlass::arch::global_load<AccessType, kBytes>(sub_fragment, gmem_ptr, true);
+            CUTLASS_PRAGMA_UNROLL
+            for (int j = 0; j < AccessType::kElements; ++j) {
+                fragment[i * AccessType::kElements + j] = sub_fragment[j];
+            }
+        }
+    }
+
+    CUTLASS_DEVICE void store(FragmentType const& fragment, int thread_id)
+    {
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < kNumIters; ++i) {
+            AccessType* __restrict__ gmem_ptr = reinterpret_cast<AccessType*>(
+                ptr + thread_id * AccessType::kElements + i * kStride);
+            AccessType sub_fragment;
+            CUTLASS_PRAGMA_UNROLL
+            for (int j = 0; j < AccessType::kElements; ++j) {
+                sub_fragment[j] = fragment[i * AccessType::kElements + j];
+            }
+            cutlass::arch::global_store<AccessType, kBytes>(sub_fragment, gmem_ptr, true);
+        }
+    }
+};
+
+template <typename scalar_t, typename Arch>
+constexpr int getWarpsPerSm()
+{
+    constexpr bool is_half = !cutlass::platform::is_same<scalar_t, float>::value;
+    if (Arch::kMinComputeCapability >= 80) { return is_half ? 12 : 8; }
+    return 8;
+}
+}  // namespace
+
+template <
+    // which arch we target (eg `cutlass::arch::Sm80`)
+    typename ArchTag_,
+    // input/output type
+    typename scalar_t_,
+    // run optimized kernel because memory accesses will be aligned
+    bool kIsAligned_,
+    // use dropout if enabled
+    bool kApplyDropout_,
+    // when doing a GEMM, preload the next one (uses more shmem)
+    bool kPreload_,
+    // block dimensions
+    int kBlockSizeI_,
+    int kBlockSizeJ_,
+    // upperbound on `max(value.shape[-1], query.shape[-1])`
+    int kMaxK_ = (int)cutlass::platform::numeric_limits<uint32_t>::max(),
+    template <typename, typename, typename> class Broadcast1_ = BroadcastNoLoad,
+    template <typename, typename, typename> class Broadcast2_ = BroadcastNoLoad>
+struct AttentionBackwardKernel {
+    using scalar_t = scalar_t_;
+    using output_t = scalar_t;
+    using output_accum_t = float;
+    using lse_scalar_t = float;
+    using accum_t = float;
+    using ArchTag = ArchTag_;
+    static constexpr bool kIsAligned = kIsAligned_;
+    static constexpr bool kApplyDropout = kApplyDropout_;
+    static constexpr bool kPreload = kPreload_;
+    static constexpr int kBlockSizeI = kBlockSizeI_;
+    static constexpr int kBlockSizeJ = kBlockSizeJ_;
+    static constexpr int kMaxK = kMaxK_;
+
+    struct Params {
+        // Input tensors
+        scalar_t* query_ptr;          // [Mq, nH, K]
+        scalar_t* key_ptr;            // [Mk, nH, K]
+        scalar_t* value_ptr;          // [Mk, nH, Kv]
+        lse_scalar_t* logsumexp_ptr;  // [nH, Mq]
+        scalar_t* output_ptr;         // [Mq, nH, Kv]
+        scalar_t* grad_output_ptr;    // [Mq, nH, Kv]
+        accum_t* delta_ptr;           // [nH, Mq]
+        int32_t* cu_seqlens_q_ptr = nullptr;
+        int32_t* cu_seqlens_k_ptr = nullptr;
+
+        // Output tensors
+        output_t* grad_query_ptr;  //  [Mq, nH, K]
+        output_t* grad_key_ptr;    //    [Mk, nH, K]
+        output_t* grad_value_ptr;  //  [Mk, nH, Kv]
+
+        accum_t* grad_bias1_ptr = nullptr;
+        accum_t* grad_bias2_ptr = nullptr;
+        int32_t B = 0;
+        int32_t N = 0;
+        scalar_t* bias1_ptr = nullptr;
+        scalar_t* bias2_ptr = nullptr;
+
+        // Accumulators
+        union {
+            output_accum_t* workspace = nullptr;  // [Mq, Kq] + [Mkv, Kq] + [Mkv, Kv]
+            output_accum_t* workspace_gk;
+        };
+        output_accum_t* workspace_gv;  // (will be calculated by the kernel)
+        output_accum_t* workspace_gq;  // (will be calculated by the kernel)
+
+        // Scale
+        accum_t scale;
+
+        // Dimensions/strides
+        int32_t head_dim = -1;
+        int32_t head_dim_value = -1;
+        int32_t num_queries = -1;
+        int32_t num_keys = -1;
+        int32_t num_heads = -1;
+
+        int32_t q_strideM;
+        int32_t k_strideM;
+        int32_t v_strideM;
+        int32_t gO_strideM;
+        int32_t gB_strideM;
+        int8_t gQKV_strideM_multiplier = 1;  // 3 for packed, 1 otherwise
+
+        // RNG sequence offset based on batch_id and head_id
+        unsigned long long dropout_batch_head_rng_offset;
+        float dropout_prob = 0.0f;
+
+        CUTLASS_HOST_DEVICE int32_t o_strideM() const { return head_dim_value * num_heads; }
+        CUTLASS_HOST_DEVICE int32_t gQ_strideM() const
+        {
+            return gQKV_strideM_multiplier * num_heads * head_dim;
+        }
+        CUTLASS_HOST_DEVICE int32_t gK_strideM() const
+        {
+            return gQKV_strideM_multiplier * num_heads * head_dim;
+        }
+        CUTLASS_HOST_DEVICE int32_t gV_strideM() const
+        {
+            return gQKV_strideM_multiplier * num_heads * head_dim_value;
+        }
+
+        // Everything below is only used in `advance_to_block`
+        // and shouldn't use registers
+        int64_t o_strideH;
+        int32_t q_strideH;
+        int32_t k_strideH;
+        int32_t v_strideH;
+        int64_t o_strideB;
+        int64_t q_strideB;
+        int64_t k_strideB;
+        int64_t v_strideB;
+        int64_t lse_strideB;
+        int64_t lse_strideH;
+        int64_t delta_strideB;
+        int64_t delta_strideH;
+        int32_t num_batches;
+
+        int64_t gO_strideB = 0;
+        int64_t gQ_strideB = 0;
+        int64_t gK_strideB = 0;
+        int64_t gV_strideB = 0;
+        int64_t gB_strideB = 0;
+        int64_t gO_strideH = 0;
+        int64_t gQ_strideH = 0;
+        int64_t gK_strideH = 0;
+        int64_t gV_strideH = 0;
+        int64_t gB_strideH = 0;
+
+        CUTLASS_DEVICE bool advance_to_block()
+        {
+            int64_t batch_id = blockIdx.z;
+            int32_t head_id = blockIdx.y;
+
+            if (kNeedsAccumGradQ || kNeedsAccumGradK || kNeedsAccumGradV) {
+                assert(workspace_size() == 0 || workspace != nullptr);
+
+                workspace += (batch_id * num_heads + head_id) * workspace_strideBH();
+                workspace = warp_uniform(workspace);
+                workspace_gv = workspace + workspace_elements_gk();
+                workspace_gq = workspace_gv + workspace_elements_gv();
+            } else {
+                workspace = nullptr;
+            }
+
+            // Advance pointers that depend on the total concatenated
+            // number of queries, as `num_queries` is modified in the block
+            // below
+            dropout_batch_head_rng_offset = batch_id * (num_heads * num_queries * num_keys) +
+                                            head_id * (num_queries * num_keys);
+            logsumexp_ptr += batch_id * lse_strideB + head_id * lse_strideH;
+
+            query_ptr += batch_id * q_strideB + head_id * q_strideH;
+            key_ptr += batch_id * k_strideB + head_id * k_strideH;
+            value_ptr += batch_id * v_strideB + head_id * v_strideH;
+            output_ptr += batch_id * o_strideB + head_id * o_strideH;
+            grad_output_ptr += batch_id * gO_strideB + head_id * gO_strideH;
+            delta_ptr += batch_id * delta_strideB + head_id * delta_strideH;
+
+            grad_query_ptr += batch_id * gQ_strideB + head_id * gQ_strideH;
+            grad_key_ptr += batch_id * gK_strideB + head_id * gK_strideH;
+            grad_value_ptr += batch_id * gV_strideB + head_id * gV_strideH;
+            using broadcast_1 = Broadcast1_<typename MatmulQK::BiasLoader::ThreadMap,
+                                            typename MatmulQK::BiasLoader::Shape,
+                                            scalar_t>;
+            using broadcast_2 = Broadcast2_<typename MatmulQK::BiasLoader::ThreadMap,
+                                            typename MatmulQK::BiasLoader::Shape,
+                                            scalar_t>;
+
+            if (broadcast_1::kEnable && grad_bias1_ptr) {
+                grad_bias1_ptr += batch_id * num_queries;
+            }
+            if (broadcast_2::kEnable && grad_bias2_ptr) {
+                auto strideB = num_heads * num_queries * num_keys;
+                auto strideH = num_queries * num_keys;
+                grad_bias2_ptr += (batch_id / N) * strideB + head_id * strideH;
+            }
+            if (broadcast_1::kEnable && bias1_ptr) {
+                bias1_ptr = broadcast_1::advance(bias1_ptr,
+                                                 batch_id / N,
+                                                 batch_id % N,
+                                                 head_id,
+                                                 num_queries * N,
+                                                 num_queries,
+                                                 0);
+            }
+            if (broadcast_2::kEnable && bias2_ptr) {
+                auto strideB = num_heads * num_queries * num_keys;
+                auto strideH = num_queries * num_keys;
+                bias2_ptr = broadcast_2::advance(
+                    bias2_ptr, batch_id / N, batch_id % N, head_id, strideB, 0, strideH);
+            }
+
+            num_queries = warp_uniform(num_queries);
+            num_keys = warp_uniform(num_keys);
+
+            query_ptr = warp_uniform(query_ptr);
+            key_ptr = warp_uniform(key_ptr);
+            value_ptr = warp_uniform(value_ptr);
+            logsumexp_ptr = warp_uniform(logsumexp_ptr);
+            output_ptr = warp_uniform(output_ptr);
+            grad_output_ptr = warp_uniform(grad_output_ptr);
+            delta_ptr = warp_uniform(delta_ptr);
+
+            grad_query_ptr = warp_uniform(grad_query_ptr);
+            grad_key_ptr = warp_uniform(grad_key_ptr);
+            grad_value_ptr = warp_uniform(grad_value_ptr);
+            if (broadcast_1::kEnable) {
+                grad_bias1_ptr = warp_uniform(grad_bias1_ptr);
+                bias1_ptr = warp_uniform(bias1_ptr);
+            }
+            if (broadcast_2::kEnable) {
+                grad_bias2_ptr = warp_uniform(grad_bias2_ptr);
+                bias2_ptr = warp_uniform(bias2_ptr);
+            }
+
+            return true;
+        }
+
+        __host__ dim3 getBlocksGrid() const { return dim3(1, num_heads, num_batches); }
+        __host__ dim3 getThreadsGrid() const { return dim3(kWarpSize * kNumWarpsPerBlock, 1, 1); }
+        CUTLASS_HOST_DEVICE int64_t workspace_elements_gk() const
+        {
+            if (!kNeedsAccumGradK) { return 0; }
+            return align_up(num_keys, (int32_t)kBlockSizeJ) *
+                   align_up(head_dim, (int32_t)kBlockSizeI);
+        }
+        CUTLASS_HOST_DEVICE int64_t workspace_elements_gv() const
+        {
+            if (!kNeedsAccumGradV) { return 0; }
+            return align_up(num_keys, (int32_t)kBlockSizeJ) *
+                   align_up(head_dim_value, (int32_t)kBlockSizeI);
+        }
+        CUTLASS_HOST_DEVICE int64_t workspace_elements_gq() const
+        {
+            if (!kNeedsAccumGradQ) { return 0; }
+            if (num_keys <= kBlockSizeJ) { return 0; }
+            return align_up(num_queries, (int32_t)kBlockSizeI) *
+                   align_up(head_dim, (int32_t)kBlockSizeJ);
+        }
+        CUTLASS_HOST_DEVICE int64_t workspace_strideBH() const
+        {
+            // Aligned on 128bits
+            return align_up(
+                workspace_elements_gk() + workspace_elements_gv() + workspace_elements_gq(),
+                int64_t(4));
+        }
+        CUTLASS_HOST_DEVICE int64_t workspace_size() const
+        {
+            // Returns size of buffer we need to run this kernel
+            return num_batches * num_heads * workspace_strideBH() * sizeof(float);
+        }
+    };
+
+    static constexpr int64_t kWarpSize = 32;
+
+    // If this is true, we store and accumulate dK/dV in RF
+    // rather than going back to gmem every time
+    static constexpr bool kIsHalf = cutlass::sizeof_bits<scalar_t>::value <= 16;
+    static constexpr bool kOutputInRF = kIsHalf && kMaxK <= kBlockSizeI;
+    static_assert(!kPreload || (kIsHalf && ArchTag::kMinComputeCapability >= 80 && kOutputInRF),
+                  "preload MMA not supported");
+    static constexpr bool kPrologueQK = kPreload;
+    static constexpr bool kPrologueGV = kPreload;
+    static constexpr bool kPrologueDOV = kPreload;
+    static constexpr bool kPrologueGQ = kPreload;
+    static constexpr bool kPrologueGK = kPreload;
+
+    static constexpr int64_t kNumWarpsPerBlock = (kBlockSizeI * kBlockSizeJ) / (32 * 32);
+
+    // Compute delta for the f16 kernels
+    // TODO: Figure out why it's slower on the f32 kernels
+    // (something due to RF pressure?)
+    // TODO: Remove condition on `kOutputInRF` - this is needed to work
+    // around a compiler bug on V100, not exactly sure why but I spent
+    // too much time on this already. Reproducible with
+    // (B, Mq, Mkv, K) = (1, 1, 1, 136) for instance
+    static constexpr bool kKernelComputesDelta =
+        kIsHalf && (kOutputInRF || ArchTag::kMinComputeCapability != 70);
+
+    static constexpr bool kNeedsAccumGradQ =
+        !cutlass::platform::is_same<output_accum_t, output_t>::value;
+    static constexpr bool kNeedsAccumGradK =
+        !kOutputInRF && !cutlass::platform::is_same<output_accum_t, output_t>::value;
+    static constexpr bool kNeedsAccumGradV =
+        !kOutputInRF && !cutlass::platform::is_same<output_accum_t, output_t>::value;
+
+    // Launch bounds
+    static constexpr int64_t kNumThreads = kWarpSize * kNumWarpsPerBlock;
+    static constexpr int64_t kMinBlocksPerSm =
+        getWarpsPerSm<scalar_t, ArchTag>() / kNumWarpsPerBlock;
+
+    using GemmType = DefaultGemmType<ArchTag, scalar_t>;
+    using DefaultConfig =
+        typename cutlass::gemm::device::DefaultGemmConfiguration<typename GemmType::OpClass,
+                                                                 ArchTag,
+                                                                 scalar_t,
+                                                                 scalar_t,
+                                                                 scalar_t,  // ElementC
+                                                                 accum_t    // ElementAccumulator
+                                                                 >;
+    static constexpr auto kOptimalAlignement =
+        cutlass::platform::max(DefaultConfig::kAlignmentA, DefaultConfig::kAlignmentB);
+    static constexpr auto kMinimumAlignment = GemmType::kMinimumAlignment;
+
+    struct MatmulQK {
+        /*
+        attn_T = k_j @ q_i.transpose(-2, -1) # matmul
+        attn_T = (attn_T - logsumexp[i_start:i_end].unsqueeze(1).transpose(-2,
+        -1)).exp() # epilogue
+
+        with attn_T.shape = (kBlockSizeJ, kBlockSizeI)
+        */
+        using ThreadblockShape =
+            cutlass::gemm::GemmShape<kBlockSizeJ, kBlockSizeI, GemmType::ThreadK>;
+        using WarpShape = cutlass::gemm::GemmShape<32, 32, GemmType::WarpK>;
+        using DefaultMma = typename cutlass::gemm::threadblock::DefaultMma<
+            scalar_t,                   // ElementA
+            cutlass::layout::RowMajor,  // LayoutA
+            kIsAligned ? DefaultConfig::kAlignmentA : GemmType::kMinimumAlignment,
+            scalar_t,                      // ElementB
+            cutlass::layout::ColumnMajor,  // LayoutB
+            kIsAligned ? DefaultConfig::kAlignmentB : GemmType::kMinimumAlignment,
+            accum_t,                    // ElementC
+            cutlass::layout::RowMajor,  // LayoutC
+            typename GemmType::OpClass,
+            ArchTag,
+            ThreadblockShape,
+            WarpShape,
+            typename GemmType::InstructionShape,
+            DefaultConfig::kStages,
+            typename GemmType::Operator,
+            false,  // AccumulatorsInRowMajor = false,
+            cutlass::gemm::SharedMemoryClearOption::kNone>;
+        using MmaCore = typename DefaultMma::MmaCore;
+        using Mma = typename MakeCustomMma<typename DefaultMma::ThreadblockMma, kMaxK>::Mma;
+
+        // used for efficient load of bias tile (Bij) from global memory to shared
+        // memory
+        using BiasLoader =
+            TileSmemLoader<scalar_t,
+                           // Bij is applied to transposed attn matrix tile (Pij.T). Bij is loaded
+                           // row-major but needs to have transposed shape so we get the same
+                           // elements.
+                           cutlass::MatrixShape<ThreadblockShape::kN, ThreadblockShape::kM>,
+                           MmaCore::kThreads,
+                           // input restriction: kv_len has to be a multiple of this value
+                           128 / cutlass::sizeof_bits<scalar_t>::value>;
+
+        // Epilogue to store to shared-memory in a format that we can use later for
+        // the second matmul
+        using B2bGemm =
+            typename cutlass::gemm::threadblock::B2bGemm<typename Mma::Operator::IteratorC,
+                                                         typename Mma::Operator,
+                                                         scalar_t,
+                                                         WarpShape,
+                                                         ThreadblockShape>;
+        using AccumLambdaIterator =
+            typename DefaultMmaAccumLambdaIterator<typename Mma::Operator::IteratorC,
+                                                   accum_t,
+                                                   kWarpSize>::Iterator;
+        using AccumulatorSharedStorage = typename B2bGemm::AccumulatorSharedStorage;
+    };
+
+    struct MatmulGradV {
+        /*
+        grad_v[j_start:j_end] += attn_T @ do_i # matmul
+
+        Dimensions: (kBlockSizeJ * kNumWarpsPerBlock, kBlockSizeI, K)
+        (we might need to iterate multiple times on K)
+        */
+        using ThreadblockShape =
+            cutlass::gemm::GemmShape<kBlockSizeJ, kBlockSizeI, GemmType::ThreadK>;
+        using WarpShape = cutlass::gemm::GemmShape<32, 32, GemmType::WarpK>;
+        using InstructionShape = typename GemmType::InstructionShape;
+
+        using DefaultGemm =
+            cutlass::gemm::kernel::DefaultGemm<scalar_t,                   // ElementA,
+                                               cutlass::layout::RowMajor,  // LayoutA,
+                                               DefaultConfig::kAlignmentA,
+                                               scalar_t,                   // ElementB,
+                                               cutlass::layout::RowMajor,  // LayoutB,
+                                               kIsAligned ? DefaultConfig::kAlignmentB
+                                                          : GemmType::kMinimumAlignment,
+                                               output_t,
+                                               cutlass::layout::RowMajor,  // LayoutC,
+                                               accum_t,
+                                               typename GemmType::OpClass,
+                                               ArchTag,
+                                               ThreadblockShape,
+                                               WarpShape,
+                                               typename GemmType::InstructionShape,
+                                               typename DefaultConfig::EpilogueOutputOp,
+                                               void,  // ThreadblockSwizzle - not used
+                                               DefaultConfig::kStages,
+                                               false,  // SplitKSerial
+                                               typename GemmType::Operator>;
+
+        // if dropout:
+        //   for computing dVj += (Pij.T * Zij) @ dOi
+        //   Pij_dropped.T = Pij.T * Zij is computed on the fly as fragments of
+        //   Pij.T are loaded in. The reason we do it this way is because Pij.T and
+        //   Zij are reused in later steps, while Pij_dropped.T is only needed in
+        //   this step. computing Pij_dropped.T on the fly allows us to avoid
+        //   keeping all 3 of Pij_dropped.T, Pij.T, and Zij in shared memory at the
+        //   same time.
+        // if no dropout:
+        //   for computing dVj += Pij.T @ dOi
+        using DefaultMmaFromSmem = typename cutlass::gemm::threadblock::DefaultMmaFromSharedMemory<
+            typename DefaultGemm::Mma,
+            typename MatmulQK::AccumulatorSharedStorage,
+            kApplyDropout>;  // kScaleOperandA
+
+        using Mma = typename DefaultMmaFromSmem::Mma;
+        using WarpIteratorA = typename DefaultMmaFromSmem::WarpIteratorA;
+        using IteratorB = typename Mma::IteratorB;
+        using WarpCount = typename Mma::WarpCount;
+
+        // Epilogue
+        using DefaultOutputOp = typename DefaultConfig::EpilogueOutputOp;
+        using DefaultEpilogue = typename DefaultGemm::Epilogue;
+        using OutputTileIterator =
+            typename cutlass::epilogue::threadblock::MakePrefetchableIterator<
+                typename DefaultEpilogue::OutputTileIterator>::Iterator;
+        using AccumTileGmem = GmemTile<typename Mma::FragmentC, (int)kNumThreads>;
+    };
+
+    struct MatmulDOIVJ {
+        /*
+        doi_t_vj = do_i @ v_j.transpose(-2, -1) # matmul
+        tmp = (doi_t_vj - Di.unsqueeze(1)) * attn # inplace / epilogue?
+        */
+        using ThreadblockShape =
+            cutlass::gemm::GemmShape<kBlockSizeI, kBlockSizeJ, GemmType::ThreadK>;
+        using WarpShape = cutlass::gemm::GemmShape<32, 32, GemmType::WarpK>;
+
+        using ElementC = accum_t;  // CSY: Change it for better accuracy
+        using ElementAccum = accum_t;
+
+        // no-op output op - epilogue just stores result to global memory
+        using BiasGradEpilogueOutputOp = typename cutlass::epilogue::thread::LinearCombination<
+            ElementC,
+            DefaultConfig::EpilogueOutputOp::kCount,
+            typename DefaultConfig::EpilogueOutputOp::ElementAccumulator,
+            typename DefaultConfig::EpilogueOutputOp::ElementCompute,
+            cutlass::epilogue::thread::ScaleType::Nothing>;
+
+        using DefaultGemm = typename cutlass::gemm::kernel::DefaultGemm<
+            scalar_t,                   // ElementA
+            cutlass::layout::RowMajor,  // LayoutA
+            kIsAligned ? DefaultConfig::kAlignmentA : GemmType::kMinimumAlignment,
+            scalar_t,                      // ElementB
+            cutlass::layout::ColumnMajor,  // LayoutB
+            kIsAligned ? DefaultConfig::kAlignmentB : GemmType::kMinimumAlignment,
+            ElementC,                   // ElementC
+            cutlass::layout::RowMajor,  // LayoutC
+            ElementAccum,               // ElementAccumulator
+            typename GemmType::OpClass,
+            ArchTag,
+            ThreadblockShape,
+            WarpShape,
+            typename GemmType::InstructionShape,
+            BiasGradEpilogueOutputOp,  // EpilogueOutputOp
+            void,                      // ThreadblockSwizzle (not used)
+            // multiple preloads, dropout Zij tile, and 3 stages push us over shared
+            // memory capacity on A100. set a ceiling on number of stages to save
+            // shared memory if dropout is in use.
+            kPreload && kApplyDropout && (kBlockSizeI * kBlockSizeJ > 64 * 64)
+                ? cutlass::const_min(2, DefaultConfig::kStages)
+                : DefaultConfig::kStages,  // Stages
+            false,                         // SplitKSerial
+            typename GemmType::Operator,
+            cutlass::gemm::SharedMemoryClearOption::kNone>;
+        using Mma = typename MakeCustomMma<typename DefaultGemm::Mma, kMaxK>::Mma;
+
+        // epilogue used to write bias gradient, which is just the output of this
+        // matmul with some operations applied to the fragment
+        using BiasGradEpilogue = typename DefaultGemm::Epilogue;
+
+        // Epilogue to store to shared-memory in a format that we can use later for
+        // the second matmul
+        using B2bGemm =
+            typename cutlass::gemm::threadblock::B2bGemm<typename Mma::Operator::IteratorC,
+                                                         typename Mma::Operator,
+                                                         scalar_t,
+                                                         WarpShape,
+                                                         ThreadblockShape>;
+        using AccumulatorSharedStorage = typename B2bGemm::AccumulatorSharedStorage;
+    };
+
+    struct MatmulGradQ {
+        // grad_q <- tmp @ k_j
+        using ThreadblockShape =
+            cutlass::gemm::GemmShape<kBlockSizeI, kBlockSizeJ, GemmType::ThreadK>;
+        using WarpShape = cutlass::gemm::GemmShape<32, 32, GemmType::WarpK>;
+        using InstructionShape = typename GemmType::InstructionShape;
+
+        using DefaultGemm =
+            cutlass::gemm::kernel::DefaultGemm<scalar_t,                   // ElementA,
+                                               cutlass::layout::RowMajor,  // LayoutA,
+                                               DefaultConfig::kAlignmentA,
+                                               scalar_t,                   // ElementB,
+                                               cutlass::layout::RowMajor,  // LayoutB,
+                                               kIsAligned ? DefaultConfig::kAlignmentB
+                                                          : GemmType::kMinimumAlignment,
+                                               output_t,
+                                               cutlass::layout::RowMajor,  // LayoutC,
+                                               accum_t,
+                                               typename GemmType::OpClass,
+                                               ArchTag,
+                                               ThreadblockShape,
+                                               WarpShape,
+                                               typename GemmType::InstructionShape,
+                                               typename DefaultConfig::EpilogueOutputOp,
+                                               void,  // ThreadblockSwizzle - not used
+                                               DefaultConfig::kStages,
+                                               false,  // SplitKSerial
+                                               typename GemmType::Operator>;
+
+        using DefaultMmaFromSmem = typename cutlass::gemm::threadblock::DefaultMmaFromSharedMemory<
+            typename DefaultGemm::Mma,
+            typename MatmulDOIVJ::AccumulatorSharedStorage,
+            false>;  // kScaleOperandA
+        using Mma = typename DefaultMmaFromSmem::Mma;
+        using IteratorB = typename Mma::IteratorB;
+        using WarpCount = typename Mma::WarpCount;
+
+        // Epilogue
+        using DefaultOutputOp = typename DefaultConfig::EpilogueOutputOp;
+        using DefaultEpilogue = typename DefaultGemm::Epilogue;
+        using OutputTileIterator =
+            typename cutlass::epilogue::threadblock::MakePrefetchableIterator<
+                typename DefaultEpilogue::OutputTileIterator>::Iterator;
+        using AccumTileGmem = GmemTile<typename Mma::FragmentC, (int)kNumThreads>;
+    };
+    struct MatmulGradK {
+        // grad_k <- tmp.transpose(-2, -1) @ q_i
+        using ThreadblockShape =
+            cutlass::gemm::GemmShape<kBlockSizeJ, kBlockSizeI, GemmType::ThreadK>;
+        using WarpShape = cutlass::gemm::GemmShape<32, 32, GemmType::WarpK>;
+        using InstructionShape = typename GemmType::InstructionShape;
+
+        using DefaultGemm =
+            cutlass::gemm::kernel::DefaultGemm<scalar_t,                   // ElementA,
+                                               cutlass::layout::RowMajor,  // LayoutA,
+                                               DefaultConfig::kAlignmentA,
+                                               scalar_t,                   // ElementB,
+                                               cutlass::layout::RowMajor,  // LayoutB,
+                                               kIsAligned ? DefaultConfig::kAlignmentB
+                                                          : GemmType::kMinimumAlignment,
+                                               output_t,
+                                               cutlass::layout::RowMajor,  // LayoutC,
+                                               accum_t,
+                                               typename GemmType::OpClass,
+                                               ArchTag,
+                                               ThreadblockShape,
+                                               WarpShape,
+                                               typename GemmType::InstructionShape,
+                                               typename DefaultConfig::EpilogueOutputOp,
+                                               void,  // ThreadblockSwizzle - not used
+                                               DefaultConfig::kStages,
+                                               false,  // SplitKSerial
+                                               typename GemmType::Operator>;
+
+        using DefaultMmaFromSmemN = typename cutlass::gemm::threadblock::DefaultMmaFromSharedMemory<
+            typename DefaultGemm::Mma,
+            typename MatmulQK::AccumulatorSharedStorage,
+            false>;  // kScaleOperandA
+        using DefaultMmaFromSmemT = typename cutlass::gemm::threadblock::DefaultMmaFromSharedMemory<
+            typename DefaultGemm::Mma,
+            typename MatmulDOIVJ::AccumulatorSharedStorage,
+            false,      // kScaleOperandA
+            kPreload>;  // kTransposeA
+        using DefaultMmaFromSmem =
+            typename cutlass::platform::conditional<DefaultMmaFromSmemT::kIsTransposedA,
+                                                    DefaultMmaFromSmemT,
+                                                    DefaultMmaFromSmemN>::type;
+        using Mma = typename DefaultMmaFromSmem::Mma;
+        using IteratorB = typename Mma::IteratorB;
+        using WarpCount = typename Mma::WarpCount;
+
+        // Epilogue
+        using DefaultOutputOp = typename DefaultConfig::EpilogueOutputOp;
+        using DefaultEpilogue = typename DefaultGemm::Epilogue;
+        using OutputTileIterator =
+            typename cutlass::epilogue::threadblock::MakePrefetchableIterator<
+                typename DefaultEpilogue::OutputTileIterator>::Iterator;
+        using AccumTileGmem = GmemTile<typename Mma::FragmentC, (int)kNumThreads>;
+    };
+
+    using broadcast_1 = Broadcast1_<typename MatmulQK::BiasLoader::ThreadMap,
+                                    typename MatmulQK::BiasLoader::Shape,
+                                    scalar_t>;
+    using broadcast_2 = Broadcast2_<typename MatmulQK::BiasLoader::ThreadMap,
+                                    typename MatmulQK::BiasLoader::Shape,
+                                    scalar_t>;
+
+    // shared storage for keeping Zij matrix. not needed if we aren't using
+    // dropout, in which case we use an empty array to save shared memory
+    using ZijSharedStorage = typename cutlass::platform::conditional<
+        kApplyDropout,
+        typename MatmulQK::AccumulatorSharedStorage,
+        // dummy shared storage object that takes up no space.
+        typename cutlass::gemm::threadblock::AccumulatorSharedStorage<
+#ifdef _WIN32
+            // windows builds throw the error:
+            // "type containing an unknown-size array is not allowed"
+            // if we try to make Zij shared storage zero-sized.
+            // To get around this just make it sized 1 on windows.
+            typename cutlass::gemm::GemmShape<1, 1, 0>,
+#else
+            typename cutlass::gemm::GemmShape<0, 0, 0>,
+#endif
+            typename MatmulQK::AccumulatorSharedStorage::Element,
+            typename MatmulQK::AccumulatorSharedStorage::Layout,
+            typename cutlass::MatrixShape<0, 0>>>::type;
+
+    struct SharedStoragePrologue {
+        struct {
+            cutlass::Array<accum_t, kBlockSizeI> di;  // (do_i * o_i).sum(-1)
+            typename MatmulQK::Mma::SharedStorageA mm_qk_k;
+        } persistent;
+        union {
+            struct {
+                // part1 - after Q.K / dV / dO.V
+                union {
+                    // 1. efficient load of bias tile Bij, which is then applied to Pij
+                    // typename MatmulQK::BiasLoader::SmemTile bias;
+                    cutlass::AlignedBuffer<float, MatmulQK::BiasLoader::Shape::kCount> bias;
+                    // 4. store Pij. it is needed:
+                    // - in dVj += (Pij.T * Zij) @ dOi
+                    // - in dSij = Pij * (dPij - Di)
+                    // 6. dVj += (Pij.T * Zij) @ dOi
+                    // 10. write to fragment
+                    typename MatmulQK::AccumulatorSharedStorage attn_shared_storage;
+                };
+                // 5. store Zij. it is needed:
+                // - to compute Pij_dropped = Pij * Zij on the fly as fragments of Pij
+                // are loaded for the computation of dVj.
+                // - to compute dPij = (dOi @ Vj.T) * Zij
+                // 6. used in dVj += (Pij.T * Zij) @ dOi
+                // 9. used in dPij = dPij_dropped * Zij
+                ZijSharedStorage zij;
+
+                union {
+                    // 2. prologue for dVj
+                    // 6. workspace for dVj += (Pij.T * Zij) @ dOi
+                    typename MatmulGradV::Mma::SharedStorage mm_gradV;
+                    // 7. dVj epilogue
+                    typename MatmulGradV::DefaultEpilogue::SharedStorage gradV_epilogue;
+                };
+
+                // 3. prologue for dPij_dropped
+                // 8. used in dPij_dropped = dOi @ Vj.T
+                typename MatmulDOIVJ::Mma::SharedStorage mm_doivj;
+            } part1;
+
+            struct {
+                // part2 - dQ
+                union {
+                    typename MatmulQK::AccumulatorSharedStorage
+                        tmpT_shared_storage;  // (from part1)
+                    typename MatmulDOIVJ::AccumulatorSharedStorage tmp_shared_storage;
+                };
+                typename MatmulGradK::Mma::SharedStorage mm_gradK;  // (preload)
+                typename MatmulGradQ::Mma::SharedStorage mm_gradQ;  // (preload)
+                union {
+                    // store dB = dSij to global memory
+                    typename MatmulDOIVJ::BiasGradEpilogue::SharedStorage gradB_epilogue;
+                    typename MatmulGradQ::DefaultEpilogue::SharedStorage gradQ_epilogue;
+                };
+
+            } part2;
+
+            struct {
+                // part3 - after last iteration on dQ's epilogue / dK
+                union {
+                    typename MatmulQK::AccumulatorSharedStorage
+                        tmpT_shared_storage;  // (from part1)
+                    typename MatmulDOIVJ::AccumulatorSharedStorage tmp_shared_storage;
+                };
+                typename MatmulGradK::Mma::SharedStorage mm_gradK;  // (preload)
+                typename MatmulGradQ::DefaultEpilogue::SharedStorage gradQ_epilogue_lastIter;
+
+                typename MatmulGradK::DefaultEpilogue::SharedStorage gradK_epilogue;
+            } part3;
+
+            struct {
+                // part4 - after last iteration on dK's epilogue / preload next K.Q_t
+                typename MatmulQK::Mma::SharedStorageB mm_qk_q;
+
+                // If we reach end of current key, dump RF->gmem with "final" epilogues
+                typename MatmulGradK::DefaultEpilogue::SharedStorage gradK_epilogue_final;
+                typename MatmulGradV::DefaultEpilogue::SharedStorage gradV_epilogue_final;
+            } part4;
+        };
+// ===========================================
+#define FIELD(INSIDE_STRUCT, FIELDNAME) \
+    CUTLASS_DEVICE auto& FIELDNAME() { return INSIDE_STRUCT.FIELDNAME; }
+
+        FIELD(persistent, di)
+        FIELD(persistent, mm_qk_k)
+        FIELD(part1, bias)
+        FIELD(part1, attn_shared_storage)
+        FIELD(part1, zij)
+        FIELD(part1, mm_gradV)
+        FIELD(part1, gradV_epilogue)
+        FIELD(part1, mm_doivj)
+        FIELD(part2, mm_gradK)
+        FIELD(part2, mm_gradQ)
+        FIELD(part2, gradB_epilogue)
+        FIELD(part2, gradQ_epilogue)
+        FIELD(part2, tmp_shared_storage)
+        FIELD(part3, tmpT_shared_storage)
+        FIELD(part3, gradQ_epilogue_lastIter)
+        FIELD(part3, gradK_epilogue)
+        FIELD(part4, mm_qk_q)
+        FIELD(part4, gradK_epilogue_final)
+        FIELD(part4, gradV_epilogue_final)
+    };
+
+    struct SharedStorageNoPrologue {
+        struct {
+            cutlass::Array<accum_t, kBlockSizeI> di;  // (do_i * o_i).sum(-1)
+        } persistent;
+        union {
+            struct {
+                // part1 - Q.K matmul
+                typename MatmulQK::Mma::SharedStorageA mm_qk_k;
+                typename MatmulQK::Mma::SharedStorageB mm_qk_q;
+            } part1;
+
+            struct {
+                // part2 - compute gradV
+                union {
+                    // 1. efficient load of bias tile Bij, which is then applied to Pij
+                    cutlass::AlignedBuffer<float, MatmulQK::BiasLoader::Shape::kCount> bias;
+                    // 2. store Pij to shared memory. it is needed:
+                    // - in this step, where it is used in dVj += (Pij.T * Zij) @ dOi
+                    // - in next step where it is used in dSij = Pij * (dPij - Di)
+                    typename MatmulQK::AccumulatorSharedStorage attn_shared_storage;
+                };
+                // 3. store Zij. it is needed:
+                // - in this step, where it is used to compute Pij_dropped = Pij * Zij
+                // on the
+                //   fly as fragments of Pij are loaded for the computation of dVj.
+                // - later to compute dPij = (dOi @ Vj.T) * Zij
+                ZijSharedStorage zij;
+
+                union {
+                    typename MatmulGradV::Mma::SharedStorage mm_gradV;
+                    typename MatmulGradV::DefaultEpilogue::SharedStorage gradV_epilogue;
+                };
+            } part2;
+
+            struct {
+                // part3 - DO.V matmul
+                union {
+                    // first compute dPij = (dOi @ Vj.T) * Zij
+                    // and dSij = Pij * (dPij - Di)
+                    struct {
+                        // (from part2) - Pij for computing dSij = Pij * (dPij - Di)
+                        typename MatmulQK::AccumulatorSharedStorage attn_shared_storage;
+                        // (from part2) - Zij for computing dPij = dPij_dropped * Zij
+                        ZijSharedStorage zij;
+                        // matmul to compute dOiVj
+                        typename MatmulDOIVJ::Mma::SharedStorage mm_doivj;
+                    };
+                    // then store dB = dSij to global memory
+                    typename MatmulDOIVJ::BiasGradEpilogue::SharedStorage gradB_epilogue;
+                };
+            } part3;
+
+            struct {
+                // part4 - compute gradQ
+                typename MatmulQK::AccumulatorSharedStorage tmpT_shared_storage;  // (from part2)
+                typename MatmulDOIVJ::AccumulatorSharedStorage tmp_shared_storage;
+                union {
+                    typename MatmulGradQ::Mma::SharedStorage mm_gradQ;
+                    typename MatmulGradQ::DefaultEpilogue::SharedStorage gradQ_epilogue;
+                    typename MatmulGradQ::DefaultEpilogue::SharedStorage gradQ_epilogue_lastIter;
+                };
+            } part4;
+
+            struct {
+                // part5 - compute gradK
+                typename MatmulQK::AccumulatorSharedStorage tmpT_shared_storage;  // (from part2)
+                typename MatmulDOIVJ::AccumulatorSharedStorage tmp_shared_storage;
+                union {
+                    typename MatmulGradK::Mma::SharedStorage mm_gradK;
+                    typename MatmulGradK::DefaultEpilogue::SharedStorage gradK_epilogue;
+                };
+            } part5;
+
+            struct {
+                // part6 - store RF accumulated into gmem
+                typename MatmulGradK::DefaultEpilogue::SharedStorage gradK_epilogue_final;
+                typename MatmulGradV::DefaultEpilogue::SharedStorage gradV_epilogue_final;
+            } part6;
+        };
+// ===========================================
+#define FIELD(INSIDE_STRUCT, FIELDNAME) \
+    CUTLASS_DEVICE auto& FIELDNAME() { return INSIDE_STRUCT.FIELDNAME; }
+
+        FIELD(persistent, di)
+        FIELD(part1, mm_qk_k)
+        FIELD(part1, mm_qk_q)
+        FIELD(part2, bias)
+        FIELD(part2, attn_shared_storage)
+        FIELD(part2, zij)
+        FIELD(part2, mm_gradV)
+        FIELD(part2, gradV_epilogue)
+        FIELD(part3, mm_doivj)
+        FIELD(part3, gradB_epilogue)
+        FIELD(part4, tmpT_shared_storage)
+        FIELD(part4, tmp_shared_storage)
+        FIELD(part4, mm_gradQ)
+        FIELD(part4, gradQ_epilogue)
+        FIELD(part4, gradQ_epilogue_lastIter)
+        FIELD(part5, mm_gradK)
+        FIELD(part5, gradK_epilogue)
+        FIELD(part6, gradK_epilogue_final)
+        FIELD(part6, gradV_epilogue_final)
+    };
+
+    using SharedStorage = typename cutlass::platform::
+        conditional<kPreload, SharedStoragePrologue, SharedStorageNoPrologue>::type;
+
+    struct OutputFragments {
+        typename MatmulGradV::Mma::FragmentC gradV;
+        typename MatmulGradK::Mma::FragmentC gradK;
+
+        CUTLASS_DEVICE void clear()
+        {
+            gradV.clear();
+            gradK.clear();
+        }
+    };
+
+    static bool __host__ check_supported(Params const& p)
+    {
+        CHECK_ALIGNED_PTR(p.query_ptr, kMinimumAlignment);
+        CHECK_ALIGNED_PTR(p.key_ptr, kMinimumAlignment);
+        CHECK_ALIGNED_PTR(p.value_ptr, kMinimumAlignment);
+        CHECK_ALIGNED_PTR(p.output_ptr, kMinimumAlignment);
+        CHECK_ALIGNED_PTR(p.grad_output_ptr, kMinimumAlignment);
+        EVOFORMER_CHECK(p.lse_strideH % 8 == 0, "LSE is not correctly aligned");
+        EVOFORMER_CHECK(p.lse_strideB % 8 == 0, "LSE is not correctly aligned");
+        EVOFORMER_CHECK(p.num_heads <= 1 || p.q_strideH % kMinimumAlignment == 0,
+                        "query is not correctly aligned (strideH)");
+        EVOFORMER_CHECK(p.num_heads <= 1 || p.k_strideH % kMinimumAlignment == 0,
+                        "key is not correctly aligned (strideH)");
+        EVOFORMER_CHECK(p.num_heads <= 1 || p.v_strideH % kMinimumAlignment == 0,
+                        "value is not correctly aligned (strideH)");
+        EVOFORMER_CHECK(p.num_batches <= 1 || p.q_strideB % kMinimumAlignment == 0,
+                        "query is not correctly aligned (strideB)");
+        EVOFORMER_CHECK(p.num_batches <= 1 || p.k_strideB % kMinimumAlignment == 0,
+                        "key is not correctly aligned (strideB)");
+        EVOFORMER_CHECK(p.num_batches <= 1 || p.v_strideB % kMinimumAlignment == 0,
+                        "value is not correctly aligned (strideB)");
+        EVOFORMER_CHECK(p.q_strideM % kMinimumAlignment == 0,
+                        "query is not correctly aligned (strideM)");
+        EVOFORMER_CHECK(p.k_strideM % kMinimumAlignment == 0,
+                        "key is not correctly aligned (strideM)");
+        EVOFORMER_CHECK(p.v_strideM % kMinimumAlignment == 0,
+                        "value is not correctly aligned (strideM)");
+        EVOFORMER_CHECK(p.dropout_prob <= 1.0f && p.dropout_prob >= 0.0f,
+                        "Invalid value for `dropout_prob`");
+        EVOFORMER_CHECK(kApplyDropout || p.dropout_prob == 0.0f,
+                        "Set `kApplyDropout`=True to support `dropout_prob > 0`");
+        EVOFORMER_CHECK(p.head_dim > 0, "Invalid value for `head_dim`");
+        EVOFORMER_CHECK(p.head_dim_value > 0, "Invalid value for `head_dim_value`");
+        EVOFORMER_CHECK(p.num_queries > 0, "Invalid value for `num_queries`");
+        EVOFORMER_CHECK(p.num_keys > 0, "Invalid value for `num_keys`");
+        EVOFORMER_CHECK(p.num_heads > 0, "Invalid value for `num_heads`");
+        EVOFORMER_CHECK(p.num_batches > 0, "Invalid value for `num_batches`");
+        EVOFORMER_CHECK(p.head_dim <= kMaxK, "kMaxK: Expected `head_dim < kMaxK`");
+        EVOFORMER_CHECK(p.head_dim_value <= kMaxK, "kMaxK: Expected `head_dim_value < kMaxK`");
+        return true;
+    }
+
+    static CUTLASS_DEVICE void attention_kernel(Params p)
+    {
+        extern __shared__ char smem_buffer[];
+        SharedStorage& shared_storage = *((SharedStorage*)smem_buffer);
+
+        uint16_t thread_id = threadIdx.x;
+        uint8_t warp_id = warp_uniform(thread_id / 32);
+        uint8_t lane_id = thread_id % 32;
+
+        if (kPrologueQK) {
+            prologueQkNextIteration<true>(shared_storage, p, 0, 0, warp_id, lane_id);
+        }
+
+        // Computes (dO*out).sum(-1) and writes it to `p.delta_ptr`
+        if (kKernelComputesDelta) {
+            constexpr int kOptimalElements = 128 / cutlass::sizeof_bits<scalar_t>::value;
+            if (p.head_dim_value % kOptimalElements == 0) {
+                for (int query_start = 0; query_start < p.num_queries; query_start += kBlockSizeI) {
+                    computeDelta<kOptimalElements>(p, query_start, warp_id, lane_id);
+                }
+            } else {
+                for (int query_start = 0; query_start < p.num_queries; query_start += kBlockSizeI) {
+                    computeDelta<1>(p, query_start, warp_id, lane_id);
+                }
+            }
+            __syncthreads();
+        }
+
+        OutputFragments output_frags;
+
+        int32_t key_start = 0;
+        int32_t key_end = p.num_keys / kBlockSizeJ * kBlockSizeJ;
+        for (; key_start < key_end; key_start += kBlockSizeJ) {
+            output_frags.clear();
+            int32_t query_start = getQueryStart(p, key_start);
+            int32_t query_end =
+                query_start + (p.num_queries - query_start) / kBlockSizeI * kBlockSizeI;
+            for (; query_start < query_end; query_start += kBlockSizeI) {
+                processBlockIJ<true>(
+                    shared_storage, output_frags, p, query_start, key_start, warp_id, lane_id);
+            }
+            // last (partial) query
+            if (query_start < p.num_queries) {
+                processBlockIJ<false>(
+                    shared_storage, output_frags, p, query_start, key_start, warp_id, lane_id);
+            }
+            if (kOutputInRF) {
+                writeFragsToGmem<true>(
+                    shared_storage, output_frags, p, key_start, warp_id, lane_id);
+            } else if (getQueryStart(p, key_start) >= p.num_queries) {
+                zfillGradKV<true>(p, key_start, warp_id, lane_id);
+            }
+            __syncthreads();
+        }
+        // Last (partial) key
+        if (key_start != p.num_keys) {
+            output_frags.clear();
+            int32_t query_start = getQueryStart(p, key_start);
+            for (; query_start < p.num_queries; query_start += kBlockSizeI) {
+                warp_id = warp_uniform(warp_id);
+                processBlockIJ<false>(
+                    shared_storage, output_frags, p, query_start, key_start, warp_id, lane_id);
+            }
+            if (kOutputInRF) {
+                writeFragsToGmem<false>(
+                    shared_storage, output_frags, p, key_start, warp_id, lane_id);
+            } else if (getQueryStart(p, key_start) >= p.num_queries) {
+                zfillGradKV<false>(p, key_start, warp_id, lane_id);
+            }
+        }
+    }
+
+    static CUTLASS_DEVICE void loadDi(cutlass::Array<accum_t, kBlockSizeI>& di,
+                                      Params const& p,
+                                      int32_t query_start)
+    {
+        int32_t thread_id = threadIdx.x + threadIdx.y * blockDim.x;
+        if (thread_id < kBlockSizeI) {
+            accum_t di_rf = accum_t(0);
+            if (query_start + thread_id < p.num_queries) {
+                di_rf = p.delta_ptr[query_start + thread_id];
+            }
+            di[thread_id] = di_rf;
+        }
+    }
+
+    template <bool skipBoundsChecks>
+    static CUTLASS_DEVICE void zfillGradKV(Params const& p,
+                                           int32_t key_start,
+                                           uint8_t warp_id,
+                                           uint8_t lane_id)
+    {
+        constexpr int kThreadsPerKey = 8;
+        constexpr int kParallelKeys = kNumThreads / kThreadsPerKey;
+        static_assert(kBlockSizeJ % kParallelKeys == 0, "");
+        // This function is not really optimized, but should rarely be used
+        // It's only used when some keys are "useless" and don't attend to
+        // any query, due to causal masking
+        int thread_id = 32 * warp_id + lane_id;
+        int k_shift = lane_id % kThreadsPerKey;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int j = 0; j < kBlockSizeJ; j += kParallelKeys) {
+            int key = key_start + j + (thread_id / kThreadsPerKey);
+            if (!skipBoundsChecks && key >= p.num_keys) { continue; }
+            auto gv_ptr = p.grad_value_ptr + key * p.gV_strideM();
+            auto gk_ptr = p.grad_key_ptr + key * p.gK_strideM();
+
+            for (int k = k_shift; k < p.head_dim_value; k += kThreadsPerKey) {
+                gv_ptr[k] = scalar_t(0);
+            }
+            for (int k = k_shift; k < p.head_dim; k += kThreadsPerKey) { gk_ptr[k] = scalar_t(0); }
+        }
+    }
+
+    template <bool skipBoundsChecks>
+    static CUTLASS_DEVICE void processBlockIJ(SharedStorage& shared_storage,
+                                              OutputFragments& output_frags,
+                                              Params& p,
+                                              int32_t query_start,
+                                              int32_t key_start,
+                                              uint8_t warp_id,
+                                              uint8_t lane_id)
+    {
+        cutlass::MatrixCoord no_offset{0, 0};
+        accum_t scale = p.scale;
+        int16_t thread_id = 32 * warp_id + lane_id;
+        auto rematerializeThreadIds = [&]() {
+            // Prevents `nvcc` from keeping values deduced from
+            // `thread_id`, `warp_id`, ... in RF - to reduce register pressure
+            warp_id = warp_uniform(thread_id / 32);
+            lane_id = thread_id % 32;
+            thread_id = 32 * warp_id + lane_id;
+        };
+
+        bool isFirstQuery = (query_start == getQueryStart(p, key_start));
+        int32_t next_query, next_key;
+        incrIteration(p, query_start, key_start, next_query, next_key);
+        bool isLastQuery = next_key != key_start;
+        __syncthreads();
+        loadDi(shared_storage.di(), p, query_start);
+
+        int32_t num_queries_in_block =
+            skipBoundsChecks ? MatmulQK::Mma::Shape::kN
+                             : warp_uniform(cutlass::fast_min((int32_t)MatmulQK::Mma::Shape::kN,
+                                                              p.num_queries - query_start));
+        int32_t num_keys_in_block =
+            skipBoundsChecks ? MatmulQK::Mma::Shape::kM
+                             : warp_uniform(cutlass::fast_min((int32_t)MatmulQK::Mma::Shape::kM,
+                                                              p.num_keys - key_start));
+
+        auto prologueGradV = [&](int col) {
+            typename MatmulGradV::Mma::IteratorB iterator_dO(
+                {int32_t(p.gO_strideM)},
+                p.grad_output_ptr + query_start * p.gO_strideM + col,
+                {num_queries_in_block, p.head_dim_value - col},
+                thread_id,
+                no_offset);
+            MatmulGradV::Mma::prologue(
+                shared_storage.mm_gradV(), iterator_dO, thread_id, num_queries_in_block);
+        };
+        auto prologueGradQ = [&](int col) {
+            typename MatmulGradQ::Mma::IteratorB iterator_K(
+                {int32_t(p.k_strideM)},
+                p.key_ptr + key_start * p.k_strideM + col,
+                {num_keys_in_block, p.head_dim - col},
+                thread_id,
+                no_offset);
+            MatmulGradQ::Mma::prologue(
+                shared_storage.mm_gradQ(), iterator_K, thread_id, num_keys_in_block);
+        };
+        auto prologueGradK = [&](int col) {
+            typename MatmulGradK::Mma::IteratorB iterator_Q(
+                {int32_t(p.q_strideM)},
+                p.query_ptr + query_start * p.q_strideM + col,
+                {num_queries_in_block, p.head_dim - col},
+                thread_id,
+                no_offset);
+            MatmulGradK::Mma::prologue(
+                shared_storage.mm_gradK(), iterator_Q, thread_id, num_queries_in_block);
+        };
+        auto prologueDOV = [&]() {
+            typename MatmulDOIVJ::Mma::IteratorA iterator_A(
+                {int32_t(p.gO_strideM)},
+                p.grad_output_ptr + query_start * p.gO_strideM,
+                {num_queries_in_block, p.head_dim_value},
+                thread_id,
+                no_offset);
+            typename MatmulDOIVJ::Mma::IteratorB iterator_B({int32_t(p.v_strideM)},
+                                                            p.value_ptr + key_start * p.v_strideM,
+                                                            {p.head_dim_value, num_keys_in_block},
+                                                            thread_id,
+                                                            no_offset);
+            MatmulDOIVJ::Mma::prologue(
+                shared_storage.mm_doivj(), iterator_A, iterator_B, thread_id, p.head_dim_value);
+        };
+
+        /////////////////////////////////////////////////////////////////////////////////////////////////
+        // MatmulQK
+        /////////////////////////////////////////////////////////////////////////////////////////////////
+        {
+            using Mma = typename MatmulQK::Mma;
+
+            cutlass::gemm::GemmCoord problem_size(num_keys_in_block,
+                                                  num_queries_in_block,
+                                                  p.head_dim  // k
+            );
+
+            // k_j
+            typename Mma::IteratorA iterator_A({int32_t(p.k_strideM)},
+                                               p.key_ptr + key_start * p.k_strideM,
+                                               {problem_size.m(), problem_size.k()},
+                                               thread_id,
+                                               no_offset);
+
+            // q_i.transpose(-2, -1)
+            typename Mma::IteratorB iterator_B({int32_t(p.q_strideM)},
+                                               p.query_ptr + query_start * p.q_strideM,
+                                               {problem_size.k(), problem_size.n()},
+                                               thread_id,
+                                               no_offset);
+
+            Mma mma(
+                shared_storage.mm_qk_k(), shared_storage.mm_qk_q(), thread_id, warp_id, lane_id);
+
+            typename Mma::FragmentC accum;
+
+            accum.clear();
+
+            auto gemm_k_iterations = (problem_size.k() + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+            // Compute threadblock-scoped matrix multiply-add
+            mma.set_prologue_done(kPrologueQK);
+            mma.set_zero_outside_bounds(!skipBoundsChecks);
+            mma(gemm_k_iterations, accum, iterator_A, iterator_B, accum);
+
+            // Epilogue: add LSE + exp and store that to our shared memory buffer
+            // shmem <- (matmul_result -
+            // logsumexp[i_start:i_end].unsqueeze(1)).exp()
+            int warp_idx_mn_0 = warp_id % (Mma::Base::WarpCount::kM * Mma::Base::WarpCount::kN);
+            auto output_tile_coords = cutlass::MatrixCoord{
+                warp_idx_mn_0 % Mma::Base::WarpCount::kM, warp_idx_mn_0 / Mma::Base::WarpCount::kM};
+
+            if (broadcast_1::kEnable || broadcast_2::kEnable) {
+                cutlass::TensorRef<float, cutlass::layout::RowMajor> bias_tensor_ref(
+                    shared_storage.bias().data(),
+                    cutlass::layout::RowMajor(MatmulQK::ThreadblockShape::kM));
+                using Shape = cutlass::MatrixShape<MatmulQK::ThreadblockShape::kM,
+                                                   MatmulQK::ThreadblockShape::kN>;
+                AttentionBiasEpilogue<Shape,
+                                      scalar_t,
+                                      MatmulQK::MmaCore::kThreads,
+                                      Broadcast1_,
+                                      Broadcast2_>
+                    bias_epilogue;
+                bias_epilogue(bias_tensor_ref,
+                              p.bias1_ptr + key_start,
+                              p.bias2_ptr + query_start * p.num_keys + key_start,
+                              thread_id,
+                              {num_queries_in_block, num_keys_in_block},
+                              p.num_keys);
+                // Pij += Bij, Pij is in register fragment and Bij is in shared memory
+                auto lane_offset = MatmulQK::AccumLambdaIterator::get_lane_offset(
+                    lane_id, warp_id, output_tile_coords);
+                MatmulQK::AccumLambdaIterator::iterateRows(
+                    lane_offset,
+                    [&](int accum_n) {},
+                    [&](int accum_m, int accum_n, int idx) {
+                        // remember we are transposed
+                        accum[idx] = accum[idx] * scale + bias_tensor_ref.at({accum_n, accum_m});
+                    },
+                    [&](int accum_n) {});
+            } else {
+                accum = cutlass::multiplies<typename Mma::FragmentC>()(scale, accum);
+            }
+
+            __syncthreads();
+            if (kPrologueGV) { prologueGradV(0); }
+            if (kPrologueDOV) { prologueDOV(); }
+
+            MatmulQK::B2bGemm::accumApplyLSEToSmem(shared_storage.attn_shared_storage(),
+                                                   accum,
+                                                   p.logsumexp_ptr + query_start,
+                                                   problem_size.n(),
+                                                   thread_id,
+                                                   warp_id,
+                                                   lane_id,
+                                                   output_tile_coords);
+
+            __syncthreads();
+        }
+        rematerializeThreadIds();
+
+        /////////////////////////////////////////////////////////////////////////////////////////////////
+        // GradV matmul
+        //
+        // grad_v[j_start:j_end] += attn_T @ do_i
+        /////////////////////////////////////////////////////////////////////////////////////////////////
+        constexpr bool kSingleIterationGradV = kMaxK <= MatmulGradV::ThreadblockShape::kN;
+        for (int col = 0; col < (kSingleIterationGradV ? 1 : p.head_dim_value);
+             col += MatmulGradV::ThreadblockShape::kN) {
+            using Mma = typename MatmulGradV::Mma;
+            using AccumTileGmem = typename MatmulGradQ::AccumTileGmem;
+
+            cutlass::gemm::GemmCoord problem_size(
+                num_keys_in_block, p.head_dim_value - col, num_queries_in_block);
+            auto createEpilogueIter = [&]() {
+                return typename MatmulGradV::OutputTileIterator(
+                    typename MatmulGradV::OutputTileIterator::Params{p.gV_strideM()},
+                    p.grad_value_ptr + key_start * p.gV_strideM() + col,
+                    {num_keys_in_block, p.head_dim_value - col},
+                    thread_id);
+            };
+            typename Mma::IteratorB iterator_B({int32_t(p.gO_strideM)},
+                                               p.grad_output_ptr + query_start * p.gO_strideM + col,
+                                               {num_queries_in_block, p.head_dim_value - col},
+                                               thread_id,
+                                               no_offset);
+
+            // if dropout: dVj += (Pij.T * Zij) @ dOi
+            // otherwise:  dVj += Pij.T @ dOi
+            Mma mma(shared_storage.mm_gradV(),
+                    // operand A: Pij
+                    typename MatmulGradV::WarpIteratorA(
+                        shared_storage.attn_shared_storage().accum_ref(), lane_id),
+                    // if we're using dropout, operand A is Pij_dropped = Pij * Zij
+                    // which is computed on the fly as fragments of Pij are loaded in
+                    typename Mma::WarpIteratorAScale(shared_storage.zij().accum_ref(), lane_id),
+                    thread_id,
+                    warp_id,
+                    lane_id);
+
+            int storage_id = col / MatmulGradV::ThreadblockShape::kN;
+            AccumTileGmem gmem_tile{p.workspace_gv + storage_id * AccumTileGmem::kElementsStored};
+            if (!kOutputInRF) {
+                if (isFirstQuery || !kNeedsAccumGradV) {
+                    output_frags.gradV.clear();
+                } else {
+                    gmem_tile.load(output_frags.gradV, thread_id);
+                }
+            }
+            mma.set_prologue_done(kPrologueGV);
+
+            auto gemm_k_iterations = (problem_size.k() + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+            // Compute threadblock-scoped matrix multiply-add
+            __syncthreads();
+
+            mma(gemm_k_iterations, output_frags.gradV, iterator_B, output_frags.gradV);
+            __syncthreads();
+            if (kPrologueGV && !kSingleIterationGradV &&
+                col + MatmulGradV::ThreadblockShape::kN < p.head_dim_value) {
+                prologueGradV(col + MatmulGradV::ThreadblockShape::kN);
+            }
+
+            if (!kOutputInRF) {
+                if (kNeedsAccumGradV && !isLastQuery) {
+                    gmem_tile.store(output_frags.gradV, thread_id);
+                } else {
+                    accumulateInGmem<MatmulGradV>(shared_storage.gradV_epilogue(),
+                                                  output_frags.gradV,
+                                                  createEpilogueIter(),
+                                                  isFirstQuery || kNeedsAccumGradV,
+                                                  warp_id,
+                                                  lane_id);
+                }
+            }
+        }
+        __syncthreads();
+        /////////////////////////////////////////////////////////////////////////////////////////////////
+        // MatmulDOIVJ
+        /////////////////////////////////////////////////////////////////////////////////////////////////
+        {
+            using Mma = typename MatmulDOIVJ::Mma;
+            // do_i
+            typename Mma::IteratorA iterator_A({int32_t(p.gO_strideM)},
+                                               p.grad_output_ptr + query_start * p.gO_strideM,
+                                               {num_queries_in_block, p.head_dim_value},
+                                               thread_id,
+                                               no_offset);
+
+            // v_j.transpose(-2, -1)
+            typename Mma::IteratorB iterator_B({int32_t(p.v_strideM)},
+                                               p.value_ptr + key_start * p.v_strideM,
+                                               {p.head_dim_value, num_keys_in_block},
+                                               thread_id,
+                                               no_offset);
+
+            Mma mma(shared_storage.mm_doivj(), thread_id, warp_id, lane_id);
+            mma.set_prologue_done(kPrologueDOV);
+            mma.set_zero_outside_bounds(!skipBoundsChecks);
+
+            typename Mma::FragmentC accum;
+
+            accum.clear();
+
+            auto gemm_k_iterations = (p.head_dim_value + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+            // Compute threadblock-scoped matrix multiply-add
+            mma(gemm_k_iterations, accum, iterator_A, iterator_B, accum);
+            __syncthreads();
+            if (kPrologueGQ) { prologueGradQ(0); }
+            if (kPrologueGK) { prologueGradK(0); }
+
+            int warp_idx_mn_0 = warp_id % (Mma::Base::WarpCount::kM * Mma::Base::WarpCount::kN);
+            auto output_tile_coords = cutlass::MatrixCoord{
+                warp_idx_mn_0 % Mma::Base::WarpCount::kM, warp_idx_mn_0 / Mma::Base::WarpCount::kM};
+            // TODO: This must be terribly inefficient. There must be a better way
+            // tmp [RF] <- (accum [RF] - Di [smem] ) * attn_T.T [smem]
+            // attn_shared_storage  [smem] <- tmp.T
+            // tmp_shared_storage [smem] <- tmp
+            {
+                using LambdaIterator =
+                    typename DefaultMmaAccumLambdaIterator<typename Mma::Operator::IteratorC,
+                                                           typename MatmulDOIVJ::ElementAccum,
+                                                           kWarpSize>::Iterator;
+                auto lane_offset =
+                    LambdaIterator::get_lane_offset(lane_id, warp_id, output_tile_coords);
+
+                auto attn_T = shared_storage.attn_shared_storage().accum_ref();
+                accum_t current_di;
+                // dSij = (dPij - Di) * Pij
+                LambdaIterator::iterateRows(
+                    lane_offset,
+                    [&](int accum_m) { current_di = shared_storage.di()[accum_m]; },
+                    [&](int accum_m, int accum_n, int idx) {
+                        if (skipBoundsChecks ||
+                            (accum_m < num_queries_in_block && accum_n < num_keys_in_block)) {
+                            accum_t attn = attn_T.at({accum_n, accum_m});
+                            accum[idx] = (accum[idx] - current_di) * attn;
+                        } else {
+                            accum[idx] = 0;
+                        }
+                    },
+                    [&](int accum_m) {
+
+                    });
+
+                using DefaultGemm = typename MatmulDOIVJ::DefaultGemm;
+                using OutputOp = typename MatmulDOIVJ::BiasGradEpilogueOutputOp;
+                if (broadcast_1::kEnable && p.grad_bias1_ptr) {
+                    using Epilogue =
+                        typename BiasGradEpilogueAffineRankN<ArchTag,
+                                                             2,
+                                                             typename MatmulDOIVJ::ThreadblockShape,
+                                                             typename DefaultGemm::Mma::Operator,
+                                                             DefaultGemm::kPartitionsK,
+                                                             OutputOp,
+                                                             OutputOp::kCount>::Epilogue;
+                    cutlass::layout::AffineRankN<2> layout({0, 1});
+                    auto dst_ptr = p.grad_bias1_ptr + key_start;
+                    typename Epilogue::OutputTileIterator output_iter(
+                        {layout},
+                        dst_ptr,
+                        {num_queries_in_block, num_keys_in_block},
+                        (int)thread_id);
+                    Epilogue epilogue(shared_storage.gradB_epilogue(),
+                                      (int)thread_id,
+                                      (int)warp_id,
+                                      (int)lane_id);
+                    epilogue(OutputOp(1), output_iter, accum);
+                }
+
+                if (broadcast_2::kEnable && p.grad_bias2_ptr) {
+                    if (broadcast_1::kEnable) { __syncthreads(); }
+                    using Epilogue =
+                        typename BiasGradEpilogue<ArchTag,
+                                                  typename MatmulDOIVJ::ThreadblockShape,
+                                                  typename DefaultGemm::Mma::Operator,
+                                                  DefaultGemm::kPartitionsK,
+                                                  OutputOp,
+                                                  OutputOp::kCount>::Epilogue;
+                    typename Epilogue::OutputTileIterator::Params params{p.num_keys};
+                    auto dst_ptr = p.grad_bias2_ptr + query_start * p.num_keys + key_start;
+                    typename Epilogue::OutputTileIterator output_iter(
+                        params, dst_ptr, {num_queries_in_block, num_keys_in_block}, (int)thread_id);
+                    Epilogue epilogue(shared_storage.gradB_epilogue(),
+                                      (int)thread_id,
+                                      (int)warp_id,
+                                      (int)lane_id);
+                    epilogue(OutputOp(1), output_iter, accum);
+                }
+
+                accum = accum * scale;
+
+                __syncthreads();
+                if (!MatmulGradK::DefaultMmaFromSmem::kIsTransposedA) {
+                    auto tmpT = shared_storage.tmpT_shared_storage().accum_ref();
+                    // attn <- attn_T.T
+                    LambdaIterator::iterateRows(
+                        lane_offset,
+                        [&](int accum_m) {},
+                        [&](int accum_m, int accum_n, int idx) {
+                            tmpT.at({accum_n, accum_m}) = scalar_t(accum[idx]);
+                        },
+                        [&](int accum_m) {});
+                }
+            }
+
+            MatmulDOIVJ::B2bGemm::accumToSmem(
+                shared_storage.tmp_shared_storage(), accum, lane_id, output_tile_coords);
+            __syncthreads();
+        }
+        p.head_dim = warp_uniform(p.head_dim);
+        p.k_strideM = warp_uniform(p.k_strideM);
+        rematerializeThreadIds();
+        /////////////////////////////////////////////////////////////////////////////////////////////////
+        // GradQ matmul
+        //
+        // grad_q[i_start:i_end] += tmp @ k_j
+        /////////////////////////////////////////////////////////////////////////////////////////////////
+        // Skip the loop & associated branches if we know at compile time the number
+        // of iterations
+        constexpr bool kSingleIterationGradQ = kMaxK <= MatmulGradQ::ThreadblockShape::kN;
+        for (int col = 0; col < (kSingleIterationGradQ ? 1 : p.head_dim);
+             col += MatmulGradQ::ThreadblockShape::kN) {
+            using Mma = typename MatmulGradQ::Mma;
+            using AccumTileGmem = typename MatmulGradQ::AccumTileGmem;
+
+            cutlass::gemm::GemmCoord problem_size(
+                num_queries_in_block,
+                false ? MatmulGradQ::ThreadblockShape::kN : p.head_dim - col,
+                num_keys_in_block);
+
+            // k_j
+            typename Mma::IteratorB iterator_B({int32_t(p.k_strideM)},
+                                               p.key_ptr + key_start * p.k_strideM + col,
+                                               {problem_size.k(), problem_size.n()},
+                                               thread_id,
+                                               no_offset);
+
+            auto a = shared_storage.tmp_shared_storage().accum_ref();
+            Mma mma(shared_storage.mm_gradQ(),
+                    shared_storage.tmp_shared_storage(),
+                    thread_id,
+                    warp_id,
+                    lane_id,
+                    problem_size.k());
+
+            typename Mma::FragmentC accum;
+
+            bool isFirst = key_start == 0;
+            int col_id = col / MatmulGradQ::ThreadblockShape::kN;
+            int num_cols =
+                kSingleIterationGradQ ? 1 : ceil_div(p.head_dim, MatmulGradQ::ThreadblockShape::kN);
+            int storage_id = (col_id + query_start / kBlockSizeI * num_cols);
+            AccumTileGmem gmem_tile{p.workspace_gq + storage_id * AccumTileGmem::kElementsStored};
+            if (isFirst || !kNeedsAccumGradQ) {
+                accum.clear();
+            } else {
+                gmem_tile.load(accum, thread_id);
+            }
+
+            auto gemm_k_iterations = (problem_size.k() + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+            // Compute threadblock-scoped matrix multiply-add
+            __syncthreads();
+            mma.set_prologue_done(kPrologueGQ);
+            mma(gemm_k_iterations, accum, iterator_B, accum);
+            __syncthreads();
+            bool isLastColumn = kSingleIterationGradQ ||
+                                (col + MatmulGradQ::ThreadblockShape::kN >= p.head_dim);
+            if (kPrologueGQ && !isLastColumn) {
+                prologueGradQ(col + MatmulGradQ::ThreadblockShape::kN);
+            }
+
+            // Output results
+            int32_t next_query, next_key;
+            incrIteration(p, p.num_queries, key_start, next_query, next_key);
+            bool isLast = next_query > query_start || next_key >= p.num_keys;
+            if (kNeedsAccumGradQ && !isLast) {
+                gmem_tile.store(accum, thread_id);
+            } else {
+                typename MatmulGradQ::OutputTileIterator output_it(
+                    typename MatmulGradQ::OutputTileIterator::Params{p.gQ_strideM()},
+                    p.grad_query_ptr + query_start * p.gQ_strideM() + col,
+                    {problem_size.m(), problem_size.n()},
+                    thread_id);
+                accumulateInGmem<MatmulGradQ>(isLastColumn
+                                                  ? shared_storage.gradQ_epilogue_lastIter()
+                                                  : shared_storage.gradQ_epilogue(),
+                                              accum,
+                                              output_it,
+                                              isFirst || kNeedsAccumGradQ,
+                                              warp_id,
+                                              lane_id);
+            }
+        }
+        /////////////////////////////////////////////////////////////////////////////////////////////////
+        // GradK matmul
+        //
+        // grad_k[i_start:i_end] += tmp.transpose(-2, -1) @ q_i
+        /////////////////////////////////////////////////////////////////////////////////////////////////
+        rematerializeThreadIds();
+
+        constexpr bool kSingleIterationGradK = kMaxK <= MatmulGradK::ThreadblockShape::kN;
+        for (int col = 0; col < (kSingleIterationGradK ? 1 : p.head_dim);
+             col += MatmulGradK::ThreadblockShape::kN) {
+            using Mma = typename MatmulGradK::Mma;
+            using AccumTileGmem = typename MatmulGradQ::AccumTileGmem;
+
+            cutlass::gemm::GemmCoord problem_size(
+                num_keys_in_block,
+                false ? MatmulGradK::ThreadblockShape::kN : p.head_dim - col,
+                num_queries_in_block);
+            auto createEpilogueIter = [&]() {
+                return typename MatmulGradK::OutputTileIterator(
+                    typename MatmulGradK::OutputTileIterator::Params{p.gK_strideM()},
+                    p.grad_key_ptr + key_start * p.gK_strideM() + col,
+                    {num_keys_in_block,
+                     false ? MatmulGradK::ThreadblockShape::kN : p.head_dim - col},
+                    thread_id);
+            };
+
+            // q_i
+            typename Mma::IteratorB iterator_B({int32_t(p.q_strideM)},
+                                               p.query_ptr + query_start * p.q_strideM + col,
+                                               {problem_size.k(), problem_size.n()},
+                                               thread_id,
+                                               no_offset);
+
+            auto getTmp = [&](int) { return &shared_storage.tmp_shared_storage(); };
+            auto getTmpT = [&](int) { return &shared_storage.tmpT_shared_storage(); };
+            // this is basically:
+            // opA = kIsTransposedA ? getTmp() : getTmpT();
+            bool constexpr kIsTransposedA = MatmulGradK::DefaultMmaFromSmem::kIsTransposedA;
+            auto& opA =
+                *call_conditional<kIsTransposedA, decltype(getTmp), decltype(getTmpT)>::apply(
+                    getTmp, getTmpT, 0);
+            Mma mma(shared_storage.mm_gradK(), opA, thread_id, warp_id, lane_id, problem_size.k());
+
+            int storage_id = col / MatmulGradK::ThreadblockShape::kN;
+            AccumTileGmem gmem_tile{p.workspace_gk + storage_id * AccumTileGmem::kElementsStored};
+            if (!kOutputInRF) {
+                if (isFirstQuery || !kNeedsAccumGradK) {
+                    output_frags.gradK.clear();
+                } else {
+                    gmem_tile.load(output_frags.gradK, thread_id);
+                }
+            }
+            mma.set_prologue_done(kPrologueGK);
+
+            auto gemm_k_iterations = (problem_size.k() + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+            // Compute threadblock-scoped matrix multiply-add
+            __syncthreads();
+
+            mma(gemm_k_iterations, output_frags.gradK, iterator_B, output_frags.gradK);
+            __syncthreads();
+            bool isLastColumn = kSingleIterationGradK ||
+                                col + MatmulGradK::ThreadblockShape::kN >= p.head_dim;
+            if (kPrologueGK && !isLastColumn) {
+                prologueGradK(col + MatmulGradK::ThreadblockShape::kN);
+            }
+
+            if (kPrologueQK && isLastColumn) {
+                int32_t next_query, next_key;
+                incrIteration(p, query_start, key_start, next_query, next_key);
+                DISPATCH_BOOL(next_key != key_start, kForceReloadK, ([&]() {
+                                  prologueQkNextIteration<kForceReloadK>(
+                                      shared_storage, p, next_query, next_key, warp_id, lane_id);
+                              }));
+            }
+
+            // Output results
+            if (!kOutputInRF) {
+                if (kNeedsAccumGradK && !isLastQuery) {
+                    gmem_tile.store(output_frags.gradK, thread_id);
+                } else {
+                    accumulateInGmem<MatmulGradK>(isLastColumn
+                                                      ? shared_storage.gradK_epilogue_final()
+                                                      : shared_storage.gradK_epilogue(),
+                                                  output_frags.gradK,
+                                                  createEpilogueIter(),
+                                                  isFirstQuery || kNeedsAccumGradK,
+                                                  warp_id,
+                                                  lane_id);
+                    __syncthreads();
+                }
+            }
+        }
+    }
+
+    static CUTLASS_DEVICE int32_t getQueryStart(Params const& p, int32_t key_start) { return 0; };
+
+    static CUTLASS_DEVICE void incrIteration(Params const& p,
+                                             int32_t query_start,
+                                             int32_t key_start,
+                                             int32_t& next_query,
+                                             int32_t& next_key)
+    {
+        next_query = query_start + kBlockSizeI;
+        next_key = key_start;
+        if (next_query >= p.num_queries) {
+            next_key = key_start + kBlockSizeJ;
+            next_query = getQueryStart(p, next_key);
+        }
+    }
+
+    template <bool kForceReloadK>
+    static CUTLASS_DEVICE void prologueQkNextIteration(SharedStorage& shared_storage,
+                                                       Params const& p,
+                                                       int32_t query_start,
+                                                       int32_t key_start,
+                                                       uint8_t warp_id,
+                                                       uint8_t lane_id)
+    {
+        if (query_start >= p.num_queries || key_start >= p.num_keys) { return; }
+
+        static constexpr bool kReloadK = kForceReloadK || !MatmulQK::Mma::kSmemContainsEntireMat;
+        int thread_id = 32 * warp_id + lane_id;
+        typename MatmulQK::Mma::IteratorA iterator_A({int32_t(p.k_strideM)},
+                                                     p.key_ptr + key_start * p.k_strideM,
+                                                     {p.num_keys - key_start, p.head_dim},
+                                                     thread_id,
+                                                     cutlass::MatrixCoord{0, 0});
+
+        typename MatmulQK::Mma::IteratorB iterator_B({int32_t(p.q_strideM)},
+                                                     p.query_ptr + query_start * p.q_strideM,
+                                                     {p.head_dim, p.num_queries - query_start},
+                                                     thread_id,
+                                                     cutlass::MatrixCoord{0, 0});
+
+        MatmulQK::Mma::prologue<kReloadK, true>(shared_storage.mm_qk_k(),
+                                                shared_storage.mm_qk_q(),
+                                                iterator_A,
+                                                iterator_B,
+                                                thread_id,
+                                                p.head_dim);
+    }
+
+    template <bool skipBoundsChecks>
+    static CUTLASS_DEVICE void writeFragsToGmem(SharedStorage& shared_storage,
+                                                OutputFragments& output_frags,
+                                                Params const& p,
+                                                int32_t key_start,
+                                                uint8_t warp_id,
+                                                uint8_t lane_id)
+    {
+        uint16_t thread_id = 32 * warp_id + lane_id;
+        int32_t num_keys_in_block =
+            skipBoundsChecks
+                ? MatmulQK::Mma::Shape::kM
+                : cutlass::fast_min((int32_t)MatmulQK::Mma::Shape::kM, p.num_keys - key_start);
+        typename MatmulGradV::OutputTileIterator outputV_it(
+            typename MatmulGradV::OutputTileIterator::Params{p.gV_strideM()},
+            p.grad_value_ptr + key_start * p.gV_strideM(),
+            {num_keys_in_block, p.head_dim_value},
+            thread_id);
+        accumulateInGmem<MatmulGradV>(shared_storage.gradV_epilogue_final(),
+                                      output_frags.gradV,
+                                      outputV_it,
+                                      true,
+                                      warp_id,
+                                      lane_id);
+
+        typename MatmulGradK::OutputTileIterator outputK_it(
+            typename MatmulGradK::OutputTileIterator::Params{p.gK_strideM()},
+            p.grad_key_ptr + key_start * p.gK_strideM(),
+            {num_keys_in_block, false ? MatmulGradK::ThreadblockShape::kN : p.head_dim},
+            thread_id);
+        accumulateInGmem<MatmulGradK>(shared_storage.gradK_epilogue_final(),
+                                      output_frags.gradK,
+                                      outputK_it,
+                                      true,
+                                      warp_id,
+                                      lane_id);
+    }
+
+    template <typename MatmulT>
+    static CUTLASS_DEVICE void accumulateInGmem(
+        typename MatmulT::DefaultEpilogue::SharedStorage& epilogue_smem,
+        typename MatmulT::Mma::FragmentC const& accum,
+        typename MatmulT::OutputTileIterator output_it,
+        bool first,
+        uint8_t warp_id,
+        uint8_t lane_id)
+    {
+        using DefaultEpilogue = typename MatmulT::DefaultEpilogue;
+        using DefaultOutputOp = typename MatmulT::DefaultOutputOp;
+        using Mma = typename MatmulT::Mma;
+        int thread_id = 32 * warp_id + lane_id;
+        DISPATCH_BOOL(
+            first, kIsFirst, ([&]() {
+                static constexpr auto ScaleType =
+                    kIsFirst ? cutlass::epilogue::thread::ScaleType::Nothing
+                             : cutlass::epilogue::thread::ScaleType::NoBetaScaling;
+                using EpilogueOutputOp = typename cutlass::epilogue::thread::LinearCombination<
+                    typename DefaultOutputOp::ElementOutput,
+                    DefaultOutputOp::kCount,
+                    typename DefaultOutputOp::ElementAccumulator,
+                    typename DefaultOutputOp::ElementCompute,
+                    ScaleType>;
+                using Epilogue = typename cutlass::epilogue::threadblock::EpiloguePipelined<
+                    typename DefaultEpilogue::Shape,
+                    typename Mma::Operator,
+                    DefaultEpilogue::kPartitionsK,
+                    typename MatmulT::OutputTileIterator,
+                    typename DefaultEpilogue::AccumulatorFragmentIterator,
+                    typename DefaultEpilogue::WarpTileIterator,
+                    typename DefaultEpilogue::SharedLoadIterator,
+                    EpilogueOutputOp,
+                    typename DefaultEpilogue::Padding,
+                    DefaultEpilogue::kFragmentsPerIteration,
+                    true  // IterationsUnroll
+                    >;
+                EpilogueOutputOp rescale({1, 1});
+                Epilogue epilogue(epilogue_smem, thread_id, warp_id, lane_id);
+                epilogue(rescale, output_it, accum, output_it);
+            }));
+    }
+
+    template <int kElementsPerAccess>
+    static CUTLASS_DEVICE void computeDelta(Params const& p,
+                                            int32_t query_start,
+                                            uint8_t warp_id,
+                                            uint8_t lane_id)
+    {
+        // Each thread computes one value for Delta
+        // Depending on warp configuration, we might have multiple
+        // threads of the same warp working on the same row
+        using AccessType = cutlass::Array<scalar_t, kElementsPerAccess>;
+        static_assert(kNumThreads >= kBlockSizeI, "");
+        static constexpr int kNumThreadsPerLine = kNumThreads / kBlockSizeI;
+        int16_t thread_id = 32 * warp_id + lane_id;
+
+        int16_t laneFirstCol = kElementsPerAccess * (lane_id % kNumThreadsPerLine);
+        int16_t laneRow = thread_id / kNumThreadsPerLine;
+        bool rowPred = (query_start + laneRow) < p.num_queries;
+        bool pred = rowPred;
+
+        // on windows, previous syntax __restrict__ AccessType*
+        // resulted in error: "restrict" is not allowed
+        const AccessType* __restrict__ grad_output_ptr = reinterpret_cast<const AccessType*>(
+            p.grad_output_ptr + (query_start + laneRow) * p.gO_strideM + laneFirstCol);
+        const AccessType* __restrict__ output_ptr = reinterpret_cast<const AccessType*>(
+            p.output_ptr + (query_start + laneRow) * p.o_strideM() + laneFirstCol);
+
+        static constexpr int64_t kMaxIters = kMaxK / (kElementsPerAccess * kNumThreadsPerLine);
+        constexpr int kPipelineStages = 2;
+        accum_t delta_value = accum_t(0);
+        using GlobalLoad = cutlass::arch::global_load<AccessType, sizeof(AccessType)>;
+        AccessType frag_grad_output[kPipelineStages];
+        AccessType frag_output[kPipelineStages];
+
+        auto loadAndIncrement = [&](int ld_pos, bool is_valid) {
+            frag_grad_output[ld_pos].clear();
+            frag_output[ld_pos].clear();
+            GlobalLoad(frag_grad_output[ld_pos], grad_output_ptr, is_valid);
+            GlobalLoad(frag_output[ld_pos], output_ptr, is_valid);
+            grad_output_ptr += kNumThreadsPerLine;
+            output_ptr += kNumThreadsPerLine;
+        };
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int iter = 0; iter < kPipelineStages - 1; ++iter) {
+            int ld_pos = iter % kPipelineStages;
+            pred = pred && (laneFirstCol + iter * kElementsPerAccess * kNumThreadsPerLine) <
+                               p.head_dim_value;
+            loadAndIncrement(ld_pos, pred);
+        }
+        auto columnIteration = [&](int iter) {
+            // Load for next iter
+            int ld_pos = (iter + kPipelineStages - 1) % kPipelineStages;
+            pred = pred && (laneFirstCol + (iter + kPipelineStages - 1) * kElementsPerAccess *
+                                               kNumThreadsPerLine) < p.head_dim_value;
+            loadAndIncrement(ld_pos, pred);
+            CUTLASS_PRAGMA_UNROLL
+            for (int i = 0; i < AccessType::kElements; ++i) {
+                delta_value += accum_t(frag_output[iter % kPipelineStages][i]) *
+                               accum_t(frag_grad_output[iter % kPipelineStages][i]);
+            }
+        };
+
+        // If we have a small lower-bound for K, we can unroll the loop
+        if (kMaxK <= 256) {
+            CUTLASS_PRAGMA_UNROLL
+            for (int iter = 0; iter < kMaxIters; ++iter) { columnIteration(iter); }
+        } else {
+            int num_iters = ceil_div(p.head_dim_value, kElementsPerAccess * kNumThreadsPerLine) *
+                            (kElementsPerAccess * kNumThreadsPerLine);
+            for (int iter = 0; iter < num_iters; ++iter) { columnIteration(iter); }
+        }
+
+        // Reduce between workers
+        static_assert(kNumThreadsPerLine == 1 || kNumThreadsPerLine == 2 || kNumThreadsPerLine == 4,
+                      "");
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 1; i < kNumThreadsPerLine; i *= 2) {
+            delta_value = delta_value + __shfl_xor_sync(0xffffffff, delta_value, i);
+        }
+
+        // Store in gmem
+        if (rowPred) { p.delta_ptr[query_start + laneRow] = delta_value; }
+    }
+};
+
+template <typename AK>
+__global__ void __launch_bounds__(AK::kNumThreads, AK::kMinBlocksPerSm)
+    attention_kernel_backward_batched_impl(typename AK::Params p)
+{
+    if (!p.advance_to_block()) { return; }
+    AK::attention_kernel(p);
+}
+
+template <typename AK>
+__global__ void __launch_bounds__(AK::kNumThreads, AK::kMinBlocksPerSm)
+    attention_kernel_backward_batched(typename AK::Params params);
diff --git a/csrc/deepspeed4science/evoformer_attn/kernel_forward.h b/csrc/deepspeed4science/evoformer_attn/kernel_forward.h
new file mode 100644
index 000000000000..e3b11ebcc661
--- /dev/null
+++ b/csrc/deepspeed4science/evoformer_attn/kernel_forward.h
@@ -0,0 +1,986 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holdvr nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#pragma once
+
+#include <curand_kernel.h>
+#include <cmath>
+#include <vector>
+
+#include "cutlass/bfloat16.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/vector.h"
+#include "cutlass/matrix.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_ref.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_simt.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+#include "cutlass/gemm/kernel/default_gemm.h"
+#include "cutlass/gemm/threadblock/default_mma.h"
+#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/platform/platform.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+#include "epilogue/epilogue_pipelined.h"
+#include "epilogue/epilogue_rescale_output.h"
+#include "gemm/find_default_mma.h"
+#include "gemm/mma_from_smem.h"
+#include "gemm_kernel_utils.h"
+#include "transform/bias_broadcast.h"
+#include "transform/tile_smem_loader.h"
+
+#include <inttypes.h>
+
+using namespace gemm_kernel_utils;
+
+namespace {
+template <typename scalar_t, typename Arch>
+constexpr int getWarpsPerSm()
+{
+    return (Arch::kMinComputeCapability >= 80 && !cutlass::platform::is_same<scalar_t, float>::value
+                ? 16
+                : 12);
+}
+static CUTLASS_DEVICE float atomicMaxFloat(float* addr, float value)
+{
+    // source: https://stackoverflow.com/a/51549250
+    return (value >= 0) ? __int_as_float(atomicMax((int*)addr, __float_as_int(value)))
+                        : __uint_as_float(atomicMin((unsigned int*)addr, __float_as_uint(value)));
+}
+}  // namespace
+
+template <
+    // The datatype of Q/K/V
+    typename scalar_t_,
+    // Architecture we are targeting (eg `cutlass::arch::Sm80`)
+    typename ArchTag,
+    // If Q/K/V are correctly aligned in memory and we can run a fast kernel
+    bool isAligned_,
+    int kQueriesPerBlock,
+    int kKeysPerBlock_,
+    bool kSingleValueIteration_,  // = `value.shape[-1] <= kKeysPerBlock`
+    // This is quite slower on V100 for some reason
+    // Set to false if you know at compile-time you will never need dropout
+    bool kSupportsBias_ = false,
+    template <typename, typename, typename> class Broadcast1_ = BroadcastNoLoad,
+    template <typename, typename, typename> class Broadcast2_ = BroadcastNoLoad>
+struct AttentionKernel {
+    using scalar_t = scalar_t_;
+    using accum_t = float;
+    using lse_scalar_t = float;
+    using output_t = scalar_t;
+    // Accumulator between 2 iterations
+    // Using `accum_t` improves perf on f16 at the cost of
+    // numerical errors
+    using output_accum_t = accum_t;
+    static constexpr bool kSupportsBias = kSupportsBias_;
+    static constexpr int kKeysPerBlock = kKeysPerBlock_;
+    static constexpr bool kIsAligned = isAligned_;
+    static constexpr bool kSingleValueIteration = kSingleValueIteration_;
+    static constexpr int32_t kAlignLSE = 32;  // block size of backward
+    static constexpr bool kPreloadV =
+        ArchTag::kMinComputeCapability >= 80 && cutlass::sizeof_bits<scalar_t>::value == 16;
+    static constexpr bool kKeepOutputInRF = kSingleValueIteration;
+    static constexpr bool kNeedsOutputAccumulatorBuffer =
+        !kKeepOutputInRF && !cutlass::platform::is_same<output_accum_t, output_t>::value;
+
+    static_assert(kQueriesPerBlock % 32 == 0, "");
+    static_assert(kKeysPerBlock % 32 == 0, "");
+    static constexpr int kNumWarpsPerBlock = kQueriesPerBlock * kKeysPerBlock / (32 * 32);
+    static constexpr int kWarpSize = 32;
+
+    // Launch bounds
+    static constexpr int kNumThreads = kWarpSize * kNumWarpsPerBlock;
+    static constexpr int kMinBlocksPerSm = getWarpsPerSm<scalar_t, ArchTag>() / kNumWarpsPerBlock;
+
+    struct Params {
+        // Input tensors
+        scalar_t* query_ptr;  // [num_queries, num_heads, head_dim]
+        scalar_t* key_ptr;    // [num_keys, num_heads, head_dim]
+        scalar_t* value_ptr;  // [num_keys, num_heads, head_dim_value]
+
+        // Output tensors
+        output_t* output_ptr;              // [num_queries, num_heads, head_dim_value]
+        output_accum_t* output_accum_ptr;  // [num_queries, num_heads, head_dim_value]
+        lse_scalar_t* logsumexp_ptr;       // [num_heads, num_queries] - can be null
+
+        // Scale
+        accum_t scale;
+
+        // Dimensions/strides
+        int32_t head_dim;
+        int32_t head_dim_value;
+        int32_t num_queries;
+        int32_t num_keys;
+
+        int32_t q_strideM;
+        int32_t k_strideM;
+        int32_t v_strideM;
+        // int32_t bias_strideM = 0;
+
+        int32_t o_strideM = 0;
+
+        // Everything below is only used in `advance_to_block`
+        // and shouldn't use registers
+        int32_t q_strideH;
+        int32_t k_strideH;
+        int32_t v_strideH;
+        // int32_t bias_strideH = 0;
+
+        int64_t q_strideB;
+        int64_t k_strideB;
+        int64_t v_strideB;
+        // int32_t bias_strideB = 0;
+
+        int32_t num_batches;
+        int32_t num_heads;
+
+        // Parameters for biases
+        scalar_t* bias1_ptr = nullptr;
+        scalar_t* bias2_ptr = nullptr;
+        int32_t B = 0;
+        int32_t N = 0;
+
+        // Moves pointers to what we should process
+        // Returns "false" if there is no work to do
+        CUTLASS_DEVICE bool advance_to_block()
+        {
+            auto batch_id = blockIdx.z;
+            auto head_id = blockIdx.y;
+            auto query_start = blockIdx.x * kQueriesPerBlock;
+
+            auto lse_dim = ceil_div((int32_t)num_queries, kAlignLSE) * kAlignLSE;
+
+            query_ptr += batch_id * q_strideB;
+            key_ptr += batch_id * k_strideB;
+            value_ptr += batch_id * v_strideB;
+            output_ptr += int64_t(batch_id * num_queries) * o_strideM;
+            if (output_accum_ptr != nullptr) {
+                output_accum_ptr += int64_t(batch_id * num_queries) * (head_dim_value * num_heads);
+            }
+
+            int64_t q_start = 0, k_start = 0;
+            // Advance to the current batch / head / query_start
+            query_ptr += (q_start + query_start) * q_strideM + head_id * q_strideH;
+            key_ptr += k_start * k_strideM + head_id * k_strideH;
+
+            value_ptr += k_start * v_strideM + head_id * v_strideH;
+            output_ptr += int64_t(q_start + query_start) * o_strideM + head_id * head_dim_value;
+
+            if (output_accum_ptr != nullptr) {
+                output_accum_ptr += int64_t(q_start + query_start) * (head_dim_value * num_heads) +
+                                    head_id * head_dim_value;
+            } else {
+                // Accumulate directly in the destination buffer (eg for f32)
+                output_accum_ptr = (accum_t*)output_ptr;
+            }
+
+            if (logsumexp_ptr != nullptr) {
+                // lse[batch_id, head_id, query_start]
+                logsumexp_ptr += batch_id * lse_dim * num_heads + head_id * lse_dim + query_start;
+            }
+
+            using broadcast_1 = Broadcast1_<typename MM0::BiasLoader::ThreadMap,
+                                            typename MM0::BiasLoader::Shape,
+                                            scalar_t>;
+            if (kSupportsBias && broadcast_1::kEnable && bias1_ptr) {
+                bias1_ptr = broadcast_1::advance(bias1_ptr,
+                                                 batch_id / N,
+                                                 batch_id % N,
+                                                 head_id,
+                                                 num_queries * N,
+                                                 num_queries,
+                                                 0);
+            }
+            using broadcast_2 = Broadcast2_<typename MM0::BiasLoader::ThreadMap,
+                                            typename MM0::BiasLoader::Shape,
+                                            scalar_t>;
+            if (kSupportsBias && broadcast_2::kEnable && bias2_ptr) {
+                auto strideB = num_heads * num_queries * num_keys;
+                auto strideH = num_queries * num_keys;
+                bias2_ptr = broadcast_2::advance(
+                    bias2_ptr, batch_id / N, batch_id % N, head_id, strideB, 0, strideH);
+            }
+
+            num_queries -= query_start;
+            num_batches = 0;  // no longer used after
+
+            // If num_queries == 1, and there is only one key head we're wasting
+            // 15/16th of tensor core compute In that case :
+            //  - we only launch kernels for head_id % kQueriesPerBlock == 0
+            //  - we iterate over heads instead of queries (strideM = strideH)
+            if (num_queries == 1 && k_strideH == 0 && v_strideH == 0) {
+                if (head_id % kQueriesPerBlock != 0) return false;
+                q_strideM = q_strideH;
+                num_queries = num_heads;
+                num_heads = 1;  // unused but here for intent
+                o_strideM = head_dim_value;
+            }
+
+            // Make sure the compiler knows these variables are the same on all
+            // the threads of the warp.
+            query_ptr = warp_uniform(query_ptr);
+            key_ptr = warp_uniform(key_ptr);
+            value_ptr = warp_uniform(value_ptr);
+            output_ptr = warp_uniform(output_ptr);
+            output_accum_ptr = warp_uniform(output_accum_ptr);
+            logsumexp_ptr = warp_uniform(logsumexp_ptr);
+            num_queries = warp_uniform(num_queries);
+            num_keys = warp_uniform(num_keys);
+            num_heads = warp_uniform(num_heads);
+            head_dim = warp_uniform(head_dim);
+            head_dim_value = warp_uniform(head_dim_value);
+            o_strideM = warp_uniform(o_strideM);
+            if (kSupportsBias && broadcast_1::kEnable) { bias1_ptr = warp_uniform(bias1_ptr); }
+            if (kSupportsBias && broadcast_2::kEnable) { bias2_ptr = warp_uniform(bias2_ptr); }
+            return true;
+        }
+
+        __host__ dim3 getBlocksGrid() const
+        {
+            return dim3(ceil_div(num_queries, (int32_t)kQueriesPerBlock), num_heads, num_batches);
+        }
+
+        __host__ dim3 getThreadsGrid() const { return dim3(kWarpSize, kNumWarpsPerBlock, 1); }
+    };
+
+    struct MM0 {
+        /*
+          In this first matmul, we compute a block of `Q @ K.T`.
+          While the calculation result is still hot in registers, we update
+          `mi`, `m_prime`, `s_prime` in shared-memory, and then store this value
+          into a shared-memory ("AccumulatorSharedStorage") that is used later as
+          operand A for the second matmul (see MM1)
+        */
+        using GemmType = DefaultGemmType<ArchTag, scalar_t>;
+
+        using OpClass = typename GemmType::OpClass;
+        using DefaultConfig =
+            typename cutlass::gemm::device::DefaultGemmConfiguration<OpClass,
+                                                                     ArchTag,
+                                                                     scalar_t,
+                                                                     scalar_t,
+                                                                     scalar_t,  // ElementC
+                                                                     accum_t  // ElementAccumulator
+                                                                     >;
+        static constexpr int kAlignmentA = kIsAligned ? DefaultConfig::kAlignmentA
+                                                      : GemmType::kMinimumAlignment;
+        static constexpr int kAlignmentB = kIsAligned ? DefaultConfig::kAlignmentB
+                                                      : GemmType::kMinimumAlignment;
+        using ThreadblockShape =
+            cutlass::gemm::GemmShape<kQueriesPerBlock, kKeysPerBlock, GemmType::ThreadK>;
+        using WarpShape = cutlass::gemm::GemmShape<32, 32, GemmType::WarpK>;
+        using DefaultMma = typename cutlass::gemm::threadblock::FindDefaultMma<
+            scalar_t,                   // ElementA,
+            cutlass::layout::RowMajor,  // LayoutA,
+            kAlignmentA,
+            scalar_t,                      // ElementB,
+            cutlass::layout::ColumnMajor,  // LayoutB,
+            kAlignmentB,
+            accum_t,
+            cutlass::layout::RowMajor,  // LayoutC,
+            OpClass,
+            ArchTag,                              // ArchTag
+            ThreadblockShape,                     // ThreadblockShape
+            WarpShape,                            // WarpShape
+            typename GemmType::InstructionShape,  // InstructionShape
+            DefaultConfig::kStages,               // Should use `DefaultConfig::kStages`, but that
+                                                  // uses too much smem
+            typename GemmType::Operator           // Operator
+            >::DefaultMma;
+        using MmaCore = typename DefaultMma::MmaCore;
+        using IteratorA = typename DefaultMma::IteratorA;
+        using IteratorB = typename DefaultMma::IteratorB;
+        using Mma = typename DefaultMma::ThreadblockMma;
+        using AccumLambdaIterator =
+            typename DefaultMmaAccumLambdaIterator<typename Mma::Operator::IteratorC,
+                                                   accum_t,
+                                                   kWarpSize>::Iterator;
+        static_assert(MmaCore::WarpCount::kM * MmaCore::WarpCount::kN * MmaCore::WarpCount::kK ==
+                          kNumWarpsPerBlock,
+                      "");
+
+        // used for efficient load of bias tile Bij from global to shared memory
+        using BiasLoader =
+            TileSmemLoader<scalar_t,
+                           cutlass::MatrixShape<kQueriesPerBlock, kKeysPerBlock>,
+                           MmaCore::kThreads,
+                           // input restriction: kv_len has to be a multiple of this value
+                           128 / cutlass::sizeof_bits<scalar_t>::value>;
+
+        // Epilogue to store to shared-memory in a format that we can use later for
+        // the second matmul
+        using B2bGemm =
+            typename cutlass::gemm::threadblock::B2bGemm<typename Mma::Operator::IteratorC,
+                                                         typename Mma::Operator,
+                                                         scalar_t,
+                                                         WarpShape,
+                                                         ThreadblockShape>;
+        using AccumulatorSharedStorage = typename B2bGemm::AccumulatorSharedStorage;
+    };
+
+    struct MM1 {
+        /**
+          Second matmul: perform `attn @ V` where `attn` is the attention (not
+          normalized) and stored in shared memory
+        */
+        using GemmType = DefaultGemmType<ArchTag, scalar_t>;
+
+        using OpClass = typename GemmType::OpClass;
+        using DefaultConfig =
+            typename cutlass::gemm::device::DefaultGemmConfiguration<OpClass,
+                                                                     ArchTag,
+                                                                     scalar_t,
+                                                                     scalar_t,
+                                                                     output_accum_t,  // ElementC
+                                                                     accum_t  // ElementAccumulator
+                                                                     >;
+        static constexpr int kAlignmentA = DefaultConfig::kAlignmentA;  // from smem
+        static constexpr int kAlignmentB = kIsAligned ? DefaultConfig::kAlignmentB
+                                                      : GemmType::kMinimumAlignment;
+        using ThreadblockShape =
+            cutlass::gemm::GemmShape<kQueriesPerBlock, kKeysPerBlock, GemmType::ThreadK>;
+        using WarpShape = cutlass::gemm::GemmShape<32, 32, GemmType::WarpK>;
+        using InstructionShape = typename GemmType::InstructionShape;
+
+        using LayoutB = cutlass::layout::RowMajor;
+        using DefaultGemm =
+            cutlass::gemm::kernel::DefaultGemm<scalar_t,                   // ElementA,
+                                               cutlass::layout::RowMajor,  // LayoutA,
+                                               kAlignmentA,
+                                               scalar_t,  // ElementB,
+                                               LayoutB,   // LayoutB,
+                                               kAlignmentB,
+                                               output_accum_t,
+                                               cutlass::layout::RowMajor,  // LayoutC,
+                                               accum_t,
+                                               OpClass,
+                                               ArchTag,
+                                               ThreadblockShape,
+                                               WarpShape,
+                                               typename GemmType::InstructionShape,
+                                               typename DefaultConfig::EpilogueOutputOp,
+                                               void,  // ThreadblockSwizzle - not used
+                                               DefaultConfig::kStages,
+                                               false,  // SplitKSerial
+                                               typename GemmType::Operator>;
+
+        using DefaultMmaFromSmem = typename cutlass::gemm::threadblock::DefaultMmaFromSharedMemory<
+            typename DefaultGemm::Mma,
+            typename MM0::AccumulatorSharedStorage,
+            false>;  // kScaleOperandA
+        using Mma = typename DefaultMmaFromSmem::Mma;
+        using IteratorB = typename Mma::IteratorB;
+        using WarpCount = typename Mma::WarpCount;
+        static_assert(WarpCount::kM * WarpCount::kN * WarpCount::kK == kNumWarpsPerBlock, "");
+
+        using DefaultEpilogue = typename DefaultGemm::Epilogue;
+        using OutputTileIterator = typename cutlass::epilogue::threadblock::PredicatedTileIterator<
+            typename DefaultEpilogue::OutputTileIterator::ThreadMap,
+            output_t>;
+        using OutputTileIteratorAccum =
+            typename cutlass::epilogue::threadblock::PredicatedTileIterator<
+                typename DefaultEpilogue::OutputTileIterator::ThreadMap,
+                output_accum_t>;
+
+        struct SharedStorageMM1 {
+            typename Mma::SharedStorage mm;
+        };
+    };
+
+    static constexpr int64_t kAlignmentQ = MM0::kAlignmentA;
+    static constexpr int64_t kAlignmentK = MM0::kAlignmentB;
+    static constexpr int64_t kAlignmentV = 1;
+
+    // Shared storage - depends on kernel params
+    struct ScalingCoefs {
+        cutlass::Array<accum_t, kQueriesPerBlock> m_prime;
+        cutlass::Array<accum_t, kQueriesPerBlock> s_prime;
+        cutlass::Array<accum_t, kQueriesPerBlock> mi;
+    };
+
+    struct SharedStorageEpilogueAtEnd : ScalingCoefs {
+        struct SharedStorageAfterMM0 {
+            // Everything here might be overwritten during MM0
+            union {
+                // typename MM0::BiasLoader::SmemTile bias;
+                cutlass::AlignedBuffer<float, MM0::BiasLoader::Shape::kCount> bias;
+                typename MM0::AccumulatorSharedStorage si;
+            };
+            typename MM1::SharedStorageMM1 mm1;
+        };
+
+        union {
+            typename MM0::Mma::SharedStorage mm0;
+            SharedStorageAfterMM0 after_mm0;
+            typename MM1::DefaultEpilogue::SharedStorage epilogue;
+        };
+
+        CUTLASS_DEVICE typename MM1::DefaultEpilogue::SharedStorage& epilogue_shared_storage()
+        {
+            return epilogue;
+        }
+    };
+
+    struct SharedStorageEpilogueInLoop : ScalingCoefs {
+        struct SharedStorageAfterMM0 {
+            // Everything here might be overwritten during MM0
+            union {
+                // typename MM0::BiasLoader::SmemTile bias;
+                cutlass::AlignedBuffer<float, MM0::BiasLoader::Shape::kCount> bias;
+                typename MM0::AccumulatorSharedStorage si;
+            };
+            typename MM1::SharedStorageMM1 mm1;
+            typename MM1::DefaultEpilogue::SharedStorage epilogue;
+        };
+
+        union {
+            typename MM0::Mma::SharedStorage mm0;
+            SharedStorageAfterMM0 after_mm0;
+        };
+
+        CUTLASS_DEVICE typename MM1::DefaultEpilogue::SharedStorage& epilogue_shared_storage()
+        {
+            return after_mm0.epilogue;
+        }
+    };
+
+    using SharedStorage =
+        typename cutlass::platform::conditional<kSingleValueIteration || kKeepOutputInRF,
+                                                SharedStorageEpilogueAtEnd,
+                                                SharedStorageEpilogueInLoop>::type;
+
+    static bool __host__ check_supported(Params const& p)
+    {
+        CHECK_ALIGNED_PTR(p.query_ptr, kAlignmentQ);
+        CHECK_ALIGNED_PTR(p.key_ptr, kAlignmentK);
+        CHECK_ALIGNED_PTR(p.value_ptr, kAlignmentV);
+        EVOFORMER_CHECK(p.q_strideM % kAlignmentQ == 0, "query is not correctly aligned (strideM)");
+        EVOFORMER_CHECK(p.k_strideM % kAlignmentK == 0, "key is not correctly aligned (strideM)");
+        EVOFORMER_CHECK(p.v_strideM % kAlignmentV == 0, "value is not correctly aligned (strideM)");
+        EVOFORMER_CHECK(p.num_heads <= 1 || p.q_strideH % kAlignmentQ == 0,
+                        "query is not correctly aligned (strideH)");
+        EVOFORMER_CHECK(p.num_heads <= 1 || p.k_strideH % kAlignmentK == 0,
+                        "key is not correctly aligned (strideH)");
+        EVOFORMER_CHECK(p.num_heads <= 1 || p.v_strideH % kAlignmentV == 0,
+                        "value is not correctly aligned (strideH)");
+        return true;
+    }
+
+    static void CUTLASS_DEVICE attention_kernel(Params& p)
+    {
+        // In this block, we will only ever:
+        // - read query[query_start:query_end, :]
+        // - write to output[query_start:query_end, :]
+
+        extern __shared__ char smem_buffer[];
+        SharedStorage& shared_storage = *((SharedStorage*)smem_buffer);
+        auto& m_prime = shared_storage.m_prime;
+        auto& s_prime = shared_storage.s_prime;
+        auto& mi = shared_storage.mi;
+        const uint32_t query_start = blockIdx.x * kQueriesPerBlock;
+
+        static_assert(kQueriesPerBlock < kNumWarpsPerBlock * kWarpSize, "");
+        if (thread_id() < kQueriesPerBlock) {
+            s_prime[thread_id()] = accum_t(0);
+            m_prime[thread_id()] = -cutlass::platform::numeric_limits<accum_t>::infinity();
+            mi[thread_id()] = -cutlass::platform::numeric_limits<accum_t>::infinity();
+        }
+        typename MM1::Mma::FragmentC accum_o;
+        accum_o.clear();
+
+        auto createOutputIter = [&](int col) -> typename MM1::OutputTileIterator {
+            using OutputTileIterator = typename MM1::OutputTileIterator;
+            return OutputTileIterator(
+                typename OutputTileIterator::Params{(int32_t)p.o_strideM},
+                p.output_ptr,
+                typename OutputTileIterator::TensorCoord{p.num_queries, p.head_dim_value},
+                thread_id(),
+                {0, col});
+        };
+
+        auto createOutputAccumIter = [&](int col) -> typename MM1::OutputTileIteratorAccum {
+            using OutputTileIteratorAccum = typename MM1::OutputTileIteratorAccum;
+            return OutputTileIteratorAccum(
+                typename OutputTileIteratorAccum::Params{(int32_t)(p.head_dim_value * p.num_heads)},
+                p.output_accum_ptr,
+                typename OutputTileIteratorAccum::TensorCoord{p.num_queries, p.head_dim_value},
+                thread_id(),
+                {0, col});
+        };
+
+        // Iterate through keys
+        for (int32_t iter_key_start = 0; iter_key_start < p.num_keys;
+             iter_key_start += kKeysPerBlock) {
+            int32_t problem_size_0_m = cutlass::fast_min((int32_t)kQueriesPerBlock, p.num_queries);
+            int32_t problem_size_0_n =
+                cutlass::fast_min(int32_t(kKeysPerBlock), p.num_keys - iter_key_start);
+            int32_t const& problem_size_0_k = p.head_dim;
+            int32_t const& problem_size_1_n = p.head_dim_value;
+            int32_t const& problem_size_1_k = problem_size_0_n;
+
+            auto prologueV = [&](int blockN) {
+                typename MM1::Mma::IteratorB iterator_V(
+                    typename MM1::IteratorB::Params{MM1::LayoutB(p.v_strideM)},
+                    p.value_ptr + iter_key_start * p.v_strideM,
+                    {problem_size_1_k, problem_size_1_n},
+                    thread_id(),
+                    cutlass::MatrixCoord{0, blockN * MM1::Mma::Shape::kN});
+                MM1::Mma::prologue(
+                    shared_storage.after_mm0.mm1.mm, iterator_V, thread_id(), problem_size_1_k);
+            };
+
+            __syncthreads();  // Need to have shared memory initialized, and `m_prime`
+                              // updated from end of prev iter
+            //
+            // MATMUL: Q.K_t
+            //
+            // Computes the block-matrix product of:
+            // (a) query[query_start:query_end, :]
+            // with
+            // (b) key[iter_key_start:iter_key_start + kKeysPerBlock]
+            // and stores that into `shared_storage.si`
+            //
+
+            // Compute threadblock location
+            cutlass::gemm::GemmCoord tb_tile_offset = {0, 0, 0};
+
+            cutlass::MatrixCoord tb_offset_A{tb_tile_offset.m() * MM0::Mma::Shape::kM,
+                                             tb_tile_offset.k()};
+
+            cutlass::MatrixCoord tb_offset_B{tb_tile_offset.k(),
+                                             tb_tile_offset.n() * MM0::Mma::Shape::kN};
+
+            // Construct iterators to A and B operands
+            typename MM0::IteratorA iterator_A(
+                typename MM0::IteratorA::Params(typename MM0::MmaCore::LayoutA(p.q_strideM)),
+                p.query_ptr,
+                {problem_size_0_m, problem_size_0_k},
+                thread_id(),
+                tb_offset_A);
+
+            typename MM0::IteratorB iterator_B(
+                typename MM0::IteratorB::Params(typename MM0::MmaCore::LayoutB(p.k_strideM)),
+                p.key_ptr + iter_key_start * p.k_strideM,
+                {problem_size_0_k, problem_size_0_n},
+                thread_id(),
+                tb_offset_B);
+
+            auto my_warp_id = warp_id();
+            auto my_lane_id = lane_id();
+
+            // Construct thread-scoped matrix multiply
+            typename MM0::Mma mma(shared_storage.mm0, thread_id(), my_warp_id, my_lane_id);
+
+            typename MM0::Mma::FragmentC accum;
+
+            accum.clear();
+
+            auto gemm_k_iterations =
+                (problem_size_0_k + MM0::Mma::Shape::kK - 1) / MM0::Mma::Shape::kK;
+
+            // Compute threadblock-scoped matrix multiply-add
+            mma(gemm_k_iterations, accum, iterator_A, iterator_B, accum);
+            __syncthreads();
+
+            if (kPreloadV) {
+                prologueV(0);
+            } else {
+                MM1::Mma::drain_cp_asyncs();
+            }
+
+            typename MM0::Mma::Operator::IteratorC::TensorCoord iteratorC_tile_offset = {
+                (tb_tile_offset.m() * MM0::Mma::WarpCount::kM) +
+                    (my_warp_id % MM0::Mma::WarpCount::kM),
+                (tb_tile_offset.n() * MM0::Mma::WarpCount::kN) +
+                    (my_warp_id / MM0::Mma::WarpCount::kM)};
+
+            // multiply by scaling factor
+            // if (kSupportsBias) {
+            //   accum =
+            //       cutlass::multiplies<typename MM0::Mma::FragmentC>()(p.scale,
+            //       accum);
+            // }
+
+            if (kSupportsBias) {
+                cutlass::TensorRef<float, cutlass::layout::RowMajor> bias_tensor_ref(
+                    shared_storage.after_mm0.bias.data(),
+                    cutlass::layout::RowMajor(MM0::ThreadblockShape::kN));
+                using Shape =
+                    cutlass::MatrixShape<MM0::ThreadblockShape::kM, MM0::ThreadblockShape::kN>;
+                AttentionBiasEpilogue<Shape,
+                                      scalar_t,
+                                      MM0::MmaCore::kThreads,
+                                      Broadcast1_,
+                                      Broadcast2_>
+                    bias_epilogue;
+                bias_epilogue(bias_tensor_ref,
+                              p.bias1_ptr + iter_key_start,
+                              p.bias2_ptr + query_start * p.num_keys + iter_key_start,
+                              thread_id(),
+                              {problem_size_0_m, problem_size_0_n},
+                              p.num_keys);
+                // Pij += Bij, Pij is in register fragment and Bij is in shared memory
+                auto lane_offset = MM0::AccumLambdaIterator::get_lane_offset(
+                    lane_id(), warp_id(), iteratorC_tile_offset);
+                MM0::AccumLambdaIterator::iterateRows(
+                    lane_offset,
+                    [&](int accum_m) {},
+                    [&](int accum_m, int accum_n, int idx) {
+                        if (accum_m < problem_size_0_m && accum_n < problem_size_0_n) {
+                            accum[idx] =
+                                accum[idx] * p.scale + bias_tensor_ref.at({accum_m, accum_n});
+                        }
+                    },
+                    [&](int accum_m) {});
+            }
+
+            DISPATCH_BOOL(iter_key_start == 0, kIsFirst, ([&] {
+                              DISPATCH_BOOL(
+                                  p.num_keys - iter_key_start >= kKeysPerBlock, kFullColumns, ([&] {
+                                      // Update `mi` from accum stored in registers
+                                      // Also does accum[i] <- exp(accum[i] - mi)
+                                      iterative_softmax<typename MM0::Mma::Operator::IteratorC,
+                                                        kFullColumns,
+                                                        kIsFirst>(accum_o,
+                                                                  accum,
+                                                                  mi,
+                                                                  m_prime,
+                                                                  s_prime,
+                                                                  lane_id(),
+                                                                  thread_id(),
+                                                                  warp_id(),
+                                                                  p.num_keys - iter_key_start,
+                                                                  iteratorC_tile_offset,
+                                                                  kSupportsBias ? 1.0f : p.scale);
+                                  }));
+                          }));
+
+            // Output results to shared-memory
+            int warp_idx_mn_0 =
+                my_warp_id % (MM0::Mma::Base::WarpCount::kM * MM0::Mma::Base::WarpCount::kN);
+            auto output_tile_coords =
+                cutlass::MatrixCoord{warp_idx_mn_0 % MM0::Mma::Base::WarpCount::kM,
+                                     warp_idx_mn_0 / MM0::Mma::Base::WarpCount::kM};
+
+            MM0::B2bGemm::accumToSmem(
+                shared_storage.after_mm0.si, accum, my_lane_id, output_tile_coords);
+
+            __syncthreads();
+
+            //
+            // MATMUL: Attn . V
+            // Run the matmul `attn @ V` for a block of attn and V.
+            // `attn` is read from shared memory (in `shared_storage_si`)
+            // `V` is read from global memory (with iterator_B)
+            //
+
+            const int64_t nBlockN =
+                kSingleValueIteration
+                    ? 1
+                    : ceil_div((int64_t)problem_size_1_n, int64_t(MM1::ThreadblockShape::kN));
+            for (int blockN = 0; blockN < nBlockN; ++blockN) {
+                int gemm_k_iterations =
+                    (problem_size_1_k + MM1::Mma::Shape::kK - 1) / MM1::Mma::Shape::kK;
+
+                // Compute threadblock-scoped matrix multiply-add and store it in accum
+                // (in registers)
+                if (!kPreloadV) {
+                    __syncthreads();  // we share shmem between mma and epilogue
+                }
+
+                typename MM1::Mma::IteratorB iterator_V(
+                    typename MM1::IteratorB::Params{MM1::LayoutB(p.v_strideM)},
+                    p.value_ptr + iter_key_start * p.v_strideM,
+                    {problem_size_1_k, problem_size_1_n},
+                    thread_id(),
+                    cutlass::MatrixCoord{0, blockN * MM1::Mma::Shape::kN});
+                typename MM1::Mma mma_pv(shared_storage.after_mm0.mm1.mm,
+                                         shared_storage.after_mm0.si,
+                                         (int)thread_id(),
+                                         (int)warp_id(),
+                                         (int)lane_id(),
+                                         (int)problem_size_1_k);
+                mma_pv.set_prologue_done(kPreloadV);
+                if (!kKeepOutputInRF) { accum_o.clear(); }
+                mma_pv(gemm_k_iterations, accum_o, iterator_V, accum_o);
+                __syncthreads();
+
+                if (kPreloadV && !kSingleValueIteration && blockN + 1 < nBlockN) {
+                    prologueV(blockN + 1);
+                }
+
+                if (!kKeepOutputInRF) {
+                    MM1::Mma::drain_cp_asyncs();
+                    DISPATCH_BOOL(
+                        iter_key_start == 0, kIsFirst, ([&] {
+                            DISPATCH_BOOL(
+                                (iter_key_start + kKeysPerBlock) >= p.num_keys, kIsLast, ([&] {
+                                    using DefaultEpilogue = typename MM1::DefaultEpilogue;
+                                    using DefaultOp = typename MM1::DefaultConfig::EpilogueOutputOp;
+                                    using ElementCompute = typename DefaultOp::ElementCompute;
+                                    using EpilogueOutputOp = typename cutlass::epilogue::thread::
+                                        MemoryEfficientAttentionNormalize<
+                                            typename cutlass::platform::
+                                                conditional<kIsLast, output_t, output_accum_t>::
+                                                    type,
+                                            output_accum_t,
+                                            DefaultOp::kCount,
+                                            typename DefaultOp::ElementAccumulator,
+                                            ElementCompute,
+                                            kIsFirst,
+                                            kIsLast,
+                                            cutlass::Array<ElementCompute, kQueriesPerBlock>>;
+                                    using Epilogue =
+                                        typename cutlass::epilogue::threadblock::EpiloguePipelined<
+                                            typename DefaultEpilogue::Shape,
+                                            typename MM1::Mma::Operator,
+                                            DefaultEpilogue::kPartitionsK,
+                                            typename cutlass::platform::conditional<
+                                                kIsLast,
+                                                typename MM1::OutputTileIterator,
+                                                typename MM1::OutputTileIteratorAccum>::type,
+                                            typename DefaultEpilogue::AccumulatorFragmentIterator,
+                                            typename DefaultEpilogue::WarpTileIterator,
+                                            typename DefaultEpilogue::SharedLoadIterator,
+                                            EpilogueOutputOp,
+                                            typename DefaultEpilogue::Padding,
+                                            DefaultEpilogue::kFragmentsPerIteration,
+                                            true,  // IterationsUnroll
+                                            typename MM1::OutputTileIteratorAccum  // Read
+                                                                                   // iterator
+                                            >;
+
+                                    int col = blockN * MM1::Mma::Shape::kN;
+                                    auto source_iter = createOutputAccumIter(col);
+                                    auto dest_iter =
+                                        call_conditional<kIsLast,
+                                                         decltype(createOutputIter),
+                                                         decltype(createOutputAccumIter)>::
+                                            apply(createOutputIter, createOutputAccumIter, col);
+                                    EpilogueOutputOp rescale(s_prime, m_prime);
+                                    Epilogue epilogue(shared_storage.epilogue_shared_storage(),
+                                                      thread_id(),
+                                                      warp_id(),
+                                                      lane_id());
+                                    epilogue(rescale, dest_iter, accum_o, source_iter);
+                                }));
+                        }));
+                    if (!kSingleValueIteration) { __syncthreads(); }
+                }
+            }
+            __syncthreads();  // we modify `m_prime` after
+        }
+
+        if (kKeepOutputInRF) {
+            constexpr bool kIsFirst = true;
+            constexpr bool kIsLast = true;
+            using DefaultEpilogue = typename MM1::DefaultEpilogue;
+            using DefaultOp = typename MM1::DefaultConfig::EpilogueOutputOp;
+            using ElementCompute = typename DefaultOp::ElementCompute;
+            using EpilogueOutputOp =
+                typename cutlass::epilogue::thread::MemoryEfficientAttentionNormalize<
+                    output_t,        // output
+                    output_accum_t,  // source
+                    DefaultOp::kCount,
+                    typename DefaultOp::ElementAccumulator,  // accum
+                    output_accum_t,                          // compute
+                    kIsFirst,
+                    kIsLast,
+                    cutlass::Array<ElementCompute, kQueriesPerBlock>>;
+            using Epilogue = typename cutlass::epilogue::threadblock::EpiloguePipelined<
+                typename DefaultEpilogue::Shape,
+                typename MM1::Mma::Operator,
+                DefaultEpilogue::kPartitionsK,
+                typename MM1::OutputTileIterator,  // destination
+                typename DefaultEpilogue::AccumulatorFragmentIterator,
+                typename DefaultEpilogue::WarpTileIterator,
+                typename DefaultEpilogue::SharedLoadIterator,
+                EpilogueOutputOp,
+                typename DefaultEpilogue::Padding,
+                DefaultEpilogue::kFragmentsPerIteration,
+                true,                                  // IterationsUnroll
+                typename MM1::OutputTileIteratorAccum  // source tile
+                >;
+            auto dest_iter = createOutputIter(0);
+            EpilogueOutputOp rescale(s_prime, m_prime);
+            Epilogue epilogue(
+                shared_storage.epilogue_shared_storage(), thread_id(), warp_id(), lane_id());
+            MM1::Mma::drain_cp_asyncs();
+            epilogue(rescale, dest_iter, accum_o);
+        }
+
+        // 7. Calculate logsumexp
+        // To make the backward easier, we pad logsumexp with `inf`
+        // this avoids a few bound checks, and is not more expensive during fwd
+        static_assert(kQueriesPerBlock < kNumWarpsPerBlock * kWarpSize, "");
+        if (p.logsumexp_ptr && thread_id() < kQueriesPerBlock) {
+            auto lse_dim = ceil_div((int32_t)p.num_queries, kAlignLSE) * kAlignLSE;
+            if (thread_id() < p.num_queries) {
+                p.logsumexp_ptr[thread_id()] =
+                    accum_t(mi[thread_id()]) + cutlass::fast_log(accum_t(s_prime[thread_id()]));
+            } else if (thread_id() < lse_dim) {
+                p.logsumexp_ptr[thread_id()] =
+                    cutlass::platform::numeric_limits<accum_t>::infinity();
+            }
+        }
+    }
+
+    template <typename WarpIteratorC,
+              bool kFullColumns,
+              bool kIsFirst>
+    CUTLASS_DEVICE static void iterative_softmax(
+        typename WarpIteratorC::Fragment& frag_o,  // output so far
+        typename WarpIteratorC::Fragment& frag,
+        cutlass::Array<accum_t, kQueriesPerBlock>& mi,
+        cutlass::Array<accum_t, kQueriesPerBlock>& m_prime,
+        cutlass::Array<accum_t, kQueriesPerBlock>& s_prime,
+        int8_t lane_id,
+        int8_t thread_id,
+        int8_t warp_id,
+        int16_t max_col,
+        typename WarpIteratorC::TensorCoord const& tile_offset,
+        float scaling)
+    {
+        /* Iterates on the accumulator and corresponding position on result matrix
+
+        (1) Update `mi[r]` to the max value of the row `r`
+        (2) In a second iteration do the following:
+            (a) accum   <- exp(accum - mi)
+            (b) m_prime <- exp(m_prime - mi)
+            (c) s_prime <- s_prime * m_prime + sum(accum)
+
+        All of this is done on registers, before we store all of this
+        on shared memory for the next matmul with Value.
+        */
+        using Fragment = typename WarpIteratorC::Fragment;
+        using LambdaIterator =
+            typename DefaultMmaAccumLambdaIterator<WarpIteratorC, accum_t, kWarpSize>::Iterator;
+        // Convert to `accum_t` (rather than double)
+        constexpr float kLog2e = 1.4426950408889634074;  // log_2(e) = M_LOG2E
+        if (!kIsFirst) {
+            if (thread_id < kQueriesPerBlock) { m_prime[thread_id] = mi[thread_id]; }
+            __syncthreads();
+        }
+
+        auto lane_offset = LambdaIterator::get_lane_offset(lane_id, warp_id, tile_offset);
+
+        // First update `mi` to the max per-row
+        {
+            accum_t max;
+            LambdaIterator::iterateRows(
+                lane_offset,
+                [&](int accum_m) { max = -cutlass::platform::numeric_limits<accum_t>::infinity(); },
+                [&](int accum_m, int accum_n, int idx) {
+                    if (kFullColumns || accum_n < max_col) {
+                        max = cutlass::fast_max(max, frag[idx]);
+                    }
+                },
+                [&](int accum_m) {
+                    // Having 4x atomicMax seems faster than reduce within warp
+                    // first...
+                    atomicMaxFloat(&mi[accum_m], max * scaling);
+                });
+        }
+        frag = cutlass::multiplies<Fragment>()(scaling * kLog2e, frag);
+
+        // Make sure we all share the update values for `mi`
+        __syncthreads();
+
+        if (thread_id < kQueriesPerBlock) {
+            auto m_prime_exp = exp2f(kLog2e * (m_prime[thread_id] - mi[thread_id]));
+            m_prime[thread_id] = m_prime_exp;
+            s_prime[thread_id] *= m_prime_exp;
+        }
+        __syncthreads();  // Update output fragments
+        if (kKeepOutputInRF && !kIsFirst) {
+            accum_t mp;
+            LambdaIterator::iterateRows(
+                lane_offset,
+                [&](int accum_m) { mp = m_prime[accum_m]; },
+                [&](int accum_m, int accum_n, int idx) { frag_o[idx] *= mp; },
+                [&](int accum_m) {});
+            __syncthreads();
+        }
+        // Update accum_m, accum_n, ...
+        {
+            accum_t mi_row, total_row;
+            LambdaIterator::iterateRows(
+                lane_offset,
+                [&](int accum_m) { mi_row = kLog2e * mi[accum_m]; },
+                [&](int accum_m, int accum_n, int idx) {
+                    frag[idx] = (kFullColumns || accum_n < max_col) ? exp2f(frag[idx] - mi_row)
+                                                                    : accum_t(0.0);
+                },
+                [&](int accum_m) {});
+            LambdaIterator::iterateRows(
+                lane_offset,
+                [&](int accum_m) { total_row = 0.0; },
+                [&](int accum_m, int accum_n, int idx) { total_row += frag[idx]; },
+                [&](int accum_m) {
+                    if (LambdaIterator::reduceSameRow(
+                            lane_id, total_row, [](accum_t a, accum_t b) { return a + b; })) {
+                        atomicAdd(&s_prime[accum_m], total_row);
+                    }
+                });
+        }
+    }
+
+    static CUTLASS_DEVICE int8_t lane_id() { return threadIdx.x; }
+    static CUTLASS_DEVICE int8_t warp_id() { return threadIdx.y; }
+    static CUTLASS_DEVICE int16_t thread_id() { return threadIdx.x + threadIdx.y * blockDim.x; }
+};
+
+template <typename AK>
+__global__ void __launch_bounds__(AK::kNumThreads, AK::kMinBlocksPerSm)
+    attention_kernel_batched_impl(typename AK::Params p)
+{
+    if (!p.advance_to_block()) { return; }
+    AK::attention_kernel(p);
+}
+
+template <typename AK>
+__global__ void __launch_bounds__(AK::kNumThreads, AK::kMinBlocksPerSm)
+    attention_kernel_batched(typename AK::Params params);
diff --git a/csrc/deepspeed4science/evoformer_attn/transform/bias_broadcast.h b/csrc/deepspeed4science/evoformer_attn/transform/bias_broadcast.h
new file mode 100644
index 000000000000..0f15a43574cf
--- /dev/null
+++ b/csrc/deepspeed4science/evoformer_attn/transform/bias_broadcast.h
@@ -0,0 +1,148 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#pragma once
+
+// This does nothing.
+template <typename ThreadMap, typename Shape, typename scalar_t>
+struct BroadcastNoLoad {
+    using Fragment =
+        cutlass::Array<scalar_t, ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+    static const bool kEnable = false;
+    CUTLASS_DEVICE static void load(Fragment& frag,
+                                    scalar_t* ptr,
+                                    int thread_id,
+                                    const cutlass::MatrixCoord& extent,
+                                    int stride)
+    {
+    }
+    CUTLASS_DEVICE static scalar_t*
+    advance(scalar_t* ptr, int B_id, int N_id, int H_id, int strideB, int strideN, int strideH)
+    {
+        return ptr;
+    }
+};
+
+// This is to load the bias matrix from the global memory with on-the-fly
+// broadcast. The shape in global memory is [B, N, 1, 1, L]. Each time we load
+// the last dimension as a L row vector, and we further broadcast the L vector
+// to a tile of size [L, L] by repeating the L vector L times
+template <typename ThreadMap, typename Shape, typename scalar_t>
+struct BroadcastA : public BroadcastNoLoad<ThreadMap, Shape, scalar_t> {
+    using Base = BroadcastNoLoad<ThreadMap, Shape, scalar_t>;
+    static const bool kEnable = true;
+    using layout = cutlass::layout::AffineRank2RowMajor;
+
+    using GmemTileIterator = cutlass::transform::threadblock::
+        PredicatedTileIterator<Shape, scalar_t, layout, 0, ThreadMap>;
+    using Fragment = typename GmemTileIterator::Fragment;
+
+    CUTLASS_DEVICE static void load(Fragment& frag,
+                                    scalar_t* ptr,
+                                    int thread_id,
+                                    const cutlass::MatrixCoord& extent,
+                                    int stride)
+    {
+        GmemTileIterator iter({layout(0, 1)}, ptr, extent, thread_id);
+        iter.load(frag);
+    }
+
+    CUTLASS_DEVICE static scalar_t*
+    advance(scalar_t* ptr, int B_id, int N_id, int H_id, int strideB, int strideN, int strideH)
+    {
+        return ptr + B_id * strideB + N_id * strideN;
+    }
+};
+
+// This is to load the bias matrix from the global memory with on-the-fly
+// broadcast. The shape in global memory is [B, 1, H, L, L]. Each time we load
+// a [L, L] matrix. Different N use the same bias matrix when B and H are the
+// same.
+template <typename ThreadMap, typename Shape, typename scalar_t>
+struct BroadcastB : public BroadcastNoLoad<ThreadMap, Shape, scalar_t> {
+    using Base = BroadcastNoLoad<ThreadMap, Shape, scalar_t>;
+    static const bool kEnable = true;
+    using layout = cutlass::layout::RowMajor;
+
+    using GmemTileIterator = cutlass::transform::threadblock::
+        PredicatedTileIterator<Shape, scalar_t, layout, 0, ThreadMap>;
+    using Fragment = typename GmemTileIterator::Fragment;
+
+    CUTLASS_DEVICE static void load(Fragment& frag,
+                                    scalar_t* ptr,
+                                    int thread_id,
+                                    const cutlass::MatrixCoord& extent,
+                                    int stride)
+    {
+        GmemTileIterator iter({layout(stride)}, ptr, extent, thread_id);
+        iter.load(frag);
+    }
+
+    CUTLASS_DEVICE static scalar_t*
+    advance(scalar_t* ptr, int B_id, int N_id, int H_id, int strideB, int strideN, int strideH)
+    {
+        return ptr + B_id * strideB + H_id * strideH;
+    }
+};
+
+template <typename Shape,
+          typename scalar_t,
+          int kThreads,
+          template <typename, typename, typename>
+          class Broadcast1_,
+          template <typename, typename, typename>
+          class Broadcast2_>
+struct AttentionBiasEpilogue {
+    using ThreadMap = cutlass::transform::PitchLinearStripminedThreadMap<
+        cutlass::layout::PitchLinearShape<Shape::kColumn, Shape::kRow>,
+        kThreads,
+        1>;
+
+    using Broadcast1 = Broadcast1_<ThreadMap, Shape, scalar_t>;
+    using Broadcast2 = Broadcast2_<ThreadMap, Shape, scalar_t>;
+
+    Broadcast1 broadcast1;
+    Broadcast2 broadcast2;
+
+    using Ref = cutlass::TensorRef<float, cutlass::layout::RowMajor>;
+    using SmemTileIterator = cutlass::transform::threadblock::
+        RegularTileIterator<Shape, float, cutlass::layout::RowMajor, 0, ThreadMap>;
+
+    CUTLASS_DEVICE void operator()(const Ref& ref,
+                                   scalar_t* ptr1,
+                                   scalar_t* ptr2,
+                                   int thread_id,
+                                   const cutlass::MatrixCoord& extent,
+                                   int stride)
+    {
+        static_assert(Broadcast1::Fragment::kElements == Broadcast2::Fragment::kElements,
+                      "The two broadcast fragments must have the same number of "
+                      "elements");
+        typename SmemTileIterator::Fragment frag;
+        frag.clear();
+        float* frag_ptr = reinterpret_cast<float*>(&frag);
+        if (Broadcast1::kEnable) {
+            typename Broadcast1::Fragment frag1;
+            frag1.clear();
+            broadcast1.load(frag1, ptr1, thread_id, extent, stride);
+            scalar_t* frag1_ptr = reinterpret_cast<scalar_t*>(&frag1);
+            for (int i = 0; i < Broadcast1::Fragment::kElements; ++i) {
+                frag_ptr[i] += static_cast<float>(frag1_ptr[i]);
+            }
+        }
+        if (Broadcast2::kEnable) {
+            typename Broadcast2::Fragment frag2;
+            frag2.clear();
+            broadcast2.load(frag2, ptr2, thread_id, extent, stride);
+            scalar_t* frag2_ptr = reinterpret_cast<scalar_t*>(&frag2);
+            for (int i = 0; i < Broadcast2::Fragment::kElements; ++i) {
+                frag_ptr[i] += static_cast<float>(frag2_ptr[i]);
+            }
+        }
+        SmemTileIterator iter(ref, thread_id);
+        iter.store(frag);
+        __syncthreads();
+    }
+};
diff --git a/csrc/deepspeed4science/evoformer_attn/transform/tile_smem_loader.h b/csrc/deepspeed4science/evoformer_attn/transform/tile_smem_loader.h
new file mode 100644
index 000000000000..5f3f0ee507e5
--- /dev/null
+++ b/csrc/deepspeed4science/evoformer_attn/transform/tile_smem_loader.h
@@ -0,0 +1,93 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holdvr nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#pragma once
+#include <cutlass/cutlass.h>
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/platform/platform.h"
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+#include "cutlass/transform/threadblock/regular_tile_iterator.h"
+
+template <typename scalar_t,              // scalar type
+          typename ThreadblockTileShape,  // size of tile to load
+          int Threads,                    // number of participating threads
+          int ElementsPerAccess>          // thread access width in elements
+class TileSmemLoader {
+public:
+    using Shape = ThreadblockTileShape;
+    using SmemTile = cutlass::AlignedBuffer<scalar_t, ThreadblockTileShape::kCount>;
+
+    using ThreadMap = cutlass::transform::PitchLinearStripminedThreadMap<
+        cutlass::layout::PitchLinearShape<ThreadblockTileShape::kColumn,  // contiguous
+                                          ThreadblockTileShape::kRow>,    // strided
+        Threads,                                                          // Threads
+        ElementsPerAccess>;                                               // ElementsPerAccess
+
+    using GmemTileIterator = cutlass::transform::threadblock::PredicatedTileIterator<
+        ThreadblockTileShape,       // Shape
+        scalar_t,                   // Element
+        cutlass::layout::RowMajor,  // Layout
+        0,                          // AdvanceRank
+        ThreadMap>;                 // ThreadMap
+
+    using SmemTileIterator =
+        cutlass::transform::threadblock::RegularTileIterator<ThreadblockTileShape,       // Shape
+                                                             scalar_t,                   // Element
+                                                             cutlass::layout::RowMajor,  // Layout
+                                                             0,           // AdvanceRank
+                                                             ThreadMap>;  // ThreadMap
+
+    using Fragment = typename GmemTileIterator::Fragment;
+
+    /// load a tile from global memory into shared memory
+    CUTLASS_DEVICE
+    static void load(GmemTileIterator tile_load_iter, SmemTileIterator tile_store_iter)
+    {
+        Fragment tb_frag;
+        tb_frag.clear();
+        tile_load_iter.load(tb_frag);
+        tile_store_iter.store(tb_frag);
+
+        __syncthreads();
+    }
+};
diff --git a/csrc/includes/StopWatch.h b/csrc/includes/StopWatch.h
index a5576047c967..42e4eb886eaa 100644
--- a/csrc/includes/StopWatch.h
+++ b/csrc/includes/StopWatch.h
@@ -1,6 +1,7 @@
-/*
-Copyright The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #pragma once
 #ifdef _WIN32
diff --git a/csrc/includes/Timer.h b/csrc/includes/Timer.h
index d325cae1c531..b67e04171daf 100644
--- a/csrc/includes/Timer.h
+++ b/csrc/includes/Timer.h
@@ -1,6 +1,7 @@
-/*
-Copyright The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #ifndef __TIMER_H__
 #define __TIMER_H__
diff --git a/csrc/includes/activation_type.h b/csrc/includes/activation_type.h
new file mode 100644
index 000000000000..a44921d5d650
--- /dev/null
+++ b/csrc/includes/activation_type.h
@@ -0,0 +1,17 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#pragma once
+
+enum ActivationType {
+    GELU = 0,
+    RELU = 1,
+    SILU = 2,
+    GEGLU = 3,
+    ReGLU = 4,
+    SiGLU = 5,
+    IDENTITY = 6,
+    InvalidType = -1
+};
diff --git a/csrc/includes/compat.h b/csrc/includes/compat.h
index 86f84a85065c..6d54446d472e 100644
--- a/csrc/includes/compat.h
+++ b/csrc/includes/compat.h
@@ -1,6 +1,11 @@
-/* Copyright 2020 The Microsoft DeepSpeed Team
-   Copyright NVIDIA/apex
-   This file is adapted from fused adam in NVIDIA/apex, commit a109f85
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+/*
+Copyright NVIDIA/apex
+This file is adapted from fused adam in NVIDIA/apex, commit a109f85
 */
 
 #ifndef TORCH_CHECK
diff --git a/csrc/includes/context.h b/csrc/includes/context.h
index a8968ba1f0b4..cd80f8fbeebe 100644
--- a/csrc/includes/context.h
+++ b/csrc/includes/context.h
@@ -1,6 +1,7 @@
-/*
-Copyright The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #pragma once
 
@@ -43,28 +44,32 @@ inline int DS_GET_BLOCKS(const int N)
         1);
 }
 
-class Context {
+class TrainingContext {
 public:
-    Context() : _workspace(nullptr), _seed(42), _curr_offset(0)
+    TrainingContext() : _workspace(nullptr), _seed(42), _curr_offset(0)
     {
         curandCreateGenerator(&_gen, CURAND_RNG_PSEUDO_DEFAULT);
         curandSetPseudoRandomGeneratorSeed(_gen, 123);
-        if (cublasCreate(&_cublasHandle) != CUBLAS_STATUS_SUCCESS) {
-            auto message = std::string("Fail to create cublas handle.");
+        cublasStatus_t stat = cublasCreate(&_cublasHandle);
+        if (stat != CUBLAS_STATUS_SUCCESS) {
+            // It would be nice to use cublasGetStatusName and
+            // cublasGetStatusString, but they were only added in CUDA 11.4.2.
+            auto message = std::string("Failed to create cublas handle: cublasStatus_t was ") +
+                           std::to_string(stat);
             std::cerr << message << std::endl;
             throw std::runtime_error(message);
         }
     }
 
-    virtual ~Context()
+    virtual ~TrainingContext()
     {
         cublasDestroy(_cublasHandle);
         cudaFree(_workspace);
     }
 
-    static Context& Instance()
+    static TrainingContext& Instance()
     {
-        static Context _ctx;
+        static TrainingContext _ctx;
         return _ctx;
     }
 
diff --git a/csrc/includes/conversion_utils.h b/csrc/includes/conversion_utils.h
index 4f952e5f0650..3a90a3e91ddf 100644
--- a/csrc/includes/conversion_utils.h
+++ b/csrc/includes/conversion_utils.h
@@ -1,12 +1,12 @@
-/*
-Copyright 2022 The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #pragma once
 
 #include "ds_kernel_utils.h"
 
-#include <cuda_fp16.h>
 #include <stdint.h>
 
 #ifdef BF16_AVAILABLE
@@ -265,7 +265,12 @@ DS_D_INLINE float2 to(__nv_bfloat162 val)
 template <>
 DS_D_INLINE __half to(double val)
 {
+#ifdef __HIP_PLATFORM_AMD__
+    float val_f = __double2float_rn(val);
+    return __float2half(val_f);
+#else
     return __double2half(val);
+#endif
 }
 template <>
 DS_D_INLINE __half to(float val)
@@ -328,6 +333,11 @@ DS_D_INLINE __half2 to(float2 val)
 {
     return __float22half2_rn(val);
 }
+template <>
+DS_D_INLINE __half2 to(float val)
+{
+    return __float2half2_rn(val);
+}
 
 #ifdef BF16_AVAILABLE
 // No direct conversion
@@ -400,6 +410,11 @@ DS_D_INLINE __nv_bfloat162 to(float2 val)
     return __float22bfloat162_rn(val);
 }
 template <>
+DS_D_INLINE __nv_bfloat162 to(float val)
+{
+    return __float2bfloat162_rn(val);
+}
+template <>
 DS_D_INLINE __nv_bfloat162 to(__half2 val)
 {
     return to<__nv_bfloat162>(to<float2>(val));
diff --git a/csrc/includes/cpu_adagrad.h b/csrc/includes/cpu_adagrad.h
index 0dda4f759ece..59888adf17c3 100644
--- a/csrc/includes/cpu_adagrad.h
+++ b/csrc/includes/cpu_adagrad.h
@@ -1,6 +1,7 @@
-/*
-Copyright The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #pragma once
 
@@ -17,6 +18,10 @@ Copyright The Microsoft DeepSpeed Team
 #include "cuda.h"
 #include "custom_cuda_layers.h"
 typedef __half ds_half_precision_t;
+#elif defined(__ENABLE_CANN__)
+#include "acl/acl.h"
+#include "torch_npu/csrc/core/npu/NPUStream.h"
+typedef c10::Half ds_half_precision_t;
 #else
 typedef unsigned short ds_half_precision_t;
 #endif
@@ -38,8 +43,13 @@ class Adagrad_Optimizer {
         cudaMallocHost((void**)_doubled_buffer, TILE * sizeof(float));
         cudaMallocHost((void**)(_doubled_buffer + 1), TILE * sizeof(float));
 
-        _streams[0] = Context::Instance().GetCurrentStream();
-        _streams[1] = Context::Instance().GetNewStream();
+        _streams[0] = TrainingContext::Instance().GetCurrentStream();
+        _streams[1] = TrainingContext::Instance().GetNewStream();
+        _buf_index = false;
+#elif defined(__ENABLE_CANN__)
+        aclrtMallocHost((void**)_doubled_buffer, TILE * sizeof(float));
+        aclrtMallocHost((void**)(_doubled_buffer + 1), TILE * sizeof(float));
+
         _buf_index = false;
 #endif
     }
@@ -48,6 +58,9 @@ class Adagrad_Optimizer {
 #if defined(__ENABLE_CUDA__)
         cudaFreeHost(_doubled_buffer[0]);
         cudaFreeHost(_doubled_buffer[1]);
+#elif defined(__ENABLE_CANN__)
+        aclrtFreeHost(_doubled_buffer[0]);
+        aclrtFreeHost(_doubled_buffer[1]);
 #endif
     }
 #if defined(__AVX512__) or defined(__AVX256__)
@@ -68,6 +81,11 @@ class Adagrad_Optimizer {
     {
         for (int i = 0; i < 2; i++) cudaStreamSynchronize(_streams[i]);
     }
+#elif defined(__ENABLE_CANN__)
+    inline void SynchronizeStreams()
+    {
+        for (int i = 0; i < 2; i++) aclrtSynchronizeStream(_streams[i].stream());
+    }
 #endif
     inline void IncrementStep(size_t step)
     {
@@ -94,6 +112,11 @@ class Adagrad_Optimizer {
     bool _buf_index;
     float* _doubled_buffer[2];
     cudaStream_t _streams[2];
+#elif defined(__ENABLE_CANN__)
+    float* _doubled_buffer[2];
+    c10_npu::NPUStream _streams[2] = {c10_npu::getCurrentNPUStream(),
+                                      c10_npu::getNPUStreamFromPool()};
+    bool _buf_index;
 #endif
 };
 
@@ -124,6 +147,8 @@ void Adagrad_Optimizer::Step_AVX(size_t* rounded_size,
         size_t offset = copy_size + t;
 #if defined(__ENABLE_CUDA__)
         if ((t / TILE) >= 2) { cudaStreamSynchronize(_streams[_buf_index]); }
+#elif defined(__ENABLE_CANN__)
+        if ((t / TILE) >= 2) { aclrtSynchronizeStream(_streams[_buf_index].stream()); }
 #endif
 #pragma omp parallel for
         for (size_t i = t; i < offset; i += SIMD_WIDTH * span) {
@@ -148,7 +173,7 @@ void Adagrad_Optimizer::Step_AVX(size_t* rounded_size,
             simd_fma<span>(param_4, grad_4, step_size_4, param_4);
 
             simd_store<span>(_params + i, param_4, half_precision);
-#if defined(__ENABLE_CUDA__)
+#if defined(__ENABLE_CUDA__) or defined(__ENABLE_CANN__)
             if (dev_params) {
                 simd_store<span>(_doubled_buffer[_buf_index] + (i - t), param_4, half_precision);
             }
@@ -166,6 +191,17 @@ void Adagrad_Optimizer::Step_AVX(size_t* rounded_size,
 
             _buf_index = !_buf_index;
         }
+#elif defined(__ENABLE_CANN__)
+        if (dev_params) {
+            size_t memcpy_size = copy_size * sizeof(_doubled_buffer[_buf_index][0]);
+            if (half_precision) memoryCopySize /= 2;
+            aclrtMemcpy(dev_params + t,
+                        memcpy_size,
+                        _doubled_buffer[_buf_index],
+                        memcpy_size,
+                        aclrtMemcpyKind::ACL_MEMCPY_HOST_TO_DEVICE);
+
+            _buf_index = !_buf_index;
 #endif
     }
     *rounded_size = new_rounded_size;
diff --git a/csrc/includes/cpu_adam.h b/csrc/includes/cpu_adam.h
index e9e139aa8492..44d3ed3cac61 100644
--- a/csrc/includes/cpu_adam.h
+++ b/csrc/includes/cpu_adam.h
@@ -1,6 +1,7 @@
-/*
-Copyright The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #pragma once
 
@@ -8,6 +9,7 @@ Copyright The Microsoft DeepSpeed Team
                   // https://stackoverflow.com/questions/4913922/possible-problems-with-nominmax-on-visual-c
 
 #include <stdio.h>
+#include <torch/extension.h>
 #include <cassert>
 #include "simd.h"
 
@@ -17,6 +19,10 @@ Copyright The Microsoft DeepSpeed Team
 #include "cuda.h"
 #include "custom_cuda_layers.h"
 typedef __half ds_half_precision_t;
+#elif defined(__ENABLE_CANN__)
+#include "acl/acl.h"
+#include "torch_npu/csrc/core/npu/NPUStream.h"
+typedef c10::Half ds_half_precision_t;
 #else
 #include <cmath>
 typedef unsigned short ds_half_precision_t;
@@ -53,8 +59,13 @@ class Adam_Optimizer {
         cudaMallocHost((void**)_doubled_buffer, TILE * sizeof(float));
         cudaMallocHost((void**)(_doubled_buffer + 1), TILE * sizeof(float));
 
-        _streams[0] = Context::Instance().GetCurrentStream();
-        _streams[1] = Context::Instance().GetNewStream();
+        _streams[0] = TrainingContext::Instance().GetCurrentStream();
+        _streams[1] = TrainingContext::Instance().GetNewStream();
+        _buf_index = false;
+#elif defined(__ENABLE_CANN__)
+        aclrtMallocHost((void**)_doubled_buffer, TILE * sizeof(float));
+        aclrtMallocHost((void**)(_doubled_buffer + 1), TILE * sizeof(float));
+
         _buf_index = false;
 #endif
     }
@@ -63,6 +74,9 @@ class Adam_Optimizer {
 #if defined(__ENABLE_CUDA__)
         cudaFreeHost(_doubled_buffer[0]);
         cudaFreeHost(_doubled_buffer[1]);
+#elif defined(__ENABLE_CANN__)
+        aclrtFreeHost(_doubled_buffer[0]);
+        aclrtFreeHost(_doubled_buffer[1]);
 #endif
     }
 
@@ -85,6 +99,11 @@ class Adam_Optimizer {
     {
         for (int i = 0; i < 2; i++) cudaStreamSynchronize(_streams[i]);
     }
+#elif defined(__ENABLE_CANN__)
+    inline void SynchronizeStreams()
+    {
+        for (int i = 0; i < 2; i++) aclrtSynchronizeStream(_streams[i].stream());
+    }
 #endif
     inline void IncrementStep(size_t step, float beta1, float beta2)
     {
@@ -140,6 +159,11 @@ class Adam_Optimizer {
     float* _doubled_buffer[2];
     cudaStream_t _streams[2];
     bool _buf_index;
+#elif defined(__ENABLE_CANN__)
+    float* _doubled_buffer[2];
+    c10_npu::NPUStream _streams[2] = {c10_npu::getCurrentNPUStream(),
+                                      c10_npu::getNPUStreamFromPool()};
+    bool _buf_index;
 #endif
 };
 
@@ -190,6 +214,9 @@ void Adam_Optimizer::Step_AVX(size_t* rounded_size,
         size_t offset = copy_size + t;
 #if defined(__ENABLE_CUDA__)
         if ((t / TILE) >= 2) { cudaStreamSynchronize(_streams[_buf_index]); }
+#elif defined(__ENABLE_CANN__)
+        if ((t / TILE) >= 2) { aclrtSynchronizeStream((_streams[_buf_index].stream());
+        }
 #endif
 #pragma omp parallel for
         for (size_t i = t; i < offset; i += SIMD_WIDTH * span) {
@@ -225,7 +252,7 @@ void Adam_Optimizer::Step_AVX(size_t* rounded_size,
             simd_fma<span>(param_4, grad_4, step_size_4, param_4);
 
             simd_store<span>(_params + (i >> rshft), param_4, half_precision);
-#if defined(__ENABLE_CUDA__)
+#if defined(__ENABLE_CUDA__) or defined(__ENABLE_CANN__)
             if (dev_params) {
                 simd_store<span>(_doubled_buffer[_buf_index] + (i - t), param_4, half_precision);
             }
@@ -244,8 +271,57 @@ void Adam_Optimizer::Step_AVX(size_t* rounded_size,
 
             _buf_index = !_buf_index;
         }
+#elif defined(__ENABLE_CANN__)
+        if (dev_params) {
+            size_t memcpy_size = copy_size * sizeof(_doubled_buffer[_buf_index][0]);
+            if (half_precision) memoryCopySize /= 2;
+            aclrtMemcpy(dev_params + t,
+                        memcpy_size,
+                        _doubled_buffer[_buf_index],
+                        memcpy_size,
+                        aclrtMemcpyKind::ACL_MEMCPY_HOST_TO_DEVICE);
+
+            _buf_index = !_buf_index;
 #endif
     }
     *rounded_size = new_rounded_size;
 }
 #endif
+
+int create_adam_optimizer(int optimizer_id,
+                          float alpha = 1e-3,
+                          float betta1 = 0.9,
+                          float betta2 = 0.999,
+                          float eps = 1e-8,
+                          float weight_decay = 0,
+                          bool adamw_mode = true,
+                          bool should_log = false);
+
+int ds_adam_step(int optimizer_id,
+                 size_t step,
+                 float lr,
+                 float beta1,
+                 float beta2,
+                 float epsilon,
+                 float weight_decay,
+                 bool bias_correction,
+                 torch::Tensor& params,
+                 torch::Tensor& grads,
+                 torch::Tensor& exp_avg,
+                 torch::Tensor& exp_avg_sq);
+
+int ds_adam_step_plus_copy(int optimizer_id,
+                           size_t step,
+                           float lr,
+                           float beta1,
+                           float beta2,
+                           float epsilon,
+                           float weight_decay,
+                           bool bias_correction,
+                           torch::Tensor& params,
+                           torch::Tensor& grads,
+                           torch::Tensor& exp_avg,
+                           torch::Tensor& exp_avg_sq,
+                           torch::Tensor& gpu_params);
+
+int destroy_adam_optimizer(int optimizer_id);
diff --git a/csrc/includes/cpu_lion.h b/csrc/includes/cpu_lion.h
new file mode 100644
index 000000000000..d83fe9473332
--- /dev/null
+++ b/csrc/includes/cpu_lion.h
@@ -0,0 +1,268 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#pragma once
+
+#define NOMINMAX  // Windows idiosyncrasy
+                  // https://stackoverflow.com/questions/4913922/possible-problems-with-nominmax-on-visual-c
+
+#include <stdio.h>
+#include <torch/extension.h>
+#include <cassert>
+#include "simd.h"
+
+#if defined(__ENABLE_CUDA__)
+#include <cuda_fp16.h>
+#include <cuda_runtime_api.h>
+#include "cuda.h"
+#include "custom_cuda_layers.h"
+typedef __half ds_half_precision_t;
+#elif defined(__ENABLE_CANN__)
+#include "acl/acl.h"
+#include "torch_npu/csrc/core/npu/NPUStream.h"
+typedef c10::Half ds_half_precision_t;
+#else
+#include <cmath>
+typedef unsigned short ds_half_precision_t;
+#endif
+
+#define STEP(SPAN)                                             \
+    void Step_##SPAN(float* _params,                           \
+                     float* grads,                             \
+                     float* _exp_avg,                          \
+                     size_t _param_size,                       \
+                     ds_half_precision_t* dev_param = nullptr, \
+                     bool half_precision = false);
+
+class Lion_Optimizer {
+public:
+    Lion_Optimizer(float alpha = 1e-3,
+                   float betta1 = 0.9,
+                   float betta2 = 0.999,
+                   float weight_decay = 0)
+        : _alpha(alpha), _betta1(betta1), _betta2(betta2), _weight_decay(weight_decay), _step(0)
+    {
+#if defined(__ENABLE_CUDA__)
+        cudaMallocHost((void**)_doubled_buffer, TILE * sizeof(float));
+        cudaMallocHost((void**)(_doubled_buffer + 1), TILE * sizeof(float));
+
+        _streams[0] = TrainingContext::Instance().GetCurrentStream();
+        _streams[1] = TrainingContext::Instance().GetNewStream();
+        _buf_index = false;
+#elif defined(__ENABLE_CANN__)
+        aclrtMallocHost((void**)_doubled_buffer, TILE * sizeof(float));
+        aclrtMallocHost((void**)(_doubled_buffer + 1), TILE * sizeof(float));
+
+        _buf_index = false;
+#endif
+    }
+    ~Lion_Optimizer()
+    {
+#if defined(__ENABLE_CUDA__)
+        cudaFreeHost(_doubled_buffer[0]);
+        cudaFreeHost(_doubled_buffer[1]);
+#elif defined(__ENABLE_CANN__)
+        aclrtFreeHost(_doubled_buffer[0]);
+        aclrtFreeHost(_doubled_buffer[1]);
+#endif
+    }
+
+#if defined(__AVX512__) or defined(__AVX256__)
+    template <int span>
+    void Step_AVX(size_t* rounded_size,
+                  float* _params,
+                  float* grads,
+                  float* _exp_avg,
+                  size_t param_size,
+                  ds_half_precision_t* dev_param = nullptr,
+                  bool half_precision = false);
+#endif
+    STEP(1)
+    STEP(4)
+    STEP(8)
+#if defined(__ENABLE_CUDA__)
+    inline void SynchronizeStreams()
+    {
+        for (int i = 0; i < 2; i++) cudaStreamSynchronize(_streams[i]);
+    }
+#elif defined(__ENABLE_CANN__)
+    inline void SynchronizeStreams()
+    {
+        for (int i = 0; i < 2; i++) aclrtSynchronizeStream(_streams[i].stream());
+    }
+#endif
+    inline void IncrementStep(size_t step, float beta1, float beta2)
+    {
+        _step++;
+        if (_step != step || beta1 != _betta1 || beta2 != _betta2) {
+            _step = step;
+            _betta1 = beta1;
+            _betta2 = beta2;
+        }
+    }
+    inline void update_state(float lr, float weight_decay)
+    {
+        _alpha = lr;
+        _weight_decay = weight_decay;
+    }
+
+private:
+    float _alpha;
+    float _betta1;
+    float _betta2;
+    float _weight_decay;
+    size_t _step;
+
+#if defined(__ENABLE_CUDA__)
+    float* _doubled_buffer[2];
+    cudaStream_t _streams[2];
+    bool _buf_index;
+#elif defined(__ENABLE_CANN__)
+    float* _doubled_buffer[2];
+    c10_npu::NPUStream _streams[2] = {c10_npu::getCurrentNPUStream(),
+                                      c10_npu::getNPUStreamFromPool()};
+    bool _buf_index;
+#endif
+};
+
+#if defined(__AVX512__) or defined(__AVX256__)
+template <int span>
+void Lion_Optimizer::Step_AVX(size_t* rounded_size,
+                              float* _params,
+                              float* grads,
+                              float* _exp_avg,
+                              size_t _param_size,
+                              ds_half_precision_t* dev_params,
+                              bool half_precision)
+{
+    size_t new_rounded_size = 0;
+    int rshft = half_precision ? 1 : 0;
+
+    constexpr float neg1 = -1.0f;
+    AVX_Data neg1_4;
+    neg1_4.data = SIMD_SET(neg1);
+
+    AVX_Data betta1_4;
+    betta1_4.data = SIMD_SET(_betta1);
+    AVX_Data betta2_4;
+    betta2_4.data = SIMD_SET(_betta2);
+
+    float betta1_minus1 = 1 - _betta1;
+    float betta2_minus1 = 1 - _betta2;
+    AVX_Data betta1_minus1_4;
+    betta1_minus1_4.data = SIMD_SET(betta1_minus1);
+    AVX_Data betta2_minus1_4;
+    betta2_minus1_4.data = SIMD_SET(betta2_minus1);
+
+    float step_size = -_alpha;
+    AVX_Data step_size_4;
+    step_size_4.data = SIMD_SET(step_size);
+
+    float after_decay = 1.0f - _alpha * _weight_decay;
+    AVX_Data after_decay_4;
+    if (_weight_decay > 0) after_decay_4.data = SIMD_SET(after_decay);
+
+    new_rounded_size = ROUND_DOWN(_param_size, SIMD_WIDTH * span);
+    for (size_t t = 0; t < new_rounded_size; t += TILE) {
+        size_t copy_size = TILE;
+        if ((t + TILE) > new_rounded_size) copy_size = new_rounded_size - t;
+        size_t offset = copy_size + t;
+#if defined(__ENABLE_CUDA__)
+        if ((t / TILE) >= 2) { cudaStreamSynchronize(_streams[_buf_index]); }
+#elif defined(__ENABLE_CANN__)
+        if ((t / TILE) >= 2) { aclrtSynchronizeStream(_streams[_buf_index].stream()); }
+#endif
+#pragma omp parallel for
+        for (size_t i = t; i < offset; i += SIMD_WIDTH * span) {
+            AVX_Data grad_4[span];
+            simd_load<span>(grad_4, grads + (i >> rshft), half_precision);
+
+            AVX_Data momentum_4[span];
+            simd_load<span>(momentum_4, _exp_avg + i, false);
+
+            AVX_Data param_4[span];
+            simd_load<span>(param_4, _params + (i >> rshft), half_precision);
+
+            AVX_Data tmp_4[span];
+
+            simd_mul<span>(tmp_4, momentum_4, betta1_4);
+            simd_fma<span>(tmp_4, grad_4, betta1_minus1_4, tmp_4);
+            // We already used intrinsics, so consider the machine representation fixed.
+            simd_and<span>(tmp_4, tmp_4, neg1_4);
+            simd_xor<span>(tmp_4, tmp_4, step_size_4);
+            if (_weight_decay > 0) {
+                simd_fma<span>(param_4, param_4, after_decay_4, tmp_4);
+            } else {
+                simd_add<span>(param_4, param_4, tmp_4);
+            }
+
+            simd_mul<span>(momentum_4, momentum_4, betta2_4);
+            simd_fma<span>(momentum_4, grad_4, betta2_minus1_4, momentum_4);
+
+            simd_store<span>(_params + (i >> rshft), param_4, half_precision);
+#if defined(__ENABLE_CUDA__) or defined(__ENABLE_CANN__)
+            if (dev_params) {
+                simd_store<span>(_doubled_buffer[_buf_index] + (i - t), param_4, half_precision);
+            }
+#endif
+            simd_store<span>(_exp_avg + i, momentum_4, false);
+        }
+#if defined(__ENABLE_CUDA__)
+        if (dev_params) {
+            if (half_precision)
+                launch_param_update_half(
+                    _doubled_buffer[_buf_index], dev_params + t, copy_size, _streams[_buf_index]);
+            else
+                launch_param_update(
+                    _doubled_buffer[_buf_index], dev_params + t, copy_size, _streams[_buf_index]);
+
+            _buf_index = !_buf_index;
+        }
+#elif defined(__ENABLE_CANN__)
+        if (dev_params) {
+            size_t memcpy_size = copy_size * sizeof(_doubled_buffer[_buf_index][0]);
+            if (half_precision) memoryCopySize /= 2;
+            aclrtMemcpy(dev_params + t,
+                        memcpy_size,
+                        _doubled_buffer[_buf_index],
+                        memcpy_size,
+                        aclrtMemcpyKind::ACL_MEMCPY_HOST_TO_DEVICE);
+
+            _buf_index = !_buf_index;
+#endif
+    }
+    *rounded_size = new_rounded_size;
+}
+#endif
+
+int create_lion_optimizer(int optimizer_id,
+                          float alpha = 1e-3,
+                          float betta1 = 0.9,
+                          float betta2 = 0.999,
+                          float weight_decay = 0,
+                          bool should_log = false);
+
+int ds_lion_step(int optimizer_id,
+                 size_t step,
+                 float lr,
+                 float beta1,
+                 float beta2,
+                 float weight_decay,
+                 torch::Tensor& params,
+                 torch::Tensor& grads,
+                 torch::Tensor& exp_avg);
+
+int ds_lion_step_plus_copy(int optimizer_id,
+                           size_t step,
+                           float lr,
+                           float beta1,
+                           float beta2,
+                           float weight_decay,
+                           torch::Tensor& params,
+                           torch::Tensor& grads,
+                           torch::Tensor& exp_avg,
+                           torch::Tensor& gpu_params);
+
+int destroy_lion_optimizer(int optimizer_id);
diff --git a/csrc/includes/cublas_wrappers.h b/csrc/includes/cublas_wrappers.h
index 68e7f48e5083..b57ff79923fc 100644
--- a/csrc/includes/cublas_wrappers.h
+++ b/csrc/includes/cublas_wrappers.h
@@ -1,6 +1,7 @@
-/*
-Copyright The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #pragma once
 
@@ -9,9 +10,12 @@ Copyright The Microsoft DeepSpeed Team
 #include <cuda.h>
 #include <cuda_fp16.h>
 #include <cuda_runtime.h>
-#ifndef __HIP_PLATFORM_HCC__
+#ifndef __HIP_PLATFORM_AMD__
 #include <mma.h>
 #endif
+#ifdef __HIP_PLATFORM_AMD__
+#include <rocblas/rocblas.h>
+#endif
 #include <stdio.h>
 
 int cublas_gemm_ex(cublasHandle_t handle,
@@ -25,7 +29,7 @@ int cublas_gemm_ex(cublasHandle_t handle,
                    const float* A,
                    const float* B,
                    float* C,
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef __HIP_PLATFORM_AMD__
                    rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
 #else
                    cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT);
@@ -42,7 +46,7 @@ int cublas_gemm_ex(cublasHandle_t handle,
                    const __half* A,
                    const __half* B,
                    __half* C,
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef __HIP_PLATFORM_AMD__
                    rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
 #else
                    cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP);
@@ -63,7 +67,7 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
                                 int stride_B,
                                 int stride_C,
                                 int batch,
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef __HIP_PLATFORM_AMD__
                                 rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
 #else
                                 cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT);
@@ -84,7 +88,7 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
                                 int stride_B,
                                 int stride_C,
                                 int batch,
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef __HIP_PLATFORM_AMD__
                                 rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
 #else
                                 cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP);
diff --git a/csrc/includes/custom_cuda_layers.h b/csrc/includes/custom_cuda_layers.h
index 7ac04154dc57..265eb7b12444 100644
--- a/csrc/includes/custom_cuda_layers.h
+++ b/csrc/includes/custom_cuda_layers.h
@@ -1,6 +1,7 @@
-/*
-Copyright 2022 The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #pragma once
 
diff --git a/csrc/includes/dequantization_utils.h b/csrc/includes/dequantization_utils.h
index fea7505c8a13..37a7cdf27a01 100644
--- a/csrc/includes/dequantization_utils.h
+++ b/csrc/includes/dequantization_utils.h
@@ -1,6 +1,7 @@
-/*
-Copyright 2022 The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #include "conversion_utils.h"
 #include "ds_kernel_utils.h"
diff --git a/csrc/includes/dropout.h b/csrc/includes/dropout.h
index 2fdf22495295..c00943a97687 100644
--- a/csrc/includes/dropout.h
+++ b/csrc/includes/dropout.h
@@ -1,6 +1,7 @@
-/*
-Copyright The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #pragma once
 
diff --git a/csrc/includes/ds_kernel_utils.h b/csrc/includes/ds_kernel_utils.h
index 10210663fcfb..8e4888109fcd 100644
--- a/csrc/includes/ds_kernel_utils.h
+++ b/csrc/includes/ds_kernel_utils.h
@@ -1,6 +1,9 @@
-/*
-Copyright 2022 The Microsoft DeepSpeed Team
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
+/*
 Centralized header file for preprocessor macros and constants
 used throughout the codebase.
 */
@@ -8,18 +11,24 @@ used throughout the codebase.
 #pragma once
 
 #include <cuda.h>
+#include <cuda_fp16.h>
+
+#ifdef BF16_AVAILABLE
+#include <cuda_bf16.h>
+#endif
 
 #define DS_HD_INLINE __host__ __device__ __forceinline__
 #define DS_D_INLINE __device__ __forceinline__
 
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef __HIP_PLATFORM_AMD__
 
 // constexpr variant of warpSize for templating
 constexpr int hw_warp_size = 64;
 #define HALF_PRECISION_AVAILABLE = 1
 #include <hip/hip_cooperative_groups.h>
+#include <hip/hip_fp16.h>
 
-#else  // !__HIP_PLATFORM_HCC__
+#else  // !__HIP_PLATFORM_AMD__
 
 // constexpr variant of warpSize for templating
 constexpr int hw_warp_size = 32;
@@ -31,12 +40,12 @@ constexpr int hw_warp_size = 32;
 
 #if __CUDA_ARCH__ >= 800
 #define ASYNC_COPY_AVAILABLE
-#define BF16_AVAILABLE
 #endif  // __CUDA_ARCH__ >= 800
 
 #include <cooperative_groups.h>
+#include <cuda_fp16.h>
 
-#endif  //__HIP_PLATFORM_HCC__
+#endif  //__HIP_PLATFORM_AMD__
 
 inline int next_pow2(const int val)
 {
diff --git a/csrc/includes/ds_transformer_cuda.h b/csrc/includes/ds_transformer_cuda.h
old mode 100755
new mode 100644
index d289a24c6b4c..4dbd1d477338
--- a/csrc/includes/ds_transformer_cuda.h
+++ b/csrc/includes/ds_transformer_cuda.h
@@ -1,6 +1,7 @@
-/*
-Copyright The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #pragma once
 
diff --git a/csrc/includes/feed_forward.h b/csrc/includes/feed_forward.h
index 0f2ece4aabd3..46e3ba748d52 100644
--- a/csrc/includes/feed_forward.h
+++ b/csrc/includes/feed_forward.h
@@ -1,6 +1,7 @@
-/*
-Copyright The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #ifndef __FEEDFORWARD_H__
 #define __FEEDFORWARD_H__
@@ -47,7 +48,7 @@ class FeedForward {
                        weights,
                        input_ptr,
                        out,
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef __HIP_PLATFORM_AMD__
                        rocblas_gemm_algo(config_.gemm_algos[0]));
 #else
                        cublasGemmAlgo_t(config_.gemm_algos[0]));
@@ -76,7 +77,7 @@ class FeedForward {
                        input_ptr,
                        out_grad,
                        weights_grad,
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef __HIP_PLATFORM_AMD__
                        rocblas_gemm_algo(config_.gemm_algos[1]));
 #else
                        cublasGemmAlgo_t(config_.gemm_algos[1]));
@@ -93,7 +94,7 @@ class FeedForward {
                        weights,
                        out_grad,
                        inp_grad_out,
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef __HIP_PLATFORM_AMD__
                        rocblas_gemm_algo(config_.gemm_algos[2]));
 #else
                        cublasGemmAlgo_t(config_.gemm_algos[2]));
diff --git a/csrc/includes/gelu.h b/csrc/includes/gelu.h
index 07d431484482..b75c1510bad6 100644
--- a/csrc/includes/gelu.h
+++ b/csrc/includes/gelu.h
@@ -1,6 +1,7 @@
-/*
-Copyright The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #pragma once
 
diff --git a/csrc/includes/gemm_test.h b/csrc/includes/gemm_test.h
index 7ddb8b122798..278515174523 100644
--- a/csrc/includes/gemm_test.h
+++ b/csrc/includes/gemm_test.h
@@ -1,13 +1,17 @@
-/*
-Copyright The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #pragma once
 
 #include <cuda_fp16.h>
-#ifndef __HIP_PLATFORM_HCC__
+#ifndef __HIP_PLATFORM_AMD__
 #include <cuda_profiler_api.h>
 #endif
+#ifdef __HIP_PLATFORM_AMD__
+#include <rocblas/rocblas.h>
+#endif
 #include <array>
 #include <cstdio>
 #include <cstdlib>
@@ -63,7 +67,7 @@ class GemmTest {
                            B,
                            A,
                            C,
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef __HIP_PLATFORM_AMD__
                            static_cast<rocblas_gemm_algo>(algo));
 #else
                            static_cast<cublasGemmAlgo_t>(algo));
@@ -82,7 +86,7 @@ class GemmTest {
                            A,
                            C,
                            B,
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef __HIP_PLATFORM_AMD__
                            static_cast<rocblas_gemm_algo>(algo));
 #else
                            static_cast<cublasGemmAlgo_t>(algo));
@@ -101,7 +105,7 @@ class GemmTest {
                            B,
                            C,
                            A,
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef __HIP_PLATFORM_AMD__
                            static_cast<rocblas_gemm_algo>(algo));
 #else
                            static_cast<cublasGemmAlgo_t>(algo));
@@ -117,7 +121,7 @@ class GemmTest {
         float fast_latency = (std::numeric_limits<float>::max)();
         int fast_algo = 0;
 
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef __HIP_PLATFORM_AMD__
         for (int algo = (int)rocblas_gemm_algo_standard; algo <= (int)rocblas_gemm_algo_standard;
 #else
         for (int algo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
@@ -207,7 +211,7 @@ class StridedGemmTest {
                                         stride_b,
                                         stride_c,
                                         bsz,
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef __HIP_PLATFORM_AMD__
                                         static_cast<rocblas_gemm_algo>(algo));
 #else
                                         static_cast<cublasGemmAlgo_t>(algo));
@@ -241,7 +245,7 @@ class StridedGemmTest {
                                         stride_b,
                                         stride_c,
                                         bsz,
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef __HIP_PLATFORM_AMD__
                                         static_cast<rocblas_gemm_algo>(algo));
 #else
                                         static_cast<cublasGemmAlgo_t>(algo));
@@ -272,7 +276,7 @@ class StridedGemmTest {
                                         stride_b,
                                         stride_c,
                                         bsz,
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef __HIP_PLATFORM_AMD__
                                         static_cast<rocblas_gemm_algo>(algo));
 #else
                                         static_cast<cublasGemmAlgo_t>(algo));
@@ -288,7 +292,7 @@ class StridedGemmTest {
         float fast_latency = (std::numeric_limits<float>::max)();
         int fast_algo = 0;
 
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef __HIP_PLATFORM_AMD__
         for (int algo = (int)rocblas_gemm_algo_standard; algo <= (int)rocblas_gemm_algo_standard;
 #else
         for (int algo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
diff --git a/csrc/includes/general_kernels.h b/csrc/includes/general_kernels.h
index 875df84195ea..bd621d3c4329 100644
--- a/csrc/includes/general_kernels.h
+++ b/csrc/includes/general_kernels.h
@@ -1,13 +1,14 @@
-/*
-Copyright The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #include <cuda.h>
 #include <cuda_fp16.h>
 #include <stdio.h>
 #include <stdlib.h>
 
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef __HIP_PLATFORM_AMD__
 #include <hip/hip_cooperative_groups.h>
 #else
 #include <cooperative_groups.h>
diff --git a/csrc/includes/memory_access_utils.h b/csrc/includes/memory_access_utils.h
index e2cdcb6ca831..6789714d27c7 100644
--- a/csrc/includes/memory_access_utils.h
+++ b/csrc/includes/memory_access_utils.h
@@ -1,6 +1,7 @@
-/*
-Copyright 2022 The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #pragma once
 
diff --git a/csrc/includes/normalize_layer.h b/csrc/includes/normalize_layer.h
index 092129362f36..b9c719087a6b 100644
--- a/csrc/includes/normalize_layer.h
+++ b/csrc/includes/normalize_layer.h
@@ -1,6 +1,7 @@
-/*
-Copyright The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #pragma once
 
diff --git a/csrc/includes/quantization.h b/csrc/includes/quantization.h
index 89867748280c..45828832d8d2 100644
--- a/csrc/includes/quantization.h
+++ b/csrc/includes/quantization.h
@@ -1,6 +1,7 @@
-/*
-Copyright The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #pragma once
 
@@ -39,6 +40,32 @@ void launch_dequantize_kernel(T* dequant_data,
                               int total_elems,
                               cudaStream_t stream);
 
+void launch_swizzled_quant(int8_t* q_data,
+                           float* q_scales,
+                           const __half* input_data,
+                           int num_bits,
+                           quantize::Type q_type,
+                           int groups,
+                           int elems_per_group,
+                           int pipelining,
+                           int nodes,
+                           int devices_per_node,
+                           cudaStream_t stream);
+
+void launch_dequant_reduce(int8_t* reduced_data,
+                           float* reduced_scales,
+                           const int8_t* input_data,
+                           const float* input_scales,
+                           int num_gpus,
+                           int num_bits,
+                           quantize::Type quant_type,
+                           int out_groups,
+                           int elems_per_out_group,
+                           int elems_per_in_tensor,
+                           int groups_per_in_tensor,
+                           int elems_per_in_group,
+                           cudaStream_t stream);
+
 template <typename T>
 void launch_fake_quantize_kernel(T* vals,
                                  int total_count,
@@ -63,3 +90,19 @@ void launch_sr_fake_quantize_kernel_asym(T* vals,
                                          int group_num,
                                          int num_bits,
                                          cudaStream_t stream);
+
+void launch_dequantize_int4_to_half_experimental(uint8_t* data_in,
+                                                 half* data_out,
+                                                 half* scale_buffer,
+                                                 half* min_val_buffer,
+                                                 int num_group,
+                                                 int group_size,
+                                                 cudaStream_t stream);
+
+void launch_dequantize_int8_to_half_experimental(uint8_t* data_in,
+                                                 half* data_out,
+                                                 half* scale_buffer,
+                                                 half* min_val_buffer,
+                                                 int num_group,
+                                                 int group_size,
+                                                 cudaStream_t stream);
diff --git a/csrc/includes/quantization_utils.h b/csrc/includes/quantization_utils.h
index 8b14d1dc8f67..26db86ec1e0b 100644
--- a/csrc/includes/quantization_utils.h
+++ b/csrc/includes/quantization_utils.h
@@ -1,6 +1,7 @@
-/*
-Copyright 2022 The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #include <cassert>
 #include "conversion_utils.h"
@@ -101,9 +102,9 @@ class Params<Type::Asymmetric, numBits> {
         if (max == min) {
             scale = 1.0;
         } else {
-            scale = (1 << numBits) / (max - min);
+            scale = ((1 << numBits)) / (max - min);
         }
-        offset = -(1 << (numBits - 1)) - (min * scale);
+        offset = (max + min) / 2;
     }
 
     DS_D_INLINE int8_t quantize(__half val)
@@ -111,7 +112,7 @@ class Params<Type::Asymmetric, numBits> {
         constexpr int32_t q_min = -(1 << (numBits - 1));
         constexpr int32_t q_max = (1 << (numBits - 1)) - 1;
 
-        float val_f = conversion::to<float>(val) * scale + offset;
+        float val_f = (conversion::to<float>(val) - offset) * scale;
         int32_t data_i32 = conversion::to<int32_t>(val_f);
         data_i32 = min(max(data_i32, q_min), q_max);
         return (int8_t)data_i32;
@@ -120,7 +121,7 @@ class Params<Type::Asymmetric, numBits> {
     template <typename T>
     DS_D_INLINE T dequantize(int8_t val)
     {
-        const float val_deq_f = conversion::to<float>(val) * scale + offset;
+        const float val_deq_f = ((conversion::to<float>(val)) * scale) + offset;
         return conversion::to<__half>(val_deq_f);
     }
 
diff --git a/csrc/includes/quantizer.h b/csrc/includes/quantizer.h
index fe363c809607..f4f63160d79b 100644
--- a/csrc/includes/quantizer.h
+++ b/csrc/includes/quantizer.h
@@ -1,10 +1,16 @@
-/*
-Copyright The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #pragma once
 
+#ifdef __HIP_PLATFORM_AMD__
+#include <hip/hip_cooperative_groups.h>
+#else
 #include <cooperative_groups.h>
+#endif
+
 #include <cuda.h>
 #include <cuda_fp16.h>
 #include <stdio.h>
diff --git a/csrc/includes/reduction_utils.h b/csrc/includes/reduction_utils.h
index fabf19dea911..eb8efab77ac1 100644
--- a/csrc/includes/reduction_utils.h
+++ b/csrc/includes/reduction_utils.h
@@ -1,6 +1,7 @@
-/*
-Copyright 2022 The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #pragma once
 
@@ -144,6 +145,13 @@ of reduce should be straightforward (can just wrap the sum reduction) and
 would be a good extension of the header.
 */
 
+DS_D_INLINE int _warp_rank()
+{
+    const int thread_rank =
+        threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y;
+    return thread_rank / hw_warp_size;
+}
+
 /* Float element reduce implementations */
 template <>
 DS_D_INLINE float element<ROpType::Add>(const float lhs, const float rhs)
@@ -225,6 +233,60 @@ DS_D_INLINE __half2 element<ROpType::Min>(const __half2 lhs, const __half2 rhs)
 #endif
 }
 
+template <>
+DS_D_INLINE int32_t element<ROpType::Add>(const int32_t lhs, const int32_t rhs)
+{
+    return lhs + rhs;
+}
+
+template <>
+DS_D_INLINE int32_t element<ROpType::Max>(const int32_t lhs, const int32_t rhs)
+{
+    return (lhs > rhs) ? lhs : rhs;
+}
+
+template <>
+DS_D_INLINE int32_t element<ROpType::Min>(const int32_t lhs, const int32_t rhs)
+{
+    return (lhs < rhs) ? lhs : rhs;
+}
+
+template <>
+DS_D_INLINE uint32_t element<ROpType::Add>(const uint32_t lhs, const uint32_t rhs)
+{
+    return lhs + rhs;
+}
+
+template <>
+DS_D_INLINE uint32_t element<ROpType::Max>(const uint32_t lhs, const uint32_t rhs)
+{
+    return (lhs > rhs) ? lhs : rhs;
+}
+
+template <>
+DS_D_INLINE uint32_t element<ROpType::Min>(const uint32_t lhs, const uint32_t rhs)
+{
+    return (lhs < rhs) ? lhs : rhs;
+}
+
+template <>
+DS_D_INLINE int64_t element<ROpType::Add>(const int64_t lhs, const int64_t rhs)
+{
+    return lhs + rhs;
+}
+
+template <>
+DS_D_INLINE int64_t element<ROpType::Max>(const int64_t lhs, const int64_t rhs)
+{
+    return (lhs > rhs) ? lhs : rhs;
+}
+
+template <>
+DS_D_INLINE int64_t element<ROpType::Min>(const int64_t lhs, const int64_t rhs)
+{
+    return (lhs < rhs) ? lhs : rhs;
+}
+
 /*
 Reduction initialization primitives
 */
@@ -272,22 +334,106 @@ DS_D_INLINE __half init<ROpType::Max>()
 template <>
 DS_D_INLINE __half2 init<ROpType::Add>()
 {
+#ifdef __HIP_PLATFORM_AMD__
+    return __half2{_Float16_2{0x0000, 0x0000}};
+#else
     constexpr __half2_raw zero = {0x0000, 0x0000};
     return __half2(zero);
+#endif
 }
 
 template <>
 DS_D_INLINE __half2 init<ROpType::Min>()
 {
+#ifdef __HIP_PLATFORM_AMD__
+    return __half2{_Float16_2{0x7C00, 0x7C00}};
+#else
     constexpr __half2_raw inf = {0x7C00, 0x7C00};
     return __half2(inf);
+#endif
 }
 
 template <>
 DS_D_INLINE __half2 init<ROpType::Max>()
 {
+#ifdef __HIP_PLATFORM_AMD__
+    return __half2{_Float16_2{0xFC00, 0xFC00}};
+#else
     constexpr __half2_raw neg_inf = {0xFC00, 0xFC00};
     return __half2(neg_inf);
+#endif
+}
+
+template <>
+DS_D_INLINE int32_t init<ROpType::Add>()
+{
+    return 0;
+}
+
+template <>
+DS_D_INLINE int32_t init<ROpType::Min>()
+{
+    return 0x7FFFFFFF;
+}
+
+template <>
+DS_D_INLINE int32_t init<ROpType::Max>()
+{
+    return 0x80000000;
+}
+
+template <>
+DS_D_INLINE uint32_t init<ROpType::Add>()
+{
+    return 0;
+}
+
+template <>
+DS_D_INLINE uint32_t init<ROpType::Min>()
+{
+    return 0xFFFFFFFF;
+}
+
+template <>
+DS_D_INLINE uint32_t init<ROpType::Max>()
+{
+    return 0;
+}
+
+template <>
+DS_D_INLINE int64_t init<ROpType::Add>()
+{
+    return 0;
+}
+
+template <>
+DS_D_INLINE int64_t init<ROpType::Min>()
+{
+    return 0x7FFFFFFFFFFFFFFF;
+}
+
+template <>
+DS_D_INLINE int64_t init<ROpType::Max>()
+{
+    return 0x8000000000000000;
+}
+
+template <>
+DS_D_INLINE uint64_t init<ROpType::Add>()
+{
+    return 0;
+}
+
+template <>
+DS_D_INLINE uint64_t init<ROpType::Min>()
+{
+    return 0xFFFFFFFFFFFFFFFF;
+}
+
+template <>
+DS_D_INLINE uint64_t init<ROpType::Max>()
+{
+    return 0;
 }
 
 template <ROpType Op, typename T>
@@ -332,8 +478,8 @@ here (fold is C++17 only and I don't think helps and recursion feels like
 huge overkill that harms readability) that would be wonderful.
 */
 
-template <ROpType Op, int reduce_width = hw_warp_size>
-DS_D_INLINE void _warp(cg::thread_block_tile<hw_warp_size>& warp, float* data)
+template <typename T, ROpType Op, int reduce_width = hw_warp_size>
+DS_D_INLINE void _warp(cg::thread_block_tile<hw_warp_size>& warp, T* data)
 {
 #pragma unroll
     for (int i = 1; i < reduce_width; i *= 2) {
@@ -341,8 +487,8 @@ DS_D_INLINE void _warp(cg::thread_block_tile<hw_warp_size>& warp, float* data)
     }
 }
 
-template <ROpType Op1, ROpType Op2, int reduce_width = hw_warp_size>
-DS_D_INLINE void _warp(cg::thread_block_tile<hw_warp_size>& warp, float* data)
+template <typename T, ROpType Op1, ROpType Op2, int reduce_width = hw_warp_size>
+DS_D_INLINE void _warp(cg::thread_block_tile<hw_warp_size>& warp, T* data)
 {
 #pragma unroll
     for (int i = 1; i < reduce_width; i *= 2) {
@@ -351,8 +497,8 @@ DS_D_INLINE void _warp(cg::thread_block_tile<hw_warp_size>& warp, float* data)
     }
 }
 
-template <ROpType Op1, ROpType Op2, ROpType Op3, int reduce_width = hw_warp_size>
-DS_D_INLINE void _warp(cg::thread_block_tile<hw_warp_size>& warp, float* data)
+template <typename T, ROpType Op1, ROpType Op2, ROpType Op3, int reduce_width = hw_warp_size>
+DS_D_INLINE void _warp(cg::thread_block_tile<hw_warp_size>& warp, T* data)
 {
 #pragma unroll
     for (int i = 1; i < reduce_width; i *= 2) {
@@ -362,8 +508,13 @@ DS_D_INLINE void _warp(cg::thread_block_tile<hw_warp_size>& warp, float* data)
     }
 }
 
-template <ROpType Op1, ROpType Op2, ROpType Op3, ROpType Op4, int reduce_width = hw_warp_size>
-DS_D_INLINE void _warp(cg::thread_block_tile<hw_warp_size>& warp, float* data)
+template <typename T,
+          ROpType Op1,
+          ROpType Op2,
+          ROpType Op3,
+          ROpType Op4,
+          int reduce_width = hw_warp_size>
+DS_D_INLINE void _warp(cg::thread_block_tile<hw_warp_size>& warp, T* data)
 {
 #pragma unroll
     for (int i = 1; i < reduce_width; i *= 2) {
@@ -378,48 +529,45 @@ DS_D_INLINE void _warp(cg::thread_block_tile<hw_warp_size>& warp, float* data)
 Implementation for primary block reduction that serves both `block` and
 `partitioned_block`.
 
-`local_warp_rank` refers to the warp's location within the partition, so
-for an unpartitioned threadblock this will be equivalent to
-`warp_arg.meta_group_rank()`.
-
-Similarly, the warp offset is the `local_warp_rank` of the warp with the
-lowest rank in the partition. In the case of an 8 warp block with a
-4 warp reduction, this would map to [0, 0, 0, 0, 4, 4, 4, 4].
-
-Partition size is the number of warps per partition (equal to the thread
-block in the default case). This enables us to only perform the warp reduction
-when able to.
+Total warps refers to the reduction width of the reduction, not
+the number of warps in the block (which may exceed that
+if the block is partitioned or if we do a conservative bound at
+compile time).
 */
-template <int total_warps, ROpType... Ops>
+template <typename T, int total_warps, ROpType... Ops>
 DS_D_INLINE void _block(cg::thread_block& tb,
                         cg::thread_block_tile<hw_warp_size>& warp_arg,
-                        float* data,
-                        int warp_offset)
+                        T* data)
 {
     constexpr int elems = sizeof...(Ops);
-    // Separated for now in case this no longer is true
-    constexpr int bytes = sizeof(float);
+    constexpr int bytes = sizeof(T);
     // Unused when `partition_size == 1` or total_warps == 1
-    __shared__ float reduce_buffer[max_warps * elems];
+    __shared__ T reduce_buffer[max_warps * elems];
+
+#ifdef __HIP_PLATFORM_AMD__
+    const int total_threads = blockDim.x * blockDim.y * blockDim.z;
+    const int running_warps = total_threads / hw_warp_size;
+#else
+    const int running_warps = warp_arg.meta_group_size();
+#endif
 
     // Always perform warp-scope reduction
-    _warp<Ops...>(warp_arg, data);
+    _warp<T, Ops...>(warp_arg, data);
 
     // If max_warps == 1 let's skip the runtime check
-    if (warp_arg.meta_group_size() > 1 && total_warps != 1) {
+    if (total_warps != 1) {
         if (warp_arg.thread_rank() == 0) {
 #pragma unroll
             for (int i = 0; i < elems; i++) {
-                mem_access::store_shared<bytes>(
-                    reduce_buffer + elems * warp_arg.meta_group_rank() + i, data + i);
+                mem_access::store_shared<bytes>(reduce_buffer + elems * _warp_rank() + i, data + i);
             }
         }
 
         // Synchronization inside block-uniform conditional is safe
         tb.sync();
 
-        if (warp_arg.meta_group_rank() == 0) {
-            if (warp_arg.thread_rank() < warp_arg.meta_group_size()) {
+        if (_warp_rank() == 0) {
+            if (warp_arg.thread_rank() < running_warps) {
 #pragma unroll
                 for (int i = 0; i < elems; i++) {
                     mem_access::load_shared<bytes>(
@@ -429,7 +577,7 @@ DS_D_INLINE void _block(cg::thread_block& tb,
                 init<Ops...>(data);
             }
 
-            _warp<Ops..., total_warps>(warp_arg, data);
+            _warp<T, Ops..., total_warps>(warp_arg, data);
 
 #pragma unroll
             for (int i = 0; i < elems; i++) {
@@ -443,8 +591,7 @@ DS_D_INLINE void _block(cg::thread_block& tb,
 
 #pragma unroll
         for (int i = 0; i < elems; i++) {
-            mem_access::load_shared<bytes>(data + i,
-                                           reduce_buffer + warp_arg.meta_group_rank() * elems + i);
+            mem_access::load_shared<bytes>(data + i, reduce_buffer + _warp_rank() * elems + i);
         }
     }
 }
@@ -459,7 +606,7 @@ us to obfuscate the details of the partitioned implementation.
 template <ROpType Op, int warp_bound>
 DS_D_INLINE void block(cg::thread_block& tb, cg::thread_block_tile<hw_warp_size>& warp, float& val)
 {
-    _block<warp_bound, Op>(tb, warp, &val, 0);
+    _block<float, warp_bound, Op>(tb, warp, &val);
 }
 
 template <ROpType Op1, ROpType Op2, int warp_bound>
@@ -469,7 +616,7 @@ DS_D_INLINE void block(cg::thread_block& tb,
                        float& val2)
 {
     float data[2] = {val1, val2};
-    _block<warp_bound, Op1, Op2>(tb, warp, data, 0);
+    _block<float, warp_bound, Op1, Op2>(tb, warp, data);
     val1 = data[0];
     val2 = data[1];
 }
@@ -482,7 +629,7 @@ DS_D_INLINE void block(cg::thread_block& tb,
                        float& val3)
 {
     float data[3] = {val1, val2, val3};
-    _block<warp_bound, Op1, Op2, Op3>(tb, warp, data, 0);
+    _block<float, warp_bound, Op1, Op2, Op3>(tb, warp, data);
     val1 = data[0];
     val2 = data[1];
     val3 = data[2];
@@ -497,7 +644,7 @@ DS_D_INLINE void block(cg::thread_block& tb,
                        float& val4)
 {
     float data[4] = {val1, val2, val3, val4};
-    _block<warp_bound, Op1, Op2, Op3, Op4>(tb, warp, data, 0);
+    _block<float, warp_bound, Op1, Op2, Op3, Op4>(tb, warp, data);
     val1 = data[0];
     val2 = data[1];
     val3 = data[2];
@@ -514,11 +661,10 @@ DS_D_INLINE void partitioned_block(cg::thread_block& tb,
                                    float& val)
 {
     if (num_threads <= hw_warp_size) {
-        _warp<Op, num_threads>(warp, &val);
+        _warp<float, Op, num_threads>(warp, &val);
     } else {
         constexpr int num_warps = num_threads / hw_warp_size;
-        const int warp_offset = warp.meta_group_rank() & ~(num_warps - 1);
-        _block<num_warps, Op>(tb, warp, &val, warp_offset);
+        _block<float, num_warps, Op>(tb, warp, &val);
     }
 }
 
@@ -531,11 +677,10 @@ DS_D_INLINE void partitioned_block(cg::thread_block& tb,
     float data[2] = {val1, val2};
 
     if (num_threads <= hw_warp_size) {
-        _warp<Op1, Op2, num_threads>(warp, data);
+        _warp<float, Op1, Op2, num_threads>(warp, data);
     } else {
         constexpr int num_warps = num_threads / hw_warp_size;
-        const int warp_offset = warp.meta_group_rank() & ~(num_warps - 1);
-        _block<num_warps, Op1, Op2>(tb, warp, data, warp_offset);
+        _block<float, num_warps, Op1, Op2>(tb, warp, data);
     }
 
     val1 = data[0];
@@ -552,11 +697,10 @@ DS_D_INLINE void partitioned_block(cg::thread_block& tb,
     float data[3] = {val1, val2, val3};
 
     if (num_threads <= hw_warp_size) {
-        _warp<Op1, Op2, Op3, num_threads>(warp, data);
+        _warp<float, Op1, Op2, Op3, num_threads>(warp, data);
     } else {
         constexpr int num_warps = num_threads / hw_warp_size;
-        const int warp_offset = warp.meta_group_rank() & ~(num_warps - 1);
-        _block<num_warps, Op1, Op2, Op3>(tb, warp, data, warp_offset);
+        _block<float, num_warps, Op1, Op2, Op3>(tb, warp, data);
     }
 
     val1 = data[0];
@@ -575,11 +719,10 @@ DS_D_INLINE void partitioned_block(cg::thread_block& tb,
     float data[4] = {val1, val2, val3, val4};
 
     if (num_threads <= hw_warp_size) {
-        _warp<Op1, Op2, Op3, Op4, num_threads>(warp, data);
+        _warp<float, Op1, Op2, Op3, Op4, num_threads>(warp, data);
     } else {
         constexpr int num_warps = num_threads / hw_warp_size;
-        const int warp_offset = warp.meta_group_rank() & ~(num_warps - 1);
-        _block<num_warps, Op1, Op2, Op3, Op4>(tb, warp, data, warp_offset);
+        _block<float, num_warps, Op1, Op2, Op3, Op4>(tb, warp, data);
     }
 
     val1 = data[0];
@@ -588,4 +731,48 @@ DS_D_INLINE void partitioned_block(cg::thread_block& tb,
     val4 = data[3];
 }
 
+/*
+Arg-reduce is a specialization of the above. We only support this with a single reduction
+parameter. This only works for max/min reductions.
+*/
+
+__align__(8) struct IdxReduceResult {
+    /*
+    NOTE: ORDERING MATTERS HERE! The idx is the least significant set of bits
+    and the val is the most significant. Changing the order of this declaration
+    will break the code.
+    */
+    int idx;
+    float val;
+};
+
+template <ROpType Op, int warpBound>
+DS_D_INLINE IdxReduceResult
+idx_reduce(cg::thread_block& tb, cg::thread_block_tile<hw_warp_size>& warp, float val, int idx)
+{
+    IdxReduceResult res = {idx, val};
+
+    // Clear out the nan. This shouldn't be an issue for our initial applications
+    if (isnan(val)) res.val = init<Op>();
+
+    // Can do float compares as integers. By packing the index into the lower bits
+    // we can just do a single int64 rather than a branch, compare, and select.
+    // One side benefit of this is that it is by nature a stable algorithm and
+    // will always bias ties to the higher index.
+    int64_t* res_as_int = reinterpret_cast<int64_t*>(&res);
+
+    // The way floating point compare works is normally to perform a sign comparison
+    // and if they match, then do a comparison of the rest of the bits as unsigned
+    // integers. Since we are bundling these, that means for negative values we need
+    // to reverse the sort order, which we can do with an XOR.
+    if (val < 0) { *res_as_int ^= 0x7fffffff00000000; }
+
+    _block<int64_t, warpBound, Op>(tb, warp, res_as_int);
+
+    // Sign bit is preserved, so we can check if we need to invert the mantissa back
+    if (res.val < 0) { *res_as_int ^= 0x7fffffff00000000; }
+
+    return res;
+}
+
 }  // namespace reduce
diff --git a/csrc/includes/simd.h b/csrc/includes/simd.h
index 61f6ffdf3cbc..f77568be7835 100644
--- a/csrc/includes/simd.h
+++ b/csrc/includes/simd.h
@@ -1,6 +1,7 @@
-/*
-Copyright The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #pragma once
 
@@ -23,6 +24,10 @@ Copyright The Microsoft DeepSpeed Team
 #define SIMD_FMA(x, y, c) _mm512_fmadd_ps(x, y, c)
 #define SIMD_SQRT(x) _mm512_sqrt_ps(x)
 #define SIMD_DIV(x, y) _mm512_div_ps(x, y)
+#define SIMD_AND(x, y) _mm512_and_ps(x, y)
+#define SIMD_ANDNOT(x, y) _mm512_andnot_ps(x, y)
+#define SIMD_OR(x, y) _mm512_or_ps(x, y)
+#define SIMD_XOR(x, y) _mm512_xor_ps(x, y)
 #define SIMD_WIDTH 16
 
 #define SIMD_LOAD2(x, h) \
@@ -41,10 +46,14 @@ Copyright The Microsoft DeepSpeed Team
 #define SIMD_FMA(x, y, c) _mm256_fmadd_ps(x, y, c)
 #define SIMD_SQRT(x) _mm256_sqrt_ps(x)
 #define SIMD_DIV(x, y) _mm256_div_ps(x, y)
+#define SIMD_AND(x, y) _mm256_and_ps(x, y)
+#define SIMD_ANDNOT(x, y) _mm256_andnot_ps(x, y)
+#define SIMD_OR(x, y) _mm256_or_ps(x, y)
+#define SIMD_XOR(x, y) _mm256_xor_ps(x, y)
 #define SIMD_WIDTH 8
+
 #define SIMD_LOAD2(x, h) \
     ((h) ? _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*)x)) : _mm256_loadu_ps(x))
-
 #define SIMD_STORE2(x, d, h)                                                                \
     ((h) ? _mm_store_ps(x, _mm_castsi128_ps(_mm256_cvtps_ph(d, _MM_FROUND_TO_NEAREST_INT))) \
          : _mm256_storeu_ps(x, d))
@@ -135,5 +144,55 @@ inline void simd_div(AVX_Data* dst, AVX_Data* src_a_l, AVX_Data* src_a_r)
 #pragma unroll
     for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_DIV(src_a_l[i].data, src_a_r[i].data); }
 }
+template <int span>
+inline void simd_and(AVX_Data* dst, AVX_Data* src_a_l, AVX_Data src_a_r)
+{
+#pragma unroll
+    for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_AND(src_a_l[i].data, src_a_r.data); }
+}
+template <int span>
+inline void simd_and(AVX_Data* dst, AVX_Data* src_a_l, AVX_Data* src_a_r)
+{
+#pragma unroll
+    for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_AND(src_a_l[i].data, src_a_r[i].data); }
+}
+template <int span>
+inline void simd_andnot(AVX_Data* dst, AVX_Data* src_a_l, AVX_Data src_a_r)
+{
+#pragma unroll
+    for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_ANDNOT(src_a_l[i].data, src_a_r.data); }
+}
+template <int span>
+inline void simd_andnot(AVX_Data* dst, AVX_Data* src_a_l, AVX_Data* src_a_r)
+{
+#pragma unroll
+    for (size_t i = 0; i < span; ++i) {
+        dst[i].data = SIMD_ANDNOT(src_a_l[i].data, src_a_r[i].data);
+    }
+}
+template <int span>
+inline void simd_or(AVX_Data* dst, AVX_Data* src_a_l, AVX_Data src_a_r)
+{
+#pragma unroll
+    for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_OR(src_a_l[i].data, src_a_r.data); }
+}
+template <int span>
+inline void simd_or(AVX_Data* dst, AVX_Data* src_a_l, AVX_Data* src_a_r)
+{
+#pragma unroll
+    for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_OR(src_a_l[i].data, src_a_r[i].data); }
+}
+template <int span>
+inline void simd_xor(AVX_Data* dst, AVX_Data* src_a_l, AVX_Data src_a_r)
+{
+#pragma unroll
+    for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_XOR(src_a_l[i].data, src_a_r.data); }
+}
+template <int span>
+inline void simd_xor(AVX_Data* dst, AVX_Data* src_a_l, AVX_Data* src_a_r)
+{
+#pragma unroll
+    for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_XOR(src_a_l[i].data, src_a_r[i].data); }
+}
 
 #endif
diff --git a/csrc/includes/softmax.h b/csrc/includes/softmax.h
old mode 100755
new mode 100644
index 676a35d5292a..b6320a3f91be
--- a/csrc/includes/softmax.h
+++ b/csrc/includes/softmax.h
@@ -1,6 +1,7 @@
-/*
-Copyright The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #pragma once
 
diff --git a/csrc/includes/strided_batch_gemm.h b/csrc/includes/strided_batch_gemm.h
index 6a1d65cfc7ba..86d1e3dea11a 100644
--- a/csrc/includes/strided_batch_gemm.h
+++ b/csrc/includes/strided_batch_gemm.h
@@ -1,6 +1,7 @@
-/*
-Copyright The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #pragma once
 
@@ -76,7 +77,7 @@ class StridedBatchGemm {
                                     stride_b,
                                     stride_c,
                                     bsz,
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef __HIP_PLATFORM_AMD__
                                     rocblas_gemm_algo(_config.gemm_algos[0]));
 #else
                                     cublasGemmAlgo_t(_config.gemm_algos[0]));
@@ -104,7 +105,7 @@ class StridedBatchGemm {
                                     stride_b,
                                     stride_c,
                                     _config.batch_size,
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef __HIP_PLATFORM_AMD__
                                     rocblas_gemm_algo(_config.gemm_algos[0]));
 #else
                                     cublasGemmAlgo_t(_config.gemm_algos[0]));
@@ -148,7 +149,7 @@ class StridedBatchGemm {
                                     stride_b,
                                     stride_c,
                                     bsz,
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef __HIP_PLATFORM_AMD__
                                     rocblas_gemm_algo(_config.gemm_algos[1]));
 #else
                                     cublasGemmAlgo_t(_config.gemm_algos[1]));
@@ -177,7 +178,7 @@ class StridedBatchGemm {
                                     stride_b,
                                     stride_c,
                                     bsz,
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef __HIP_PLATFORM_AMD__
                                     rocblas_gemm_algo(_config.gemm_algos[2]));
 #else
                                     cublasGemmAlgo_t(_config.gemm_algos[2]));
diff --git a/csrc/includes/type_shim.h b/csrc/includes/type_shim.h
index 9d28019be877..78ade5a06daa 100644
--- a/csrc/includes/type_shim.h
+++ b/csrc/includes/type_shim.h
@@ -1,6 +1,7 @@
-/*
-Copyright The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 /* Taken from NVIDIA/apex commit 855808f3fc268e9715d613f3c2e56469d8c986d8 */
 #include <ATen/ATen.h>
diff --git a/csrc/lamb/fused_lamb_cuda.cpp b/csrc/lamb/fused_lamb_cuda.cpp
index 7a142b13b00c..c629b93517d2 100644
--- a/csrc/lamb/fused_lamb_cuda.cpp
+++ b/csrc/lamb/fused_lamb_cuda.cpp
@@ -1,4 +1,8 @@
-/* Copyright 2019 The Microsoft DeepSpeed Team */
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
 #include <torch/extension.h>
 
 // CUDA forward declaration
@@ -22,7 +26,7 @@ void fused_lamb_cuda(at::Tensor& p,
                      at::Tensor& u_l2_i,
                      at::Tensor& lamb_coeff_val);
 
-#define CHECK_CUDA(x) AT_ASSERTM(x.type().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), #x " must be a CUDA tensor")
 #define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
 #define CHECK_INPUT(x) \
     CHECK_CUDA(x);     \
diff --git a/csrc/lamb/fused_lamb_cuda_kernel.cu b/csrc/lamb/fused_lamb_cuda_kernel.cu
index a957eb628751..d9bacae73457 100644
--- a/csrc/lamb/fused_lamb_cuda_kernel.cu
+++ b/csrc/lamb/fused_lamb_cuda_kernel.cu
@@ -1,4 +1,8 @@
-/* Copyright 2019 The Microsoft DeepSpeed Team */
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <stdio.h>
@@ -13,7 +17,7 @@
 #include <iostream>
 
 // #include <helper_functions.h>
-#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
+#if defined(__HIP_PLATFORM_AMD__) && HIP_VERSION > 305
 #include <hip/hip_cooperative_groups.h>
 #else
 #include <cooperative_groups.h>
@@ -105,7 +109,7 @@ __device__ void reduce_block_in_shared_memory(T* s_a, T* s_b, T* g_a, T* g_b)
 
     cg::sync(cta);
 
-#if (__CUDA_ARCH__ >= 300) || (defined(__HIP_PLATFORM_HCC__) && HIP_VERSION >= 502)
+#if (__CUDA_ARCH__ >= 300) || (defined(__HIP_PLATFORM_AMD__) && HIP_VERSION >= 502)
     if (tid < 32) {
         cg::coalesced_group active = cg::coalesced_threads();
 
diff --git a/csrc/lion/cpu_lion.cpp b/csrc/lion/cpu_lion.cpp
new file mode 100644
index 000000000000..a0562eac9c4a
--- /dev/null
+++ b/csrc/lion/cpu_lion.cpp
@@ -0,0 +1,16 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include "cpu_lion.h"
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+    m.def("lion_update", &ds_lion_step, "DeepSpeed CPU Lion update (C++)");
+    m.def("lion_update_copy",
+          &ds_lion_step_plus_copy,
+          "DeepSpeed CPU Lion update and param copy (C++)");
+    m.def("create_lion", &create_lion_optimizer, "DeepSpeed CPU Lion (C++)");
+    m.def("destroy_lion", &destroy_lion_optimizer, "DeepSpeed CPU Lion destroy (C++)");
+}
diff --git a/csrc/lion/cpu_lion_impl.cpp b/csrc/lion/cpu_lion_impl.cpp
new file mode 100644
index 000000000000..28314cf5b6e1
--- /dev/null
+++ b/csrc/lion/cpu_lion_impl.cpp
@@ -0,0 +1,268 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include <torch/extension.h>
+#include <cassert>
+#include <cmath>
+#include <iostream>
+#include <memory>
+#include <type_traits>
+#include <unordered_map>
+#include "cpu_lion.h"
+
+#if defined(__ENABLE_CUDA__)
+#include <cuda_runtime_api.h>
+#include "cublas_v2.h"
+#include "cuda.h"
+#include "curand.h"
+#include "custom_cuda_layers.h"
+#endif
+
+static std::unordered_map<int, std::shared_ptr<void>> s_optimizers;
+
+// C++ interface
+
+void Lion_Optimizer::Step_1(float* _params,
+                            float* grads,
+                            float* _exp_avg,
+                            size_t _param_size,
+                            ds_half_precision_t* dev_params,
+                            bool half_precision)
+{
+    size_t rounded_size = 0;
+#if defined(__AVX512__) or defined(__AVX256__)
+    Step_AVX<1>(&rounded_size, _params, grads, _exp_avg, _param_size, dev_params, half_precision);
+#endif
+    if (_param_size > rounded_size) {
+        float betta1_minus1 = 1 - _betta1;
+        float betta2_minus1 = 1 - _betta2;
+
+        float alpha = _alpha;
+        float after_decay = 1 - alpha * _weight_decay;
+        ds_half_precision_t* grads_cast_h;
+        ds_half_precision_t* params_cast_h;
+        if (half_precision) {
+            grads_cast_h = reinterpret_cast<ds_half_precision_t*>(grads);
+            params_cast_h = reinterpret_cast<ds_half_precision_t*>(_params);
+        }
+
+        for (size_t t = rounded_size; t < _param_size; t += TILE) {
+            size_t copy_size = TILE;
+            if ((t + TILE) > _param_size) copy_size = _param_size - t;
+            size_t offset = copy_size + t;
+#if defined(__ENABLE_CUDA__)
+            if ((t / TILE) >= 2) { cudaStreamSynchronize(_streams[_buf_index]); }
+#elif defined(__ENABLE_CANN__)
+            if ((t / TILE) >= 2) { aclrtSynchronizeStream(_streams[_buf_index].stream()); }
+#endif
+#pragma omp parallel for
+            for (size_t k = t; k < offset; k++) {
+                float grad = half_precision ? (float)grads_cast_h[k] : grads[k];
+                float param = half_precision ? (float)params_cast_h[k] : _params[k];
+                float momentum = _exp_avg[k];
+                float tmp = momentum * _betta1;
+                tmp = grad * betta1_minus1 + tmp;
+                // Rely on portable C++ methods to manipulate the sign bit of a floating-point
+                // number.
+                tmp = -std::copysignf(alpha, tmp);
+                if (_weight_decay > 0) {
+                    param = param * after_decay + tmp;
+                } else {
+                    param = param + tmp;
+                }
+                momentum = momentum * _betta2;
+                momentum = grad * betta2_minus1 + momentum;
+#if defined(__ENABLE_CUDA__) or defined(__ENABLE_CANN__)
+                if (dev_params) _doubled_buffer[_buf_index][k - t] = param;
+#endif
+                if (half_precision)
+                    params_cast_h[k] = (ds_half_precision_t)param;
+                else
+                    _params[k] = param;
+                _exp_avg[k] = momentum;
+            }
+#if defined(__ENABLE_CUDA__)
+            if (dev_params) {
+                launch_param_update(
+                    _doubled_buffer[_buf_index], dev_params + t, (copy_size), _streams[_buf_index]);
+
+                _buf_index = !_buf_index;
+            }
+#elif defined(__ENABLE_CANN__)
+            if (dev_params) {
+                size_t memcpy_size = copy_size * sizeof(_doubled_buffer[_buf_index][0]);
+                aclrtMemcpy(dev_params + t,
+                            memcpy_size,
+                            _doubled_buffer[_buf_index],
+                            memcpy_size,
+                            aclrtMemcpyKind::ACL_MEMCPY_HOST_TO_DEVICE);
+
+                _buf_index = !_buf_index;
+            }
+#endif
+        }
+    }
+}
+
+void Lion_Optimizer::Step_4(float* _params,
+                            float* grads,
+                            float* _exp_avg,
+                            size_t _param_size,
+                            ds_half_precision_t* dev_params,
+                            bool half_precision)
+{
+    size_t rounded_size = 0;
+#if defined(__AVX512__) or defined(__AVX256__)
+    Step_AVX<4>(&rounded_size, _params, grads, _exp_avg, _param_size, dev_params, half_precision);
+#endif
+    if (_param_size > rounded_size)
+        Step_1((_params + rounded_size),
+               (grads + rounded_size),
+               (_exp_avg + rounded_size),
+               (_param_size - rounded_size),
+               (dev_params != nullptr ? (dev_params + rounded_size) : dev_params),
+               half_precision);
+}
+
+int create_lion_optimizer(int optimizer_id,
+                          float alpha,
+                          float betta1,
+                          float betta2,
+                          float weight_decay,
+                          bool should_log)
+{
+    auto opt = std::make_shared<Lion_Optimizer>(alpha, betta1, betta2, weight_decay);
+
+    s_optimizers[optimizer_id] = opt;
+
+    if (should_log) {
+        std::string avx_type = "";
+#if defined(__AVX512__)
+        avx_type = "AVX512";
+#else
+#if defined(__AVX256__)
+        avx_type = "AVX2";
+#else
+        avx_type = "scalar";
+#endif
+#endif
+
+        printf("Lion Optimizer #%d is created with %s arithmetic capability.\n",
+               optimizer_id,
+               avx_type.c_str());
+        printf("Config: alpha=%f, betas=(%f, %f), weight_decay=%f\n",
+               alpha,
+               betta1,
+               betta2,
+               weight_decay);
+    }
+
+    return 0;
+}
+
+void Lion_Optimizer::Step_8(float* _params,
+                            float* grads,
+                            float* _exp_avg,
+                            size_t _param_size,
+                            ds_half_precision_t* dev_params,
+                            bool half_precision)
+{
+    size_t rounded_size = 0;
+#if defined(__AVX512__) or defined(__AVX256__)
+    Step_AVX<8>(&rounded_size, _params, grads, _exp_avg, _param_size, dev_params, half_precision);
+#endif
+    if (_param_size > rounded_size)
+        Step_4((_params + rounded_size),
+               (grads + rounded_size),
+               (_exp_avg + rounded_size),
+               (_param_size - rounded_size),
+               (dev_params != nullptr ? (dev_params + rounded_size) : dev_params),
+               half_precision);
+}
+
+int ds_lion_step(int optimizer_id,
+                 size_t step,
+                 float lr,
+                 float beta1,
+                 float beta2,
+                 float weight_decay,
+                 torch::Tensor& params,
+                 torch::Tensor& grads,
+                 torch::Tensor& exp_avg)
+{
+    auto params_c = params.contiguous();
+    auto grads_c = grads.contiguous();
+    auto exp_avg_c = exp_avg.contiguous();
+
+    // assert(params.options().dtype() == grads.options().dtype());
+
+    float* params_ptr = (float*)params_c.data_ptr();
+    float* grads_ptr = (float*)grads_c.data_ptr();
+    float* exp_avg_ptr = (float*)exp_avg_c.data_ptr();
+
+    std::shared_ptr<Lion_Optimizer> opt =
+        std::static_pointer_cast<Lion_Optimizer>(s_optimizers[optimizer_id]);
+    opt->IncrementStep(step, beta1, beta2);
+    opt->update_state(lr, weight_decay);
+
+    opt->Step_8(params_ptr,
+                grads_ptr,
+                exp_avg_ptr,
+                params_c.numel(),
+                nullptr,
+                (params.options().dtype() == at::kHalf));
+
+#if defined(__ENABLE_CUDA__) or defined(__ENABLE_CANN__)
+    opt->SynchronizeStreams();
+#endif
+    return 0;
+}
+
+int ds_lion_step_plus_copy(int optimizer_id,
+                           size_t step,
+                           float lr,
+                           float beta1,
+                           float beta2,
+                           float weight_decay,
+                           torch::Tensor& params,
+                           torch::Tensor& grads,
+                           torch::Tensor& exp_avg,
+                           torch::Tensor& gpu_params)
+{
+#if defined(__ENABLE_CUDA__) or defined(__ENABLE_CANN__)
+    auto params_c = params.contiguous();
+    auto gpu_params_c = gpu_params.contiguous();
+    auto exp_avg_c = exp_avg.contiguous();
+    auto grads_c = grads.contiguous();
+
+    float* params_ptr = (float*)params_c.data_ptr();
+    float* grads_ptr = (float*)grads_c.data_ptr();
+    ds_half_precision_t* gpu_params_ptr = (ds_half_precision_t*)gpu_params_c.data_ptr();
+    float* exp_avg_ptr = (float*)exp_avg_c.data_ptr();
+
+    std::shared_ptr<Lion_Optimizer> opt =
+        std::static_pointer_cast<Lion_Optimizer>(s_optimizers[optimizer_id]);
+    opt->IncrementStep(step, beta1, beta2);
+    opt->update_state(lr, weight_decay);
+    opt->Step_8(params_ptr,
+                grads_ptr,
+                exp_avg_ptr,
+                params_c.numel(),
+                gpu_params_ptr,
+                (params.options().dtype() == at::kHalf));
+
+    opt->SynchronizeStreams();
+#else
+    assert(false);
+#endif
+    return 0;
+}
+
+int destroy_lion_optimizer(int optimizer_id)
+{
+    s_optimizers.erase(optimizer_id);
+
+    return 0;
+}
diff --git a/csrc/lion/fused_lion_frontend.cpp b/csrc/lion/fused_lion_frontend.cpp
new file mode 100644
index 000000000000..e523f97ca309
--- /dev/null
+++ b/csrc/lion/fused_lion_frontend.cpp
@@ -0,0 +1,22 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include <torch/extension.h>
+
+void multi_tensor_lion_cuda(int chunk_size,
+                            at::Tensor noop_flag,
+                            std::vector<std::vector<at::Tensor>> tensor_lists,
+                            const float lr,
+                            const float beta1,
+                            const float beta2,
+                            const int step,
+                            const float weight_decay);
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+    m.def("multi_tensor_lion",
+          &multi_tensor_lion_cuda,
+          "Compute and apply gradient update to parameters for Lion optimizer");
+}
diff --git a/csrc/lion/multi_tensor_apply.cuh b/csrc/lion/multi_tensor_apply.cuh
new file mode 100644
index 000000000000..12f41cb49c6b
--- /dev/null
+++ b/csrc/lion/multi_tensor_apply.cuh
@@ -0,0 +1,132 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+/*
+Copyright NVIDIA/apex
+This file is adapted from fused adam in NVIDIA/apex, commit a109f85
+*/
+
+#include <ATen/ATen.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/Exceptions.h>
+#include <c10/cuda/CUDAGuard.h>
+#include "compat.h"
+
+#include <assert.h>
+
+// #include <iostream>
+
+// This header is the one-stop shop for all your multi-tensor apply needs.
+
+// TODO:  Kernel arg size limit may be <4KB for some other cards (ie Jetson)
+constexpr int depth_to_max_tensors[5] = {110, 64, 48, 36, 30};
+constexpr int depth_to_max_blocks[5] = {320, 320, 320, 320, 320};
+
+template <int n>
+struct TensorListMetadata {
+    void* addresses[n][depth_to_max_tensors[n - 1]];
+    int sizes[depth_to_max_tensors[n - 1]];
+    unsigned char block_to_tensor[depth_to_max_blocks[n - 1]];
+    int block_to_chunk[depth_to_max_blocks[n - 1]];  // I fear this needs to be a full int.
+    int start_tensor_this_launch;
+};
+
+template <typename T, typename U, typename... ArgTypes>
+__global__ void multi_tensor_apply_kernel(int chunk_size,
+                                          volatile int* noop_flag,
+                                          T tl,
+                                          U callable,
+                                          ArgTypes... args)
+{
+    // Hand the chunk information to the user-supplied functor to process however it likes.
+    callable(chunk_size, noop_flag, tl, args...);
+}
+
+template <int depth, typename T, typename... ArgTypes>
+void multi_tensor_apply(int block_size,
+                        int chunk_size,
+                        const at::Tensor& noop_flag,
+                        const std::vector<std::vector<at::Tensor>>& tensor_lists,
+                        T callable,
+                        ArgTypes... args)
+{
+    TORCH_CHECK(tensor_lists.size() == depth, "tensor_lists.size() != depth");
+    int len0 = tensor_lists[0].size();
+    TORCH_CHECK(len0 > 0, "tensor_lists[0].size() is not > 0");
+    auto ref_device = tensor_lists[0][0].device();
+    TORCH_CHECK(ref_device.type() == at::kCUDA, "expected input to be on cuda");
+    for (int l = 0; l < tensor_lists.size(); l++)  // No range-based for because I need indices
+    {
+        TORCH_CHECK(tensor_lists[l].size() == len0, "Size mismatch among tensor lists");
+        for (int t = 0; t < tensor_lists[l].size(); t++) {
+            // TODO:  Print which tensor fails.
+            bool contiguous_memory = tensor_lists[l][t].is_contiguous();
+#ifdef VERSION_GE_1_5
+            contiguous_memory = (contiguous_memory ||
+                                 tensor_lists[l][t].is_contiguous(at::MemoryFormat::ChannelsLast));
+#endif
+            TORCH_CHECK(contiguous_memory, "A tensor was not contiguous.");
+            TORCH_CHECK(tensor_lists[l][t].device() == ref_device,
+                        "A tensor was not on the same device as the first tensor");
+            TORCH_CHECK(tensor_lists[l][t].numel() == tensor_lists[0][t].numel(), "Size mismatch");
+        }
+    }
+
+    int ntensors = tensor_lists[0].size();
+
+    TensorListMetadata<depth> tl;
+
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(tensor_lists[0][0]));
+    auto stream = at::cuda::getCurrentCUDAStream();
+
+    tl.start_tensor_this_launch = 0;
+    int loc_block_info = 0;
+    int loc_tensor_info = 0;
+    for (int t = 0; t < ntensors; t++) {
+        tl.sizes[loc_tensor_info] = tensor_lists[0][t].numel();
+        for (int d = 0; d < depth; d++)
+            tl.addresses[d][loc_tensor_info] = tensor_lists[d][t].data_ptr();
+        loc_tensor_info++;
+
+        int chunks_this_tensor = (tensor_lists[0][t].numel() + chunk_size - 1) / chunk_size;
+
+        for (int chunk = 0; chunk < chunks_this_tensor; chunk++) {
+            // std::cout << chunks_this_tensor << std::endl;
+            tl.block_to_tensor[loc_block_info] = loc_tensor_info - 1;
+            tl.block_to_chunk[loc_block_info] = chunk;
+            loc_block_info++;
+
+            bool tensors_full = (loc_tensor_info == depth_to_max_tensors[depth - 1] &&
+                                 chunk == chunks_this_tensor - 1);
+            bool blocks_full = (loc_block_info == depth_to_max_blocks[depth - 1]);
+            bool last_chunk = (t == ntensors - 1 && chunk == chunks_this_tensor - 1);
+            if (tensors_full || blocks_full || last_chunk) {
+                // using accscalar_t = acc_type<scalar_t, true>;
+                multi_tensor_apply_kernel<<<loc_block_info, block_size, 0, stream>>>(
+                    chunk_size, noop_flag.DATA_PTR<int>(), tl, callable, args...);
+
+                AT_CUDA_CHECK(cudaGetLastError());
+
+                // Reset.  The control flow possibilities here make my brain hurt.
+                loc_block_info = 0;
+                if (chunk == chunks_this_tensor - 1) {
+                    // std::cout << "Hit case 1 " << cond1 << " " << cond2 << " " << cond3 <<
+                    // std::endl;
+                    loc_tensor_info = 0;
+                    tl.start_tensor_this_launch = t + 1;
+                } else {
+                    // std::cout << "Hit case 2 " << cond1 << " " << cond2 << " " << cond3 <<
+                    // std::endl;
+                    tl.sizes[0] = tl.sizes[loc_tensor_info - 1];
+                    for (int d = 0; d < depth; d++)
+                        tl.addresses[d][0] = tl.addresses[d][loc_tensor_info - 1];
+                    loc_tensor_info = 1;
+                    tl.start_tensor_this_launch = t;
+                }
+            }
+        }
+    }
+}
diff --git a/csrc/lion/multi_tensor_lion.cu b/csrc/lion/multi_tensor_lion.cu
new file mode 100644
index 000000000000..f5fe6dfdd7d0
--- /dev/null
+++ b/csrc/lion/multi_tensor_lion.cu
@@ -0,0 +1,126 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+/*
+Copyright NVIDIA/apex
+This file is adapted from fused adam in NVIDIA/apex, commit a109f85
+*/
+
+#include <ATen/ATen.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/Exceptions.h>
+// Another possibility:
+// #include <torch/all.h>
+
+#include <assert.h>
+
+#include "multi_tensor_apply.cuh"
+#include "type_shim.h"
+
+#define BLOCK_SIZE 512
+#define ILP 4
+
+using MATH_T = float;
+
+template <typename T>
+struct LionFunctor {
+    __device__ __forceinline__ void operator()(int chunk_size,
+                                               volatile int* noop_gmem,
+                                               TensorListMetadata<3>& tl,
+                                               const float beta1,
+                                               const float beta2,
+                                               const float lr,
+                                               const float decay)
+    {
+        // I'd like this kernel to propagate infs/nans.
+        // if(*noop_gmem == 1)
+        //   return;
+
+        int tensor_loc = tl.block_to_tensor[blockIdx.x];
+
+        // potentially use to pass in list of scalar
+        // int tensor_num = tl.start_tensor_this_launch + tensor_loc;
+
+        int chunk_idx = tl.block_to_chunk[blockIdx.x];
+        int n = tl.sizes[tensor_loc];
+
+        T* g = (T*)tl.addresses[0][tensor_loc];
+        g += chunk_idx * chunk_size;
+
+        T* p = (T*)tl.addresses[1][tensor_loc];
+        p += chunk_idx * chunk_size;
+
+        T* m = (T*)tl.addresses[2][tensor_loc];
+        m += chunk_idx * chunk_size;
+
+        n -= chunk_idx * chunk_size;
+
+        MATH_T after_decay = 1.0f - lr * decay;
+
+        // see note in multi_tensor_scale_kernel.cu
+        for (int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x * ILP) {
+            MATH_T r_g[ILP];
+            MATH_T r_p[ILP];
+            MATH_T r_m[ILP];
+#pragma unroll
+            for (int ii = 0; ii < ILP; ii++) {
+                int i = i_start + threadIdx.x + ii * blockDim.x;
+                if (i < n && i < chunk_size) {
+                    r_g[ii] = g[i];
+                    r_p[ii] = p[i];
+                    r_m[ii] = m[i];
+                } else {
+                    r_g[ii] = MATH_T(0);
+                    r_p[ii] = MATH_T(0);
+                    r_m[ii] = MATH_T(0);
+                }
+            }
+#pragma unroll
+            for (int ii = 0; ii < ILP; ii++) {
+                MATH_T c = beta1 * r_m[ii] + (1 - beta1) * r_g[ii];
+                MATH_T update = c > 0 ? (-lr) : lr;
+                r_p[ii] = r_p[ii] * after_decay + update;
+                r_m[ii] = beta2 * r_m[ii] + (1 - beta2) * r_g[ii];
+            }
+#pragma unroll
+            for (int ii = 0; ii < ILP; ii++) {
+                int i = i_start + threadIdx.x + ii * blockDim.x;
+                if (i < n && i < chunk_size) {
+                    p[i] = r_p[ii];
+                    m[i] = r_m[ii];
+                }
+            }
+        }
+    }
+};
+
+void multi_tensor_lion_cuda(int chunk_size,
+                            at::Tensor noop_flag,
+                            std::vector<std::vector<at::Tensor>> tensor_lists,
+                            const float lr,
+                            const float beta1,
+                            const float beta2,
+                            const int step,
+                            const float weight_decay)
+{
+    using namespace at;
+
+    // Assume single type across p,g,m1,m2 now
+    DISPATCH_DOUBLE_FLOAT_AND_HALF(tensor_lists[0][0].scalar_type(),
+                                   0,
+                                   "lion",
+                                   multi_tensor_apply<3>(BLOCK_SIZE,
+                                                         chunk_size,
+                                                         noop_flag,
+                                                         tensor_lists,
+                                                         LionFunctor<scalar_t_0>(),
+                                                         beta1,
+                                                         beta2,
+                                                         lr,
+                                                         weight_decay);)
+
+    AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/csrc/quantization/dequantize.cu b/csrc/quantization/dequantize.cu
index 3185b5f9c98c..783ffb7a41e3 100644
--- a/csrc/quantization/dequantize.cu
+++ b/csrc/quantization/dequantize.cu
@@ -1,6 +1,7 @@
-/*
-Copyright 2022 The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #include "dequantization_utils.h"
 #include "memory_access_utils.h"
diff --git a/csrc/quantization/fake_quantizer.cu b/csrc/quantization/fake_quantizer.cu
index 5d23c6ea4a6b..4c08cd4cc3d2 100644
--- a/csrc/quantization/fake_quantizer.cu
+++ b/csrc/quantization/fake_quantizer.cu
@@ -1,6 +1,7 @@
-/*
-Copyright The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #include <math.h>
 #include "custom_cuda_layers.h"
@@ -10,7 +11,7 @@ namespace cg = cooperative_groups;
 
 __global__ void fake_quantize_kernel(__half* vals, int group_size, int num_bits)
 {
-#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_AMD__)
 
     cg::thread_block b = cg::this_thread_block();  // tb
     cg::thread_block_tile<32> g =
@@ -196,7 +197,7 @@ __global__ void sr_fake_quantize_kernel(__half* vals,
                                         int num_bits,
                                         std::pair<uint64_t, uint64_t> seed)
 {
-#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_AMD__)
 
     cg::thread_block b = cg::this_thread_block();
     cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
@@ -456,7 +457,7 @@ void launch_sr_fake_quantize_kernel(T* vals,
     dim3 grid_dim(group_num);
 
     uint64_t inc = total_count / grid_dim.x / block_dim.x;
-    std::pair<uint64_t, uint64_t> seed = Context::Instance().IncrementOffset(inc);
+    std::pair<uint64_t, uint64_t> seed = TrainingContext::Instance().IncrementOffset(inc);
 
     sr_fake_quantize_kernel<<<grid_dim, block_dim, 0, stream>>>(
         vals, (total_count / group_num) / 4, group_num, num_bits, seed);
@@ -474,7 +475,7 @@ template void launch_sr_fake_quantize_kernel(__half* vals,
 
 __global__ void fake_quantize_kernel_asym(__half* vals, int group_size, int num_bits)
 {
-#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_AMD__)
 
     cg::thread_block b = cg::this_thread_block();
     cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
@@ -719,7 +720,7 @@ __global__ void sr_fake_quantize_kernel_asym(__half* vals,
                                              int num_bits,
                                              std::pair<uint64_t, uint64_t> seed)
 {
-#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_AMD__)
 
     cg::thread_block b = cg::this_thread_block();
     cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
@@ -1010,7 +1011,7 @@ void launch_sr_fake_quantize_kernel_asym(T* vals,
     dim3 grid_dim(group_num);
 
     uint64_t inc = total_count / grid_dim.x / block_dim.x;
-    std::pair<uint64_t, uint64_t> seed = Context::Instance().IncrementOffset(inc);
+    std::pair<uint64_t, uint64_t> seed = TrainingContext::Instance().IncrementOffset(inc);
 
     sr_fake_quantize_kernel<<<grid_dim, block_dim, 0, stream>>>(
         vals, (total_count / group_num) / 4, group_num, num_bits, seed);
diff --git a/csrc/quantization/pt_binding.cpp b/csrc/quantization/pt_binding.cpp
index 1465cabd2961..a4210897092d 100644
--- a/csrc/quantization/pt_binding.cpp
+++ b/csrc/quantization/pt_binding.cpp
@@ -1,3 +1,8 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
 #include <ATen/cuda/CUDAContext.h>
 #include <torch/extension.h>
 #include <cassert>
@@ -131,6 +136,135 @@ at::Tensor dequantize(at::Tensor& quantized_data,
     return output;
 }
 
+at::Tensor dequantize_int4_to_half_experimental(at::Tensor& data_in,
+                                                at::Tensor& scale_buffer,
+                                                at::Tensor& min_val_buffer,
+                                                int num_group,
+                                                int group_size)
+{
+    auto output_options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA);
+    auto output = torch::empty({num_group, group_size}, output_options);
+
+    launch_dequantize_int4_to_half_experimental((uint8_t*)data_in.data_ptr(),
+                                                (half*)output.data_ptr(),
+                                                (half*)scale_buffer.data_ptr(),
+                                                (half*)min_val_buffer.data_ptr(),
+                                                num_group,
+                                                group_size,
+                                                at::cuda::getCurrentCUDAStream());
+
+    return output;
+}
+
+at::Tensor dequantize_int8_to_half_experimental(at::Tensor& data_in,
+                                                at::Tensor& scale_buffer,
+                                                at::Tensor& min_val_buffer,
+                                                int num_group,
+                                                int group_size)
+{
+    auto output_options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA);
+    auto output = torch::empty({num_group, group_size}, output_options);
+
+    launch_dequantize_int8_to_half_experimental((uint8_t*)data_in.data_ptr(),
+                                                (half*)output.data_ptr(),
+                                                (half*)scale_buffer.data_ptr(),
+                                                (half*)min_val_buffer.data_ptr(),
+                                                num_group,
+                                                group_size,
+                                                at::cuda::getCurrentCUDAStream());
+
+    return output;
+}
+
+std::vector<at::Tensor> ds_swizzle_quant(at::Tensor& input_vals,
+                                         int groups,
+                                         int num_bits,
+                                         quantize::Type quant_type,
+                                         int pipeline_size,
+                                         int nodes,
+                                         int devices_per_node)
+{
+    auto scales_options = at::TensorOptions()
+                              .dtype(at::kFloat)
+                              .layout(at::kStrided)
+                              .device(at::kCUDA)
+                              .requires_grad(false);
+    const int scales_elems = (quantize::requires_offset(quant_type)) ? 2 : 1;
+    auto scales = torch::empty({groups, scales_elems}, scales_options);
+
+    auto output_options = at::TensorOptions()
+                              .dtype(at::kChar)
+                              .layout(at::kStrided)
+                              .device(at::kCUDA)
+                              .requires_grad(false);
+
+    const int quantization_scalar = 8 / num_bits;
+    const int compressed_vals = at::numel(input_vals) / quantization_scalar;
+
+    auto output = torch::empty({compressed_vals}, output_options);
+    const int elems_per_group = at::numel(input_vals) / groups;
+
+    launch_swizzled_quant((int8_t*)output.data_ptr(),
+                          (float*)scales.data_ptr(),
+                          (__half*)input_vals.data_ptr(),
+                          num_bits,
+                          quant_type,
+                          groups,
+                          elems_per_group,
+                          pipeline_size,
+                          nodes,
+                          devices_per_node,
+                          at::cuda::getCurrentCUDAStream());
+
+    return {output, scales};
+}
+
+std::vector<at::Tensor> quantized_reduction(at::Tensor& input_vals,
+                                            at::Tensor& input_scales,
+                                            int in_groups,
+                                            int out_groups,
+                                            int num_bits,
+                                            quantize::Type quant_type,
+                                            int devices_per_node)
+{
+    auto scales_options = at::TensorOptions()
+                              .dtype(at::kFloat)
+                              .layout(at::kStrided)
+                              .device(at::kCUDA)
+                              .requires_grad(false);
+    const int scales_elems = (quantize::requires_offset(quant_type)) ? 2 : 1;
+    auto scales = torch::empty({out_groups, scales_elems}, scales_options);
+
+    auto output_options = at::TensorOptions()
+                              .dtype(at::kChar)
+                              .layout(at::kStrided)
+                              .device(at::kCUDA)
+                              .requires_grad(false);
+
+    std::vector<long int> sz(input_vals.sizes().begin(), input_vals.sizes().end());
+    sz[sz.size() - 1] = sz.back() / devices_per_node;  // num of GPU per nodes
+    const int elems_per_in_tensor = at::numel(input_vals) / devices_per_node;
+    auto output = torch::empty(sz, output_options);
+
+    const int elems_per_in_group = elems_per_in_tensor / (in_groups / devices_per_node);
+    const int elems_per_out_group = elems_per_in_tensor / out_groups;
+
+    launch_dequant_reduce((int8_t*)output.data_ptr(),
+                          (float*)scales.data_ptr(),
+                          (const int8_t*)input_vals.data_ptr(),
+                          (const float*)input_scales.data_ptr(),
+                          devices_per_node,
+                          num_bits,
+                          quant_type,
+                          out_groups,
+                          elems_per_out_group,
+                          elems_per_in_tensor,
+                          in_groups / devices_per_node,
+                          elems_per_in_group,
+                          at::cuda::getCurrentCUDAStream());
+    return {output, scales};
+}
+
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
 {
     m.def("ds_quantize_fp32", &ds_quantize<float>, "DeepSpeed Quantize with fp32 (CUDA)");
@@ -153,4 +287,12 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
     m.def("quantize", &quantize_kernel);
     m.def("dequantize", &dequantize<__half>);
     m.def("dequantize_fp32", &dequantize<float>);
+    m.def("dequantize_int4_to_half_experimental",
+          &dequantize_int4_to_half_experimental,
+          "Dequantize int4 to half (experimental)");
+    m.def("dequantize_int8_to_half_experimental",
+          &dequantize_int8_to_half_experimental,
+          "Dequantize int8 to half (experimental)");
+    m.def("swizzle_quant", &ds_swizzle_quant);
+    m.def("quantized_reduction", &quantized_reduction);
 }
diff --git a/csrc/quantization/quant_reduce.cu b/csrc/quantization/quant_reduce.cu
new file mode 100644
index 000000000000..26db1118c831
--- /dev/null
+++ b/csrc/quantization/quant_reduce.cu
@@ -0,0 +1,263 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include <cstdio>
+#include "dequantization_utils.h"
+#include "ds_kernel_utils.h"
+#include "memory_access_utils.h"
+#include "quantization_utils.h"
+#include "reduction_utils.h"
+
+using rop = reduce::ROpType;
+
+/*
+TODO(cmikeh2): Add implementation that better handles larger nodes. It would like make sense
+to leverage some parallel reductions here to improve performance.
+*/
+
+template <int numBits, int numTensors, int totalChunks, quantize::Type quantType>
+__global__ void __launch_bounds__(1024) dequant_reduce(int8_t* reduced_data,
+                                                       float* reduced_scales,
+                                                       const int8_t* input_data,
+                                                       const float* input_scales,
+                                                       int elems_per_out_group,
+                                                       int elems_per_in_tensor,
+                                                       int groups_per_in_tensor,
+                                                       int elems_per_in_group,
+                                                       int num_tensors)
+{
+    cg::thread_block tb = cg::this_thread_block();
+    cg::thread_block_tile<hw_warp_size> warp = cg::tiled_partition<hw_warp_size>(tb);
+
+    // NOTE(cmikeh2): This probably could be hardcoded to a larger number,
+    // but that means even stronger restrictions on the number of elements per group
+    // A performance analysis here might be beneficial
+    constexpr int mem_granularity = (numBits == 8) ? 8 : 4;
+    constexpr int elems_per_load = mem_granularity / sizeof(int8_t);  // div by 1
+    constexpr int storage_values = 16 / sizeof(__half2);
+
+    const int block_offset = tb.group_index().x * elems_per_out_group;
+    const int elem_offset = tb.thread_index().x * elems_per_load;
+    const int base_offset = block_offset + elem_offset;
+    const int stride = tb.group_dim().x * elems_per_load;
+
+    __half2 local_buffer[totalChunks * storage_values];
+
+    quantize::GroupStats<quantType> stats;
+
+#pragma unroll
+    for (int i = 0; i < totalChunks; i++) {
+        __half2* iteration_buffer = local_buffer + i * storage_values;
+
+#pragma unroll
+        for (int j = 0; j < storage_values; j++) {
+            iteration_buffer[j] = reduce::init<rop::Add, __half2>();
+        }
+
+        const int iter_offset = i * stride + base_offset;
+        const int iter_scale_idx = iter_offset / elems_per_in_group;
+        bool do_loads = i * stride + elem_offset < elems_per_out_group;
+
+        if (numTensors > 0) {
+#pragma unroll
+            for (int j = 0; j < numTensors; j++) {
+                if (do_loads) {
+                    int8_t load_buffer[elems_per_load];
+
+                    mem_access::load_global<mem_granularity>(
+                        load_buffer, input_data + j * elems_per_in_tensor + iter_offset);
+
+                    quantize::Params<quantType, numBits> params(
+                        input_scales + j * groups_per_in_tensor, iter_scale_idx);
+
+                    __half2 dequant_buffer[storage_values];
+                    dequantize::chunk<numBits, quantType>(dequant_buffer, load_buffer, params);
+
+#pragma unroll
+                    for (int k = 0; k < storage_values; k++) {
+                        iteration_buffer[k] =
+                            reduce::element<rop::Add>(iteration_buffer[k], dequant_buffer[k]);
+                    }
+                }
+            }
+        } else {
+#pragma unroll 4
+            for (int j = 0; j < num_tensors; j++) {
+                if (do_loads) {
+                    int8_t load_buffer[elems_per_load];
+
+                    mem_access::load_global<mem_granularity>(
+                        load_buffer, input_data + j * elems_per_in_tensor + iter_offset);
+
+                    quantize::Params<quantType, numBits> params(
+                        input_scales + j * groups_per_in_tensor, iter_scale_idx);
+
+                    __half2 dequant_buffer[storage_values];
+                    dequantize::chunk<numBits, quantType>(dequant_buffer, load_buffer, params);
+
+#pragma unroll
+                    for (int k = 0; k < storage_values; k++) {
+                        iteration_buffer[k] =
+                            reduce::element<rop::Add>(iteration_buffer[k], dequant_buffer[k]);
+                    }
+                }
+            }
+        }
+
+#pragma unroll
+        for (int j = 0; j < storage_values; j++) { stats.update(iteration_buffer[j]); }
+    }
+
+    auto params = stats.template get_params<numBits, 1024>(tb, warp);
+
+    if (tb.thread_index().x == 0) { params.store(reduced_scales, tb.group_index().x); }
+
+#pragma unroll
+    for (int i = 0; i < totalChunks; i++) {
+        const int iter_offset = i * stride + base_offset;
+        if (i * stride + elem_offset < elems_per_out_group) {
+            int8_t local_output[elems_per_load];
+            quantize::_chunk<numBits, quantType>(
+                local_output, local_buffer + i * storage_values, params);
+            mem_access::store_global<mem_granularity>(reduced_data + iter_offset, local_output);
+        }
+    }
+}
+
+template <int Power>
+int32_t pow2_round(int32_t raw_value)
+{
+    return (((raw_value - 1) >> Power) + 1) << Power;
+}
+
+#define LAUNCH_DEQUANT_REDUCE(num_chunks)                      \
+    dequant_reduce<numBits, numTensors, num_chunks, quantType> \
+        <<<grid, block, 0, stream>>>(reduced_data,             \
+                                     reduced_scales,           \
+                                     input_data,               \
+                                     input_scales,             \
+                                     elems_per_out_group,      \
+                                     elems_per_in_tensor,      \
+                                     groups_per_in_tensor,     \
+                                     elems_per_in_group,       \
+                                     num_tensors);
+
+template <int numBits, int numTensors, quantize::Type quantType>
+void launch_dequant_reduce_impl(int8_t* reduced_data,
+                                float* reduced_scales,
+                                const int8_t* input_data,
+                                const float* input_scales,
+                                int out_groups,
+                                int elems_per_out_group,
+                                int elems_per_in_tensor,
+                                int groups_per_in_tensor,
+                                int elems_per_in_group,
+                                int num_tensors,
+                                cudaStream_t stream)
+{
+    // This is a coincidence. This is derived by 8 halves per 16 bytes with 2-way packing for int4
+    constexpr int elems_per_thread = numBits;
+    const int one_step_threads =
+        next_pow2((elems_per_out_group + elems_per_thread - 1) / (elems_per_thread));
+    // TODO(cmikeh2): Tune this
+    const int threads = (one_step_threads < 1024) ? one_step_threads : 1024;
+
+    dim3 block(threads);
+    dim3 grid(out_groups);
+
+    const int elems_per_step = threads * elems_per_thread;
+    const int unroll_raw = (elems_per_out_group + elems_per_step - 1) / elems_per_step;
+
+    const int unroll = (unroll_raw >= 4) ? pow2_round<1>(unroll_raw) : unroll_raw;
+
+    if (unroll == 1) {
+        // 0-4096 elems
+        LAUNCH_DEQUANT_REDUCE(1);
+    } else if (unroll == 2) {
+        // 4097-8192 etc...
+        LAUNCH_DEQUANT_REDUCE(2);
+    } else if (unroll == 3) {
+        LAUNCH_DEQUANT_REDUCE(3);
+    } else if (unroll == 4) {
+        LAUNCH_DEQUANT_REDUCE(4);
+    } else if (unroll == 6) {
+        LAUNCH_DEQUANT_REDUCE(6);
+    } else if (unroll == 8) {
+        LAUNCH_DEQUANT_REDUCE(8);
+    } else if (unroll == 10) {
+        LAUNCH_DEQUANT_REDUCE(10);
+    } else if (unroll == 12) {
+        // 48k limit
+        LAUNCH_DEQUANT_REDUCE(12);
+    } else {
+        assert(false);
+    }
+}
+
+#define LAUNCH_DEQUANT_REDUCE_IMPL(NUM_BITS, NUM_GPUS, QUANT_TYPE)                   \
+    launch_dequant_reduce_impl<NUM_BITS, NUM_GPUS, QUANT_TYPE>(reduced_data,         \
+                                                               reduced_scales,       \
+                                                               input_data,           \
+                                                               input_scales,         \
+                                                               out_groups,           \
+                                                               elems_per_out_group,  \
+                                                               elems_per_in_tensor,  \
+                                                               groups_per_in_tensor, \
+                                                               elems_per_in_group,   \
+                                                               num_gpus,             \
+                                                               stream);
+
+void launch_dequant_reduce(int8_t* reduced_data,
+                           float* reduced_scales,
+                           const int8_t* input_data,
+                           const float* input_scales,
+                           int num_gpus,
+                           int num_bits,
+                           quantize::Type quant_type,
+                           int out_groups,
+                           int elems_per_out_group,
+                           int elems_per_in_tensor,
+                           int groups_per_in_tensor,
+                           int elems_per_in_group,
+                           cudaStream_t stream)
+{
+    if (quant_type == quantize::Type::Symmetric) {
+        if (num_bits == 4) {
+            if (num_gpus == 8) {
+                LAUNCH_DEQUANT_REDUCE_IMPL(4, 8, quantize::Type::Symmetric);
+            } else if (num_gpus == 16) {
+                LAUNCH_DEQUANT_REDUCE_IMPL(4, 16, quantize::Type::Symmetric);
+            } else {
+                LAUNCH_DEQUANT_REDUCE_IMPL(4, -1, quantize::Type::Symmetric);
+            }
+        } else if (num_bits == 8) {
+            if (num_gpus == 8) {
+                LAUNCH_DEQUANT_REDUCE_IMPL(8, 8, quantize::Type::Symmetric);
+            } else if (num_gpus == 16) {
+                LAUNCH_DEQUANT_REDUCE_IMPL(8, 16, quantize::Type::Symmetric);
+            } else {
+                LAUNCH_DEQUANT_REDUCE_IMPL(8, -1, quantize::Type::Symmetric);
+            }
+        }
+    } else if (quant_type == quantize::Type::Asymmetric) {
+        if (num_bits == 4) {
+            if (num_gpus == 8) {
+                LAUNCH_DEQUANT_REDUCE_IMPL(4, 8, quantize::Type::Asymmetric);
+            } else if (num_gpus == 16) {
+                LAUNCH_DEQUANT_REDUCE_IMPL(4, 16, quantize::Type::Asymmetric);
+            } else {
+                LAUNCH_DEQUANT_REDUCE_IMPL(4, -1, quantize::Type::Asymmetric);
+            }
+        } else if (num_bits == 8) {
+            if (num_gpus == 8) {
+                LAUNCH_DEQUANT_REDUCE_IMPL(8, 8, quantize::Type::Asymmetric);
+            } else if (num_gpus == 16) {
+                LAUNCH_DEQUANT_REDUCE_IMPL(8, 16, quantize::Type::Asymmetric);
+            } else {
+                LAUNCH_DEQUANT_REDUCE_IMPL(8, -1, quantize::Type::Asymmetric);
+            }
+        }
+    }
+}
diff --git a/csrc/quantization/quantize.cu b/csrc/quantization/quantize.cu
index b9c925c19f87..b04d0e968ba5 100644
--- a/csrc/quantization/quantize.cu
+++ b/csrc/quantization/quantize.cu
@@ -1,6 +1,7 @@
-/*
-Copyright 2022 The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #include "ds_kernel_utils.h"
 #include "memory_access_utils.h"
diff --git a/csrc/quantization/quantize_intX.cu b/csrc/quantization/quantize_intX.cu
new file mode 100644
index 000000000000..b26151ab5c8c
--- /dev/null
+++ b/csrc/quantization/quantize_intX.cu
@@ -0,0 +1,281 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include <assert.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include "memory_access_utils.h"
+
+template <typename T, int N>
+struct alignas(sizeof(T) * N) AlignedArray {
+    using Element = T;
+    static const int kElements = N;
+
+    __device__ __host__ AlignedArray() {}
+
+    __device__ __host__ AlignedArray(const T& rhs)
+    {
+#pragma unroll
+        for (int idx = 0; idx < kElements; ++idx) { this->at(idx) = rhs; }
+    }
+
+    __device__ __host__ T& operator[](int offset)
+    {
+        return reinterpret_cast<T&>(this->buffer[offset]);
+    }
+
+    __device__ __host__ const T& operator[](int offset) const
+    {
+        return reinterpret_cast<const T&>(this->buffer[offset]);
+    }
+
+    __device__ __host__ T& at(int offset) { return reinterpret_cast<T&>(this->buffer[offset]); }
+
+    __device__ __host__ const T& at(int offset) const
+    {
+        return reinterpret_cast<const T&>(this->buffer[offset]);
+    }
+
+    __device__ __host__ AlignedArray<T, N> operator+(const AlignedArray<T, N>& rhs) const
+    {
+        AlignedArray<T, N> ret;
+
+#pragma unroll
+        for (int idx = 0; idx < kElements; ++idx) { ret[idx] = this->at(idx) + rhs.at(idx); }
+
+        return ret;
+    }
+
+    __device__ __forceinline__ void clear()
+    {
+#pragma unroll
+        for (int idx = 0; idx < kElements; ++idx) { this->at(idx) = Element(0); }
+    }
+
+    Element buffer[N];
+};
+
+template <typename T>
+struct reduce_max {
+    __device__ __forceinline__ T operator()(const T& lhs, const T& rhs)
+    {
+        return lhs > rhs ? lhs : rhs;
+    }
+};
+
+template <typename T>
+struct reduce_min {
+    __device__ __forceinline__ T operator()(const T& lhs, const T& rhs)
+    {
+        return lhs < rhs ? lhs : rhs;
+    }
+};
+
+template <typename T, int N>
+struct subtract {
+    __device__ __forceinline__ AlignedArray<T, N> operator()(const AlignedArray<T, N>& lhs,
+                                                             const T& rhs)
+    {
+        AlignedArray<T, N> ret;
+
+#pragma unroll
+        for (int idx = 0; idx < N; ++idx) { ret[idx] = lhs[idx] - rhs; }
+
+        return ret;
+    }
+};
+
+template <typename T, int N>
+struct plus {
+    __device__ __forceinline__ AlignedArray<T, N> operator()(const AlignedArray<T, N>& lhs,
+                                                             const T& rhs)
+    {
+        AlignedArray<T, N> ret;
+
+#pragma unroll
+        for (int idx = 0; idx < N; ++idx) { ret[idx] = lhs[idx] + rhs; }
+
+        return ret;
+    }
+};
+
+template <typename T, int N>
+struct multiply {
+    __device__ __forceinline__ AlignedArray<T, N> operator()(const AlignedArray<T, N>& lhs,
+                                                             const T& rhs)
+    {
+        AlignedArray<T, N> ret;
+
+#pragma unroll
+        for (int idx = 0; idx < N; ++idx) { ret[idx] = lhs[idx] * rhs; }
+
+        return ret;
+    }
+};
+
+template <typename T, int N>
+struct clamp {
+    __device__ __forceinline__ AlignedArray<T, N> operator()(const AlignedArray<T, N>& lhs,
+                                                             const T& min_val,
+                                                             const T& max_val)
+    {
+        AlignedArray<T, N> ret;
+
+#pragma unroll
+        for (int idx = 0; idx < N; ++idx) {
+            ret[idx] = reduce_max<T>()(reduce_min<T>()(lhs[idx], max_val), min_val);
+        }
+
+        return ret;
+    }
+};
+
+template <typename T, int N>
+struct round_int;
+
+template <int N>
+struct round_int<half, N> {
+    __device__ __forceinline__ AlignedArray<half, N> operator()(const AlignedArray<half, N>& lhs)
+    {
+        AlignedArray<half, N> ret;
+
+#pragma unroll
+        for (int idx = 0; idx < N; ++idx) { ret[idx] = hrint(lhs[idx]); }
+
+        return ret;
+    }
+};
+
+template <typename T, int N>
+struct divide {
+    __device__ __forceinline__ AlignedArray<T, N> operator()(const AlignedArray<T, N>& lhs,
+                                                             const T& rhs)
+    {
+        AlignedArray<T, N> ret;
+
+#pragma unroll
+        for (int idx = 0; idx < N; ++idx) { ret[idx] = lhs[idx] / rhs; }
+
+        return ret;
+    }
+};
+
+template <typename T, int N, typename Reducer>
+__device__ __forceinline__ T to_scalar(const AlignedArray<T, N>& data)
+{
+    Reducer re;
+    T res = data[0];
+
+#pragma unroll
+    for (int idx = 1; idx < N; ++idx) { res = re(res, data[idx]); }
+
+    return res;
+}
+
+template <int N>
+__device__ __forceinline__ AlignedArray<half, N * 2> int4_to_half(
+    const AlignedArray<uint8_t, N>& data)
+{
+    AlignedArray<half, N * 2> ret;
+
+#pragma unroll
+    for (int idx = 0; idx < N * 2; idx += 2) {
+        ret[idx] = half(int(data[idx / 2] >> 4));
+        ret[idx + 1] = half(int(data[idx / 2] & 0xf));
+    }
+
+    return ret;
+}
+
+__global__ void dequantize_int4_to_half(uint8_t* data_in,
+                                        half* data_out,
+                                        half* scale_buffer,
+                                        half* min_val_buffer,
+                                        int num_group,
+                                        int group_size)
+{
+    using AccessType = AlignedArray<uint8_t, 4>;
+    using AccessTypeOut = AlignedArray<half, 8>;
+
+    for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < num_group * group_size / 8;
+         idx += blockDim.x * gridDim.x) {
+        int id_group = idx / (group_size / 8);
+        AccessType value = reinterpret_cast<AccessType*>(data_in)[idx];
+        half scale = scale_buffer[id_group];
+        half min_value = min_val_buffer[id_group];
+
+        AccessTypeOut output = int4_to_half(value);
+        output = divide<half, 8>()(output, scale);
+        output = plus<half, 8>()(output, min_value);
+
+        reinterpret_cast<AccessTypeOut*>(data_out)[idx] = output;
+    }
+}
+
+void launch_dequantize_int4_to_half_experimental(uint8_t* data_in,
+                                                 half* data_out,
+                                                 half* scale_buffer,
+                                                 half* min_val_buffer,
+                                                 int num_group,
+                                                 int group_size,
+                                                 cudaStream_t stream)
+{
+    int num_warp = num_group / 4;
+    int num_block = num_warp / 8;  // 256 trd / block
+
+    dequantize_int4_to_half<<<num_block, 256, 0, stream>>>(
+        data_in, data_out, scale_buffer, min_val_buffer, num_group, group_size);
+}
+
+template <int N>
+__device__ __forceinline__ AlignedArray<half, N> int8_to_half(const AlignedArray<uint8_t, N>& data)
+{
+    AlignedArray<half, N> ret;
+
+#pragma unroll
+    for (int idx = 0; idx < N; idx += 1) { ret[idx] = half(int(data[idx])); }
+
+    return ret;
+}
+
+__global__ void dequantize_int8_to_half(uint8_t* data_in,
+                                        half* data_out,
+                                        half* scale_buffer,
+                                        half* min_val_buffer,
+                                        int num_group,
+                                        int group_size)
+{
+    using AccessType = AlignedArray<uint8_t, 8>;
+    using AccessTypeOut = AlignedArray<half, 8>;
+
+    for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < num_group * group_size / 8;
+         idx += blockDim.x * gridDim.x) {
+        int id_group = idx / (group_size / 8);
+        AccessType value = reinterpret_cast<AccessType*>(data_in)[idx];
+        half scale = scale_buffer[id_group];
+        half min_value = min_val_buffer[id_group];
+
+        AccessTypeOut output = int8_to_half(value);
+        output = divide<half, 8>()(output, scale);
+        output = plus<half, 8>()(output, min_value);
+
+        reinterpret_cast<AccessTypeOut*>(data_out)[idx] = output;
+    }
+}
+
+void launch_dequantize_int8_to_half_experimental(uint8_t* data_in,
+                                                 half* data_out,
+                                                 half* scale_buffer,
+                                                 half* min_val_buffer,
+                                                 int num_group,
+                                                 int group_size,
+                                                 cudaStream_t stream)
+{
+    int num_warp = num_group / 4;
+    int num_block = num_warp / 8;  // 256 trd / block
+
+    dequantize_int8_to_half<<<num_block, 256, 0, stream>>>(
+        data_in, data_out, scale_buffer, min_val_buffer, num_group, group_size);
+}
diff --git a/csrc/quantization/swizzled_quantize.cu b/csrc/quantization/swizzled_quantize.cu
new file mode 100644
index 000000000000..5a02a0ae8120
--- /dev/null
+++ b/csrc/quantization/swizzled_quantize.cu
@@ -0,0 +1,196 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include "memory_access_utils.h"
+#include "quantization_utils.h"
+#include "reduction_utils.h"
+
+using rop = reduce::ROpType;
+
+namespace swiz_quant {
+constexpr int max_threads = 512;
+constexpr int min_threads = 32;
+
+constexpr int step_granularity = 2;
+constexpr int h_per_step = step_granularity * quantize::h_per_load;
+}  // namespace swiz_quant
+
+template <int numBits, int totalChunks, int threads, quantize::Type quantType>
+__global__ void swizzled_quant_kernel(int8_t* quantized_data,
+                                      float* quantized_scales,
+                                      const __half* uncompressed_data,
+                                      int elems_per_group,
+                                      int nodes,
+                                      int devices_per_node)
+{
+    cg::thread_block tb = cg::this_thread_block();
+    cg::thread_block_tile<hw_warp_size> warp = cg::tiled_partition<hw_warp_size>(tb);
+
+    // Indexing offsets, same as normal quantization for in-case
+    const int block_rank = blockIdx.x + blockIdx.y * gridDim.x + blockIdx.z * gridDim.x * gridDim.y;
+    const int block_offset = block_rank * elems_per_group;
+    const int elem_offset = tb.thread_index().x * quantize::h_per_load;
+    const int base_offset = block_offset + elem_offset;
+    const int stride = tb.size() * quantize::h_per_load;
+    const __half* input_base = uncompressed_data + base_offset;
+
+    // Local buffer
+    __half2 local_buffer[totalChunks * quantize::h2_per_load];
+
+    quantize::GroupStats<quantType> stats;
+#pragma unroll
+    for (int i = 0; i < totalChunks; i++) {
+        __half2* iteration_buffer = local_buffer + i * quantize::h2_per_load;
+
+        mem_access::load_global<quantize::granularity>(
+            iteration_buffer, input_base + i * stride, elem_offset + i * stride < elems_per_group);
+
+#pragma unroll
+        for (int j = 0; j < quantize::h2_per_load; j++) { stats.update(iteration_buffer[j]); }
+    }
+
+    auto params = stats.template get_params<numBits, threads>(tb, warp);
+
+    const int partition_id = blockIdx.z;
+    const int partition_offset = partition_id / devices_per_node;
+    const int partition_base = (partition_id % devices_per_node) * nodes;
+    const int pipelining_offset = blockIdx.y * (devices_per_node * nodes);
+    const int output_partition = (pipelining_offset + partition_base + partition_offset);
+
+    constexpr int out_scalar_effect = 8 / numBits;
+    const int out_block_rank = output_partition * gridDim.x + blockIdx.x;
+    const int out_block_offset = out_block_rank * elems_per_group / out_scalar_effect;
+    const int out_base_offset = out_block_offset + elem_offset / out_scalar_effect;
+    int8_t* out_base = quantized_data + out_base_offset;
+
+    const int out_stride = stride / out_scalar_effect;
+    constexpr int num_int8_out = quantize::h_per_load / out_scalar_effect;
+
+    if (tb.thread_index().x == 0) { params.store(quantized_scales, out_block_rank); }
+
+#pragma unroll
+    for (int i = 0; i < totalChunks; i++) {
+        if (i * stride + elem_offset < elems_per_group) {
+            int8_t local_output[quantize::h_per_load / out_scalar_effect];
+            quantize::_chunk<numBits, quantType>(
+                local_output, local_buffer + i * quantize::h2_per_load, params);
+            mem_access::store_global<num_int8_out>(out_base + i * out_stride, local_output);
+        }
+    }
+}
+
+#define LAUNCH_SWIZZLE_QUANT(total_chunks, threads)                                           \
+    swizzled_quant_kernel<numBits, total_chunks, threads, qType><<<grid, block, 0, stream>>>( \
+        q_data, q_scales, input_data, elems_per_group, nodes, devices_per_node);
+
+/*
+Swizzled quantization reorganizes the quantized groups in order to better facilitate
+communication. As an example of the partitioning scheme we have the following example
+of 2 node, 4 device swizzling:
+
+ --- --- --- --- --- --- --- ---
+| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
+ --- --- --- --- --- --- --- ---
+becomes
+ --- --- --- --- --- --- --- ---
+| 0 | 4 | 1 | 5 | 2 | 6 | 3 | 7 |
+ --- --- --- --- --- --- --- ---
+
+Multiple quantization groups may be mapped into a single partition. In order to better support
+later pipelining, we may also perform an additional slicing. In two-way slicing, for instance,
+the first halves of each partition are concatenated.
+*/
+
+template <int numBits, quantize::Type qType>
+void launch_swizzled_quant_impl(int8_t* q_data,
+                                float* q_scales,
+                                const __half* input_data,
+                                int groups,
+                                int elems_per_group,
+                                int pipelining,
+                                int nodes,
+                                int devices_per_node,
+                                cudaStream_t stream)
+{
+    const int one_step_threads =
+        next_pow2((elems_per_group + swiz_quant::h_per_step - 1) / swiz_quant::h_per_step);
+    const int max_threads = (one_step_threads < swiz_quant::max_threads) ? one_step_threads
+                                                                         : swiz_quant::max_threads;
+    const int threads = (max_threads < swiz_quant::min_threads) ? swiz_quant::min_threads
+                                                                : max_threads;
+
+    dim3 block(threads);
+    const int groups_per_partition = groups / (nodes * devices_per_node);
+    assert(groups_per_partition % pipelining == 0);
+    const int contiguous_groups = groups_per_partition / pipelining;
+    const int partitions = nodes * devices_per_node;
+    dim3 grid(contiguous_groups, pipelining, partitions);
+
+    const int elems_per_step = threads * swiz_quant::h_per_step;
+    const int external_unroll = ((elems_per_group + elems_per_step - 1) / elems_per_step);
+    const int total_unroll = external_unroll * swiz_quant::step_granularity;
+
+    assert(total_unroll % 2 == 0);
+
+    if (threads == 32) {
+        LAUNCH_SWIZZLE_QUANT(2, 32);
+    } else if (threads == 64) {
+        LAUNCH_SWIZZLE_QUANT(2, 64);
+    } else if (threads == 128) {
+        LAUNCH_SWIZZLE_QUANT(2, 128);
+    } else if (threads == 256) {
+        LAUNCH_SWIZZLE_QUANT(2, 256);
+    } else if (threads == 512) {
+        if (total_unroll == 2) {
+            LAUNCH_SWIZZLE_QUANT(2, 512);
+        } else if (total_unroll == 4) {
+            LAUNCH_SWIZZLE_QUANT(4, 512);
+        } else if (total_unroll == 6) {
+            LAUNCH_SWIZZLE_QUANT(6, 512);
+        } else if (total_unroll == 8) {
+            LAUNCH_SWIZZLE_QUANT(8, 512);
+        } else if (total_unroll == 10) {
+            LAUNCH_SWIZZLE_QUANT(10, 512);
+        }
+    }
+}
+
+#define DISPATCH_SWIZZLE_QUANT(num_bits, qtype)                   \
+    launch_swizzled_quant_impl<num_bits, qtype>(q_data,           \
+                                                q_scales,         \
+                                                input_data,       \
+                                                groups,           \
+                                                elems_per_group,  \
+                                                pipelining,       \
+                                                nodes,            \
+                                                devices_per_node, \
+                                                stream);
+
+void launch_swizzled_quant(int8_t* q_data,
+                           float* q_scales,
+                           const __half* input_data,
+                           int num_bits,
+                           quantize::Type q_type,
+                           int groups,
+                           int elems_per_group,
+                           int pipelining,
+                           int nodes,
+                           int devices_per_node,
+                           cudaStream_t stream)
+{
+    if (num_bits == 4) {
+        if (q_type == quantize::Type::Asymmetric) {
+            DISPATCH_SWIZZLE_QUANT(4, quantize::Type::Asymmetric);
+        } else if (q_type == quantize::Type::Symmetric) {
+            DISPATCH_SWIZZLE_QUANT(4, quantize::Type::Symmetric);
+        }
+    } else if (num_bits == 8) {
+        if (q_type == quantize::Type::Asymmetric) {
+            DISPATCH_SWIZZLE_QUANT(8, quantize::Type::Asymmetric);
+        } else if (q_type == quantize::Type::Symmetric) {
+            DISPATCH_SWIZZLE_QUANT(8, quantize::Type::Symmetric);
+        }
+    }
+}
diff --git a/csrc/random_ltd/gather_scatter.cu b/csrc/random_ltd/gather_scatter.cu
index 8da74796ed1f..291c8eb063b0 100644
--- a/csrc/random_ltd/gather_scatter.cu
+++ b/csrc/random_ltd/gather_scatter.cu
@@ -1,6 +1,7 @@
-/*
-Copyright 2022 The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #include "custom_cuda_layers.h"
 #include "memory_access_utils.h"
diff --git a/csrc/random_ltd/pt_binding.cpp b/csrc/random_ltd/pt_binding.cpp
index 54c41cab9851..b50476719872 100644
--- a/csrc/random_ltd/pt_binding.cpp
+++ b/csrc/random_ltd/pt_binding.cpp
@@ -1,6 +1,7 @@
-/*
-Copyright 2022 The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #include <torch/extension.h>
 #include <vector>
diff --git a/csrc/random_ltd/slice_attn_masks.cu b/csrc/random_ltd/slice_attn_masks.cu
index 63d005c4b508..bc3823b84655 100644
--- a/csrc/random_ltd/slice_attn_masks.cu
+++ b/csrc/random_ltd/slice_attn_masks.cu
@@ -1,6 +1,7 @@
-/*
-Copyright 2022 The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #include "custom_cuda_layers.h"
 #include "memory_access_utils.h"
diff --git a/csrc/random_ltd/token_sort.cu b/csrc/random_ltd/token_sort.cu
index d260211f9739..3049471cfe34 100644
--- a/csrc/random_ltd/token_sort.cu
+++ b/csrc/random_ltd/token_sort.cu
@@ -1,6 +1,7 @@
-/*
-Copyright 2022 The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #include <cassert>
 #include "custom_cuda_layers.h"
@@ -14,7 +15,7 @@ constexpr int granularity = 16;
 constexpr int mem_vals = granularity / sizeof(int32_t);
 constexpr int max_buffer_size = (threads + 1) * mem_vals;
 
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef __HIP_PLATFORM_AMD__
 constexpr int warp_size = 64;
 #else
 constexpr int warp_size = 32;
diff --git a/csrc/sparse_attention/utils.cpp b/csrc/sparse_attention/utils.cpp
index 8e4346be8a29..352306ba2612 100644
--- a/csrc/sparse_attention/utils.cpp
+++ b/csrc/sparse_attention/utils.cpp
@@ -1,5 +1,12 @@
-// DeepSpeed note, code taken & adapted from commit 9aa94789f13ada713af36cfd8cca2fc9a7f6b79a
-// https://github.com/ptillet/torch-blocksparse/blob/master/csrc/utils.cpp
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+/*
+DeepSpeed note, code taken & adapted from commit 9aa94789f13ada713af36cfd8cca2fc9a7f6b79a
+ https:github.com/ptillet/torch-blocksparse/blob/master/csrc/utils.cpp
+*/
 
 #include <torch/extension.h>
 #include <string>
diff --git a/csrc/spatial/csrc/opt_bias_add.cu b/csrc/spatial/csrc/opt_bias_add.cu
index dfcb92facbf7..d831b372b65f 100644
--- a/csrc/spatial/csrc/opt_bias_add.cu
+++ b/csrc/spatial/csrc/opt_bias_add.cu
@@ -1,6 +1,7 @@
-/*
-Copyright 2022 The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #include <cassert>
 #include "memory_access_utils.h"
diff --git a/csrc/spatial/csrc/pt_binding.cpp b/csrc/spatial/csrc/pt_binding.cpp
index 3234b063c241..cbf6636a6ee4 100644
--- a/csrc/spatial/csrc/pt_binding.cpp
+++ b/csrc/spatial/csrc/pt_binding.cpp
@@ -1,6 +1,7 @@
-/*
-Copyright 2022 The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #include <c10/cuda/CUDAStream.h>
 #include <torch/extension.h>
diff --git a/csrc/spatial/includes/spatial_cuda_layers.h b/csrc/spatial/includes/spatial_cuda_layers.h
index 3eec59f24f16..4f56f89f16cd 100644
--- a/csrc/spatial/includes/spatial_cuda_layers.h
+++ b/csrc/spatial/includes/spatial_cuda_layers.h
@@ -1,6 +1,7 @@
-/*
-Copyright 2022 The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #pragma once
 
@@ -8,7 +9,12 @@ Copyright 2022 The Microsoft DeepSpeed Team
 #define HALF_PRECISION_AVAILABLE = 1
 #endif
 
+#ifdef __HIP_PLATFORM_AMD__
+#include <hip/hip_cooperative_groups.h>
+#else
 #include <cooperative_groups.h>
+#endif
+
 #include <cuda.h>
 #include <cuda_fp16.h>
 
diff --git a/csrc/transformer/cublas_wrappers.cu b/csrc/transformer/cublas_wrappers.cu
index 97ecdd6f13ce..7821a8759ab0 100644
--- a/csrc/transformer/cublas_wrappers.cu
+++ b/csrc/transformer/cublas_wrappers.cu
@@ -1,10 +1,11 @@
-/*
-Copyright The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #include "cublas_wrappers.h"
 
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef __HIP_PLATFORM_AMD__
 int cublas_gemm_ex(rocblas_handle handle,
                    rocblas_operation transa,
                    rocblas_operation transb,
@@ -32,7 +33,7 @@ int cublas_gemm_ex(cublasHandle_t handle,
                    cublasGemmAlgo_t algo)
 #endif
 {
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef __HIP_PLATFORM_AMD__
     rocblas_status status = rocblas_gemm_ex(handle,
                                             transa,
                                             transb,
@@ -79,7 +80,7 @@ int cublas_gemm_ex(cublasHandle_t handle,
                                          algo);
 #endif
 
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef __HIP_PLATFORM_AMD__
     if (status != rocblas_status_success) {
 #else
     if (status != CUBLAS_STATUS_SUCCESS) {
@@ -95,7 +96,7 @@ int cublas_gemm_ex(cublasHandle_t handle,
     return 0;
 }
 
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef __HIP_PLATFORM_AMD__
 int cublas_gemm_ex(rocblas_handle handle,
                    rocblas_operation transa,
                    rocblas_operation transb,
@@ -123,7 +124,7 @@ int cublas_gemm_ex(cublasHandle_t handle,
                    cublasGemmAlgo_t algo)
 #endif
 {
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef __HIP_PLATFORM_AMD__
     rocblas_status status = rocblas_gemm_ex(handle,
                                             transa,
                                             transb,
@@ -170,7 +171,7 @@ int cublas_gemm_ex(cublasHandle_t handle,
                                          algo);
 #endif
 
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef __HIP_PLATFORM_AMD__
     if (status != rocblas_status_success) {
 #else
     if (status != CUBLAS_STATUS_SUCCESS) {
@@ -186,7 +187,7 @@ int cublas_gemm_ex(cublasHandle_t handle,
     return 0;
 }
 
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef __HIP_PLATFORM_AMD__
 int cublas_strided_batched_gemm(rocblas_handle handle,
                                 int m,
                                 int n,
@@ -222,7 +223,7 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
                                 cublasGemmAlgo_t algo)
 #endif
 {
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef __HIP_PLATFORM_AMD__
     rocblas_status status =
         rocblas_gemm_strided_batched_ex(handle,
                                         op_A,
@@ -279,7 +280,7 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
                                                        algo);
 #endif
 
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef __HIP_PLATFORM_AMD__
     if (status != rocblas_status_success) {
 #else
     if (status != CUBLAS_STATUS_SUCCESS) {
@@ -296,7 +297,7 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
     return 0;
 }
 
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef __HIP_PLATFORM_AMD__
 int cublas_strided_batched_gemm(rocblas_handle handle,
                                 int m,
                                 int n,
@@ -332,7 +333,7 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
                                 cublasGemmAlgo_t algo)
 #endif
 {
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef __HIP_PLATFORM_AMD__
     rocblas_status status =
         rocblas_gemm_strided_batched_ex(handle,
                                         op_A,
@@ -389,7 +390,7 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
                                                        algo);
 #endif
 
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef __HIP_PLATFORM_AMD__
     if (status != rocblas_status_success) {
 #else
     if (status != CUBLAS_STATUS_SUCCESS) {
diff --git a/csrc/transformer/dropout_kernels.cu b/csrc/transformer/dropout_kernels.cu
old mode 100755
new mode 100644
index b2f2abc3f098..97cb3592dcdc
--- a/csrc/transformer/dropout_kernels.cu
+++ b/csrc/transformer/dropout_kernels.cu
@@ -1,6 +1,7 @@
-/*
-Copyright The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #include "custom_cuda_layers.h"
 
@@ -277,7 +278,7 @@ void launch_dropout(T* out,
         grid_dim.x <<= 1;
     }
     uint64_t inc = total_count / grid_dim.x / block_dim.x;
-    std::pair<uint64_t, uint64_t> seed = Context::Instance().IncrementOffset(inc);
+    std::pair<uint64_t, uint64_t> seed = TrainingContext::Instance().IncrementOffset(inc);
     if (bwd)
         dropout_kernel_bwd<<<grid_dim, block_dim, 0, stream>>>(
             total_count, ratio, vals, out, mask, seed);
@@ -624,7 +625,7 @@ void launch_dropout(T* out,
     dim3 block_dim = DS_CUDA_NUM_THREADS;
 
     uint64_t inc = (batch * dim) / grid_dim.x / block_dim.x;
-    std::pair<uint64_t, uint64_t> seed = Context::Instance().IncrementOffset(inc);
+    std::pair<uint64_t, uint64_t> seed = TrainingContext::Instance().IncrementOffset(inc);
 
     dropout_kernel<<<grid_dim, block_dim, 0, stream>>>(
         total_count, dim, ratio, bias, out, mask, seed);
@@ -846,7 +847,7 @@ void launch_dropout(T* out,
     dim3 block_dim = DS_CUDA_NUM_THREADS;
 
     uint64_t inc = (batch * dim) / grid_dim.x / block_dim.x;
-    std::pair<uint64_t, uint64_t> seed = Context::Instance().IncrementOffset(inc);
+    std::pair<uint64_t, uint64_t> seed = TrainingContext::Instance().IncrementOffset(inc);
 
     dropout_kernel<<<grid_dim, block_dim, 0, stream>>>(
         total_count, dim, ratio, input, residual, bias, out, mask, seed);
diff --git a/csrc/transformer/ds_transformer_cuda.cpp b/csrc/transformer/ds_transformer_cuda.cpp
index 629a8ef1bcb5..b637bb710c67 100644
--- a/csrc/transformer/ds_transformer_cuda.cpp
+++ b/csrc/transformer/ds_transformer_cuda.cpp
@@ -1,3 +1,8 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
 #include <torch/extension.h>
 
 #include <cublas_v2.h>
@@ -39,7 +44,7 @@ unsigned get_workspace_size(unsigned maxBatchSize,
 }
 
 // NOTE: AT_ASSERT has become AT_CHECK on master after 0.4.
-#define CHECK_CUDA(x) AT_ASSERTM(x.type().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), #x " must be a CUDA tensor")
 #define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
 #define CHECK_INPUT(x) \
     CHECK_CUDA(x);     \
@@ -73,8 +78,8 @@ BertTransformerLayer<T>::BertTransformerLayer(unsigned layer_id,
       _normalize_invertible(normalize_invertible),
       _gelu_checkpoint(gelu_checkpoint),
       _stochastic_mode(stochastic_mode),
-      _stream(Context::Instance().GetCurrentStream()),
-      _cublasHandle(Context::Instance().GetCublasHandle()),
+      _stream(TrainingContext::Instance().GetCurrentStream()),
+      _cublasHandle(TrainingContext::Instance().GetCublasHandle()),
       _qkv_linear(typename FeedForward<T>::Config(batch_size * seq_length,
                                                   3 * hidden_size,
                                                   hidden_size,
@@ -140,7 +145,7 @@ BertTransformerLayer<T>::~BertTransformerLayer()
 template <typename T>
 void BertTransformerLayer<T>::Initialize()
 {
-#ifndef __HIP_PLATFORM_HCC__
+#ifndef __HIP_PLATFORM_AMD__
     if (std::is_same<T, __half>::value) cublasSetMathMode(_cublasHandle, CUBLAS_TENSOR_OP_MATH);
 #endif
 }
@@ -178,7 +183,7 @@ void BertTransformerLayer<T>::Forward(unsigned bsz,
 
     if (!_stochastic_mode) cudaStreamSynchronize(_stream);
 
-    T* workspace = static_cast<T*>(Context::Instance().GetWorkSpace());
+    T* workspace = static_cast<T*>(TrainingContext::Instance().GetWorkSpace());
     size_t small_buf_size = bsz * _seq_length * _hidden_size;
     T* buf_0 = workspace;
     T* buf_1 = buf_0 + small_buf_size;
@@ -338,7 +343,7 @@ void BertTransformerLayer<T>::Backward(unsigned bsz,
 
     if (!_stochastic_mode) cudaStreamSynchronize(_stream);
 
-    T* workspace = static_cast<T*>(Context::Instance().GetWorkSpace());
+    T* workspace = static_cast<T*>(TrainingContext::Instance().GetWorkSpace());
     size_t small_buf_size = bsz * _seq_length * _hidden_size;
     T* buf_0 = workspace;
     T* buf_1 = buf_0 + small_buf_size;
@@ -604,25 +609,26 @@ int create_transformer_layer(unsigned layer_id,
                              bool gelu_checkpoint,
                              bool stochastic_mode)
 {
-    Context::Instance().SetSeed(seed);
-    Context::Instance().TestGemmFP16(
+    TrainingContext::Instance().SetSeed(seed);
+    TrainingContext::Instance().TestGemmFP16(
         test_gemm, batch_size, init_seq_length, num_heads, hidden_dim / num_heads);
 
-    auto layer = std::make_shared<BertTransformerLayer<T>>(layer_id,
-                                                           batch_size,
-                                                           hidden_dim,
-                                                           num_heads,
-                                                           intermediate_size,
-                                                           init_seq_length,
-                                                           attn_dropout_ratio,
-                                                           hidden_dropout_ratio,
-                                                           layer_norm_eps,
-                                                           pre_or_postLayerNorm,
-                                                           Context::Instance().GetGemmAlgos(),
-                                                           attn_dropout_checkpoint,
-                                                           normalize_invertible,
-                                                           gelu_checkpoint,
-                                                           stochastic_mode);
+    auto layer =
+        std::make_shared<BertTransformerLayer<T>>(layer_id,
+                                                  batch_size,
+                                                  hidden_dim,
+                                                  num_heads,
+                                                  intermediate_size,
+                                                  init_seq_length,
+                                                  attn_dropout_ratio,
+                                                  hidden_dropout_ratio,
+                                                  layer_norm_eps,
+                                                  pre_or_postLayerNorm,
+                                                  TrainingContext::Instance().GetGemmAlgos(),
+                                                  attn_dropout_checkpoint,
+                                                  normalize_invertible,
+                                                  gelu_checkpoint,
+                                                  stochastic_mode);
 
     s_transformer_layers[layer_id] = layer;
 
@@ -720,7 +726,7 @@ std::vector<torch::Tensor> ds_transformer_forward(unsigned layer_id,
                                                          layer->IsTrainingMode(),
                                                          layer->GeluCheckpoint())},
                                   options);
-    Context::Instance().SetWorkSpace((T*)workspace.data_ptr());
+    TrainingContext::Instance().SetWorkSpace((T*)workspace.data_ptr());
 
     auto inp_norm = ((prelayernorm || !normalize_invertible) ? torch::empty_like(input) : output);
     auto add_res = (normalize_invertible ? inp_norm : torch::empty_like(input));
@@ -904,7 +910,7 @@ std::vector<torch::Tensor> ds_transformer_backward(unsigned layer_id,
                                                          layer->IsTrainingMode(),
                                                          layer->GeluCheckpoint())},
                                   options);
-    Context::Instance().SetWorkSpace((T*)workspace.data_ptr());
+    TrainingContext::Instance().SetWorkSpace((T*)workspace.data_ptr());
 
     auto grad_input = torch::empty_like(input);
     auto grad_attn_qkvw = torch::empty_like(attn_qkvw);
diff --git a/csrc/transformer/gelu_kernels.cu b/csrc/transformer/gelu_kernels.cu
index 1f113a988d74..273891b91923 100644
--- a/csrc/transformer/gelu_kernels.cu
+++ b/csrc/transformer/gelu_kernels.cu
@@ -1,6 +1,7 @@
-/*
-Copyright The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #include "custom_cuda_layers.h"
 
diff --git a/csrc/transformer/general_kernels.cu b/csrc/transformer/general_kernels.cu
index ea5491003dc9..a987eec5ef0b 100644
--- a/csrc/transformer/general_kernels.cu
+++ b/csrc/transformer/general_kernels.cu
@@ -1,6 +1,7 @@
-/*
-Copyright The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #include "general_kernels.h"
 
diff --git a/csrc/transformer/inference/csrc/apply_rotary_pos_emb.cu b/csrc/transformer/inference/csrc/apply_rotary_pos_emb.cu
index 0be46353c654..a06dbb48fd33 100644
--- a/csrc/transformer/inference/csrc/apply_rotary_pos_emb.cu
+++ b/csrc/transformer/inference/csrc/apply_rotary_pos_emb.cu
@@ -1,208 +1,133 @@
-/*
-Copyright 2022 The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
 
+// DeepSpeed Team
+
+#include "conversion_utils.h"
+#ifdef __HIP_PLATFORM_AMD__
+#include "hip/hip_cooperative_groups.h"
+#else
+#include "cooperative_groups.h"
+#endif
+#include "ds_kernel_utils.h"
 #include "inference_cuda_layers.h"
+#include "memory_access_utils.h"
 
-#ifndef __HIP_PLATFORM_HCC__
+#ifndef __HIP_PLATFORM_AMD__
 #include <cuda_profiler_api.h>
 #endif
 
 namespace cg = cooperative_groups;
-namespace cg = cooperative_groups;
-
-__global__ void apply_rotary_pos_emb(float* mixed_query,
-                                     float* key_layer,
-                                     unsigned rotary_dim,
-                                     unsigned seq_len,
-                                     unsigned seq_offset,
-                                     unsigned num_heads,
-                                     unsigned head_size,
-                                     unsigned total_count,
-                                     int max_out_tokens)
-{
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-
-    int id = threadIdx.x;
-    int gid = id >> 5;
-    int lane = id & 0x1f;
-
-    unsigned head_id = blockIdx.x * MAX_WARP_NUM + gid;
-    unsigned offset = head_id * head_size;
-
-    unsigned seq_id = (head_id / num_heads) % seq_len + seq_offset;
-    unsigned seq_index = head_id % seq_len;
-    unsigned k_offset = (seq_index + (head_id / seq_len) * max_out_tokens) * head_size;
-
-    if (head_id < total_count) {
-        while (lane < rotary_dim) {
-            float inv_freq = (float)((lane / 2) * 2) / (float)rotary_dim;
-            inv_freq = 1.0 / powf(10000.0, inv_freq) * (float)seq_id;
-            float q = mixed_query[offset + lane];
-            float k = key_layer[k_offset + lane];
-            float rotary_sign = (lane % 2 == 1 ? -1.0 : 1.0);
-            float q_rot = (q * rotary_sign);
-            float k_rot = (k * rotary_sign);
-            q_rot = g.shfl_xor(q_rot, 1);
-            k_rot = g.shfl_xor(k_rot, 1);
-            q = q * cosf(inv_freq) + q_rot * sinf(inv_freq);
-            k = k * cosf(inv_freq) + k_rot * sinf(inv_freq);
-
-            mixed_query[offset + lane] = q;
-            key_layer[k_offset + lane] = k;
-
-            lane += WARP_SIZE;
-        }
-    }
-}
-
-__global__ void apply_rotary_pos_emb(__half* mixed_query,
-                                     __half* key_layer,
-                                     unsigned rotary_dim,
-                                     unsigned seq_len,
-                                     unsigned seq_offset,
-                                     unsigned num_heads,
-                                     unsigned head_size,
-                                     unsigned total_count,
-                                     int max_out_tokens)
-{
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-
-    int id = threadIdx.x;
-    int gid = id >> 5;
-    int lane = id & 0x1f;
-
-    unsigned head_id = blockIdx.x * MAX_WARP_NUM + gid;
-    unsigned offset = head_id * head_size;
-
-    unsigned seq_id = (head_id / num_heads) % seq_len + seq_offset;
-    unsigned seq_index = head_id % seq_len;
-    unsigned k_offset = (seq_index + (head_id / seq_len) * max_out_tokens) * head_size;
 
-    if (head_id < total_count) {
-        while (lane < rotary_dim) {
-            float inv_freq = (float)((lane / 2) * 2) / (float)rotary_dim;
-            inv_freq = 1.0 / powf(10000.0, inv_freq) * (float)seq_id;
-            float q = (float)mixed_query[offset + lane];
-            float k = (float)key_layer[k_offset + lane];
-            float rotary_sign = (lane % 2 == 1 ? -1.0 : 1.0);
-            float q_rot = (q * rotary_sign);
-            float k_rot = (k * rotary_sign);
-            q_rot = g.shfl_xor(q_rot, 1);
-            k_rot = g.shfl_xor(k_rot, 1);
-            q = q * cosf(inv_freq) + q_rot * sinf(inv_freq);
-            k = k * cosf(inv_freq) + k_rot * sinf(inv_freq);
+namespace rot_half {
+constexpr int threads = 256;
+}  // namespace rot_half
 
-            mixed_query[offset + lane] = (__half)q;
-            key_layer[k_offset + lane] = (__half)k;
-
-            lane += WARP_SIZE;
-        }
-    }
-}
-__global__ void apply_rotary_pos_emb1(float* mixed_query,
-                                      float* key_layer,
+template <typename T, int threadsPerHead, int granularity>
+__global__ void apply_rotary_pos_half(T* mixed_query,
+                                      T* key_layer,
                                       unsigned rotary_dim,
                                       unsigned seq_len,
                                       unsigned seq_offset,
                                       unsigned num_heads,
                                       unsigned head_size,
                                       unsigned total_count,
+                                      float rope_theta,
                                       int max_out_tokens)
 {
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-
-    int id = threadIdx.x;
-    int gid = id >> 5;
-    int lane = id & 0x1f;
-
-    unsigned head_id = blockIdx.x * MAX_WARP_NUM + gid;
-    unsigned offset = head_id * head_size;
-
-    unsigned seq_id = (head_id / num_heads) % seq_len + seq_offset;
-    unsigned seq_index = head_id % seq_len;
-    unsigned k_offset = (seq_index + (head_id / seq_len) * max_out_tokens) * head_size;
-
-    if (head_id < total_count) {
-        while (lane < rotary_dim) {
-            float inv_freq = (float)((lane / 2) * 2) / (float)rotary_dim;
-            inv_freq = 1.0 / powf(10000.0, inv_freq) * (float)seq_id;
-            float q = mixed_query[offset + lane];
-            float k = key_layer[k_offset + lane];
-            float rotary_sign = (lane % 2 == 1 ? -1.0 : 1.0);
-            float q_rot = (q * rotary_sign);
-            float k_rot = (k * rotary_sign);
-            q_rot = g.shfl_xor(q_rot, 1);
-            k_rot = g.shfl_xor(k_rot, 1);
-            q = q * cosf(inv_freq) + q_rot * sinf(inv_freq);
-            k = k * cosf(inv_freq) + k_rot * sinf(inv_freq);
-
-            mixed_query[offset + lane] = q;
-            key_layer[k_offset + lane] = k;
-
-            lane += WARP_SIZE;
+    constexpr int T_per_thread = granularity / sizeof(T);
+    constexpr int heads_per_block = rot_half::threads / threadsPerHead;
+
+    cg::thread_block tb = cg::this_thread_block();
+    cg::thread_block_tile<threadsPerHead> head_group = cg::tiled_partition<threadsPerHead>(tb);
+
+    const int head_idx = blockIdx.x * heads_per_block + threadIdx.x / threadsPerHead;
+    const int cur_seq_idx = head_idx % seq_len;
+    const int offset = head_idx * head_size;
+    const int k_offset = (cur_seq_idx + (head_idx / seq_len) * max_out_tokens) * head_size;
+
+    const int seq_idx = cur_seq_idx + seq_offset;
+    const int half_dim = rotary_dim >> 1;
+    const int half_dim_threads = half_dim / T_per_thread;
+
+    if (head_idx < total_count) {
+        const int base_neuron_idx = head_group.thread_rank() * T_per_thread;
+
+        T q[T_per_thread], k[T_per_thread];
+        mem_access::load_global<granularity>(q, mixed_query + offset + base_neuron_idx);
+        mem_access::load_global<granularity>(k, key_layer + k_offset + base_neuron_idx);
+
+#pragma unroll
+        for (int i = 0; i < T_per_thread; i++) {
+            const int neuron_idx = base_neuron_idx + i;
+            if (neuron_idx < rotary_dim) {
+                float inv_freq = (float)((neuron_idx % half_dim) * 2) / (float)rotary_dim;
+                inv_freq = 1.0 / powf(rope_theta, inv_freq) * (float)seq_idx;
+
+                float rotary_sign = (neuron_idx > (half_dim - 1) ? -1.0 : 1.0);
+                float q_rot = conversion::to<float>(q[i]) * rotary_sign;
+                float k_rot = conversion::to<float>(k[i]) * rotary_sign;
+
+                const int target_lane = (neuron_idx < half_dim)
+                                            ? head_group.thread_rank() + half_dim_threads
+                                            : head_group.thread_rank() - half_dim_threads;
+
+                const float q_rot_temp = head_group.shfl(q_rot, target_lane);
+                const float k_rot_temp = head_group.shfl(k_rot, target_lane);
+
+                q[i] = conversion::to<T>(conversion::to<float>(q[i]) * cosf(inv_freq) +
+                                         q_rot_temp * sinf(inv_freq));
+                k[i] = conversion::to<T>(conversion::to<float>(k[i]) * cosf(inv_freq) +
+                                         k_rot_temp * sinf(inv_freq));
+            }
         }
+
+        mem_access::store_global<granularity>(mixed_query + offset + base_neuron_idx, q);
+        mem_access::store_global<granularity>(key_layer + k_offset + base_neuron_idx, k);
     }
 }
-__global__ void apply_rotary_pos_emb1(__half* mixed_query,
-                                      __half* key_layer,
-                                      unsigned rotary_dim,
-                                      unsigned seq_len,
-                                      unsigned seq_offset,
-                                      unsigned num_heads,
-                                      unsigned head_size,
-                                      unsigned total_count,
-                                      int max_out_tokens)
-{
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-
-    int id = threadIdx.x;
-    int gid = id >> 5;
-    int lane = id & 0x1f;
-
-    unsigned head_id = blockIdx.x * MAX_WARP_NUM + gid;
-    unsigned seq_index = head_id % seq_len;
-    unsigned offset = head_id * head_size;
-    unsigned k_offset = (seq_index + (head_id / seq_len) * max_out_tokens) * head_size;
-
-    constexpr unsigned mask[32] = {
-        0x1 | 0x1000,     0x2 | 0x2000,     0x4 | 0x4000,     0x8 | 0x8000,     0x10 | 0x10000,
-        0x20 | 0x20000,   0x40 | 0x40000,   0x80 | 0x80000,   0x100 | 0x100000, 0x200 | 0x200000,
-        0x400 | 0x400000, 0x800 | 0x800000, 0x1000 | 0x1,     0x2000 | 0x2,     0x4000 | 0x4,
-        0x8000 | 0x8,     0x10000 | 0x10,   0x20000 | 0x20,   0x40000 | 0x40,   0x80000 | 0x80,
-        0x100000 | 0x100, 0x200000 | 0x200, 0x400000 | 0x400, 0x800000 | 0x800, 0x1000000,
-        0x2000000,        0x4000000,        0x8000000,        0x10000000,       0x20000000,
-        0x40000000,       0x80000000};
-
-    unsigned seq_id = (head_id % seq_len) + seq_offset;
-    unsigned half_dim = rotary_dim >> 1;
-    if (head_id < total_count) {
-        while (lane < rotary_dim) {
-            float inv_freq = (float)((lane % half_dim) * 2) / (float)rotary_dim;
-            inv_freq = 1.0 / powf(10000.0, inv_freq) * (float)seq_id;
-            float q = (float)mixed_query[offset + lane];
-            float k = (float)key_layer[k_offset + lane];
-            float rotary_sign = (lane > (half_dim - 1) ? -1.0 : 1.0);
-            float q_rot = (q * rotary_sign);
-            float k_rot = (k * rotary_sign);
-            auto q_rot_tmp = lane < half_dim ? __shfl_sync(mask[lane], q_rot, lane + half_dim)
-                                             : __shfl_sync(mask[lane], q_rot, lane - half_dim);
-            auto k_rot_tmp = lane < half_dim ? __shfl_sync(mask[lane], k_rot, lane + half_dim)
-                                             : __shfl_sync(mask[lane], k_rot, lane - half_dim);
-            q = q * cosf(inv_freq) + q_rot_tmp * sinf(inv_freq);
-            k = k * cosf(inv_freq) + k_rot_tmp * sinf(inv_freq);
 
-            mixed_query[offset + lane] = (__half)q;
-            key_layer[k_offset + lane] = (__half)k;
-
-            lane += WARP_SIZE;
-        }
+#define LAUNCH_ROT_POS_EMB_HALF(HEAD_THREADS, ALIGNMENT)                                       \
+    apply_rotary_pos_half<T, HEAD_THREADS, ALIGNMENT><<<grid, block, 0, stream>>>(mixed_query, \
+                                                                                  key_layer,   \
+                                                                                  rotary_dim,  \
+                                                                                  seq_len,     \
+                                                                                  offset,      \
+                                                                                  num_heads,   \
+                                                                                  head_size,   \
+                                                                                  total_count, \
+                                                                                  rope_theta,  \
+                                                                                  max_out_tokens);
+
+#ifdef __HIP_PLATFORM_AMD__
+#define LAUNCH_FOR_ALIGNMENT(ALIGNMENT)         \
+    if (threads_per_head == 4) {                \
+        LAUNCH_ROT_POS_EMB_HALF(4, ALIGNMENT);  \
+    } else if (threads_per_head == 8) {         \
+        LAUNCH_ROT_POS_EMB_HALF(8, ALIGNMENT);  \
+    } else if (threads_per_head == 16) {        \
+        LAUNCH_ROT_POS_EMB_HALF(16, ALIGNMENT); \
+    } else if (threads_per_head == 32) {        \
+        LAUNCH_ROT_POS_EMB_HALF(32, ALIGNMENT); \
+    } else if (threads_per_head == 64) {        \
+        LAUNCH_ROT_POS_EMB_HALF(64, ALIGNMENT); \
+    } else {                                    \
+        assert(false);                          \
     }
-}
+#else
+#define LAUNCH_FOR_ALIGNMENT(ALIGNMENT)         \
+    if (threads_per_head == 4) {                \
+        LAUNCH_ROT_POS_EMB_HALF(4, ALIGNMENT);  \
+    } else if (threads_per_head == 8) {         \
+        LAUNCH_ROT_POS_EMB_HALF(8, ALIGNMENT);  \
+    } else if (threads_per_head == 16) {        \
+        LAUNCH_ROT_POS_EMB_HALF(16, ALIGNMENT); \
+    } else if (threads_per_head == 32) {        \
+        LAUNCH_ROT_POS_EMB_HALF(32, ALIGNMENT); \
+    } else {                                    \
+        assert(false);                          \
+    }
+#endif
 
 template <typename T>
 void launch_apply_rotary_pos_emb(T* mixed_query,
@@ -213,193 +138,62 @@ void launch_apply_rotary_pos_emb(T* mixed_query,
                                  unsigned offset,
                                  unsigned num_heads,
                                  unsigned batch,
-                                 bool rotate_half,
-                                 bool rotate_every_two,
+                                 float rope_theta,
                                  cudaStream_t stream,
                                  int max_out_tokens)
 {
-    int total_count = batch * num_heads * seq_len;
-    dim3 block_dims(1024);
-    dim3 grid_dims((total_count - 1) / MAX_WARP_NUM + 1);  // (batch_size);
-    if (rotate_every_two)
-        apply_rotary_pos_emb<<<grid_dims, block_dims, 0, stream>>>(mixed_query,
-                                                                   key_layer,
-                                                                   rotary_dim,
-                                                                   seq_len,
-                                                                   offset,
-                                                                   num_heads,
-                                                                   head_size,
-                                                                   total_count,
-                                                                   max_out_tokens);
-    else if (rotate_half)
-        apply_rotary_pos_emb1<<<grid_dims, block_dims, 0, stream>>>(mixed_query,
-                                                                    key_layer,
-                                                                    rotary_dim,
-                                                                    seq_len,
-                                                                    offset,
-                                                                    num_heads,
-                                                                    head_size,
-                                                                    total_count,
-                                                                    max_out_tokens);
-}
-
-template void launch_apply_rotary_pos_emb<float>(float*,
-                                                 float*,
-                                                 unsigned,
-                                                 unsigned,
-                                                 unsigned,
-                                                 unsigned,
-                                                 unsigned,
-                                                 unsigned,
-                                                 bool,
-                                                 bool,
-                                                 cudaStream_t,
-                                                 int);
-template void launch_apply_rotary_pos_emb<__half>(__half*,
-                                                  __half*,
-                                                  unsigned,
-                                                  unsigned,
-                                                  unsigned,
-                                                  unsigned,
-                                                  unsigned,
-                                                  unsigned,
-                                                  bool,
-                                                  bool,
-                                                  cudaStream_t,
-                                                  int);
-
-/*
-__global__ void apply_rotary_pos_emb(float* mixed_query,
-float* key_layer,
-unsigned rotary_dim,
-unsigned seq_len,
-unsigned seq_offset,
-unsigned num_heads,
-unsigned head_size,
-unsigned total_count)
-{
-cg::thread_block b = cg::this_thread_block();
-cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
+    const int half_dim = rotary_dim >> 1;
+
+    int alignment = sizeof(T);
+    if (half_dim % (16 / sizeof(T)) == 0) {
+        alignment = 16;
+    } else if (half_dim % (8 / sizeof(T)) == 0) {
+        alignment = 8;
+    } else if (half_dim % (4 / sizeof(T)) == 0) {
+        alignment = 4;
+    } else {
+        assert(false);
+    }
+    const int T_per_elem = alignment / sizeof(T);
 
-int id = threadIdx.x;
-int gid = id >> 5;
-int lane = id & 0x1f;
+    int total_count = batch * num_heads * seq_len;
 
-unsigned head_id = blockIdx.x * MAX_WARP_NUM + gid;
-unsigned offset = head_id * head_size;
+    const int padded_head_size = next_pow2(head_size);
 
-unsigned seq_id = (head_id / num_heads) % seq_len + seq_offset;
+    assert(padded_head_size <= hw_warp_size * T_per_elem);
 
-if (head_id < total_count) {
-while (lane < rotary_dim) {
-float inv_freq = (float)((lane / 2) * 2) / (float)rotary_dim;
-inv_freq = 1.0 / powf(10000.0, inv_freq) * (float)seq_id;
-float q = mixed_query[offset + lane];
-float k = key_layer[offset + lane];
-float rotary_sign = (lane % 2 == 1 ? -1.0 : 1.0);
-float q_rot = (q * rotary_sign);
-float k_rot = (k * rotary_sign);
-q_rot = g.shfl_xor(q_rot, 1);
-k_rot = g.shfl_xor(k_rot, 1);
-q = q * cosf(inv_freq) + q_rot * sinf(inv_freq);
-k = k * cosf(inv_freq) + k_rot * sinf(inv_freq);
+    const int threads_per_head = padded_head_size / T_per_elem;
+    const int heads_per_block = rot_half::threads / threads_per_head;
 
-mixed_query[offset + lane] = q;
-key_layer[offset + lane] = k;
+    dim3 block(rot_half::threads);
+    dim3 grid((total_count + heads_per_block - 1) / heads_per_block);
 
-lane += WARP_SIZE;
-}
-}
+    if (alignment == 4) {
+        LAUNCH_FOR_ALIGNMENT(4);
+    } else if (alignment == 8) {
+        LAUNCH_FOR_ALIGNMENT(8);
+    } else if (alignment == 16) {
+        LAUNCH_FOR_ALIGNMENT(16);
+    } else {
+        assert(false);
+    }
 }
 
-__global__ void apply_rotary_pos_emb(__half* mixed_query,
-__half* key_layer,
-unsigned rotary_dim,
-unsigned seq_len,
-unsigned seq_offset,
-unsigned num_heads,
-unsigned head_size,
-unsigned total_count)
-{
-#if __CUDA_ARCH__ >= 700
-cg::thread_block b = cg::this_thread_block();
-cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-
-int id = threadIdx.x;
-int gid = id >> 5;
-int lane = id & 0x1f;
-
-unsigned head_id = blockIdx.x * MAX_WARP_NUM + gid;
-unsigned offset = head_id * head_size;
-constexpr unsigned mask[32] = {0x1 | 0x1000, 0x2 | 0x2000, 0x4 | 0x4000, 0x8 | 0x8000,
-0x10 | 0x10000, 0x20 | 0x20000, 0x40 | 0x40000, 0x80 | 0x80000,
-0x100 | 0x100000, 0x200 | 0x200000, 0x400 | 0x400000, 0x800 | 0x800000,
-0x1000 | 0x1, 0x2000 | 0x2, 0x4000 | 0x4, 0x8000 | 0x8,
-0x10000 | 0x10, 0x20000 | 0x20, 0x40000 | 0x40, 0x80000 | 0x80,
-0x100000 | 0x100, 0x200000 | 0x200, 0x400000 | 0x400, 0x800000 | 0x800,
-0x1000000, 0x2000000, 0x4000000, 0x8000000,
-0x10000000, 0x20000000, 0x40000000, 0x80000000};
-unsigned seq_id = (head_id / num_heads) % seq_len + seq_offset;
-
-if (head_id < total_count) {
-while (lane < rotary_dim) {
-//float inv_freq = (float)((lane / 2) * 2) / (float)rotary_dim;
-float inv_freq = (float)((lane % (rotary_dim >> 1)) * 2) / (float)rotary_dim;
-inv_freq = 1.0 / powf(10000.0, inv_freq) * (float)seq_id;
-float q = (float)mixed_query[offset + lane];
-float k = (float)key_layer[offset + lane];
-float rotary_sign = (lane > 11 ? -1.0 : 1.0);
-float q_rot = (q * rotary_sign);
-float k_rot = (k * rotary_sign);
-auto q_rot_tmp = lane < 12 ? __shfl_sync(mask[lane], q_rot, lane + 12) : __shfl_sync(mask[lane],
-q_rot, lane - 12);//g.shfl_xor(q_rot, 12); auto k_rot_tmp = lane < 12 ? __shfl_sync(mask[lane],
-k_rot, lane + 12) : __shfl_sync(mask[lane], k_rot, lane - 12);//g.shfl_xor(k_rot, 12); q = q *
-cosf(inv_freq) + q_rot_tmp * sinf(inv_freq); k = k * cosf(inv_freq) + k_rot_tmp * sinf(inv_freq);
-
-mixed_query[offset + lane] = (__half)q;
-key_layer[offset + lane] = (__half)k;
+#define INSTANTIATE_LAUNCH_ROTARY_POS_EMB(T)                   \
+    template void launch_apply_rotary_pos_emb<T>(T*,           \
+                                                 T*,           \
+                                                 unsigned,     \
+                                                 unsigned,     \
+                                                 unsigned,     \
+                                                 unsigned,     \
+                                                 unsigned,     \
+                                                 unsigned,     \
+                                                 float,        \
+                                                 cudaStream_t, \
+                                                 int);
 
-lane += WARP_SIZE;
-}
-}
+INSTANTIATE_LAUNCH_ROTARY_POS_EMB(float);
+#ifdef BF16_AVAILABLE
+INSTANTIATE_LAUNCH_ROTARY_POS_EMB(__nv_bfloat16);
 #endif
-}
-
-template <typename T>
-void launch_apply_rotary_pos_emb(T* mixed_query,
-T* key_layer,
-unsigned head_size,
-unsigned seq_len,
-unsigned rotary_dim,
-unsigned offset,
-unsigned num_heads,
-unsigned batch,
-cudaStream_t stream)
-{
-int total_count = batch * num_heads * seq_len;
-dim3 block_dims(1024);
-dim3 grid_dims((total_count - 1) / MAX_WARP_NUM + 1);  // (batch_size);
-
-apply_rotary_pos_emb<<<grid_dims, block_dims, 0, stream>>>(
-mixed_query, key_layer, rotary_dim, seq_len, offset, num_heads, head_size, total_count);
-}
-
-template void launch_apply_rotary_pos_emb<float>(float*,
-float*,
-unsigned,
-unsigned,
-unsigned,
-unsigned,
-unsigned,
-unsigned,
-cudaStream_t);
-template void launch_apply_rotary_pos_emb<__half>(__half*,
-__half*,
-unsigned,
-unsigned,
-unsigned,
-unsigned,
-unsigned,
-unsigned,
-cudaStream_t);
-*/
+INSTANTIATE_LAUNCH_ROTARY_POS_EMB(__half);
diff --git a/csrc/transformer/inference/csrc/dequantize.cu b/csrc/transformer/inference/csrc/dequantize.cu
index 33605e1f54e0..7a8e7ca446b0 100644
--- a/csrc/transformer/inference/csrc/dequantize.cu
+++ b/csrc/transformer/inference/csrc/dequantize.cu
@@ -1,7 +1,9 @@
-/*
-Copyright 2022 The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
 
+// DeepSpeed Team
+
+#include "conversion_utils.h"
 #include "inference_cuda_layers.h"
 
 #define MAX_QUANTIZE_GROUPING 1024
@@ -9,7 +11,8 @@ Copyright 2022 The Microsoft DeepSpeed Team
 #define loop_unroll 1
 #define loop_unroll_bits 1
 
-__global__ void dequantize_kernel(float* output,
+template <typename T>
+__global__ void dequantize_kernel(T* output,
                                   const int8_t* input,
                                   const float* qscale,
                                   int output_size,
@@ -37,40 +40,7 @@ __global__ void dequantize_kernel(float* output,
 
         float scale_data = qscale[scale_index];
 
-        output[q_index] = (scale_data * (float)q);
-        tid += blockDim.x;
-    }
-}
-
-__global__ void dequantize_kernel(__half* output,
-                                  const int8_t* input,
-                                  const float* qscale,
-                                  unsigned output_size,
-                                  unsigned hidden_dim,
-                                  unsigned groups,
-                                  unsigned merge_count)
-{
-    unsigned merge_hidden = hidden_dim >> merge_count;
-    unsigned quantization_stride = (merge_hidden * output_size) / groups;
-
-    unsigned bid = blockIdx.x;
-    unsigned tid = threadIdx.x;
-
-    while (tid < output_size) {
-        unsigned w_index = bid / merge_hidden;
-        unsigned q_index = tid + bid * output_size;
-
-        auto q = input[q_index];
-
-        unsigned merge_hidden_total = w_index * merge_hidden;
-        unsigned scale_index =
-            ((((bid - merge_hidden_total) + tid * merge_hidden) / quantization_stride)
-             << merge_count) +
-            w_index;
-
-        float scale_data = qscale[scale_index];
-
-        output[q_index] = __float2half(scale_data * (float)q);
+        output[q_index] = conversion::to<T>(scale_data * (float)q);
         tid += blockDim.x;
     }
 }
@@ -93,22 +63,15 @@ void launch_dequantize(T* output,
         output, input, qscale, output_size, hidden_dim, groups, merge_count);
 }
 
-template void launch_dequantize<float>(float*,
-                                       const int8_t*,
-                                       const float*,
-                                       unsigned,
-                                       unsigned,
-                                       unsigned,
-                                       unsigned,
-                                       cudaStream_t);
-template void launch_dequantize<__half>(__half*,
-                                        const int8_t*,
-                                        const float*,
-                                        unsigned,
-                                        unsigned,
-                                        unsigned,
-                                        unsigned,
-                                        cudaStream_t);
+#define INSTANTIATE_DEQUANTIZE_MERGE(T) \
+    template void launch_dequantize<T>( \
+        T*, const int8_t*, const float*, unsigned, unsigned, unsigned, unsigned, cudaStream_t);
+
+INSTANTIATE_DEQUANTIZE_MERGE(float);
+#ifdef BF16_AVAILABLE
+INSTANTIATE_DEQUANTIZE_MERGE(__nv_bfloat16);
+#endif
+INSTANTIATE_DEQUANTIZE_MERGE(__half);
 
 __global__ void dequantize_kernel(float* output,
                                   const int8_t* input,
@@ -119,7 +82,8 @@ __global__ void dequantize_kernel(float* output,
 {
 }
 
-__global__ void dequantize_kernel(__half* output,
+template <typename T>
+__global__ void dequantize_kernel(T* output,
                                   const int8_t* input,
                                   const float* qscale,
                                   unsigned hidden_dim,
@@ -143,12 +107,12 @@ __global__ void dequantize_kernel(__half* output,
             int8_t* q_int8 = (int8_t*)&q;
 
             float2 q_f;
-            __half* q_h = (__half*)&q_f;
+            T* q_h = (T*)&q_f;
 
-            q_h[0] = __float2half(local_scale * (float)q_int8[0]);
-            q_h[1] = __float2half(local_scale * (float)q_int8[1]);
-            q_h[2] = __float2half(local_scale * (float)q_int8[2]);
-            q_h[3] = __float2half(local_scale * (float)q_int8[3]);
+            q_h[0] = conversion::to<T>(local_scale * (float)q_int8[0]);
+            q_h[1] = conversion::to<T>(local_scale * (float)q_int8[1]);
+            q_h[2] = conversion::to<T>(local_scale * (float)q_int8[2]);
+            q_h[3] = conversion::to<T>(local_scale * (float)q_int8[3]);
             output_cast[tid] = q_f;
             tid += blockDim.x;
         }
@@ -166,29 +130,24 @@ void launch_dequantize(T* output,
 {
     unsigned threads = 1024;
     hidden_dim /= 4;
-    unsigned hid_cnt = threads / hidden_dim;
     unsigned thd_cnt = (hidden_dim - 1) / threads + 1;
-    hid_cnt = hid_cnt > 0 ? hid_cnt : 1;
 
-    unsigned blocks = (output_size + hid_cnt * groups - 1) / (hid_cnt * groups);
+    assert(output_size % groups == 0);
+    unsigned blocks = output_size / groups;
+
     dim3 block_dims(threads);
     dim3 grid_dims(groups, blocks);
 
     dequantize_kernel<<<grid_dims, block_dims, 0, stream>>>(
-        output, input, qscale, hidden_dim, hid_cnt * hidden_dim, thd_cnt);
+        output, input, qscale, hidden_dim, hidden_dim, thd_cnt);
 }
 
-template void launch_dequantize<float>(float*,
-                                       const int8_t*,
-                                       const float*,
-                                       unsigned,
-                                       unsigned,
-                                       unsigned,
-                                       cudaStream_t);
-template void launch_dequantize<__half>(__half*,
-                                        const int8_t*,
-                                        const float*,
-                                        unsigned,
-                                        unsigned,
-                                        unsigned,
-                                        cudaStream_t);
+#define INSTANTIATE_DEQUANTIZE_NO_MERGE(T) \
+    template void launch_dequantize<T>(    \
+        T*, const int8_t*, const float*, unsigned, unsigned, unsigned, cudaStream_t);
+
+INSTANTIATE_DEQUANTIZE_NO_MERGE(float);
+#ifdef BF16_AVAILABLE
+INSTANTIATE_DEQUANTIZE_NO_MERGE(__nv_bfloat16);
+#endif
+INSTANTIATE_DEQUANTIZE_NO_MERGE(__half);
diff --git a/csrc/transformer/inference/csrc/gelu.cu b/csrc/transformer/inference/csrc/gelu.cu
index 71a37bb368c7..dc7ff4d1e7c0 100644
--- a/csrc/transformer/inference/csrc/gelu.cu
+++ b/csrc/transformer/inference/csrc/gelu.cu
@@ -1,6 +1,7 @@
-/*
-Copyright 2022 The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #include "conversion_utils.h"
 #include "inference_cuda_layers.h"
@@ -10,10 +11,15 @@ namespace cg = cooperative_groups;
 #define MAX_CAP 4
 #define MAX_SEQ 2048
 
+// only used to avoid compilation error due to lack of definition.
+#ifndef BF16_AVAILABLE
+using __nv_bfloat162 = __half2;
+#endif
+
 inline __device__ float gelu(const float x)
 {
-    const float sqrt_param = 0.79788456080286535587989211986876f;
-    const float mul_param = 0.044715;
+    constexpr float sqrt_param = 0.79788456080286535587989211986876f;
+    constexpr float mul_param = 0.044715;
     return x * 0.5f * (1.0f + tanhf(sqrt_param * (x + mul_param * x * x * x)));
 }
 
@@ -32,7 +38,8 @@ __global__ void fused_bias_gelu(T* input, const T* bias, int total_count, int in
         T data[values_per_access];
         T data_bias[values_per_access];
         mem_access::load_global<granularity>(data, input + offset);
-        mem_access::load_global<granularity>(data_bias, bias + (offset % intermediate_size));
+        mem_access::load_global<granularity>(
+            data_bias, bias + (offset % intermediate_size), bias != nullptr);
 
 #pragma unroll
         for (int i = 0; i < values_per_access; i++) {
@@ -64,8 +71,14 @@ void launch_bias_gelu(T* input,
         input, bias, total_count, intermediate_size);
 }
 
-template void launch_bias_gelu<float>(float*, const float*, int, int, cudaStream_t);
-template void launch_bias_gelu<__half>(__half*, const __half*, int, int, cudaStream_t);
+#define INSTANTIATE_LAUNCH_BIAS_GELU(T) \
+    template void launch_bias_gelu<T>(T*, const T*, int, int, cudaStream_t);
+
+INSTANTIATE_LAUNCH_BIAS_GELU(float)
+#ifdef BF16_AVAILABLE
+INSTANTIATE_LAUNCH_BIAS_GELU(__nv_bfloat16)
+#endif
+INSTANTIATE_LAUNCH_BIAS_GELU(__half)
 
 /*
 In-place channels-last bias add
@@ -82,7 +95,8 @@ __global__ void fused_bias_add(T* input, const T* bias, int total_count, int int
         T data[values_per_access];
         T data_bias[values_per_access];
         mem_access::load_global<granularity>(data, input + offset);
-        mem_access::load_global<granularity>(data_bias, bias + (offset % intermediate_size));
+        mem_access::load_global<granularity>(
+            data_bias, bias + (offset % intermediate_size), bias != nullptr);
 
 #pragma unroll
         for (int i = 0; i < values_per_access; i++) {
@@ -114,8 +128,14 @@ void launch_bias_add(T* input,
         input, bias, total_count, intermediate_size);
 }
 
-template void launch_bias_add<float>(float*, const float*, int, int, cudaStream_t);
-template void launch_bias_add<__half>(__half*, const __half*, int, int, cudaStream_t);
+#define INSTANTIATE_LAUNCH_BIAS_ADD(T) \
+    template void launch_bias_add<T>(T*, const T*, int, int, cudaStream_t);
+
+INSTANTIATE_LAUNCH_BIAS_ADD(float)
+#ifdef BF16_AVAILABLE
+INSTANTIATE_LAUNCH_BIAS_ADD(__nv_bfloat16)
+#endif
+INSTANTIATE_LAUNCH_BIAS_ADD(__half)
 
 __global__ void fused_bias_residual(float* residual,
                                     const float* hidden_state,
@@ -162,16 +182,19 @@ __global__ void fused_bias_residual(float* residual,
     }
 }
 
-__global__ void fused_bias_residual(__half* residual,
-                                    const __half* hidden_state,
-                                    const __half* attn,
-                                    const __half* bias,
-                                    const __half* attn_bias,
+template <typename T>
+__global__ void fused_bias_residual(T* residual,
+                                    const T* hidden_state,
+                                    const T* attn,
+                                    const T* bias,
+                                    const T* attn_bias,
                                     const int total_count,
                                     const int intermediate_size,
                                     const float mp_scale,
                                     const bool preln)
 {
+    using T2 =
+        typename std::conditional<std::is_same<T, __half>::value, __half2, __nv_bfloat162>::type;
     float2* res_fl2_ptr = reinterpret_cast<float2*>(residual);
     const float2* hs_fl2_ptr = reinterpret_cast<const float2*>(hidden_state);
     const float2* attn_fl2_ptr = reinterpret_cast<const float2*>(attn);
@@ -186,26 +209,26 @@ __global__ void fused_bias_residual(__half* residual,
         const float2 bias_fl2 = bias_fl2_ptr[offset % intermediate_size];
         const float2 attn_bias_fl2 = attn_bias_fl2_ptr[offset % intermediate_size];
 
-        __half2* res_half2 = reinterpret_cast<__half2*>(&res_fl2);
-        const __half2* hs_half2 = reinterpret_cast<const __half2*>(&hs_fl2);
-        const __half2* attn_half2 = reinterpret_cast<const __half2*>(&attn_fl2);
-        const __half2* bias_half2 = reinterpret_cast<const __half2*>(&bias_fl2);
-        const __half2* attn_bias_half2 = reinterpret_cast<const __half2*>(&attn_bias_fl2);
+        T2* res_half2 = reinterpret_cast<T2*>(&res_fl2);
+        const T2* hs_half2 = reinterpret_cast<const T2*>(&hs_fl2);
+        const T2* attn_half2 = reinterpret_cast<const T2*>(&attn_fl2);
+        const T2* bias_half2 = reinterpret_cast<const T2*>(&bias_fl2);
+        const T2* attn_bias_half2 = reinterpret_cast<const T2*>(&attn_bias_fl2);
 
-        float2 res_low = __half22float2(res_half2[0]);
-        float2 res_high = __half22float2(res_half2[1]);
+        float2 res_low = conversion::to<float2>(res_half2[0]);
+        float2 res_high = conversion::to<float2>(res_half2[1]);
 
-        const float2 hs_low = __half22float2(hs_half2[0]);
-        const float2 hs_high = __half22float2(hs_half2[1]);
+        const float2 hs_low = conversion::to<float2>(hs_half2[0]);
+        const float2 hs_high = conversion::to<float2>(hs_half2[1]);
 
-        const float2 attn_low = __half22float2(attn_half2[0]);
-        const float2 attn_high = __half22float2(attn_half2[1]);
+        const float2 attn_low = conversion::to<float2>(attn_half2[0]);
+        const float2 attn_high = conversion::to<float2>(attn_half2[1]);
 
-        const float2 bias_low = __half22float2(bias_half2[0]);
-        const float2 bias_high = __half22float2(bias_half2[1]);
+        const float2 bias_low = conversion::to<float2>(bias_half2[0]);
+        const float2 bias_high = conversion::to<float2>(bias_half2[1]);
 
-        const float2 attn_bias_low = __half22float2(attn_bias_half2[0]);
-        const float2 attn_bias_high = __half22float2(attn_bias_half2[1]);
+        const float2 attn_bias_low = conversion::to<float2>(attn_bias_half2[0]);
+        const float2 attn_bias_high = conversion::to<float2>(attn_bias_half2[1]);
 
         if (preln) {
             // residual = (residual + attention + bias + attention_bias) *
@@ -225,8 +248,8 @@ __global__ void fused_bias_residual(__half* residual,
             res_high.x = (res_high.x + hs_high.x + bias_high.x);
             res_high.y = (res_high.y + hs_high.y + bias_high.y);
         }
-        res_half2[0] = __float22half2_rn(res_low);
-        res_half2[1] = __float22half2_rn(res_high);
+        res_half2[0] = conversion::to<T2>(res_low);
+        res_half2[1] = conversion::to<T2>(res_high);
 
         res_fl2_ptr[offset] = res_fl2;
     }
@@ -259,10 +282,14 @@ void launch_bias_residual(T* residual,
                                                               preln);
 }
 
-template void launch_bias_residual<
-    float>(float*, float*, float*, float*, float*, int, int, int, bool, cudaStream_t);
-template void launch_bias_residual<
-    __half>(__half*, __half*, __half*, __half*, __half*, int, int, int, bool, cudaStream_t);
+#define INSTANTIATE_LAUNCH_BIAS_RESIDUAL(T) \
+    template void launch_bias_residual<T>(T*, T*, T*, T*, T*, int, int, int, bool, cudaStream_t);
+
+INSTANTIATE_LAUNCH_BIAS_RESIDUAL(float);
+#ifdef BF16_AVAILABLE
+INSTANTIATE_LAUNCH_BIAS_RESIDUAL(__nv_bfloat16);
+#endif
+INSTANTIATE_LAUNCH_BIAS_RESIDUAL(__half);
 
 __global__ void gptj_residual_add(float* residual,
                                   const float* hidden_state,
@@ -304,15 +331,18 @@ __global__ void gptj_residual_add(float* residual,
     }
 }
 
-__global__ void gptj_residual_add(__half* residual,
-                                  const __half* hidden_state,
-                                  const __half* attn,
-                                  const __half* bias,
-                                  const __half* attn_bias,
+template <typename T>
+__global__ void gptj_residual_add(T* residual,
+                                  const T* hidden_state,
+                                  const T* attn,
+                                  const T* bias,
+                                  const T* attn_bias,
                                   const int total_count,
                                   const int intermediate_size,
                                   const float mp_scale)
 {
+    using T2 =
+        typename std::conditional<std::is_same<T, __half>::value, __half2, __nv_bfloat162>::type;
     float2* res_fl2_ptr = reinterpret_cast<float2*>(residual);
     const float2* hs_fl2_ptr = reinterpret_cast<const float2*>(hidden_state);
     const float2* attn_fl2_ptr = reinterpret_cast<const float2*>(attn);
@@ -326,28 +356,28 @@ __global__ void gptj_residual_add(__half* residual,
         const float2 attn_fl2 = attn_fl2_ptr[offset];
         const float2 bias_fl2 = bias_fl2_ptr[offset % intermediate_size];
 
-        __half2* res_half2 = reinterpret_cast<__half2*>(&res_fl2);
-        const __half2* hs_half2 = reinterpret_cast<const __half2*>(&hs_fl2);
-        const __half2* attn_half2 = reinterpret_cast<const __half2*>(&attn_fl2);
-        const __half2* bias_half2 = reinterpret_cast<const __half2*>(&bias_fl2);
+        T2* res_half2 = reinterpret_cast<T2*>(&res_fl2);
+        const T2* hs_half2 = reinterpret_cast<const T2*>(&hs_fl2);
+        const T2* attn_half2 = reinterpret_cast<const T2*>(&attn_fl2);
+        const T2* bias_half2 = reinterpret_cast<const T2*>(&bias_fl2);
 
-        float2 res_low = __half22float2(res_half2[0]);
-        float2 res_high = __half22float2(res_half2[1]);
+        float2 res_low = conversion::to<float2>(res_half2[0]);
+        float2 res_high = conversion::to<float2>(res_half2[1]);
 
-        const float2 hs_low = __half22float2(hs_half2[0]);
-        const float2 hs_high = __half22float2(hs_half2[1]);
+        const float2 hs_low = conversion::to<float2>(hs_half2[0]);
+        const float2 hs_high = conversion::to<float2>(hs_half2[1]);
 
-        const float2 attn_low = __half22float2(attn_half2[0]);
-        const float2 attn_high = __half22float2(attn_half2[1]);
+        const float2 attn_low = conversion::to<float2>(attn_half2[0]);
+        const float2 attn_high = conversion::to<float2>(attn_half2[1]);
 
-        const float2 bias_low = __half22float2(bias_half2[0]);
-        const float2 bias_high = __half22float2(bias_half2[1]);
+        const float2 bias_low = conversion::to<float2>(bias_half2[0]);
+        const float2 bias_high = conversion::to<float2>(bias_half2[1]);
 
         if (attn_bias) {
             const float2 attn_bias_fl2 = attn_bias_fl2_ptr[offset % intermediate_size];
-            const __half2* attn_bias_half2 = reinterpret_cast<const __half2*>(&attn_bias_fl2);
-            const float2 attn_bias_low = __half22float2(attn_bias_half2[0]);
-            const float2 attn_bias_high = __half22float2(attn_bias_half2[1]);
+            const T2* attn_bias_half2 = reinterpret_cast<const T2*>(&attn_bias_fl2);
+            const float2 attn_bias_low = conversion::to<float2>(attn_bias_half2[0]);
+            const float2 attn_bias_high = conversion::to<float2>(attn_bias_half2[1]);
             // residual += attention_bias
             res_low.x += attn_bias_low.x;
             res_low.y += attn_bias_low.y;
@@ -360,8 +390,8 @@ __global__ void gptj_residual_add(__half* residual,
         res_high.x = attn_high.x + hs_high.x + (res_high.x + bias_high.x) * mp_scale;
         res_high.y = attn_high.y + hs_high.y + (res_high.y + bias_high.y) * mp_scale;
 
-        res_half2[0] = __float22half2_rn(res_low);
-        res_half2[1] = __float22half2_rn(res_high);
+        res_half2[0] = conversion::to<T2>(res_low);
+        res_half2[1] = conversion::to<T2>(res_high);
 
         res_fl2_ptr[offset] = res_fl2;
     }
@@ -386,24 +416,15 @@ void launch_gptj_residual_add(T* residual,
         residual, hidden_state, attn, bias, attn_bias, total_count, hidden_dim / 4, 1.0 / mp_size);
 }
 
-template void launch_gptj_residual_add<float>(float*,
-                                              float*,
-                                              float*,
-                                              float*,
-                                              float*,
-                                              int,
-                                              int,
-                                              int,
-                                              cudaStream_t);
-template void launch_gptj_residual_add<__half>(__half*,
-                                               __half*,
-                                               __half*,
-                                               __half*,
-                                               __half*,
-                                               int,
-                                               int,
-                                               int,
-                                               cudaStream_t);
+#define INSTANTIATE_GPT_RES_ADD(T) \
+    template void launch_gptj_residual_add<T>(T*, T*, T*, T*, T*, int, int, int, cudaStream_t);
+
+INSTANTIATE_GPT_RES_ADD(float);
+INSTANTIATE_GPT_RES_ADD(__half);
+#ifdef BF16_AVAILABLE
+INSTANTIATE_GPT_RES_ADD(__nv_bfloat16);
+#endif
+
 template <typename T>
 __global__ void moe_res_matmul(T* residual, T* coef, T* mlp_out, int seq_len, int hidden_dim)
 {
@@ -448,24 +469,20 @@ void launch_moe_res_matmul(T* residual,
         residual, coef, mlp_out, seq_len, hidden_dim);
 }
 
-template void launch_moe_res_matmul(float* residual,
-                                    float* coef,
-                                    float* mlp_out,
-                                    int seq_len,
-                                    int hidden_dim,
-                                    cudaStream_t stream);
-template void launch_moe_res_matmul(__half* residual,
-                                    __half* coef,
-                                    __half* mlp_out,
-                                    int seq_len,
-                                    int hidden_dim,
-                                    cudaStream_t stream);
+#define INSTANTIATE_LAUNCH_MOE_RES_MATMUL(T) \
+    template void launch_moe_res_matmul<T>(T*, T*, T*, int, int, cudaStream_t);
 
-__global__ void pad_data_kernel(__half* padded_output,
-                                __half* output,
-                                int head_size,
-                                int padded_head_size)
+INSTANTIATE_LAUNCH_MOE_RES_MATMUL(float);
+#ifdef BF16_AVAILABLE
+INSTANTIATE_LAUNCH_MOE_RES_MATMUL(__nv_bfloat16);
+#endif
+INSTANTIATE_LAUNCH_MOE_RES_MATMUL(__half);
+
+template <typename T>
+__global__ void pad_data_kernel(T* padded_output, T* output, int head_size, int padded_head_size)
 {
+    using T2 =
+        typename std::conditional<std::is_same<T, __half>::value, __half2, __nv_bfloat162>::type;
     float4* padded_output_cast = reinterpret_cast<float4*>(padded_output);
     float4* output_cast = reinterpret_cast<float4*>(output);
     int bid = blockIdx.x * (blockDim.y) + threadIdx.y;
@@ -473,8 +490,8 @@ __global__ void pad_data_kernel(__half* padded_output,
     padded_output_cast += (bid * padded_head_size);
     output_cast += (bid * head_size);
     float4 ZERO;
-    const __half2 zero_h = __float2half2_rn(0.f);
-    __half2* ZERO_h = reinterpret_cast<__half2*>(&ZERO);
+    const T2 zero_h = conversion::to<T2>(0.f);
+    T2* ZERO_h = reinterpret_cast<T2*>(&ZERO);
 #pragma unroll
     for (int i = 0; i < 4; i++) ZERO_h[i] = zero_h;
     if (idx < head_size)
@@ -482,12 +499,14 @@ __global__ void pad_data_kernel(__half* padded_output,
     else
         padded_output_cast[idx] = ZERO;
 }
+
 __global__ void pad_data_kernel(float* padded_output,
                                 float* output,
                                 int head_size,
                                 int padded_head_size)
 {
 }
+
 template <typename T>
 void pad_data(T* padded_output,
               T* output,
@@ -501,26 +520,25 @@ void pad_data(T* padded_output,
     pad_data_kernel<<<grid_dim, block_dim, 0, stream>>>(
         padded_output, output, head_size / 8, padded_head_size / 8);
 }
-template void pad_data(__half* padded_output,
-                       __half* output,
-                       int bsz,
-                       int head_size,
-                       int padded_head_size,
-                       cudaStream_t stream);
-template void pad_data(float* padded_output,
-                       float* output,
-                       int bsz,
-                       int head_size,
-                       int padded_head_size,
-                       cudaStream_t stream);
-
-__global__ void pad_head_seq_kernel(__half* padded_output,
-                                    __half* output,
+
+#define INSTANTIATE_PAD_DATA(T) template void pad_data(T*, T*, int, int, int, cudaStream_t stream);
+
+INSTANTIATE_PAD_DATA(float);
+INSTANTIATE_PAD_DATA(__half);
+#ifdef BF16_AVAILABLE
+INSTANTIATE_PAD_DATA(__nv_bfloat16);
+#endif
+
+template <typename T>
+__global__ void pad_head_seq_kernel(T* padded_output,
+                                    T* output,
                                     int seq_len,
                                     int padded_seq_len,
                                     int head_size,
                                     int padded_head_size)
 {
+    using T2 =
+        typename std::conditional<std::is_same<T, __half>::value, __half2, __nv_bfloat162>::type;
     float4* padded_output_cast = reinterpret_cast<float4*>(padded_output);
     float4* output_cast = reinterpret_cast<float4*>(output);
     int bsz = blockIdx.x;
@@ -529,8 +547,8 @@ __global__ void pad_head_seq_kernel(__half* padded_output,
     padded_output_cast += (bsz * padded_seq_len + bid) * padded_head_size;
     output_cast += (bsz * seq_len + bid) * head_size;
     float4 ZERO;
-    const __half2 zero_h = __float2half2_rn(0.f);
-    __half2* ZERO_h = reinterpret_cast<__half2*>(&ZERO);
+    const T2 zero_h = conversion::to<T2>(0.f);
+    T2* ZERO_h = reinterpret_cast<T2*>(&ZERO);
 #pragma unroll
     for (int i = 0; i < 4; i++) ZERO_h[i] = zero_h;
 
@@ -539,6 +557,7 @@ __global__ void pad_head_seq_kernel(__half* padded_output,
     else
         padded_output_cast[idx] = ZERO;
 }
+
 __global__ void pad_head_seq_kernel(float* padded_output,
                                     float* output,
                                     int seq_len,
@@ -547,6 +566,7 @@ __global__ void pad_head_seq_kernel(float* padded_output,
                                     int padded_head_size)
 {
 }
+
 template <typename T>
 void pad_head_seq(T* padded_output,
                   T* output,
@@ -562,22 +582,15 @@ void pad_head_seq(T* padded_output,
     pad_head_seq_kernel<<<grid_dim, block_dim, 0, stream>>>(
         padded_output, output, seq_len, padded_seq_len, head_size / 8, padded_head_size / 8);
 }
-template void pad_head_seq(__half* padded_output,
-                           __half* output,
-                           int bsz,
-                           int seq_len,
-                           int padded_seq_len,
-                           int head_size,
-                           int padded_head_size,
-                           cudaStream_t stream);
-template void pad_head_seq(float* padded_output,
-                           float* output,
-                           int bsz,
-                           int seq_len,
-                           int padded_seq_len,
-                           int head_size,
-                           int padded_head_size,
-                           cudaStream_t stream);
+
+#define INSTANTIATE_PAD_HEAD_SEQ(T) \
+    template void pad_head_seq<T>(T*, T*, int, int, int, int, int, cudaStream_t);
+
+INSTANTIATE_PAD_HEAD_SEQ(__half);
+#ifdef BF16_AVAILABLE
+INSTANTIATE_PAD_HEAD_SEQ(__nv_bfloat16);
+#endif
+INSTANTIATE_PAD_HEAD_SEQ(float);
 
 // TODO(cmikeh2): evaluate different GeLU performance
 __device__ __forceinline__ float old_gelu(float val)
@@ -593,12 +606,15 @@ constexpr int steps = 2;
 constexpr int granularity = 16;
 }  // namespace fused_geglu
 
-template <typename T>
-__global__ void fused_bias_geglu(T* output,
-                                 const T* activation,
-                                 const T* bias,
-                                 int base_channels,
-                                 int total_elems)
+__device__ __forceinline__ float silu(float val) { return val / (1.0f + expf(-val)); }
+
+template <typename T, bool useGelu>
+__global__ void fused_gate_activation(T* output,
+                                      const T* activation,
+                                      const T* bias,
+                                      int base_channels,
+                                      int output_stride,
+                                      int total_elems)
 {
     constexpr int T_per_access = fused_geglu::granularity / sizeof(T);
     constexpr int T_per_step = T_per_access * fused_geglu::threads;
@@ -623,9 +639,10 @@ __global__ void fused_bias_geglu(T* output,
                                                               activation + seq_offset + channel_id);
             mem_access::load_global<fused_geglu::granularity>(
                 activation_buffer_2, activation + seq_offset + channel_id + base_channels);
-            mem_access::load_global<fused_geglu::granularity>(bias_buffer_1, bias + channel_id);
-            mem_access::load_global<fused_geglu::granularity>(bias_buffer_2,
-                                                              bias + channel_id + base_channels);
+            mem_access::load_global<fused_geglu::granularity>(
+                bias_buffer_1, bias + channel_id, bias != nullptr);
+            mem_access::load_global<fused_geglu::granularity>(
+                bias_buffer_2, bias + channel_id + base_channels, bias != nullptr);
 
             // Since the GeLU is going to happen at float, might as well
             // convert
@@ -633,23 +650,26 @@ __global__ void fused_bias_geglu(T* output,
             for (int v = 0; v < T_per_access; v++) {
                 T hidden_state = activation_buffer_1[v] + bias_buffer_1[v];
                 T pre_gate = activation_buffer_2[v] + bias_buffer_2[v];
-                float gate_f = old_gelu(conversion::to<float>(pre_gate));
+                float pre_gate_f = conversion::to<float>(pre_gate);
+                float gate_f = (useGelu) ? old_gelu(pre_gate_f) : silu(pre_gate_f);
                 T gate = conversion::to<T>(gate_f);
                 activation_buffer_1[v] = hidden_state * gate;
             }
 
-            mem_access::store_global<fused_geglu::granularity>(output + iter_id,
-                                                               activation_buffer_1);
+            mem_access::store_global<fused_geglu::granularity>(
+                output + seq_id * output_stride + channel_id, activation_buffer_1);
         }
     }
 }
 
 template <typename T>
-void launch_fused_bias_geglu(T* output,
+void launch_gated_activation(T* output,
                              const T* activation,
                              const T* bias,
                              int rows,
+                             int output_stride,
                              int elems_per_row,
+                             bool use_gelu,
                              cudaStream_t stream)
 {
     /*
@@ -670,14 +690,21 @@ void launch_fused_bias_geglu(T* output,
     dim3 block(fused_geglu::threads);
     dim3 grid((total_elems + T_per_block - 1) / T_per_block);
 
-    fused_bias_geglu<<<grid, block, 0, stream>>>(
-        output, activation, bias, base_channels, total_elems);
+    if (use_gelu) {
+        fused_gate_activation<T, true><<<grid, block, 0, stream>>>(
+            output, activation, bias, base_channels, output_stride, total_elems);
+    } else {
+        fused_gate_activation<T, false><<<grid, block, 0, stream>>>(
+            output, activation, bias, base_channels, output_stride, total_elems);
+    }
 }
 
-template void launch_fused_bias_geglu(__half*,
-                                      const __half*,
-                                      const __half*,
-                                      int,
-                                      int,
-                                      cudaStream_t);
-template void launch_fused_bias_geglu(float*, const float*, const float*, int, int, cudaStream_t);
+#define INSTANTIATE_LAUNCH_GATED_ACTIVATION(T) \
+    template void launch_gated_activation(     \
+        T*, const T*, const T*, int, int, int, bool, cudaStream_t);
+
+INSTANTIATE_LAUNCH_GATED_ACTIVATION(__half);
+#ifdef BF16_AVAILABLE
+INSTANTIATE_LAUNCH_GATED_ACTIVATION(__nv_bfloat16);
+#endif
+INSTANTIATE_LAUNCH_GATED_ACTIVATION(float);
diff --git a/csrc/transformer/inference/csrc/layer_norm.cu b/csrc/transformer/inference/csrc/layer_norm.cu
index 06078512cef5..e5e7e89c9d20 100644
--- a/csrc/transformer/inference/csrc/layer_norm.cu
+++ b/csrc/transformer/inference/csrc/layer_norm.cu
@@ -1,6 +1,7 @@
-/*
-Copyright 2022 The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #include "conversion_utils.h"
 #include "ds_kernel_utils.h"
@@ -45,7 +46,7 @@ __global__ void fused_ln(T* output,
                              (tb.thread_index().y * elems_per_row);
     const int thread_offset = tb.thread_index().x * T_per_load;
     const int base_offset = block_offset + thread_offset;
-    const int stride = tb.size() * T_per_load;
+    const int stride = blockDim.x * T_per_load;
 
     float sum = reduce::init<rop::Add, float>();
 
@@ -56,8 +57,6 @@ __global__ void fused_ln(T* output,
 #pragma unRoll
     for (int i = 0; i < unRoll; i++) {
         T* iteration_buffer = local_buffer + i * T_per_load;
-        T residual_buffer[T_per_load];
-        T bias_buffer[T_per_load];
 
         mem_access::load_global<ln::granularity>(
             iteration_buffer, input_base + i * stride, thread_offset + i * stride < elems_per_row);
@@ -90,8 +89,8 @@ __global__ void fused_ln(T* output,
     const float variance = mean_diff / elems_per_row;
     const float denom = __frsqrt_rn(variance + epsilon);
 
-    const T mean_compute = conversion::to<T>(mean);
-    const T denom_compute = conversion::to<T>(denom);
+    // const T mean_compute = conversion::to<T>(mean);
+    // const T denom_compute = conversion::to<T>(denom);
 
     T* block_output = output + block_offset;
 
@@ -108,8 +107,11 @@ __global__ void fused_ln(T* output,
 
 #pragma unRoll
         for (int j = 0; j < T_per_load; j++) {
-            iteration_buffer[j] = (iteration_buffer[j] - mean_compute) * denom_compute;
-            iteration_buffer[j] = iteration_buffer[j] * gamma_local[j] + beta_local[j];
+            float val = conversion::to<float>(iteration_buffer[j]);
+            val = (val - mean) * denom;
+            val =
+                val * conversion::to<float>(gamma_local[j]) + conversion::to<float>(beta_local[j]);
+            iteration_buffer[j] = conversion::to<T>(val);
         }
 
         if (do_loads) {
@@ -188,16 +190,14 @@ void launch_fused_ln(T* output,
     }
 }
 
-template void launch_fused_ln(__half*,
-                              const __half*,
-                              const __half*,
-                              const __half*,
-                              float,
-                              int,
-                              int,
-                              cudaStream_t);
-template void
-launch_fused_ln(float*, const float*, const float*, const float*, float, int, int, cudaStream_t);
+#define INSTANTIATE_FUSED_LN(T) \
+    template void launch_fused_ln(T*, const T*, const T*, const T*, float, int, int, cudaStream_t);
+
+INSTANTIATE_FUSED_LN(__half);
+#ifdef BF16_AVAILABLE
+INSTANTIATE_FUSED_LN(__nv_bfloat16);
+#endif
+INSTANTIATE_FUSED_LN(float);
 
 /*
 Fused resiual + bias + layer norm implementation. Assumes elems_per_row % 8
@@ -273,7 +273,7 @@ __global__ void fused_residual_ln(T* output,
             float vals_up_cast = conversion::to<float>(iteration_buffer[j]);
             float res_up_cast = conversion::to<float>(residual_buffer[j]);
             float bias_up_cast = conversion::to<float>(bias_buffer[j]);
-            vals_up_cast += res_up_cast + bias_up_cast;
+            vals_up_cast = vals_up_cast + bias_up_cast + res_up_cast;
             sum = reduce::element<rop::Add>(sum, vals_up_cast);
             iteration_buffer[j] = conversion::to<T>(vals_up_cast);
         }
@@ -304,9 +304,6 @@ __global__ void fused_residual_ln(T* output,
     const float variance = mean_diff / elems_per_row;
     const float denom = __frsqrt_rn(variance + epsilon);
 
-    const T mean_compute = conversion::to<T>(mean);
-    const T denom_compute = conversion::to<T>(denom);
-
     T* block_output = output + block_offset;
 
 #pragma unRoll
@@ -322,8 +319,13 @@ __global__ void fused_residual_ln(T* output,
 
 #pragma unRoll
         for (int j = 0; j < T_per_load; j++) {
-            iteration_buffer[j] = (iteration_buffer[j] - mean_compute) * denom_compute;
-            iteration_buffer[j] = iteration_buffer[j] * gamma_local[j] + beta_local[j];
+            // iteration_buffer[j] = (iteration_buffer[j] - mean_compute) * denom_compute;
+            // iteration_buffer[j] = iteration_buffer[j] * gamma_local[j] + beta_local[j];
+            float val = conversion::to<float>(iteration_buffer[j]);
+            val = (val - mean) * denom;
+            val =
+                val * conversion::to<float>(gamma_local[j]) + conversion::to<float>(beta_local[j]);
+            iteration_buffer[j] = conversion::to<T>(val);
         }
 
         if (do_loads) {
@@ -480,50 +482,22 @@ void launch_fused_residual_ln_store_pre_ln_res(T* norm_output,
     }
 }
 
-// No-store specializations
-template void launch_fused_residual_ln(__half*,
-                                       const __half*,
-                                       const __half*,
-                                       const __half*,
-                                       const __half*,
-                                       const __half*,
-                                       float,
-                                       int,
-                                       int,
-                                       cudaStream_t);
-
-template void launch_fused_residual_ln(float*,
-                                       const float*,
-                                       const float*,
-                                       const float*,
-                                       const float*,
-                                       const float*,
-                                       float,
-                                       int,
-                                       int,
-                                       cudaStream_t);
-
-// Store specializations
-template void launch_fused_residual_ln_store_pre_ln_res(__half*,
-                                                        __half*,
-                                                        const __half*,
-                                                        const __half*,
-                                                        const __half*,
-                                                        const __half*,
-                                                        const __half*,
-                                                        float,
-                                                        int,
-                                                        int,
-                                                        cudaStream_t);
-
-template void launch_fused_residual_ln_store_pre_ln_res(float*,
-                                                        float*,
-                                                        const float*,
-                                                        const float*,
-                                                        const float*,
-                                                        const float*,
-                                                        const float*,
-                                                        float,
-                                                        int,
-                                                        int,
-                                                        cudaStream_t);
+#define INSTANTIATE_RES_LN(T)                  \
+    template void launch_fused_residual_ln<T>( \
+        T*, const T*, const T*, const T*, const T*, const T*, float, int, int, cudaStream_t);
+
+#define INSTANTIATE_PRE_LN_RES(T)                               \
+    template void launch_fused_residual_ln_store_pre_ln_res<T>( \
+        T*, T*, const T*, const T*, const T*, const T*, const T*, float, int, int, cudaStream_t);
+
+INSTANTIATE_RES_LN(__half);
+INSTANTIATE_RES_LN(float);
+#ifdef BF16_AVAILABLE
+INSTANTIATE_RES_LN(__nv_bfloat16);
+#endif
+
+INSTANTIATE_PRE_LN_RES(__half);
+INSTANTIATE_PRE_LN_RES(float);
+#ifdef BF16_AVAILABLE
+INSTANTIATE_PRE_LN_RES(__nv_bfloat16);
+#endif
diff --git a/csrc/transformer/inference/csrc/pointwise_ops.cu b/csrc/transformer/inference/csrc/pointwise_ops.cu
new file mode 100644
index 000000000000..0301ff777042
--- /dev/null
+++ b/csrc/transformer/inference/csrc/pointwise_ops.cu
@@ -0,0 +1,74 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include <cuda_fp16.h>
+#include "conversion_utils.h"
+#include "ds_kernel_utils.h"
+#include "memory_access_utils.h"
+
+namespace pwise {
+constexpr int granularity = 16;
+constexpr int unroll = 4;
+constexpr int threads = 256;
+}  // namespace pwise
+
+template <typename T>
+__global__ void vector_add_kernel(T* out, const T* a, const T* b, float gamma, int num_elems)
+{
+    constexpr int T_per_access = pwise::granularity / sizeof(T);
+
+    const int block_offset = blockIdx.x * pwise::threads * pwise::unroll * T_per_access;
+    const int thread_offset = threadIdx.x * T_per_access;
+    const int total_offset = block_offset + thread_offset;
+    constexpr int stride = pwise::threads * T_per_access;
+
+#pragma unroll
+    for (int i = 0; i < pwise::unroll; i++) {
+        T temp_buf_a[T_per_access], temp_buf_b[T_per_access];
+
+        const int iter_idx = total_offset + i * stride;
+
+        mem_access::load_global<pwise::granularity>(temp_buf_a, a + iter_idx, iter_idx < num_elems);
+        mem_access::load_global<pwise::granularity>(temp_buf_b, b + iter_idx, iter_idx < num_elems);
+
+#pragma unroll
+        for (int j = 0; j < T_per_access; j++) {
+            float up_cast_a = conversion::to<float>(temp_buf_a[j]);
+            float up_cast_b = conversion::to<float>(temp_buf_b[j]);
+            temp_buf_a[j] = conversion::to<T>((gamma * up_cast_a) + up_cast_b);
+        }
+
+        if (iter_idx < num_elems) {
+            mem_access::store_global<pwise::granularity>(out + iter_idx, temp_buf_a);
+        }
+    }
+}
+
+template <typename T>
+void launch_vector_add(T* out,
+                       const T* a,
+                       const T* b,
+                       float gamma,
+                       int num_elems,
+                       cudaStream_t stream)
+{
+    constexpr int T_per_access = pwise::granularity / sizeof(T);
+    constexpr int T_per_block = pwise::threads * T_per_access * pwise::unroll;
+
+    dim3 block(pwise::threads);
+    dim3 grid((num_elems + T_per_block - 1) / T_per_block);
+
+    vector_add_kernel<<<grid, block, 0, stream>>>(out, a, b, gamma, num_elems);
+}
+
+#define INSTANTIATE_VECTOR_ADD(T)       \
+    template void launch_vector_add<T>( \
+        T * out, const T* a, const T* b, float gamma, int num_elems, cudaStream_t stream);
+
+INSTANTIATE_VECTOR_ADD(float)
+INSTANTIATE_VECTOR_ADD(__half)
+#ifdef BF16_AVAILABLE
+INSTANTIATE_VECTOR_ADD(__nv_bfloat16)
+#endif
diff --git a/csrc/transformer/inference/csrc/pt_binding.cpp b/csrc/transformer/inference/csrc/pt_binding.cpp
index 1d6857042ca1..b7277d1e1678 100644
--- a/csrc/transformer/inference/csrc/pt_binding.cpp
+++ b/csrc/transformer/inference/csrc/pt_binding.cpp
@@ -1,6 +1,7 @@
-/*
-Copyright 2022 The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #include <c10/cuda/CUDAStream.h>
 #include <torch/extension.h>
@@ -15,7 +16,9 @@ std::array<int, 3> gemm_algos = std::array<int, 3>({99, 99, 99});
 // NOTE: This activation function type enum should be always in sync
 // with the python counterpart, otherwise the casting from python binding
 // will be incorrect.
-enum class ActivationFuncType { UNKNOWN = 0, GELU = 1, ReLU = 2 };
+enum class ActivationFuncType { UNKNOWN = 0, GELU = 1, ReLU = 2, GATED_GELU = 3, GATED_SILU = 4 };
+
+enum class NormType { UNKNOWN = 0, LayerNorm = 1, GroupNorm = 2, RMSNorm = 3 };
 
 enum class TransformerType : uint8_t { UNKNOWN = 0, GPTType = 1, BERTType = 2 };
 
@@ -95,7 +98,7 @@ at::Tensor ds_softmax(at::Tensor& attn_scores,
                            head_offset,
                            mask_stride,
                            mp_size,
-                           Context::Instance().GetCurrentStream(async_op));
+                           InferenceContext::Instance().GetCurrentStream(async_op));
 
     return attn_scores_c;
 }
@@ -109,18 +112,20 @@ void allocate_workspace(unsigned hidden_dim,
                         unsigned mp_size = 1,
                         bool external_cache = false,
                         unsigned rank = 0,
-                        unsigned max_out_tokens = 1024)
+                        unsigned max_out_tokens = 1024,
+                        unsigned min_out_tokens = 1)
 {
-    Context::Instance().GenWorkSpace(num_layers,
-                                     num_heads,
-                                     batch_size,
-                                     prompt_length,
-                                     hidden_dim,
-                                     mp_size,
-                                     external_cache,
-                                     sizeof(T),
-                                     rank,
-                                     max_out_tokens);
+    InferenceContext::Instance().GenWorkSpace(num_layers,
+                                              num_heads,
+                                              batch_size,
+                                              prompt_length,
+                                              hidden_dim,
+                                              mp_size,
+                                              external_cache,
+                                              sizeof(T),
+                                              rank,
+                                              max_out_tokens,
+                                              min_out_tokens);
 }
 
 template <typename T>
@@ -131,15 +136,15 @@ at::Tensor einsum_sec_sm_ecm(at::Tensor& Q, at::Tensor& W)
                        .layout(at::kStrided)
                        .device(at::kCUDA)
                        .requires_grad(false);
-    T* workspace = (T*)Context::Instance().GetWorkSpace();
+    T* workspace = (T*)InferenceContext::Instance().GetWorkSpace();
     float alpha = 1;
     float gemm_beta = 0.0;
 
     /*
     // Reallocate memory if we received a new prompt
     if (!workspace || input.size(1) != 1) {
-        allocate_workspace<T>(W.size(1), Context::Instance().GetMaxTokenLenght(), Q.size(0), 1,
-    head_size); workspace = (T*)Context::Instance().GetWorkSpace();
+        allocate_workspace<T>(W.size(1), InferenceContext::Instance().GetMaxTokenLength(),
+    Q.size(0), 1, head_size); workspace = (T*)InferenceContext::Instance().GetWorkSpace();
     }
     */
 
@@ -147,7 +152,7 @@ at::Tensor einsum_sec_sm_ecm(at::Tensor& Q, at::Tensor& W)
     unsigned m = W.size(1);
     unsigned n = Q.size(1) * Q.size(2);
     unsigned k = Q.size(0);
-    cublas_gemm_ex(Context::Instance().GetCublasHandle(),
+    cublas_gemm_ex(InferenceContext::Instance().GetCublasHandle(),
                    CUBLAS_OP_N,
                    CUBLAS_OP_T,
                    m,
@@ -158,7 +163,7 @@ at::Tensor einsum_sec_sm_ecm(at::Tensor& Q, at::Tensor& W)
                    (T*)W.data_ptr(),
                    (T*)Q.data_ptr(),
                    (T*)O.data_ptr(),
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef __HIP_PLATFORM_AMD__
                    rocblas_gemm_algo_standard);
 #else
                    CUBLAS_GEMM_DEFAULT_TENSOR_OP);
@@ -194,8 +199,9 @@ void attention_unfused(at::Tensor& prev_key_cont,
 
     auto mask_stride = get_attn_mask_stride(attn_mask);
 
-    cublasSetStream(Context::Instance().GetCublasHandle(), Context::Instance().GetCurrentStream());
-    cublas_strided_batched_gemm(Context::Instance().GetCublasHandle(),
+    cublasSetStream(InferenceContext::Instance().GetCublasHandle(),
+                    InferenceContext::Instance().GetCurrentStream());
+    cublas_strided_batched_gemm(InferenceContext::Instance().GetCublasHandle(),
                                 soft_len,
                                 seq_len,
                                 k,
@@ -210,7 +216,7 @@ void attention_unfused(at::Tensor& prev_key_cont,
                                 seq_len * k,
                                 seq_len * soft_len,
                                 bsz * heads,
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef __HIP_PLATFORM_AMD__
                                 rocblas_gemm_algo_standard);
 #else
                                 CUBLAS_GEMM_DEFAULT_TENSOR_OP);
@@ -230,9 +236,9 @@ void attention_unfused(at::Tensor& prev_key_cont,
                            0,
                            mask_stride,
                            1,
-                           Context::Instance().GetCurrentStream(false));
+                           InferenceContext::Instance().GetCurrentStream(false));
     alpha = 1.0;
-    cublas_strided_batched_gemm(Context::Instance().GetCublasHandle(),
+    cublas_strided_batched_gemm(InferenceContext::Instance().GetCublasHandle(),
                                 k,
                                 seq_len,
                                 soft_len,
@@ -247,7 +253,7 @@ void attention_unfused(at::Tensor& prev_key_cont,
                                 seq_len * soft_len,
                                 seq_len * k,
                                 bsz * heads,
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef __HIP_PLATFORM_AMD__
                                 rocblas_gemm_algo_standard);
 #else
                                 CUBLAS_GEMM_DEFAULT_TENSOR_OP);
@@ -363,10 +369,11 @@ void attention_unfused(T* prev_key_cont,
     float layer_scale = alibi.sizes().size() > 1 ? std::max(1, layer_id) : 1.0;
     float alpha = norm_factor * norm_factor / layer_scale;
     float gemm_beta = 0.0;
-    T* workspace = (T*)Context::Instance().GetAttentionUnfusedWorkspace();
+    T* workspace = (T*)InferenceContext::Instance().GetAttentionUnfusedWorkspace();
 
-    cublasSetStream(Context::Instance().GetCublasHandle(), Context::Instance().GetCurrentStream());
-    cublas_strided_batched_gemm(Context::Instance().GetCublasHandle(),
+    cublasSetStream(InferenceContext::Instance().GetCublasHandle(),
+                    InferenceContext::Instance().GetCurrentStream());
+    cublas_strided_batched_gemm(InferenceContext::Instance().GetCublasHandle(),
                                 soft_len,
                                 seq_len,
                                 k,
@@ -377,11 +384,11 @@ void attention_unfused(T* prev_key_cont,
                                 workspace,
                                 CUBLAS_OP_T,
                                 CUBLAS_OP_N,
-                                Context::Instance().GetMaxTokenLenght() * k,
+                                InferenceContext::Instance().GetMaxTokenLength() * k,
                                 seq_len * k,
                                 seq_len * soft_len,
                                 bsz * heads,
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef __HIP_PLATFORM_AMD__
                                 rocblas_gemm_algo_standard);
 #else
                                 CUBLAS_GEMM_DEFAULT_TENSOR_OP);
@@ -399,7 +406,7 @@ void attention_unfused(T* prev_key_cont,
                            soft_len,
                            heads);
     alpha = 1.0;
-    cublas_strided_batched_gemm(Context::Instance().GetCublasHandle(),
+    cublas_strided_batched_gemm(InferenceContext::Instance().GetCublasHandle(),
                                 k,
                                 seq_len,
                                 soft_len,
@@ -410,18 +417,18 @@ void attention_unfused(T* prev_key_cont,
                                 (T*)output,
                                 CUBLAS_OP_N,
                                 CUBLAS_OP_N,
-                                Context::Instance().GetMaxTokenLenght() * k,
+                                InferenceContext::Instance().GetMaxTokenLength() * k,
                                 seq_len * soft_len,
                                 seq_len * k,
                                 bsz * heads,
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef __HIP_PLATFORM_AMD__
                                 rocblas_gemm_algo_standard);
 #else
                                 CUBLAS_GEMM_DEFAULT_TENSOR_OP);
 #endif
 }
 
-void reset_cache() { Context::Instance().reset_tokens(); }
+void reset_cache() { InferenceContext::Instance().reset_tokens(); }
 
 template <typename T>
 std::vector<at::Tensor> ds_softmax_context(at::Tensor& query_key_value,
@@ -430,6 +437,7 @@ std::vector<at::Tensor> ds_softmax_context(at::Tensor& query_key_value,
                                            bool rotate_half,
                                            bool rotate_every_two,
                                            int heads,
+                                           int num_kv,
                                            float norm_factor,
                                            bool triangular,
                                            bool local_attention,
@@ -437,34 +445,36 @@ std::vector<at::Tensor> ds_softmax_context(at::Tensor& query_key_value,
                                            bool no_masking,
                                            unsigned layer_id,
                                            unsigned num_layers,
-                                           at::Tensor& alibi)
+                                           at::Tensor& alibi,
+                                           float rope_theta)
 {
     unsigned bsz = query_key_value.size(0);
     unsigned seq_len = query_key_value.size(1);
-    unsigned hidden_dim = query_key_value.size(2) / 3;
+    int k = query_key_value.size(2) / (heads + 2 * (num_kv > 0 ? num_kv : heads));
+    unsigned hidden_dim = heads * k;
 
     bool is_prompt = (seq_len > 1);
 
-    if (is_prompt) Context::Instance().reset_tokens(seq_len);
-    unsigned soft_len = Context::Instance().current_tokens();
+    if (is_prompt) InferenceContext::Instance().reset_tokens(seq_len);
+    unsigned soft_len = InferenceContext::Instance().current_tokens();
 
-    int k = hidden_dim / heads;
     auto options = at::TensorOptions()
                        .dtype(query_key_value.options().dtype())
                        .layout(at::kStrided)
                        .device(at::kCUDA)
                        .requires_grad(false);
 
-    T* workspace = (T*)Context::Instance().GetWorkSpace();
+    T* workspace = (T*)InferenceContext::Instance().GetWorkSpace();
     size_t buf_size = bsz * seq_len * hidden_dim;
     auto output = torch::from_blob(workspace + 4 * buf_size, {bsz, seq_len, hidden_dim}, options);
 
-    auto query_cont = workspace + 8 * buf_size;
-    size_t offset = 16 * (hidden_dim * bsz * Context::Instance().GetMaxTokenLenght()) +
-                    layer_id * 2 * bsz * Context::Instance().GetMaxTokenLenght() * hidden_dim;
+    auto query_cont = workspace + 5 * buf_size;
+    size_t offset =
+        10 * (hidden_dim * bsz * InferenceContext::Instance().GetMaxTokenLength()) +
+        layer_id * 2 * bsz * InferenceContext::Instance().GetMaxTokenLength() * hidden_dim;
     unsigned all_tokens = soft_len;
     auto kv_cache = workspace + offset + (hidden_dim / heads) * (is_prompt ? 0 : soft_len - 1);
-    size_t value_offset = bsz * Context::Instance().GetMaxTokenLenght() * hidden_dim;
+    size_t value_offset = bsz * InferenceContext::Instance().GetMaxTokenLength() * hidden_dim;
 
     T* temp_buf = (T*)output.data_ptr() + at::numel(output);
     launch_bias_add_transform_0213<T>((T*)query_cont,
@@ -478,12 +488,14 @@ std::vector<at::Tensor> ds_softmax_context(at::Tensor& query_key_value,
                                       soft_len,
                                       hidden_dim,
                                       heads,
+                                      (num_kv > 0 ? num_kv : heads),
                                       rotary_dim,
                                       rotate_half,
                                       rotate_every_two,
-                                      Context::Instance().GetCurrentStream(),
+                                      InferenceContext::Instance().GetCurrentStream(),
                                       3,
-                                      Context::Instance().GetMaxTokenLenght());
+                                      InferenceContext::Instance().GetMaxTokenLength(),
+                                      rope_theta);
     if (rotary_dim > 0 && rotate_half)
         launch_apply_rotary_pos_emb(query_cont,
                                     kv_cache,
@@ -493,10 +505,9 @@ std::vector<at::Tensor> ds_softmax_context(at::Tensor& query_key_value,
                                     (is_prompt ? 0 : soft_len - 1),
                                     heads,
                                     bsz,
-                                    rotate_half,
-                                    rotate_every_two,
-                                    Context::Instance().GetCurrentStream(),
-                                    Context::Instance().GetMaxTokenLenght());
+                                    rope_theta,
+                                    InferenceContext::Instance().GetCurrentStream(),
+                                    InferenceContext::Instance().GetMaxTokenLength());
 
     attention_unfused<T>(workspace + offset,
                          (T*)query_cont,
@@ -521,13 +532,27 @@ std::vector<at::Tensor> ds_softmax_context(at::Tensor& query_key_value,
                                heads,
                                seq_len,
                                output.size(2),
-                               Context::Instance().GetCurrentStream(false),
+                               InferenceContext::Instance().GetCurrentStream(false),
                                1);
 
-    if (layer_id == num_layers - 1) Context::Instance().advance_tokens();
-    auto prev_key = torch::from_blob(workspace + offset, {bsz, heads, all_tokens, k}, options);
+    if (layer_id == num_layers - 1) InferenceContext::Instance().advance_tokens();
+    auto prev_key = torch::from_blob(workspace + offset,
+                                     {bsz, heads, all_tokens, k},
+                                     {hidden_dim * InferenceContext::Instance().GetMaxTokenLength(),
+                                      k * InferenceContext::Instance().GetMaxTokenLength(),
+                                      k,
+                                      1},
+                                     options);
+
     auto prev_value =
-        torch::from_blob(workspace + offset + value_offset, {bsz, heads, all_tokens, k}, options);
+        torch::from_blob(workspace + offset + value_offset,
+                         {bsz, heads, all_tokens, k},
+                         {hidden_dim * InferenceContext::Instance().GetMaxTokenLength(),
+                          k * InferenceContext::Instance().GetMaxTokenLength(),
+                          k,
+                          1},
+                         options);
+
     return {output, prev_key, prev_value};
 }
 
@@ -543,16 +568,33 @@ at::Tensor ds_bias_gelu(at::Tensor& input, at::Tensor& bias)
                      (T*)bias.data_ptr(),
                      intermediate_size,
                      bsz,
-                     Context::Instance().GetCurrentStream());
+                     InferenceContext::Instance().GetCurrentStream());
     return input_cont;
 }
 
-at::Tensor ds_bias_geglu(at::Tensor& activation, at::Tensor& bias)
+#define DISPATCH_GATED_ACT(T_TYPE, C_TYPE)                                         \
+    if (activation.options().dtype() == torch::T_TYPE) {                           \
+        launch_gated_activation((C_TYPE*)output.data_ptr(),                        \
+                                (const C_TYPE*)activation.data_ptr(),              \
+                                (const C_TYPE*)bias.data_ptr(),                    \
+                                rows,                                              \
+                                out_channels,                                      \
+                                channels,                                          \
+                                activation_type == ActivationFuncType::GATED_GELU, \
+                                InferenceContext::Instance().GetCurrentStream());  \
+    }
+
+at::Tensor ds_gated_activation(at::Tensor& activation, at::Tensor& bias, int actFun)
 {
     /*
     Used in FF of Stable diffusion
     */
 
+    const ActivationFuncType activation_type = static_cast<ActivationFuncType>(actFun);
+
+    assert(activation_type == ActivationFuncType::GATED_GELU ||
+           activation_type == ActivationFuncType::GATED_SILU);
+
     const int batch_size = activation.size(0);
     const int seq_len = activation.size(1);
     const int channels = activation.size(2);
@@ -563,21 +605,11 @@ at::Tensor ds_bias_geglu(at::Tensor& activation, at::Tensor& bias)
 
     auto output = at::empty({batch_size, seq_len, out_channels}, activation.options());
 
-    if (activation.options().dtype() == torch::kFloat32) {
-        launch_fused_bias_geglu((float*)output.data_ptr(),
-                                (const float*)activation.data_ptr(),
-                                (const float*)bias.data_ptr(),
-                                rows,
-                                channels,
-                                Context::Instance().GetCurrentStream());
-    } else {
-        launch_fused_bias_geglu((__half*)output.data_ptr(),
-                                (const __half*)activation.data_ptr(),
-                                (const __half*)bias.data_ptr(),
-                                rows,
-                                channels,
-                                Context::Instance().GetCurrentStream());
-    }
+    DISPATCH_GATED_ACT(kFloat, float);
+    DISPATCH_GATED_ACT(kHalf, __half);
+#ifdef BF16_AVAILABLE
+    DISPATCH_GATED_ACT(kBFloat16, __nv_bfloat16);
+#endif
 
     return output;
 }
@@ -594,7 +626,7 @@ at::Tensor ds_bias_relu(at::Tensor& input, at::Tensor& bias)
                      (T*)bias.data_ptr(),
                      intermediate_size,
                      bsz,
-                     Context::Instance().GetCurrentStream());
+                     InferenceContext::Instance().GetCurrentStream());
     return input_cont;
 }
 
@@ -610,7 +642,7 @@ at::Tensor ds_bias_add(at::Tensor& input, at::Tensor& bias)
                     (T*)bias.data_ptr(),
                     hidden_size,
                     bsz,
-                    Context::Instance().GetCurrentStream());
+                    InferenceContext::Instance().GetCurrentStream());
     return input_cont;
 }
 
@@ -627,39 +659,103 @@ at::Tensor ds_bias_residual(at::Tensor& input, at::Tensor& residual, at::Tensor&
     //                      bsz,
     //                      input_cont.size(2),
     //                      (bias.size(0) > 1),
-    //                      Context::Instance().GetCurrentStream());
+    //                      InferenceContext::Instance().GetCurrentStream());
     return input_cont;
 }
 
+#define DISPATCH_LAYER_NORM(T_TYPE, C_TYPE)                               \
+    if (input.options().dtype() == torch::T_TYPE) {                       \
+        launch_fused_ln((C_TYPE*)output.data_ptr(),                       \
+                        (const C_TYPE*)input.data_ptr(),                  \
+                        (const C_TYPE*)gamma.data_ptr(),                  \
+                        (const C_TYPE*)beta.data_ptr(),                   \
+                        epsilon,                                          \
+                        rows,                                             \
+                        elems_per_row,                                    \
+                        InferenceContext::Instance().GetCurrentStream()); \
+    }
+
 at::Tensor ds_layer_norm(at::Tensor& input, at::Tensor& gamma, at::Tensor& beta, float epsilon)
 {
     const int rows = input.size(0) * input.size(1);
     const int elems_per_row = input.size(2);
     auto output = at::empty_like(input);
 
-    if (input.options().dtype() == torch::kFloat16) {
-        launch_fused_ln((__half*)output.data_ptr(),
-                        (const __half*)input.data_ptr(),
-                        (const __half*)gamma.data_ptr(),
-                        (const __half*)beta.data_ptr(),
-                        epsilon,
-                        rows,
-                        elems_per_row,
-                        Context::Instance().GetCurrentStream());
-    } else {
-        launch_fused_ln((float*)output.data_ptr(),
-                        (const float*)input.data_ptr(),
-                        (const float*)gamma.data_ptr(),
-                        (const float*)beta.data_ptr(),
-                        epsilon,
-                        rows,
-                        elems_per_row,
-                        Context::Instance().GetCurrentStream());
+    DISPATCH_LAYER_NORM(kFloat, float);
+    DISPATCH_LAYER_NORM(kHalf, __half);
+#ifdef BF16_AVAILABLE
+    DISPATCH_LAYER_NORM(kBFloat16, __nv_bfloat16);
+#endif
+
+    return output;
+}
+
+#define DISPATCH_RMS_NORM(T_TYPE, C_TYPE)                                 \
+    if (input.options().dtype() == torch::T_TYPE) {                       \
+        launch_rms_norm((C_TYPE*)output.data_ptr(),                       \
+                        (C_TYPE*)nullptr,                                 \
+                        (const C_TYPE*)input.data_ptr(),                  \
+                        (const C_TYPE*)nullptr,                           \
+                        (const C_TYPE*)gamma.data_ptr(),                  \
+                        epsilon,                                          \
+                        rows,                                             \
+                        elems_per_row,                                    \
+                        InferenceContext::Instance().GetCurrentStream()); \
     }
 
+at::Tensor ds_rms_norm(at::Tensor& input, at::Tensor& gamma, float epsilon)
+{
+    // Get number of dims of tensor
+    int num_dims = input.dim();
+    const int rows = (num_dims == 2) ? input.size(0) : input.size(0) * input.size(1);
+    const int elems_per_row = (num_dims == 2) ? input.size(1) : input.size(2);
+
+    auto output = at::empty_like(input);
+
+    DISPATCH_RMS_NORM(kFloat, float);
+    DISPATCH_RMS_NORM(kHalf, __half);
+#ifdef BF16_AVAILABLE
+    DISPATCH_RMS_NORM(kBFloat16, __nv_bfloat16);
+#endif
+
     return output;
 }
 
+#define DISPATCH_PRE_RMS_NORM(T_TYPE, C_TYPE)                             \
+    if (input.options().dtype() == torch::T_TYPE) {                       \
+        launch_rms_norm((C_TYPE*)output.data_ptr(),                       \
+                        (C_TYPE*)res_out.data_ptr(),                      \
+                        (const C_TYPE*)input.data_ptr(),                  \
+                        (const C_TYPE*)residual.data_ptr(),               \
+                        (const C_TYPE*)gamma.data_ptr(),                  \
+                        epsilon,                                          \
+                        rows,                                             \
+                        elems_per_row,                                    \
+                        InferenceContext::Instance().GetCurrentStream()); \
+    }
+
+std::vector<at::Tensor> ds_pre_rms_norm(at::Tensor& input,
+                                        at::Tensor& residual,
+                                        at::Tensor& gamma,
+                                        float epsilon)
+{
+    // Get number of dims of tensor
+    int num_dims = input.dim();
+    const int rows = (num_dims == 2) ? input.size(0) : input.size(0) * input.size(1);
+    const int elems_per_row = (num_dims == 2) ? input.size(1) : input.size(2);
+
+    auto output = at::empty_like(input);
+    auto res_out = at::empty_like(residual);
+
+    DISPATCH_PRE_RMS_NORM(kFloat, float);
+    DISPATCH_PRE_RMS_NORM(kHalf, __half);
+#ifdef BF16_AVAILABLE
+    DISPATCH_PRE_RMS_NORM(kBFloat16, __nv_bfloat16);
+#endif
+
+    return {output, res_out};
+}
+
 template <typename T>
 void ds_layer_norm_internal(T* workspace,
                             at::Tensor& input,
@@ -675,9 +771,23 @@ void ds_layer_norm_internal(T* workspace,
                     epsilon,
                     bsz,
                     input.size(2),
-                    Context::Instance().GetCurrentStream());
+                    InferenceContext::Instance().GetCurrentStream());
 }
 
+#define DISPATCH_LAYER_NORM_RESIDUAL(T_TYPE, C_TYPE)                               \
+    if (input.options().dtype() == torch::T_TYPE) {                                \
+        launch_fused_residual_ln((C_TYPE*)output.data_ptr(),                       \
+                                 (const C_TYPE*)input.data_ptr(),                  \
+                                 (const C_TYPE*)residual.data_ptr(),               \
+                                 (const C_TYPE*)bias.data_ptr(),                   \
+                                 (const C_TYPE*)gamma.data_ptr(),                  \
+                                 (const C_TYPE*)beta.data_ptr(),                   \
+                                 epsilon,                                          \
+                                 rows,                                             \
+                                 elems_per_row,                                    \
+                                 InferenceContext::Instance().GetCurrentStream()); \
+    }
+
 /* Currently only used in unit testing */
 at::Tensor ds_layer_norm_residual(at::Tensor& input,
                                   at::Tensor& bias,
@@ -690,33 +800,31 @@ at::Tensor ds_layer_norm_residual(at::Tensor& input,
     const int elems_per_row = input.size(2);
     auto output = at::empty_like(input);
 
-    if (input.options().dtype() == torch::kFloat16) {
-        launch_fused_residual_ln((__half*)output.data_ptr(),
-                                 (const __half*)input.data_ptr(),
-                                 (const __half*)residual.data_ptr(),
-                                 (const __half*)bias.data_ptr(),
-                                 (const __half*)gamma.data_ptr(),
-                                 (const __half*)beta.data_ptr(),
-                                 epsilon,
-                                 rows,
-                                 elems_per_row,
-                                 Context::Instance().GetCurrentStream());
-    } else {
-        launch_fused_residual_ln((float*)output.data_ptr(),
-                                 (const float*)input.data_ptr(),
-                                 (const float*)residual.data_ptr(),
-                                 (const float*)bias.data_ptr(),
-                                 (const float*)gamma.data_ptr(),
-                                 (const float*)beta.data_ptr(),
-                                 epsilon,
-                                 rows,
-                                 elems_per_row,
-                                 Context::Instance().GetCurrentStream());
-    }
+    DISPATCH_LAYER_NORM_RESIDUAL(kFloat, float);
+    DISPATCH_LAYER_NORM_RESIDUAL(kHalf, __half);
+#ifdef BF16_AVAILABLE
+    DISPATCH_LAYER_NORM_RESIDUAL(kBFloat16, __nv_bfloat16);
+#endif
 
     return output;
 }
 
+#define DISPATCH_PRE_LAYER_NORM_RESIDUAL(T_TYPE, C_TYPE)      \
+    if (input.options().dtype() == torch::T_TYPE) {           \
+        launch_fused_residual_ln_store_pre_ln_res(            \
+            (C_TYPE*)norm_output.data_ptr(),                  \
+            (C_TYPE*)res_output.data_ptr(),                   \
+            (const C_TYPE*)input.data_ptr(),                  \
+            (const C_TYPE*)residual.data_ptr(),               \
+            (const C_TYPE*)bias.data_ptr(),                   \
+            (const C_TYPE*)gamma.data_ptr(),                  \
+            (const C_TYPE*)beta.data_ptr(),                   \
+            epsilon,                                          \
+            rows,                                             \
+            elems_per_row,                                    \
+            InferenceContext::Instance().GetCurrentStream()); \
+    }
+
 /* Currently only used in unit testing */
 std::vector<at::Tensor> ds_layer_norm_residual_store_pre_ln_res(at::Tensor& input,
                                                                 at::Tensor& bias,
@@ -730,31 +838,11 @@ std::vector<at::Tensor> ds_layer_norm_residual_store_pre_ln_res(at::Tensor& inpu
     auto norm_output = at::empty_like(input);
     auto res_output = at::empty_like(input);
 
-    if (input.options().dtype() == torch::kFloat16) {
-        launch_fused_residual_ln_store_pre_ln_res((__half*)norm_output.data_ptr(),
-                                                  (__half*)res_output.data_ptr(),
-                                                  (const __half*)input.data_ptr(),
-                                                  (const __half*)residual.data_ptr(),
-                                                  (const __half*)bias.data_ptr(),
-                                                  (const __half*)gamma.data_ptr(),
-                                                  (const __half*)beta.data_ptr(),
-                                                  epsilon,
-                                                  rows,
-                                                  elems_per_row,
-                                                  Context::Instance().GetCurrentStream());
-    } else {
-        launch_fused_residual_ln_store_pre_ln_res((float*)norm_output.data_ptr(),
-                                                  (float*)res_output.data_ptr(),
-                                                  (const float*)input.data_ptr(),
-                                                  (const float*)residual.data_ptr(),
-                                                  (const float*)bias.data_ptr(),
-                                                  (const float*)gamma.data_ptr(),
-                                                  (const float*)beta.data_ptr(),
-                                                  epsilon,
-                                                  rows,
-                                                  elems_per_row,
-                                                  Context::Instance().GetCurrentStream());
-    }
+    DISPATCH_PRE_LAYER_NORM_RESIDUAL(kFloat, float);
+    DISPATCH_PRE_LAYER_NORM_RESIDUAL(kHalf, __half);
+#ifdef BF16_AVAILABLE
+    DISPATCH_PRE_LAYER_NORM_RESIDUAL(kBFloat16, __nv_bfloat16);
+#endif
 
     return {norm_output, res_output};
 }
@@ -768,7 +856,7 @@ void quantized_gemm(void* output,
                     int bsz,
                     int hidden_size)
 {
-    // T* weight16 = (T*)Context::Instance().GetWorkSpace() + 12 * hidden_size * bsz;
+    // T* weight16 = (T*)InferenceContext::Instance().GetWorkSpace() + 12 * hidden_size * bsz;
 
     auto options = at::TensorOptions()
                        .dtype(at::kHalf)
@@ -783,11 +871,11 @@ void quantized_gemm(void* output,
                       weight.size(0),
                       weight.size(1),
                       groups,
-                      Context::Instance().GetCurrentStream());
+                      InferenceContext::Instance().GetCurrentStream());
 
     float alpha = (T)1.0;
     float gemm_beta = (T)0.0;
-    cublas_gemm_ex(Context::Instance().GetCublasHandle(),
+    cublas_gemm_ex(InferenceContext::Instance().GetCublasHandle(),
                    CUBLAS_OP_T,
                    CUBLAS_OP_N,
                    weight.size(0),
@@ -798,7 +886,7 @@ void quantized_gemm(void* output,
                    weight16,
                    (T*)input,
                    (T*)output,
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef __HIP_PLATFORM_AMD__
                    rocblas_gemm_algo_standard);
 #else
                    CUBLAS_GEMM_DEFAULT_TENSOR_OP);
@@ -815,10 +903,11 @@ at::Tensor qkv_unfused_cublas(at::Tensor& output,
                               at::Tensor& beta,
                               const float epsilon,
                               bool add_bias,
-                              bool q_int8)
+                              bool q_int8,
+                              bool transposed_mode)
 {
     int bsz = input.size(0) * input.size(1);
-    T* workspace = (T*)Context::Instance().GetWorkSpace();
+    T* workspace = (T*)InferenceContext::Instance().GetWorkSpace();
     workspace += (3 * bsz * input.size(2));
     ds_layer_norm_internal<T>(workspace, input, gamma, beta, epsilon);
 
@@ -829,12 +918,12 @@ at::Tensor qkv_unfused_cublas(at::Tensor& output,
         float alpha = (T)1.0;
         float gemm_beta = (T)0.0;
 
-        cublasSetStream(Context::Instance().GetCublasHandle(),
-                        Context::Instance().GetCurrentStream());
-        cublas_gemm_ex(Context::Instance().GetCublasHandle(),
+        cublasSetStream(InferenceContext::Instance().GetCublasHandle(),
+                        InferenceContext::Instance().GetCurrentStream());
+        cublas_gemm_ex(InferenceContext::Instance().GetCublasHandle(),
+                       (transposed_mode ? CUBLAS_OP_T : CUBLAS_OP_N),
                        CUBLAS_OP_N,
-                       CUBLAS_OP_N,
-                       weight.size(1),
+                       weight.size(transposed_mode ? 0 : 1),
                        bsz,
                        input.size(2),
                        &alpha,
@@ -842,7 +931,7 @@ at::Tensor qkv_unfused_cublas(at::Tensor& output,
                        (T*)weight.data_ptr(),
                        workspace,
                        (T*)output.data_ptr(),
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef __HIP_PLATFORM_AMD__
                        rocblas_gemm_algo_standard);
 #else
                        CUBLAS_GEMM_DEFAULT_TENSOR_OP);
@@ -851,12 +940,79 @@ at::Tensor qkv_unfused_cublas(at::Tensor& output,
     if (add_bias)
         launch_bias_add((T*)output.data_ptr(),
                         (T*)bias.data_ptr(),
-                        q_int8 ? weight.size(0) : weight.size(1),
+                        (transposed_mode || q_int8) ? weight.size(0) : weight.size(1),
                         bsz,
-                        Context::Instance().GetCurrentStream());
+                        InferenceContext::Instance().GetCurrentStream());
     return torch::from_blob(workspace, input.sizes(), input.options());
 }
 
+template <typename T>
+std::vector<at::Tensor> ds_rms_qkv(at::Tensor& input,
+                                   at::Tensor& weight,
+                                   at::Tensor& q_scale,
+                                   at::Tensor& gamma,
+                                   const float epsilon,
+                                   bool q_int8,
+                                   bool transposed_mode)
+{
+    const int bsz = input.size(0) * input.size(1);
+    T* workspace = (T*)InferenceContext::Instance().GetWorkSpace();
+    T* rms_norm_ptr = workspace + (3 * bsz * input.size(2));
+    int out_size = (transposed_mode || q_int8) ? weight.size(0) : weight.size(1);
+
+    auto options = at::TensorOptions()
+                       .dtype(input.options().dtype())
+                       .layout(at::kStrided)
+                       .device(at::kCUDA)
+                       .requires_grad(false);
+    auto rms_norm = at::from_blob(rms_norm_ptr, input.sizes(), options);
+    auto output = at::from_blob(workspace, {input.size(0), input.size(1), out_size}, options);
+
+    launch_rms_norm((T*)rms_norm.data_ptr(),
+                    (T*)nullptr,
+                    (const T*)input.data_ptr(),
+                    (const T*)nullptr,
+                    (const T*)gamma.data_ptr(),
+                    epsilon,
+                    bsz,
+                    input.size(2),
+                    InferenceContext::Instance().GetCurrentStream());
+
+    if (q_int8) {
+        quantized_gemm<T>((T*)output.data_ptr(),
+                          (T*)rms_norm.data_ptr(),
+                          weight,
+                          q_scale,
+                          q_scale.size(0),
+                          bsz,
+                          input.size(2));
+    } else {
+        float alpha = (T)1.0;
+        float gemm_beta = (T)0.0;
+
+        cublasSetStream(InferenceContext::Instance().GetCublasHandle(),
+                        InferenceContext::Instance().GetCurrentStream());
+        cublas_gemm_ex(InferenceContext::Instance().GetCublasHandle(),
+                       (transposed_mode ? CUBLAS_OP_T : CUBLAS_OP_N),
+                       CUBLAS_OP_N,
+                       weight.size(transposed_mode ? 0 : 1),
+                       bsz,
+                       input.size(2),
+                       &alpha,
+                       &gemm_beta,
+                       (T*)weight.data_ptr(),
+                       (T*)rms_norm.data_ptr(),
+                       (T*)output.data_ptr(),
+#ifdef __HIP_PLATFORM_AMD__
+                       rocblas_gemm_algo_standard);
+#else
+                       CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+#endif
+    }
+
+    return {output, rms_norm};
+}
+
 template <typename T>
 std::vector<at::Tensor> ds_qkv_gemm(at::Tensor& input,
                                     at::Tensor& weight,
@@ -866,15 +1022,12 @@ std::vector<at::Tensor> ds_qkv_gemm(at::Tensor& input,
                                     at::Tensor& beta,
                                     const float epsilon,
                                     bool add_bias,
-                                    unsigned num_layers,
-                                    bool external_cache,
-                                    unsigned mp_size,
-                                    unsigned rank,
-                                    bool q_int8)
+                                    bool q_int8,
+                                    bool transposed_mode)
 {
     int bsz = input.size(0) * input.size(1);
-    T* workspace = (T*)Context::Instance().GetWorkSpace();
-    int out_size = q_int8 ? weight.size(0) : weight.size(1);
+    T* workspace = (T*)InferenceContext::Instance().GetWorkSpace();
+    int out_size = (transposed_mode || q_int8) ? weight.size(0) : weight.size(1);
 
     auto options = at::TensorOptions()
                        .dtype(input.options().dtype())
@@ -883,8 +1036,17 @@ std::vector<at::Tensor> ds_qkv_gemm(at::Tensor& input,
                        .requires_grad(false);
 
     auto output = at::from_blob(workspace, {input.size(0), input.size(1), out_size}, options);
-    auto inp_norm = qkv_unfused_cublas<T>(
-        output, input, weight, q_scale, bias, gamma, beta, epsilon, add_bias, q_int8);
+    auto inp_norm = qkv_unfused_cublas<T>(output,
+                                          input,
+                                          weight,
+                                          q_scale,
+                                          bias,
+                                          gamma,
+                                          beta,
+                                          epsilon,
+                                          add_bias,
+                                          q_int8,
+                                          transposed_mode);
 
     return {output, inp_norm};
 }
@@ -912,11 +1074,11 @@ void quantized_gemm(at::Tensor& output,
                       weight.size(1),
                       groups,
                       merge_count,
-                      Context::Instance().GetCurrentStream());
+                      InferenceContext::Instance().GetCurrentStream());
 
     float alpha = (T)1.0;
     float gemm_beta = (T)0.0;
-    cublas_gemm_ex(Context::Instance().GetCublasHandle(),
+    cublas_gemm_ex(InferenceContext::Instance().GetCublasHandle(),
                    CUBLAS_OP_T,
                    CUBLAS_OP_N,
                    weight.size(0),
@@ -927,54 +1089,22 @@ void quantized_gemm(at::Tensor& output,
                    (T*)weight16.data_ptr(),
                    (T*)input.data_ptr(),
                    (T*)output.data_ptr(),
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef __HIP_PLATFORM_AMD__
                    rocblas_gemm_algo_standard);
 #else
                    CUBLAS_GEMM_DEFAULT_TENSOR_OP);
 #endif
 }
 
-template <typename T>
-at::Tensor ds_qkv_gemm_int8(at::Tensor& input,
-                            at::Tensor& weight,
-                            at::Tensor& bias,
-                            at::Tensor& gamma,
-                            at::Tensor& beta,
-                            const float epsilon,
-                            at::Tensor& q_scale,
-                            int groups,
-                            bool add_bias)
-{
-    int bsz = input.size(0) * input.size(1);
-    auto input_cont = input.contiguous();
-    auto options = at::TensorOptions()
-                       .dtype(input_cont.options().dtype())
-                       .layout(at::kStrided)
-                       .device(at::kCUDA)
-                       .requires_grad(false);
-
-    auto output = at::empty({input_cont.size(0), input_cont.size(1), weight.size(1)}, options);
-
-    auto inp_norm = ds_layer_norm(input_cont, gamma, beta, epsilon);
-
-    quantized_gemm<T>(output, inp_norm, weight, q_scale, groups, 0);
-    if (add_bias)
-        launch_bias_add((T*)output.data_ptr(),
-                        (T*)bias.data_ptr(),
-                        weight.size(1),
-                        bsz,
-                        Context::Instance().GetCurrentStream());
-
-    return output;
-}
-
 template <typename T>
 at::Tensor ds_linear_layer(at::Tensor& input,
                            at::Tensor& weight,
                            at::Tensor& bias,
                            bool add_bias,
                            bool do_flash_attn,
-                           int num_heads)
+                           int num_heads,
+                           bool transposed_mode,
+                           float rope_theta)
 {
     auto input_cont = input.contiguous();
     auto options = at::TensorOptions()
@@ -985,17 +1115,19 @@ at::Tensor ds_linear_layer(at::Tensor& input,
 
     int head_size = input_cont.size(2) / num_heads;
     int bsz = input.size(0) * input.size(1);
-    T* workspace = (T*)Context::Instance().GetWorkSpace();
-    auto output = at::from_blob(workspace, {input.size(0), input.size(1), weight.size(1)}, options);
+    int out_size = transposed_mode ? weight.size(0) : weight.size(1);
+    T* workspace = (T*)InferenceContext::Instance().GetWorkSpace();
+    auto output = at::from_blob(workspace, {input.size(0), input.size(1), out_size}, options);
 
     float alpha = (T)1.0;
     float gemm_beta = (T)0.0;
-    cublasSetStream(Context::Instance().GetCublasHandle(), Context::Instance().GetCurrentStream());
+    cublasSetStream(InferenceContext::Instance().GetCublasHandle(),
+                    InferenceContext::Instance().GetCurrentStream());
 
-    cublas_gemm_ex(Context::Instance().GetCublasHandle(),
-                   CUBLAS_OP_N,
+    cublas_gemm_ex(InferenceContext::Instance().GetCublasHandle(),
+                   (transposed_mode ? CUBLAS_OP_T : CUBLAS_OP_N),
                    CUBLAS_OP_N,
-                   weight.size(1),
+                   weight.size(transposed_mode ? 0 : 1),
                    bsz,
                    input_cont.size(2),
                    &alpha,
@@ -1003,7 +1135,7 @@ at::Tensor ds_linear_layer(at::Tensor& input,
                    (T*)weight.data_ptr(),
                    (T*)input_cont.data_ptr(),
                    (T*)output.data_ptr(),
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef __HIP_PLATFORM_AMD__
                    rocblas_gemm_algo_standard);
 #else
                    CUBLAS_GEMM_DEFAULT_TENSOR_OP);
@@ -1011,9 +1143,9 @@ at::Tensor ds_linear_layer(at::Tensor& input,
     if (add_bias)
         launch_bias_add((T*)output.data_ptr(),
                         (T*)bias.data_ptr(),
-                        weight.size(1),
+                        weight.size(transposed_mode ? 0 : 1),
                         bsz,
-                        Context::Instance().GetCurrentStream());
+                        InferenceContext::Instance().GetCurrentStream());
     bool add_padding = (head_size % 32 != 0 && head_size < 64) || (head_size % 64 != 0);
     if (do_flash_attn) {
         if (add_padding) {
@@ -1026,7 +1158,7 @@ at::Tensor ds_linear_layer(at::Tensor& input,
                      3 * bsz * num_heads,
                      head_size,
                      padded_head_size,
-                     Context::Instance().GetCurrentStream());
+                     InferenceContext::Instance().GetCurrentStream());
 
             launch_bias_add_transform_0213<T>(
                 final_output,
@@ -1041,11 +1173,13 @@ at::Tensor ds_linear_layer(at::Tensor& input,
                 (num_heads * padded_head_size),
                 num_heads,
                 -1,
+                -1,
                 false,
                 false,
-                Context::Instance().GetCurrentStream(),
+                InferenceContext::Instance().GetCurrentStream(),
                 3,
-                input.size(1));
+                input.size(1),
+                rope_theta);
             return at::from_blob(final_output,
                                  {3, input.size(0), num_heads, input.size(1), padded_head_size},
                                  options);
@@ -1066,11 +1200,13 @@ at::Tensor ds_linear_layer(at::Tensor& input,
                 input_cont.size(2),
                 num_heads,
                 -1,
+                -1,
                 false,
                 false,
-                Context::Instance().GetCurrentStream(),
+                InferenceContext::Instance().GetCurrentStream(),
                 3,
-                input.size(1));
+                input.size(1),
+                rope_theta);
             return at::from_blob(
                 final_output, {3, input.size(0), num_heads, input.size(1), head_size}, options);
             // return at::from_blob(workspace, {input.size(0) * input.size(1), 3, num_heads,
@@ -1086,7 +1222,7 @@ std::vector<at::Tensor> add_padding(at::Tensor& query, at::Tensor& key, at::Tens
 {
     int head_size = query.size(3);
     int padded_head_size = head_size < 32 ? 32 : (head_size < 64 ? 64 : 128);
-    T* workspace = (T*)Context::Instance().GetWorkSpace();
+    T* workspace = (T*)InferenceContext::Instance().GetWorkSpace();
     T* key_pad_ptr = workspace + padded_head_size * query.size(0) * query.size(1) * query.size(2);
     T* value_pad_ptr = key_pad_ptr + padded_head_size * query.size(0) * query.size(1) * 128;
     pad_head_seq(workspace,
@@ -1096,7 +1232,7 @@ std::vector<at::Tensor> add_padding(at::Tensor& query, at::Tensor& key, at::Tens
                  query.size(2),
                  head_size,
                  padded_head_size,
-                 Context::Instance().GetCurrentStream());
+                 InferenceContext::Instance().GetCurrentStream());
     pad_head_seq(key_pad_ptr,
                  (T*)key.data_ptr(),
                  query.size(0) * query.size(1),
@@ -1104,7 +1240,7 @@ std::vector<at::Tensor> add_padding(at::Tensor& query, at::Tensor& key, at::Tens
                  128,
                  head_size,
                  padded_head_size,
-                 Context::Instance().GetCurrentStream());
+                 InferenceContext::Instance().GetCurrentStream());
     pad_head_seq(value_pad_ptr,
                  (T*)value.data_ptr(),
                  query.size(0) * query.size(1),
@@ -1112,7 +1248,7 @@ std::vector<at::Tensor> add_padding(at::Tensor& query, at::Tensor& key, at::Tens
                  128,
                  head_size,
                  padded_head_size,
-                 Context::Instance().GetCurrentStream());
+                 InferenceContext::Instance().GetCurrentStream());
     return {
         at::from_blob(workspace,
                       {query.size(0), query.size(1), query.size(2), padded_head_size},
@@ -1134,7 +1270,7 @@ std::vector<at::Tensor> padd_add_transform(at::Tensor& query,
     int key_value_length = add_padding ? 128 : key.size(1);
     int padded_head_size = add_padding ? (head_size < 32 ? 32 : (head_size < 64 ? 64 : 128))
                                        : head_size;
-    T* workspace = (T*)Context::Instance().GetWorkSpace();
+    T* workspace = (T*)InferenceContext::Instance().GetWorkSpace();
     T* key_pad_ptr = workspace + padded_head_size * query.size(0) * heads * query.size(1);
     T* value_pad_ptr = key_pad_ptr + padded_head_size * query.size(0) * heads * key_value_length;
     launch_pad_add_transform_0213(workspace,
@@ -1145,7 +1281,7 @@ std::vector<at::Tensor> padd_add_transform(at::Tensor& query,
                                   query.size(1),
                                   heads,
                                   padded_head_size,
-                                  Context::Instance().GetCurrentStream());
+                                  InferenceContext::Instance().GetCurrentStream());
     launch_pad_add_transform_0213(key_pad_ptr,
                                   (T*)key.data_ptr(),
                                   key.size(0),
@@ -1154,7 +1290,7 @@ std::vector<at::Tensor> padd_add_transform(at::Tensor& query,
                                   key_value_length,
                                   heads,
                                   padded_head_size,
-                                  Context::Instance().GetCurrentStream());
+                                  InferenceContext::Instance().GetCurrentStream());
     launch_pad_add_transform_0213(value_pad_ptr,
                                   (T*)value.data_ptr(),
                                   value.size(0),
@@ -1163,7 +1299,7 @@ std::vector<at::Tensor> padd_add_transform(at::Tensor& query,
                                   key_value_length,
                                   heads,
                                   padded_head_size,
-                                  Context::Instance().GetCurrentStream());
+                                  InferenceContext::Instance().GetCurrentStream());
     return {
         at::from_blob(
             workspace, {query.size(0), heads, query.size(1), padded_head_size}, query.options()),
@@ -1174,48 +1310,24 @@ std::vector<at::Tensor> padd_add_transform(at::Tensor& query,
                       {query.size(0), heads, key_value_length, padded_head_size},
                       query.options())};
 }
-template <typename T>
-at::Tensor ds_linear_layer_int8(at::Tensor& input,
-                                at::Tensor& weight,
-                                at::Tensor& bias,
-                                at::Tensor& q_scale,
-                                int groups)
-{
-    auto input_cont = input.contiguous();
-    auto options = at::TensorOptions()
-                       .dtype(input_cont.options().dtype())
-                       .layout(at::kStrided)
-                       .device(at::kCUDA)
-                       .requires_grad(false);
-    int bsz = input_cont.size(0) * input_cont.size(1);
-
-    auto output = at::empty({input_cont.size(0), input_cont.size(1), weight.size(1)}, options);
-
-    quantized_gemm<T>(output, input_cont, weight, q_scale, groups, 0);
-    launch_bias_add((T*)output.data_ptr(),
-                    (T*)bias.data_ptr(),
-                    weight.size(1),
-                    bsz,
-                    Context::Instance().GetCurrentStream());
-    return output;
-}
 
 template <typename T>
 at::Tensor ds_vector_matmul(at::Tensor& input,
                             at::Tensor& weight,
                             bool async_op,
                             at::Tensor& q_scale,
-                            bool q_int8)
+                            bool q_int8,
+                            bool transposed_mode)
 {
     auto options = at::TensorOptions()
                        .dtype(input.options().dtype())
                        .layout(at::kStrided)
                        .device(at::kCUDA)
                        .requires_grad(false);
-    int out_size = q_int8 ? weight.size(0) : weight.size(1);
+    int out_size = (q_int8 || transposed_mode) ? weight.size(0) : weight.size(1);
     int bsz = input.size(0) * input.size(1);
 
-    T* workspace = (T*)Context::Instance().GetWorkSpace();
+    T* workspace = (T*)InferenceContext::Instance().GetWorkSpace();
     auto output = at::from_blob(workspace, {input.size(0), input.size(1), out_size}, options);
     if (q_int8) {
         quantized_gemm<T>(output.data_ptr(),
@@ -1228,12 +1340,12 @@ at::Tensor ds_vector_matmul(at::Tensor& input,
     } else {
         float alpha = (T)1.0;
         float gemm_beta = (T)0.0;
-        cublasSetStream(Context::Instance().GetCublasHandle(),
-                        Context::Instance().GetCurrentStream(async_op));
-        cublas_gemm_ex(Context::Instance().GetCublasHandle(),
+        cublasSetStream(InferenceContext::Instance().GetCublasHandle(),
+                        InferenceContext::Instance().GetCurrentStream(async_op));
+        cublas_gemm_ex(InferenceContext::Instance().GetCublasHandle(),
+                       (transposed_mode ? CUBLAS_OP_T : CUBLAS_OP_N),
                        CUBLAS_OP_N,
-                       CUBLAS_OP_N,
-                       weight.size(1),
+                       weight.size(transposed_mode ? 0 : 1),
                        bsz,
                        input.size(2),
                        &alpha,
@@ -1241,7 +1353,7 @@ at::Tensor ds_vector_matmul(at::Tensor& input,
                        (T*)weight.data_ptr(),
                        (T*)input.data_ptr(),
                        (T*)output.data_ptr(),
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef __HIP_PLATFORM_AMD__
                        rocblas_gemm_algo_standard);
 #else
                        CUBLAS_GEMM_DEFAULT_TENSOR_OP);
@@ -1286,11 +1398,12 @@ at::Tensor mlp_unfused_cublas(at::Tensor& output,
                               at::Tensor& q_scale,
                               at::Tensor& q_scale1,
                               bool q_int8,
-                              ActivationFuncType act_func_type)
+                              ActivationFuncType act_func_type,
+                              bool transposed_mode)
 {
     int bsz = input.size(0) * input.size(1);
-    T* inp_norm =
-        (T*)Context::Instance().GetWorkSpace() + torch::numel(input) + torch::numel(output);
+    T* inp_norm = (T*)InferenceContext::Instance().GetWorkSpace() + torch::numel(input) +
+                  torch::numel(output);
     T* intermediate = inp_norm + torch::numel(input);
 
     if (mlp_after_attn) {
@@ -1303,7 +1416,7 @@ at::Tensor mlp_unfused_cublas(at::Tensor& output,
                                  epsilon,
                                  bsz,
                                  input.size(2),
-                                 Context::Instance().GetCurrentStream());
+                                 InferenceContext::Instance().GetCurrentStream());
     } else {
         ds_layer_norm_internal(inp_norm, input, gamma, beta, epsilon);
     }
@@ -1313,12 +1426,12 @@ at::Tensor mlp_unfused_cublas(at::Tensor& output,
     } else {
         float alpha = (T)1.0;
         float gemm_beta = (T)0.0;
-        cublasSetStream(Context::Instance().GetCublasHandle(),
-                        Context::Instance().GetCurrentStream());
-        cublas_gemm_ex(Context::Instance().GetCublasHandle(),
-                       CUBLAS_OP_N,
+        cublasSetStream(InferenceContext::Instance().GetCublasHandle(),
+                        InferenceContext::Instance().GetCurrentStream());
+        cublas_gemm_ex(InferenceContext::Instance().GetCublasHandle(),
+                       (transposed_mode ? CUBLAS_OP_T : CUBLAS_OP_N),
                        CUBLAS_OP_N,
-                       weight.size(1),
+                       weight.size(transposed_mode ? 0 : 1),
                        bsz,
                        input.size(2),
                        &alpha,
@@ -1326,7 +1439,7 @@ at::Tensor mlp_unfused_cublas(at::Tensor& output,
                        (T*)weight.data_ptr(),
                        inp_norm,
                        intermediate,
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef __HIP_PLATFORM_AMD__
                        rocblas_gemm_algo_standard);
 #else
                        CUBLAS_GEMM_DEFAULT_TENSOR_OP);
@@ -1335,15 +1448,15 @@ at::Tensor mlp_unfused_cublas(at::Tensor& output,
     if (act_func_type == ActivationFuncType::GELU) {
         launch_bias_gelu(intermediate,
                          (T*)bias.data_ptr(),
-                         q_int8 ? weight.size(0) : weight.size(1),
+                         (transposed_mode || q_int8) ? weight.size(0) : weight.size(1),
                          bsz,
-                         Context::Instance().GetCurrentStream());
+                         InferenceContext::Instance().GetCurrentStream());
     } else if (act_func_type == ActivationFuncType::ReLU) {
         launch_bias_relu(intermediate,
                          (T*)bias.data_ptr(),
-                         q_int8 ? weight.size(0) : weight.size(1),
+                         (transposed_mode || q_int8) ? weight.size(0) : weight.size(1),
                          bsz,
-                         Context::Instance().GetCurrentStream());
+                         InferenceContext::Instance().GetCurrentStream());
     }
 
     if (q_int8) {
@@ -1357,20 +1470,20 @@ at::Tensor mlp_unfused_cublas(at::Tensor& output,
     } else {
         float alpha = (T)1.0;
         float gemm_beta = (T)0.0;
-        cublasSetStream(Context::Instance().GetCublasHandle(),
-                        Context::Instance().GetCurrentStream());
-        cublas_gemm_ex(Context::Instance().GetCublasHandle(),
+        cublasSetStream(InferenceContext::Instance().GetCublasHandle(),
+                        InferenceContext::Instance().GetCurrentStream());
+        cublas_gemm_ex(InferenceContext::Instance().GetCublasHandle(),
+                       (transposed_mode ? CUBLAS_OP_T : CUBLAS_OP_N),
                        CUBLAS_OP_N,
-                       CUBLAS_OP_N,
-                       weight1.size(1),
+                       weight1.size(transposed_mode ? 0 : 1),
                        bsz,
-                       weight1.size(0),
+                       weight1.size(transposed_mode ? 1 : 0),
                        &alpha,
                        &gemm_beta,
                        (T*)weight1.data_ptr(),
                        intermediate,
                        (T*)output.data_ptr(),
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef __HIP_PLATFORM_AMD__
                        rocblas_gemm_algo_standard);
 #else
                        CUBLAS_GEMM_DEFAULT_TENSOR_OP);
@@ -1395,7 +1508,8 @@ std::vector<at::Tensor> ds_mlp_gemm(at::Tensor& input,
                                     at::Tensor& q_scale,
                                     at::Tensor& q_scale1,
                                     bool q_int8,
-                                    int activation_type)
+                                    int activation_type,
+                                    bool transposed_mode)
 {
     auto options = at::TensorOptions()
                        .dtype(input.options().dtype())
@@ -1403,10 +1517,11 @@ std::vector<at::Tensor> ds_mlp_gemm(at::Tensor& input,
                        .device(at::kCUDA)
                        .requires_grad(false);
 
-    int out_size = q_int8 ? weight_out.size(0) : weight_out.size(1);
-    auto output = at::from_blob((T*)Context::Instance().GetWorkSpace() + torch::numel(input),
-                                {input.size(0), input.size(1), out_size},
-                                options);
+    int out_size = (q_int8 || transposed_mode) ? weight_out.size(0) : weight_out.size(1);
+    auto output =
+        at::from_blob((T*)InferenceContext::Instance().GetWorkSpace() + torch::numel(input),
+                      {input.size(0), input.size(1), out_size},
+                      options);
     int bsz = input.size(0) * input.size(1);
 
     auto act_func_type = static_cast<ActivationFuncType>(activation_type);
@@ -1425,45 +1540,155 @@ std::vector<at::Tensor> ds_mlp_gemm(at::Tensor& input,
                                          q_scale,
                                          q_scale1,
                                          q_int8,
-                                         act_func_type);
+                                         act_func_type,
+                                         transposed_mode);
 
     return {output, res_add};
 }
 
 template <typename T>
-std::vector<at::Tensor> ds_mlp_gemm_int8(at::Tensor& input,
-                                         at::Tensor& residual,
-                                         at::Tensor& input_bias,
-                                         at::Tensor& weight,
-                                         at::Tensor& bias,
-                                         at::Tensor& gamma,
-                                         at::Tensor& beta,
-                                         const float epsilon,
-                                         at::Tensor& q_scale,
-                                         int groups,
-                                         bool preLayerNorm)
+std::vector<at::Tensor> ds_rms_mlp_gemm(at::Tensor& input,
+                                        at::Tensor& residual,
+                                        at::Tensor& weight_interm,
+                                        at::Tensor& weight_out,
+                                        at::Tensor& gamma,
+                                        const float epsilon,
+                                        at::Tensor& q_scale,
+                                        at::Tensor& q_scale1,
+                                        bool q_int8,
+                                        int activation_type,
+                                        bool transposed_mode)
 {
-    auto input_cont = input.contiguous();
+    const int bsz = input.size(0) * input.size(1);
+    const size_t input_neurons = input.size(2);
+    const size_t mlp_1_out_neurons = transposed_mode ? weight_interm.size(0)
+                                                     : weight_interm.size(1);
+    const size_t mlp_2_in_neurons = transposed_mode ? weight_out.size(1) : weight_out.size(0);
+
     auto options = at::TensorOptions()
-                       .dtype(input_cont.options().dtype())
+                       .dtype(input.options().dtype())
                        .layout(at::kStrided)
                        .device(at::kCUDA)
                        .requires_grad(false);
 
-    auto output = at::empty({input_cont.size(0), input_cont.size(1), weight.size(1)}, options);
+    T* output_ptr = (T*)InferenceContext::Instance().GetWorkSpace() + torch::numel(input);
+    T* inp_norm_ptr = output_ptr + torch::numel(input);
+    T* intermediate_ptr = inp_norm_ptr + torch::numel(input);
 
-    int bsz = input_cont.size(0) * input_cont.size(1);
-    auto inp_norm = at::empty_like(input_cont);
+    auto output = at::from_blob(output_ptr, input.sizes(), options);
+    auto inp_norm = at::from_blob(inp_norm_ptr, input.sizes(), options);
+    auto intermediate_gemm =
+        at::from_blob(intermediate_ptr, {input.size(0), input.size(1), mlp_1_out_neurons}, options);
 
-    auto residual_add = (preLayerNorm ? at::empty_like(input_cont) : inp_norm);
-    quantized_gemm<T>(output, inp_norm, weight, q_scale, groups, 0);
-    launch_bias_gelu((T*)output.data_ptr(),
-                     (T*)bias.data_ptr(),
-                     weight.size(1),
-                     bsz,
-                     Context::Instance().GetCurrentStream());
+    auto act_func_type = static_cast<ActivationFuncType>(activation_type);
+
+    // RMS Norm, we'll update the residual in-place
+    launch_rms_norm((T*)inp_norm.data_ptr(),
+                    (T*)residual.data_ptr(),
+                    (const T*)input.data_ptr(),
+                    (const T*)residual.data_ptr(),
+                    (const T*)gamma.data_ptr(),
+                    epsilon,
+                    bsz,
+                    input_neurons,
+                    InferenceContext::Instance().GetCurrentStream());
+
+    if (q_int8) {
+        quantized_gemm<T>(intermediate_ptr,
+                          (T*)inp_norm.data_ptr(),
+                          weight_interm,
+                          q_scale,
+                          q_scale.size(0),
+                          bsz,
+                          input_neurons);
+    } else {
+        float alpha = (T)1.0;
+        float gemm_beta = (T)0.0;
+        cublasSetStream(InferenceContext::Instance().GetCublasHandle(),
+                        InferenceContext::Instance().GetCurrentStream());
+        cublas_gemm_ex(InferenceContext::Instance().GetCublasHandle(),
+                       (transposed_mode ? CUBLAS_OP_T : CUBLAS_OP_N),
+                       CUBLAS_OP_N,
+                       mlp_1_out_neurons,
+                       bsz,
+                       input_neurons,
+                       &alpha,
+                       &gemm_beta,
+                       (T*)weight_interm.data_ptr(),
+                       (T*)inp_norm.data_ptr(),
+                       intermediate_ptr,
+#ifdef __HIP_PLATFORM_AMD__
+                       rocblas_gemm_algo_standard);
+#else
+                       CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+#endif
+    }
+
+    if (act_func_type == ActivationFuncType::GELU) {
+        launch_bias_gelu(intermediate_ptr,
+                         (T*)nullptr,
+                         mlp_1_out_neurons,
+                         bsz,
+                         InferenceContext::Instance().GetCurrentStream());
+    } else if (act_func_type == ActivationFuncType::ReLU) {
+        launch_bias_relu(intermediate_ptr,
+                         (T*)nullptr,
+                         mlp_1_out_neurons,
+                         bsz,
+                         InferenceContext::Instance().GetCurrentStream());
+    } else if (act_func_type == ActivationFuncType::GATED_GELU) {
+        launch_gated_activation(intermediate_ptr,
+                                (const T*)intermediate_ptr,
+                                (const T*)nullptr,
+                                bsz,
+                                mlp_1_out_neurons,
+                                mlp_1_out_neurons,
+                                true,
+                                InferenceContext::Instance().GetCurrentStream());
+    } else if (act_func_type == ActivationFuncType::GATED_SILU) {
+        launch_gated_activation(intermediate_ptr,
+                                (const T*)intermediate_ptr,
+                                (const T*)nullptr,
+                                bsz,
+                                mlp_1_out_neurons,
+                                mlp_1_out_neurons,
+                                false,
+                                InferenceContext::Instance().GetCurrentStream());
+    }
+
+    if (q_int8) {
+        quantized_gemm<T>(output.data_ptr(),
+                          intermediate_ptr,
+                          weight_out,
+                          q_scale1,
+                          q_scale1.size(0),
+                          bsz,
+                          input.size(2));
+    } else {
+        float alpha = (T)1.0;
+        float gemm_beta = (T)0.0;
+        cublasSetStream(InferenceContext::Instance().GetCublasHandle(),
+                        InferenceContext::Instance().GetCurrentStream());
+        cublas_gemm_ex(InferenceContext::Instance().GetCublasHandle(),
+                       (transposed_mode ? CUBLAS_OP_T : CUBLAS_OP_N),
+                       CUBLAS_OP_N,
+                       input_neurons,
+                       bsz,
+                       mlp_2_in_neurons,
+                       &alpha,
+                       &gemm_beta,
+                       (T*)weight_out.data_ptr(),
+                       intermediate_ptr,
+                       (T*)output.data_ptr(),
+#ifdef __HIP_PLATFORM_AMD__
+                       rocblas_gemm_algo_standard,
+#else
+                       CUBLAS_GEMM_DEFAULT_TENSOR_OP,
+#endif
+                       mlp_1_out_neurons);
+    }
 
-    return {output, residual_add};
+    return {output, residual};
 }
 
 template <typename T>
@@ -1473,10 +1698,8 @@ at::Tensor fused_gemm_gelu(at::Tensor& input,
                            at::Tensor& bias,
                            at::Tensor& weight_out,
                            at::Tensor& weight_out_scale,
-                           const float epsilon,
-                           bool preLayerNorm,
                            bool q_int8,
-                           bool async_op)
+                           bool transposed_mode)
 {
     auto options = at::TensorOptions()
                        .dtype(input.options().dtype())
@@ -1484,9 +1707,10 @@ at::Tensor fused_gemm_gelu(at::Tensor& input,
                        .device(at::kCUDA)
                        .requires_grad(false);
 
-    int intm_dim = q_int8 ? weight.size(0) : weight.size(1);
+    int intm_dim = (transposed_mode || q_int8) ? weight.size(0) : weight.size(1);
 
-    // auto output = at::from_blob((T*)Context::Instance().GetWorkSpace() + torch::numel(input),
+    // auto output = at::from_blob((T*)InferenceContext::Instance().GetWorkSpace() +
+    // torch::numel(input),
     //                            {input.size(0), input.size(1), out_size},
     //                            options);
     // T* intermediate = (T*)input.data_ptr() + torch::numel(input);
@@ -1505,10 +1729,10 @@ at::Tensor fused_gemm_gelu(at::Tensor& input,
                           bsz,
                           input.size(2));
     } else {
-        cublasSetStream(Context::Instance().GetCublasHandle(),
-                        Context::Instance().GetCurrentStream());
-        cublas_gemm_ex(Context::Instance().GetCublasHandle(),
-                       CUBLAS_OP_N,
+        cublasSetStream(InferenceContext::Instance().GetCublasHandle(),
+                        InferenceContext::Instance().GetCurrentStream());
+        cublas_gemm_ex(InferenceContext::Instance().GetCublasHandle(),
+                       (transposed_mode ? CUBLAS_OP_T : CUBLAS_OP_N),
                        CUBLAS_OP_N,
                        intm_dim,
                        bsz,
@@ -1518,7 +1742,7 @@ at::Tensor fused_gemm_gelu(at::Tensor& input,
                        (T*)weight.data_ptr(),
                        (T*)input.data_ptr(),
                        (T*)intermediate.data_ptr(),
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef __HIP_PLATFORM_AMD__
                        rocblas_gemm_algo_standard);
 #else
                        CUBLAS_GEMM_DEFAULT_TENSOR_OP);
@@ -1528,9 +1752,9 @@ at::Tensor fused_gemm_gelu(at::Tensor& input,
                      (T*)bias.data_ptr(),
                      intm_dim,
                      bsz,
-                     Context::Instance().GetCurrentStream());
+                     InferenceContext::Instance().GetCurrentStream());
 
-    int out_size = q_int8 ? weight_out.size(0) : weight_out.size(1);
+    int out_size = (transposed_mode || q_int8) ? weight_out.size(0) : weight_out.size(1);
     auto output = at::empty({input.size(0), input.size(1), out_size}, options);
     if (q_int8) {
         quantized_gemm<T>(output.data_ptr(),
@@ -1541,8 +1765,8 @@ at::Tensor fused_gemm_gelu(at::Tensor& input,
                           bsz,
                           input.size(2));
     } else {
-        cublas_gemm_ex(Context::Instance().GetCublasHandle(),
-                       CUBLAS_OP_N,
+        cublas_gemm_ex(InferenceContext::Instance().GetCublasHandle(),
+                       (transposed_mode ? CUBLAS_OP_T : CUBLAS_OP_N),
                        CUBLAS_OP_N,
                        out_size,
                        bsz,
@@ -1552,14 +1776,14 @@ at::Tensor fused_gemm_gelu(at::Tensor& input,
                        (T*)weight_out.data_ptr(),
                        (T*)intermediate.data_ptr(),
                        (T*)output.data_ptr(),
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef __HIP_PLATFORM_AMD__
                        rocblas_gemm_algo_standard);
 #else
                        CUBLAS_GEMM_DEFAULT_TENSOR_OP);
 #endif
     }
-    // cudaEventRecord(Context::Instance().GetCompEvent(2),
-    //                Context::Instance().GetCurrentStream(true));
+    // cudaEventRecord(InferenceContext::Instance().GetCompEvent(2),
+    //                InferenceContext::Instance().GetCurrentStream(true));
     return output;
 }
 
@@ -1586,7 +1810,7 @@ at::Tensor& residual_add_bias(at::Tensor& hidden_state,
                              hidden_size,
                              mp_size,
                              preln,
-                             Context::Instance().GetCurrentStream());
+                             InferenceContext::Instance().GetCurrentStream());
     else
         launch_gptj_residual_add<T>(
             static_cast<T*>(residual.data_ptr()),
@@ -1597,17 +1821,40 @@ at::Tensor& residual_add_bias(at::Tensor& hidden_state,
             hidden_size,
             bsz,
             mp_size,
-            Context::Instance().GetCurrentStream());
+            InferenceContext::Instance().GetCurrentStream());
     return residual;
 }
 
+#define DISPATCH_VECTOR_ADD(T_TYPE, C_TYPE)                                         \
+    if (a.scalar_type() == at::k##T_TYPE) {                                         \
+        launch_vector_add<C_TYPE>((C_TYPE*)(a.data_ptr()),                          \
+                                  (const C_TYPE*)(a.data_ptr()),                    \
+                                  (const C_TYPE*)(b.data_ptr()),                    \
+                                  gamma,                                            \
+                                  total_elems,                                      \
+                                  InferenceContext::Instance().GetCurrentStream()); \
+    }
+
+at::Tensor& _vector_add(at::Tensor& a, at::Tensor& b, float gamma)
+{
+    const int total_elems = a.numel();
+
+    DISPATCH_VECTOR_ADD(Float, float)
+    DISPATCH_VECTOR_ADD(Half, __half)
+#ifdef BF16_AVAILABLE
+    DISPATCH_VECTOR_ADD(BFloat16, __nv_bfloat16)
+#endif
+
+    return a;
+}
+
 std::vector<at::Tensor> apply_rotary_pos_emb(at::Tensor& mixed_query,
                                              at::Tensor& key_layer,
                                              unsigned rotary_dim,
                                              unsigned offset,
                                              unsigned num_heads,
                                              bool rotate_half,
-                                             bool rotate_every_two)
+                                             float rope_theta)
 {
     auto query_cont = mixed_query.contiguous();
     auto key_cont = key_layer.contiguous();
@@ -1625,10 +1872,9 @@ std::vector<at::Tensor> apply_rotary_pos_emb(at::Tensor& mixed_query,
                                            offset,
                                            num_heads,
                                            bsz,
-                                           rotate_half,
-                                           rotate_every_two,
-                                           Context::Instance().GetCurrentStream(),
-                                           Context::Instance().GetMaxTokenLenght());
+                                           rope_theta,
+                                           InferenceContext::Instance().GetCurrentStream(),
+                                           InferenceContext::Instance().GetMaxTokenLength());
     else
         launch_apply_rotary_pos_emb<__half>((__half*)query_cont.data_ptr(),
                                             (__half*)key_cont.data_ptr(),
@@ -1638,143 +1884,137 @@ std::vector<at::Tensor> apply_rotary_pos_emb(at::Tensor& mixed_query,
                                             offset,
                                             num_heads,
                                             bsz,
-                                            rotate_half,
-                                            rotate_every_two,
-                                            Context::Instance().GetCurrentStream(),
-                                            Context::Instance().GetMaxTokenLenght());
+                                            rope_theta,
+                                            InferenceContext::Instance().GetCurrentStream(),
+                                            InferenceContext::Instance().GetMaxTokenLength());
     return {query_cont, key_cont};
 }
 
+#define DISPATCH_MOE_RESIDUAL(T_TYPE, C_TYPE)                                           \
+    if (moe_res.scalar_type() == torch::T_TYPE) {                                       \
+        launch_moe_res_matmul<C_TYPE>((C_TYPE*)moe_res.data_ptr(),                      \
+                                      (C_TYPE*)coef.data_ptr(),                         \
+                                      (C_TYPE*)output.data_ptr(),                       \
+                                      M,                                                \
+                                      N,                                                \
+                                      InferenceContext::Instance().GetCurrentStream()); \
+    }
+
+at::Tensor moe_res_matmul(at::Tensor& moe_res, at::Tensor& coef, at::Tensor& output)
+{
+    int M = moe_res.size(0) * moe_res.size(1);
+    int N = moe_res.size(2);
+    InferenceContext::Instance().SynchComm();
+
+    DISPATCH_MOE_RESIDUAL(kFloat, float)
+    DISPATCH_MOE_RESIDUAL(kHalf, __half)
+#ifdef BF16_AVAILABLE
+    DISPATCH_MOE_RESIDUAL(kBFloat16, __nv_bfloat16)
+#endif
+
+    return output;
+}
+
+void ds_release_workspace() { InferenceContext::Instance().release_workspace(); }
+
+bool ds_retake_workspace() { return InferenceContext::Instance().retake_workspace(); }
+
 template <typename T>
-at::Tensor fused_gemm_gelu_int8(at::Tensor& input,
-                                at::Tensor& weight,
-                                at::Tensor& bias,
-                                const float epsilon,
-                                at::Tensor& q_scale,
-                                int groups,
-                                bool preLayerNorm)
+at::Tensor ds_dequantize(at::Tensor& weight, at::Tensor& qscale, int groups)
 {
-    auto input_cont = input.contiguous();
     auto options = at::TensorOptions()
-                       .dtype(input_cont.options().dtype())
+                       .dtype(torch::kFloat16)
                        .layout(at::kStrided)
                        .device(at::kCUDA)
                        .requires_grad(false);
+    auto weight16 = at::empty({weight.size(0), weight.size(1)}, options);
 
-    auto output = at::empty({input_cont.size(0), input_cont.size(1), weight.size(1)}, options);
-
-    int bsz = input_cont.size(0) * input_cont.size(1);
-
-    quantized_gemm<T>(output, input_cont, weight, q_scale, groups, 0);
-    launch_bias_gelu((T*)output.data_ptr(),
-                     (T*)bias.data_ptr(),
-                     weight.size(1),
-                     bsz,
-                     Context::Instance().GetCurrentStream());
-
-    return output;
-}
+    launch_dequantize((T*)weight16.data_ptr(),
+                      (int8_t*)weight.data_ptr(),
+                      (float*)qscale.data_ptr(),
+                      weight.size(0),
+                      weight.size(1),
+                      groups,
+                      InferenceContext::Instance().GetCurrentStream());
 
-at::Tensor moe_res_matmul(at::Tensor& moe_res, at::Tensor& coef, at::Tensor& output)
-{
-    int M = moe_res.size(0) * moe_res.size(1);
-    int N = moe_res.size(2);
-    Context::Instance().SynchComm();
-    if (moe_res.scalar_type() == at::kFloat) {
-        launch_moe_res_matmul<float>((float*)moe_res.data_ptr(),
-                                     (float*)coef.data_ptr(),
-                                     (float*)output.data_ptr(),
-                                     M,
-                                     N,
-                                     at::cuda::getCurrentCUDAStream());
-    } else {
-        launch_moe_res_matmul<__half>((__half*)moe_res.data_ptr(),
-                                      (__half*)coef.data_ptr(),
-                                      (__half*)output.data_ptr(),
-                                      M,
-                                      N,
-                                      at::cuda::getCurrentCUDAStream());
-    }
-    return output;
+    return weight16;
 }
 
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
 {
-    m.def("softmax_fp32", &ds_softmax<float>, "DeepSpeed SoftMax with fp32 (CUDA)");
-    m.def("softmax_fp16", &ds_softmax<__half>, "DeepSpeed SoftMax with fp16 (CUDA)");
-    m.def(
-        "softmax_context_fp32", &ds_softmax_context<float>, "DeepSpeed attention with fp32 (CUDA)");
-    m.def("softmax_context_fp16",
-          &ds_softmax_context<__half>,
-          "DeepSpeed attention with fp16 (CUDA)");
     m.def("softmax_context_int8",
           &ds_softmax_context1<__half>,
           "DeepSpeed attention with int8 (CUDA)");
-    m.def("bias_gelu_fp32", &ds_bias_gelu<float>, "DeepSpeed Gelu with fp32 (CUDA)");
-    m.def("bias_gelu_fp16", &ds_bias_gelu<__half>, "DeepSpeed Gelu with fp16 (CUDA)");
-    m.def("bias_geglu", &ds_bias_geglu, "DeepSpeed Bias GEGLU (CUDA)");
-    m.def("bias_add_fp32", &ds_bias_add<float>, "DeepSpeed Bias Add with fp32 (CUDA)");
-    m.def("bias_add_fp16", &ds_bias_add<__half>, "DeepSpeed Gelu with fp16 (CUDA)");
-    m.def("bias_relu_fp32", &ds_bias_relu<float>, "DeepSpeed ReLU with fp32 (CUDA)");
-    m.def("bias_relu_fp16", &ds_bias_relu<__half>, "DeepSpeed ReLU with fp16 (CUDA)");
-    m.def("bias_residual_fp32",
-          &ds_bias_residual<float>,
-          "DeepSpeed residual-bias add with fp32 (CUDA)");
-    m.def("bias_residual_fp16",
-          &ds_bias_residual<__half>,
-          "DeepSpeed residual-bias add with fp16 (CUDA)");
+
+    // The following functions handle type dispatching internally
+    m.def("gated_activation", &ds_gated_activation, "DeepSpeed Bias GEGLU (CUDA)");
     m.def("layer_norm", &ds_layer_norm, "DeepSpeed layer norm (CUDA)");
     m.def(
         "_layer_norm_residual", &ds_layer_norm_residual, "DeepSpeed layer norm + residual (CUDA)");
     m.def("layer_norm_residual_store_pre_ln_res",
           &ds_layer_norm_residual_store_pre_ln_res,
           "DeepSpeed layer norm + store pre Layernorm residual (CUDA)");
-    m.def("qkv_gemm_fp32", &ds_qkv_gemm<float>, "DeepSpeed qkv gemm with fp32 (CUDA)");
-    m.def("qkv_gemm_fp16", &ds_qkv_gemm<__half>, "DeepSpeed qkv gemm with fp16 (CUDA)");
-    m.def("qkv_gemm_int8", &ds_qkv_gemm_int8<__half>, "DeepSpeed qkv gemm with int8 (CUDA)");
-    m.def("mlp_gemm_fp32", &ds_mlp_gemm<float>, "DeepSpeed mlp with fp32 (CUDA)");
-    m.def("mlp_gemm_fp16", &ds_mlp_gemm<__half>, "DeepSpeed mlp with fp16 (CUDA)");
-    m.def("mlp_gemm_int8", &ds_mlp_gemm_int8<__half>, "DeepSpeed mlp with int8 (CUDA)");
-    m.def("vector_matmul_fp32", &ds_vector_matmul<float>, "DeepSpeed vector-MM with fp32 (CUDA)");
-    m.def("vector_matmul_fp16", &ds_vector_matmul<__half>, "DeepSpeed vector-MM with fp16 (CUDA)");
-    m.def("vector_matmul_int8",
-          &ds_vector_matmul_int8<__half>,
-          "DeepSpeed vector-MM with int8 (CUDA)");
-    m.def("linear_layer_fp32", &ds_linear_layer<float>, "DeepSpeed linear_layer with fp32 (CUDA)");
-    m.def("linear_layer_fp16", &ds_linear_layer<__half>, "DeepSpeed linear_layer with fp16 (CUDA)");
-    m.def("linear_layer_int8",
-          &ds_linear_layer_int8<__half>,
-          "DeepSpeed linear_layer with int8 (CUDA)");
-    m.def("fused_gemm_gelu_fp32", &fused_gemm_gelu<float>, "DeepSpeed mlp with fp32 (CUDA)");
-    m.def("fused_gemm_gelu_fp16", &fused_gemm_gelu<__half>, "DeepSpeed mlp with fp16 (CUDA)");
-    m.def("residual_add_bias_fp32",
-          &residual_add_bias<float>,
-          "DeepSpeed residual add with fp32 (CUDA)");
-    m.def("residual_add_bias_fp16",
-          &residual_add_bias<__half>,
-          "DeepSpeed residual add with fp16 (CUDA)");
+    m.def("rms_norm", &ds_rms_norm, "DeepSpeed rms norm (CUDA)");
+    m.def("pre_rms_norm", &ds_pre_rms_norm, "DeepSpeed pre rms norm (CUDA)");
+    m.def("_vector_add", &_vector_add, "DeepSpeed vector add (CUDA)");
     m.def("apply_rotary_pos_emb", &apply_rotary_pos_emb, "DeepSpeed mlp with fp16 (CUDA)");
-    m.def("einsum_sec_sm_ecm_fp32",
-          &einsum_sec_sm_ecm<float>,
-          "DeepSpeed vector-MM with fp32 (CUDA)");
-
-    m.def("einsum_sec_sm_ecm_fp16",
-          &einsum_sec_sm_ecm<__half>,
-          "DeepSpeed vector-MM with fp16 (CUDA)");
     m.def("moe_res_matmul", &moe_res_matmul, "DeepSpeed moe residual matmul (CUDA)");
-    m.def("add_padding_fp32", &add_padding<float>, "DeepSpeed residual add with fp32 (CUDA)");
-    m.def("add_padding_fp16", &add_padding<__half>, "DeepSpeed residual add with fp16 (CUDA)");
-    m.def("pad_transform_fp32",
-          &padd_add_transform<float>,
-          "DeepSpeed residual add with fp32 (CUDA)");
-    m.def("pad_transform_fp16",
-          &padd_add_transform<__half>,
-          "DeepSpeed residual add with fp16 (CUDA)");
-    m.def("allocate_workspace_fp32",
-          &allocate_workspace<float>,
-          "DeepSpeed memory allocation for GPT inference with fp32 (CUDA)");
-    m.def("allocate_workspace_fp16",
-          &allocate_workspace<__half>,
-          "DeepSpeed memory allocation for GPT inference with fp16 (CUDA)");
     m.def("reset_cache", &reset_cache, "Reset Cache for generation tasks");
+    m.def("release_workspace", &ds_release_workspace, "DeepSpeed Release Workspace");
+    m.def("retake_workspace", &ds_retake_workspace, "DeepSpeed Retake Workspace");
+
+    // The following functions are templated and need to be explicitly instantiated and bound
+    // to different python methods
+#define DEF_OPS(_name, _dtype)                                                                    \
+    m.def("softmax_" #_name, &ds_softmax<_dtype>, "DeepSpeed SoftMax with " #_name " (CUDA)");    \
+    m.def("softmax_context_" #_name,                                                              \
+          &ds_softmax_context<_dtype>,                                                            \
+          "DeepSpeed attention with " #_name " (CUDA)");                                          \
+    m.def("bias_gelu_" #_name, &ds_bias_gelu<_dtype>, "DeepSpeed Gelu with " #_name " (CUDA)");   \
+    m.def("bias_add_" #_name, &ds_bias_add<_dtype>, "DeepSpeed Bias Add with " #_name " (CUDA)"); \
+    m.def("bias_relu_" #_name, &ds_bias_relu<_dtype>, "DeepSpeed ReLU with " #_name " (CUDA)");   \
+    m.def("bias_residual_" #_name,                                                                \
+          &ds_bias_residual<_dtype>,                                                              \
+          "DeepSpeed residual-bias add with " #_name " (CUDA)");                                  \
+    m.def("qkv_gemm_" #_name, &ds_qkv_gemm<_dtype>, "DeepSpeed qkv gemm with " #_name " (CUDA)"); \
+    m.def("rms_qkv_gemm_" #_name,                                                                 \
+          &ds_rms_qkv<_dtype>,                                                                    \
+          "DeepSpeed rms qkv gemm with " #_name " (CUDA)");                                       \
+    m.def("mlp_gemm_" #_name, &ds_mlp_gemm<_dtype>, "DeepSpeed mlp with " #_name " (CUDA)");      \
+    m.def("rms_mlp_gemm_" #_name,                                                                 \
+          &ds_rms_mlp_gemm<_dtype>,                                                               \
+          "DeepSpeed rms mlp gemm with " #_name " (CUDA)");                                       \
+    m.def("vector_matmul_" #_name,                                                                \
+          &ds_vector_matmul<_dtype>,                                                              \
+          "DeepSpeed vector-MM with " #_name " (CUDA)");                                          \
+    m.def("linear_layer_" #_name,                                                                 \
+          &ds_linear_layer<_dtype>,                                                               \
+          "DeepSpeed linear_layer with " #_name " (CUDA)");                                       \
+    m.def("fused_gemm_gelu_" #_name,                                                              \
+          &fused_gemm_gelu<_dtype>,                                                               \
+          "DeepSpeed mlp with " #_name " (CUDA)");                                                \
+    m.def("residual_add_bias_" #_name,                                                            \
+          &residual_add_bias<_dtype>,                                                             \
+          "DeepSpeed residual add with " #_name " (CUDA)");                                       \
+    m.def("einsum_sec_sm_ecm_" #_name,                                                            \
+          &einsum_sec_sm_ecm<_dtype>,                                                             \
+          "DeepSpeed vector-MM with " #_name " (CUDA)");                                          \
+    m.def("add_padding_" #_name,                                                                  \
+          &add_padding<_dtype>,                                                                   \
+          "DeepSpeed residual add with " #_name " (CUDA)");                                       \
+    m.def("pad_transform_" #_name,                                                                \
+          &padd_add_transform<_dtype>,                                                            \
+          "DeepSpeed residual add with " #_name " (CUDA)");                                       \
+    m.def("allocate_workspace_" #_name,                                                           \
+          &allocate_workspace<_dtype>,                                                            \
+          "DeepSpeed memory allocation for GPT inference with " #_name " (CUDA)");                \
+    m.def("dequantize_" #_name,                                                                   \
+          &ds_dequantize<_dtype>,                                                                 \
+          "DeepSpeed dequantize with " #_name " (CUDA)")
+
+    DEF_OPS(fp32, float);
+    DEF_OPS(fp16, __half);
+#ifdef BF16_AVAILABLE
+    DEF_OPS(bf16, __nv_bfloat16);
+#endif
 }
diff --git a/csrc/transformer/inference/csrc/relu.cu b/csrc/transformer/inference/csrc/relu.cu
index 87e169a9194f..40926b776cf2 100644
--- a/csrc/transformer/inference/csrc/relu.cu
+++ b/csrc/transformer/inference/csrc/relu.cu
@@ -1,6 +1,7 @@
-/*
-Copyright 2022 The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #include "conversion_utils.h"
 #include "inference_cuda_layers.h"
@@ -27,7 +28,8 @@ __global__ void fused_bias_relu(T* input, const T* bias, int total_count, int in
         T data[values_per_access];
         T data_bias[values_per_access];
         mem_access::load_global<granularity>(data, input + offset);
-        mem_access::load_global<granularity>(data_bias, bias + (offset % intermediate_size));
+        mem_access::load_global<granularity>(
+            data_bias, bias + (offset % intermediate_size), bias != nullptr);
 
 #pragma unroll
         for (int i = 0; i < values_per_access; i++) {
@@ -59,5 +61,11 @@ void launch_bias_relu(T* input,
         input, bias, total_count, intermediate_size);
 }
 
-template void launch_bias_relu<float>(float*, const float*, int, int, cudaStream_t);
-template void launch_bias_relu<__half>(__half*, const __half*, int, int, cudaStream_t);
+#define INSTANTIATE_LAUNCH_BIAS_RELU(T) \
+    template void launch_bias_relu<T>(T*, const T*, int, int, cudaStream_t);
+
+INSTANTIATE_LAUNCH_BIAS_RELU(float)
+#ifdef BF16_AVAILABLE
+INSTANTIATE_LAUNCH_BIAS_RELU(__nv_bfloat16)
+#endif
+INSTANTIATE_LAUNCH_BIAS_RELU(__half)
diff --git a/csrc/transformer/inference/csrc/rms_norm.cu b/csrc/transformer/inference/csrc/rms_norm.cu
new file mode 100644
index 000000000000..5f72a4193752
--- /dev/null
+++ b/csrc/transformer/inference/csrc/rms_norm.cu
@@ -0,0 +1,263 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include "conversion_utils.h"
+#include "ds_kernel_utils.h"
+#include "inference_cuda_layers.h"
+#include "memory_access_utils.h"
+#include "reduction_utils.h"
+
+namespace cg = cooperative_groups;
+using rop = reduce::ROpType;
+
+namespace rms {
+constexpr int granularity = 16;
+}  // namespace rms
+
+template <typename T, int UNROLL, int threadsPerGroup, int maxThreads>
+__global__ void rms_norm(T* output, const T* vals, const T* gamma, float epsilon, int elems_per_row)
+{
+    constexpr int T_per_load = rms::granularity / sizeof(T);
+
+    cg::thread_block tb = cg::this_thread_block();
+    cg::thread_block_tile<hw_warp_size> warp = cg::tiled_partition<hw_warp_size>(tb);
+
+    // X-dimension of the block
+    const int block_offset = (tb.group_index().x * (maxThreads / threadsPerGroup) * elems_per_row) +
+                             (tb.thread_index().y * elems_per_row);
+    const int thread_offset = tb.thread_index().x * T_per_load;
+    const int base_offset = block_offset + thread_offset;
+    const int stride = blockDim.x * T_per_load;
+
+    float var_sum = reduce::init<rop::Add, float>();
+
+    const T* input_base = vals + base_offset;
+
+    T local_buffer[UNROLL * T_per_load];
+
+#pragma unroll
+    for (int i = 0; i < UNROLL; i++) {
+        T* iteration_buffer = local_buffer + (i * T_per_load);
+
+        mem_access::load_global<rms::granularity>(iteration_buffer,
+                                                  input_base + (i * stride),
+                                                  thread_offset + (i * stride) < elems_per_row);
+
+#pragma unroll
+        for (int j = 0; j < T_per_load; j++) {
+            float up_cast = conversion::to<float>(iteration_buffer[j]);
+            float sq_val = up_cast * up_cast;
+            var_sum = reduce::element<rop::Add, float>(var_sum, sq_val);
+        }
+    }
+
+    reduce::partitioned_block<rop::Add, threadsPerGroup>(tb, warp, var_sum);
+    const float var = var_sum / elems_per_row;
+    const T denom = conversion::to<T>(__frsqrt_rn(var + epsilon));
+
+    T* block_output = output + block_offset;
+
+#pragma unroll
+    for (int i = 0; i < UNROLL; i++) {
+        T* iteration_buffer = local_buffer + (i * T_per_load);
+        const int iter_idx = i * stride + thread_offset;
+        const bool do_loads = (iter_idx < elems_per_row);
+
+        T gamma_local[T_per_load];
+
+        mem_access::load_global<rms::granularity>(gamma_local, gamma + iter_idx, do_loads);
+
+#pragma unroll
+        for (int j = 0; j < T_per_load; j++) {
+            iteration_buffer[j] *= denom;
+            iteration_buffer[j] *= gamma_local[j];
+        }
+
+        if (do_loads) {
+            mem_access::store_global<rms::granularity>(block_output + iter_idx, iteration_buffer);
+        }
+    }
+}
+
+template <typename T, int UNROLL, int threadsPerGroup, int maxThreads>
+__global__ void pre_rms_norm(T* output,
+                             T* res_out,
+                             const T* vals,
+                             const T* residual,
+                             const T* gamma,
+                             float epsilon,
+                             int elems_per_row)
+{
+    constexpr int T_per_load = rms::granularity / sizeof(T);
+
+    cg::thread_block tb = cg::this_thread_block();
+    cg::thread_block_tile<hw_warp_size> warp = cg::tiled_partition<hw_warp_size>(tb);
+
+    // X-dimension of the block
+    const int block_offset = (tb.group_index().x * (maxThreads / threadsPerGroup) * elems_per_row) +
+                             (tb.thread_index().y * elems_per_row);
+    const int thread_offset = tb.thread_index().x * T_per_load;
+    const int base_offset = block_offset + thread_offset;
+    const int stride = blockDim.x * T_per_load;
+
+    float var_sum = reduce::init<rop::Add, float>();
+
+    const T* input_base = vals + base_offset;
+    const T* residual_base = residual + base_offset;
+    T* res_output = res_out + base_offset;
+
+    T local_buffer[UNROLL * T_per_load];
+
+#pragma unroll
+    for (int i = 0; i < UNROLL; i++) {
+        T* iteration_buffer = local_buffer + (i * T_per_load);
+        T residual_buffer[T_per_load];
+
+        const int iter_offset = i * stride + thread_offset;
+        const bool do_loads = (iter_offset < elems_per_row);
+
+        mem_access::load_global<rms::granularity>(
+            iteration_buffer, input_base + (i * stride), do_loads);
+        mem_access::load_global<rms::granularity>(
+            residual_buffer, residual_base + (i * stride), do_loads);
+
+#pragma unroll
+        for (int j = 0; j < T_per_load; j++) {
+            iteration_buffer[j] += residual_buffer[j];
+            float vals_up_cast = conversion::to<float>(iteration_buffer[j]);
+
+            var_sum = reduce::element<rop::Add, float>(var_sum, vals_up_cast * vals_up_cast);
+        }
+
+        if (do_loads) {
+            mem_access::store_global<rms::granularity>(res_output + i * stride, iteration_buffer);
+        }
+    }
+
+    reduce::partitioned_block<rop::Add, threadsPerGroup>(tb, warp, var_sum);
+    const float var = var_sum / elems_per_row;
+    const T denom = conversion::to<T>(__frsqrt_rn(var + epsilon));
+
+    T* block_output = output + block_offset;
+
+#pragma unroll
+    for (int i = 0; i < UNROLL; i++) {
+        T* iteration_buffer = local_buffer + (i * T_per_load);
+        const int iter_idx = i * stride + thread_offset;
+        const bool do_loads = (iter_idx < elems_per_row);
+
+        T gamma_local[T_per_load];
+
+        mem_access::load_global<rms::granularity>(gamma_local, gamma + iter_idx, do_loads);
+
+#pragma unroll
+        for (int j = 0; j < T_per_load; j++) {
+            iteration_buffer[j] *= denom;
+            iteration_buffer[j] *= gamma_local[j];
+        }
+
+        if (do_loads) {
+            mem_access::store_global<rms::granularity>(block_output + iter_idx, iteration_buffer);
+        }
+    }
+}
+
+#define LAUNCH_RMS_NORM(UNROLL, threadsPerGroup, maxThreads) \
+    rms_norm<T, UNROLL, threadsPerGroup, maxThreads>         \
+        <<<grid, block, 0, stream>>>(norm_output, vals, gamma, epsilon, elems_per_row);
+
+#define LAUNCH_PRE_RMS_NORM(UNROLL, threadsPerGroup, maxThreads)                      \
+    pre_rms_norm<T, UNROLL, threadsPerGroup, maxThreads><<<grid, block, 0, stream>>>( \
+        norm_output, res_output, vals, residual, gamma, epsilon, elems_per_row);
+
+#define LAUNCH_ALL_RMS_NORM(UNROLL, threadsPerGroup, maxThreads) \
+    if (pre_norm) {                                              \
+        LAUNCH_PRE_RMS_NORM(UNROLL, threadsPerGroup, maxThreads) \
+    } else {                                                     \
+        LAUNCH_RMS_NORM(UNROLL, threadsPerGroup, maxThreads)     \
+    }
+
+template <typename T>
+void launch_rms_norm(T* norm_output,
+                     T* res_output,
+                     const T* vals,
+                     const T* residual,
+                     const T* gamma,
+                     float epsilon,
+                     int rows,
+                     int elems_per_row,
+                     cudaStream_t stream)
+{
+    // 8 for __half, 4 for float
+    constexpr int T_per_load = rms::granularity / sizeof(T);
+    constexpr int maxThreads = 256;
+    constexpr int internalUnroll = sizeof(T) == 4 ? 4 : 2;
+
+    const bool is_subblock_schedule = (elems_per_row <= 128) ? true : false;
+    const int h_per_step = is_subblock_schedule ? T_per_load : T_per_load * internalUnroll;
+
+    // Scheduling concern: may be slightly faster for some inputs to assign multiple stages of
+    // warp-sized blocks rather than stepping up to 64/96 threads
+    const int one_step_threads = next_pow2((elems_per_row + h_per_step - 1) / h_per_step);
+    const int threads_per_group = (one_step_threads < maxThreads) ? one_step_threads : maxThreads;
+
+    const int groups_per_block_max =
+        is_subblock_schedule ? (maxThreads + threads_per_group - 1) / threads_per_group : 1;
+    const int groups_per_block = (rows < groups_per_block_max) ? rows : groups_per_block_max;
+    const int groups_launch = (groups_per_block + rows - 1) / groups_per_block;
+
+    dim3 block(threads_per_group, groups_per_block);
+    dim3 grid(groups_launch);
+
+    const int elems_per_step = threads_per_group * h_per_step;
+    const int external_unRoll = (elems_per_row + elems_per_step - 1) / elems_per_step;
+
+    bool pre_norm = (residual == nullptr) ? false : true;
+
+    if (is_subblock_schedule) {
+        // <=128
+        if (threads_per_group == 1) {
+            LAUNCH_ALL_RMS_NORM(1, 1, maxThreads);
+        } else if (threads_per_group == 2) {
+            LAUNCH_ALL_RMS_NORM(1, 2, maxThreads);
+        } else if (threads_per_group == 4) {
+            LAUNCH_ALL_RMS_NORM(1, 4, maxThreads);
+        } else if (threads_per_group == 8) {
+            LAUNCH_ALL_RMS_NORM(1, 8, maxThreads);
+        } else if (threads_per_group == 16) {
+            LAUNCH_ALL_RMS_NORM(1, 16, maxThreads);
+        }
+    } else if (external_unRoll == 1) {
+        // 129 - 4096 elems
+        // (this can launch with 1-7 warps as well)
+        LAUNCH_ALL_RMS_NORM(1 * internalUnroll, maxThreads, maxThreads);
+    } else if (external_unRoll == 2) {
+        // 4097 - 8192 elems
+        LAUNCH_ALL_RMS_NORM(2 * internalUnroll, maxThreads, maxThreads);
+    } else if (external_unRoll == 3) {
+        // 8193 - 12288 elems
+        LAUNCH_ALL_RMS_NORM(3 * internalUnroll, maxThreads, maxThreads);
+    } else if (external_unRoll == 4) {
+        // 12289 - 16384 elems
+        LAUNCH_ALL_RMS_NORM(4 * internalUnroll, maxThreads, maxThreads);
+    }
+}
+
+#define INSTANTIATE_LAUNCH_RMS_NORM(T)                  \
+    template void launch_rms_norm<T>(T * norm_output,   \
+                                     T * res_output,    \
+                                     const T* vals,     \
+                                     const T* residual, \
+                                     const T* gamma,    \
+                                     float epsilon,     \
+                                     int rows,          \
+                                     int elems_per_row, \
+                                     cudaStream_t stream);
+
+INSTANTIATE_LAUNCH_RMS_NORM(float)
+INSTANTIATE_LAUNCH_RMS_NORM(__half)
+#ifdef BF16_AVAILABLE
+INSTANTIATE_LAUNCH_RMS_NORM(__nv_bfloat16)
+#endif
diff --git a/csrc/transformer/inference/csrc/softmax.cu b/csrc/transformer/inference/csrc/softmax.cu
index c5f04176203b..bb06cc149ef4 100644
--- a/csrc/transformer/inference/csrc/softmax.cu
+++ b/csrc/transformer/inference/csrc/softmax.cu
@@ -1,18 +1,19 @@
-/*
-Copyright 2022 The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #include <limits>
+#include "conversion_utils.h"
 #include "inference_cuda_layers.h"
 
-#ifndef __HIP_PLATFORM_HCC__
+#ifndef __HIP_PLATFORM_AMD__
 #include <cuda_profiler_api.h>
 #endif
 #include <cstdio>
 #include <cstdlib>
 #include <ctime>
 
-#define ATTN_THREADS 256
 #define MAX_REG_SIZE 8
 
 #define minus_infinity -10000.0
@@ -30,9 +31,10 @@ void CheckCudaErrorAux(const char* file, unsigned line)
 
 namespace cg = cooperative_groups;
 
-__global__ void attn_softmax_v2(__half* vals,
-                                __half* mask,
-                                __half* alibi,
+template <typename T, int iterations>
+__global__ void attn_softmax_v2(T* vals,
+                                T* mask,
+                                T* alibi,
                                 float layer_scale,
                                 bool triangular,
                                 bool recompute,
@@ -45,7 +47,6 @@ __global__ void attn_softmax_v2(__half* vals,
                                 int head_offset,
                                 int mask_stride,
                                 int mp_size,
-                                int iterations,
                                 int reduceWidth)
 {
     cg::thread_block b = cg::this_thread_block();
@@ -53,7 +54,7 @@ __global__ void attn_softmax_v2(__half* vals,
 
     float2 low_data[MAX_REG_SIZE];
     float2 high_data[MAX_REG_SIZE];
-    const __half zero_h = __float2half(0.f);
+    const T zero_h = conversion::to<T>(0.f);
 
     int wid = threadIdx.x >> 5;
     int lane = threadIdx.x & 0x1f;
@@ -75,7 +76,6 @@ __global__ void attn_softmax_v2(__half* vals,
         alibi_offset = (alibi_offset + ((iter_offset / num_seq) % heads)) * sequence_length;
         mask_offset = mask_offset * sequence_length;
         int seq_id = iter_offset % num_seq;
-        int seq_id4 = seq_id >> 2;
 
         int real_seq_id = seq_id + (num_seq == sequence_length ? 0 : sequence_length);
         int window_stride4 = (local_attention && (real_seq_id >> 2) > (window_size >> 2))
@@ -87,83 +87,109 @@ __global__ void attn_softmax_v2(__half* vals,
         float max_val = minus_infinity;
         // if (lane == 0) printf("%d, %d: %d \n", wid, blockIdx.x, mask_offset);
         for (int i = 0; i < iterations; i++) {
-            int data_id = i * (reduceWidth << 2) + (seq_lane << 2);
-            if ((!triangular || ((data_id >> 2) <= seq_id4)) && (data_id >> 2) >= window_stride4 &&
-                data_id < sequence_length) {
-                if ((sequence_length - data_id) >= 4) {
-                    low_data[i].x = data_id > window_stride
-                                        ? __half2float(vals[data_id]) * layer_scale
-                                        : minus_infinity;
-                    low_data[i].y = ((!triangular || ((data_id + 1) <= seq_id)) &&
-                                     (data_id + 1) > window_stride)
-                                        ? __half2float(vals[data_id + 1]) * layer_scale
-                                        : minus_infinity;
-                    high_data[i].x = ((!triangular || ((data_id + 2) <= seq_id)) &&
-                                      (data_id + 2) > window_stride)
-                                         ? __half2float(vals[data_id + 2]) * layer_scale
-                                         : minus_infinity;
-                    high_data[i].y = ((!triangular || ((data_id + 3) <= seq_id)) &&
-                                      (data_id + 3) > window_stride)
-                                         ? __half2float(vals[data_id + 3]) * layer_scale
-                                         : minus_infinity;
-                    if (alibi) {
-                        low_data[i].x = low_data[i].x + __half2float(alibi[data_id + alibi_offset]);
-                        low_data[i].y =
-                            low_data[i].y + __half2float(alibi[data_id + alibi_offset + 1]);
-                        high_data[i].x =
-                            high_data[i].x + __half2float(alibi[data_id + alibi_offset + 2]);
-                        high_data[i].y =
-                            high_data[i].y + __half2float(alibi[data_id + alibi_offset + 3]);
-                    }
-                    if (mask) {
-                        low_data[i].x += __half2float(mask[data_id + mask_offset]);
-                        low_data[i].y += __half2float(mask[data_id + mask_offset + 1]);
-                        high_data[i].x += __half2float(mask[data_id + mask_offset + 2]);
-                        high_data[i].y += __half2float(mask[data_id + mask_offset + 3]);
-                    }
-                } else {
-                    low_data[i].x = data_id > window_stride
-                                        ? __half2float(vals[data_id]) * layer_scale
-                                        : minus_infinity;
-                    low_data[i].y = (((!triangular || (data_id + 1) <= seq_id) &&
-                                      (data_id + 1) > window_stride) &&
-                                     (data_id + 1) < sequence_length)
-                                        ? __half2float(vals[data_id + 1]) * layer_scale
-                                        : minus_infinity;
-                    high_data[i].x = (((!triangular || (data_id + 2) <= seq_id) &&
-                                       (data_id + 2) > window_stride) &&
-                                      (data_id + 2) < sequence_length)
-                                         ? __half2float(vals[data_id + 2]) * layer_scale
-                                         : minus_infinity;
-                    if (alibi) {
-                        low_data[i].x = low_data[i].x + __half2float(alibi[data_id + alibi_offset]);
-                        if ((data_id + 1) < sequence_length)
-                            low_data[i].y =
-                                low_data[i].y + __half2float(alibi[data_id + alibi_offset + 1]);
-                        if ((data_id + 2) < sequence_length)
-                            high_data[i].x =
-                                high_data[i].x + __half2float(alibi[data_id + alibi_offset + 2]);
-                    }
-                    high_data[i].y = minus_infinity;
-                    if (mask) {
-                        low_data[i].x += __half2float(mask[data_id + mask_offset]);
-                        if ((data_id + 1) < sequence_length)
-                            low_data[i].y += __half2float(mask[data_id + mask_offset + 1]);
-                        if ((data_id + 2) < sequence_length)
-                            high_data[i].x += __half2float(mask[data_id + mask_offset + 2]);
-                    }
-                }
-                // if(lane == 0) printf("%f , %d, %d \n", low_data[i].x, data_id, seq_id);
-                max_val = (low_data[i].x > max_val ? low_data[i].x : max_val);
-                max_val = (low_data[i].y > max_val ? low_data[i].y : max_val);
-                max_val = (high_data[i].x > max_val ? high_data[i].x : max_val);
-                max_val = (high_data[i].y > max_val ? high_data[i].y : max_val);
+            int data_id = i * (reduceWidth << 2) + (seq_lane);
+            bool check = (data_id >> 2) >= window_stride4;
+            bool low_x_check = check && (data_id < sequence_length) &&
+                               (!triangular || (data_id <= seq_id)) && (data_id > window_stride);
+            bool low_y_check = check && ((data_id + reduceWidth) < sequence_length) &&
+                               (!triangular || ((data_id + reduceWidth) <= seq_id)) &&
+                               ((data_id + reduceWidth) > window_stride);
+            bool high_x_check = check && ((data_id + reduceWidth * 2) < sequence_length) &&
+                                (!triangular || ((data_id + reduceWidth * 2) <= seq_id)) &&
+                                ((data_id + reduceWidth * 2) > window_stride);
+            bool high_y_check = check && ((data_id + reduceWidth * 3) < sequence_length) &&
+                                (!triangular || ((data_id + reduceWidth * 3) <= seq_id)) &&
+                                ((data_id + reduceWidth * 3) > window_stride);
+
+            if (mask && alibi) {
+                low_data[i].x = low_x_check
+                                    ? conversion::to<float>(vals[data_id]) * layer_scale +
+                                          (conversion::to<float>(alibi[data_id + alibi_offset])) +
+                                          (conversion::to<float>(mask[data_id + mask_offset]))
+                                    : minus_infinity;
+                low_data[i].y =
+                    low_y_check
+                        ? conversion::to<float>(vals[data_id + reduceWidth]) * layer_scale +
+                              (conversion::to<float>(alibi[data_id + alibi_offset + reduceWidth])) +
+                              (conversion::to<float>(mask[data_id + mask_offset + reduceWidth]))
+                        : minus_infinity;
+                high_data[i].x =
+                    high_x_check
+                        ? conversion::to<float>(vals[data_id + reduceWidth * 2]) * layer_scale +
+                              (conversion::to<float>(
+                                  alibi[data_id + alibi_offset + reduceWidth * 2])) +
+                              (conversion::to<float>(mask[data_id + mask_offset + reduceWidth * 2]))
+                        : minus_infinity;
+                high_data[i].y =
+                    high_y_check
+                        ? conversion::to<float>(vals[data_id + reduceWidth * 3]) * layer_scale +
+                              (conversion::to<float>(
+                                  alibi[data_id + alibi_offset + reduceWidth * 3])) +
+                              (conversion::to<float>(mask[data_id + mask_offset + reduceWidth * 3]))
+                        : minus_infinity;
+            } else if (mask) {
+                low_data[i].x = low_x_check
+                                    ? conversion::to<float>(vals[data_id]) * layer_scale +
+                                          (conversion::to<float>(mask[data_id + mask_offset]))
+                                    : minus_infinity;
+                low_data[i].y =
+                    low_y_check
+                        ? conversion::to<float>(vals[data_id + reduceWidth]) * layer_scale +
+                              (conversion::to<float>(mask[data_id + mask_offset + reduceWidth]))
+                        : minus_infinity;
+                high_data[i].x =
+                    high_x_check
+                        ? conversion::to<float>(vals[data_id + reduceWidth * 2]) * layer_scale +
+                              (conversion::to<float>(mask[data_id + mask_offset + reduceWidth * 2]))
+                        : minus_infinity;
+                high_data[i].y =
+                    high_y_check
+                        ? conversion::to<float>(vals[data_id + reduceWidth * 3]) * layer_scale +
+                              (conversion::to<float>(mask[data_id + mask_offset + reduceWidth * 3]))
+                        : minus_infinity;
+            } else if (alibi) {
+                low_data[i].x = low_x_check
+                                    ? conversion::to<float>(vals[data_id]) * layer_scale +
+                                          (conversion::to<float>(alibi[data_id + alibi_offset]))
+                                    : minus_infinity;
+                low_data[i].y =
+                    low_y_check
+                        ? conversion::to<float>(vals[data_id + reduceWidth]) * layer_scale +
+                              (conversion::to<float>(alibi[data_id + alibi_offset + reduceWidth]))
+                        : minus_infinity;
+                high_data[i].x =
+                    high_x_check
+                        ? conversion::to<float>(vals[data_id + reduceWidth * 2]) * layer_scale +
+                              (conversion::to<float>(
+                                  alibi[data_id + alibi_offset + reduceWidth * 2]))
+                        : minus_infinity;
+                high_data[i].y =
+                    high_y_check
+                        ? conversion::to<float>(vals[data_id + reduceWidth * 3]) * layer_scale +
+                              (conversion::to<float>(
+                                  alibi[data_id + alibi_offset + reduceWidth * 3]))
+                        : minus_infinity;
             } else {
-                low_data[i].x = minus_infinity;
-                low_data[i].y = minus_infinity;
-                high_data[i].x = minus_infinity;
-                high_data[i].y = minus_infinity;
+                low_data[i].x = low_x_check ? conversion::to<float>(vals[data_id]) * layer_scale
+                                            : minus_infinity;
+                low_data[i].y =
+                    low_y_check ? conversion::to<float>(vals[data_id + reduceWidth]) * layer_scale
+                                : minus_infinity;
+                high_data[i].x =
+                    high_x_check
+                        ? conversion::to<float>(vals[data_id + reduceWidth * 2]) * layer_scale
+                        : minus_infinity;
+                high_data[i].y =
+                    high_y_check
+                        ? conversion::to<float>(vals[data_id + reduceWidth * 3]) * layer_scale
+                        : minus_infinity;
             }
+
+            // if(lane == 0) printf("%f , %d, %d \n", low_data[i].x, data_id, seq_id);
+            max_val = (low_data[i].x > max_val ? low_data[i].x : max_val);
+            max_val = (low_data[i].y > max_val ? low_data[i].y : max_val);
+            max_val = (high_data[i].x > max_val ? high_data[i].x : max_val);
+            max_val = (high_data[i].y > max_val ? high_data[i].y : max_val);
         }
 
         for (int i = 1; i < WARP_SIZE; i *= 2) {
@@ -212,26 +238,21 @@ __global__ void attn_softmax_v2(__half* vals,
         }
         sum += 1e-6;
         for (int i = 0; i < iterations; i++) {
-            int data_id = i * (reduceWidth << 2) + (seq_lane << 2);
-
+            int data_id = i * (reduceWidth << 2) + (seq_lane);
             if (data_id < sequence_length) {
-                if ((sequence_length - data_id) >= 4) {
-                    vals[data_id] = __float2half(low_data[i].x / sum);
-                    vals[data_id + 1] = __float2half(low_data[i].y / sum);
-                    vals[data_id + 2] = __float2half(high_data[i].x / sum);
-                    vals[data_id + 3] = __float2half(high_data[i].y / sum);
-                } else {
-                    vals[data_id] = __float2half(low_data[i].x / sum);
-                    if ((data_id + 1) < sequence_length)
-                        vals[data_id + 1] = __float2half(low_data[i].y / sum);
-                    if ((data_id + 2) < sequence_length)
-                        vals[data_id + 2] = __float2half(high_data[i].x / sum);
-                }
+                vals[data_id] = conversion::to<T>(low_data[i].x / sum);
+                if ((data_id + reduceWidth) < sequence_length)
+                    vals[data_id + reduceWidth] = conversion::to<T>(low_data[i].y / sum);
+                if ((data_id + reduceWidth * 2) < sequence_length)
+                    vals[data_id + reduceWidth * 2] = conversion::to<T>(high_data[i].x / sum);
+                if ((data_id + reduceWidth * 3) < sequence_length)
+                    vals[data_id + reduceWidth * 3] = conversion::to<T>(high_data[i].y / sum);
             }
         }
     }
 }
 
+template <int iterations>
 __global__ void attn_softmax_v2(float* vals,
                                 float* attn_mask,
                                 float* alibi,
@@ -247,7 +268,6 @@ __global__ void attn_softmax_v2(float* vals,
                                 int head_offset,
                                 int mask_stride,
                                 int mp_size,
-                                int iterations,
                                 int reduceWidth)
 {
     cg::thread_block b = cg::this_thread_block();
@@ -269,11 +289,9 @@ __global__ void attn_softmax_v2(float* vals,
         vals += (iter_offset * sequence_length);
 
         int batch_idx = iter_offset / (num_seq * heads);
-        int alibi_offset = batch_idx * heads * mp_size + head_offset;
         int mask_offset = batch_idx * mask_stride + (iter_offset % mask_stride);
         mask_offset = mask_offset * sequence_length;
         int seq_id = iter_offset % num_seq;
-        int seq_id4 = seq_id >> 2;
 
         int real_seq_id = seq_id + (num_seq == sequence_length ? 0 : sequence_length);
         int window_stride4 = (local_attention && (real_seq_id >> 2) > (window_size >> 2))
@@ -285,58 +303,43 @@ __global__ void attn_softmax_v2(float* vals,
         float max_val = minus_infinity;
 
         for (int i = 0; i < iterations; i++) {
-            int data_id = i * (reduceWidth << 2) + (seq_lane << 2);
-            if ((!triangular || ((data_id >> 2) <= seq_id4)) && (data_id >> 2) >= window_stride4 &&
-                data_id < sequence_length) {
-                if ((sequence_length - data_id) >= 4) {
-                    data[i].x = (data_id > window_stride ? vals[data_id] : minus_infinity);
-                    data[i].y = ((!triangular || ((data_id + 1) <= seq_id)) &&
-                                 (data_id + 1) > window_stride)
-                                    ? vals[data_id + 1]
-                                    : minus_infinity;
-                    data[i].z = ((!triangular || ((data_id + 2) <= seq_id)) &&
-                                 (data_id + 2) > window_stride)
-                                    ? vals[data_id + 2]
+            int data_id = i * (reduceWidth << 2) + (seq_lane);
+            bool check = (data_id >> 2) >= window_stride4;
+            bool x_check = check && (data_id < sequence_length) &&
+                           (!triangular || (data_id <= seq_id)) && (data_id > window_stride);
+            bool y_check = check && ((data_id + reduceWidth) < sequence_length) &&
+                           (!triangular || ((data_id + reduceWidth) <= seq_id)) &&
+                           ((data_id + reduceWidth) > window_stride);
+            bool z_check = check && ((data_id + reduceWidth * 2) < sequence_length) &&
+                           (!triangular || ((data_id + reduceWidth * 2) <= seq_id)) &&
+                           ((data_id + reduceWidth * 2) > window_stride);
+            bool w_check = check && ((data_id + reduceWidth * 3) < sequence_length) &&
+                           (!triangular || ((data_id + reduceWidth * 3) <= seq_id)) &&
+                           ((data_id + reduceWidth * 3) > window_stride);
+
+            if (attn_mask) {
+                data[i].x = x_check ? vals[data_id] + attn_mask[data_id + mask_offset]
                                     : minus_infinity;
-                    data[i].w = ((!triangular || ((data_id + 3) <= seq_id)) &&
-                                 (data_id + 3) > window_stride)
-                                    ? vals[data_id + 3]
+                data[i].y = y_check ? vals[data_id + reduceWidth] +
+                                          attn_mask[data_id + mask_offset + reduceWidth]
                                     : minus_infinity;
-                    if (attn_mask) {
-                        data[i].x += attn_mask[data_id + mask_offset];
-                        data[i].y += attn_mask[data_id + mask_offset + 1];
-                        data[i].z += attn_mask[data_id + mask_offset + 2];
-                        data[i].w += attn_mask[data_id + mask_offset + 3];
-                    }
-                } else {
-                    data[i].x = data_id > window_stride ? vals[data_id] : minus_infinity;
-                    data[i].y = (((!triangular || (data_id + 1) <= seq_id)) &&
-                                 (data_id + 1) > window_stride && (data_id + 1) < sequence_length)
-                                    ? (vals[data_id + 1])
+                data[i].z = z_check ? vals[data_id + reduceWidth * 2] +
+                                          attn_mask[data_id + mask_offset + reduceWidth * 2]
                                     : minus_infinity;
-                    data[i].z = (((!triangular || (data_id + 2) <= seq_id)) &&
-                                 (data_id + 2) > window_stride && (data_id + 2) < sequence_length)
-                                    ? (vals[data_id + 2])
+                data[i].w = w_check ? vals[data_id + reduceWidth * 3] +
+                                          attn_mask[data_id + mask_offset + reduceWidth * 3]
                                     : minus_infinity;
-                    data[i].w = minus_infinity;
-                    if (attn_mask) {
-                        data[i].x += attn_mask[data_id + mask_offset];
-                        if ((data_id + 1) < sequence_length)
-                            data[i].y += attn_mask[data_id + mask_offset + 1];
-                        if ((data_id + 2) < sequence_length)
-                            data[i].z += attn_mask[data_id + mask_offset + 2];
-                    }
-                }
-                max_val = (data[i].x > max_val ? data[i].x : max_val);
-                max_val = (data[i].y > max_val ? data[i].y : max_val);
-                max_val = (data[i].z > max_val ? data[i].z : max_val);
-                max_val = (data[i].w > max_val ? data[i].w : max_val);
             } else {
-                data[i].x = minus_infinity;
-                data[i].y = minus_infinity;
-                data[i].z = minus_infinity;
-                data[i].w = minus_infinity;
+                data[i].x = x_check ? vals[data_id] : minus_infinity;
+                data[i].y = y_check ? vals[data_id + reduceWidth] : minus_infinity;
+                data[i].z = z_check ? vals[data_id + reduceWidth * 2] : minus_infinity;
+                data[i].w = w_check ? vals[data_id + reduceWidth * 3] : minus_infinity;
             }
+
+            max_val = (data[i].x > max_val ? data[i].x : max_val);
+            max_val = (data[i].y > max_val ? data[i].y : max_val);
+            max_val = (data[i].z > max_val ? data[i].z : max_val);
+            max_val = (data[i].w > max_val ? data[i].w : max_val);
         }
 
         for (int i = 1; i < WARP_SIZE; i *= 2) {
@@ -387,24 +390,38 @@ __global__ void attn_softmax_v2(float* vals,
         sum += 1e-6;
 
         for (int i = 0; i < iterations; i++) {
-            int data_id = i * (reduceWidth << 2) + (seq_lane << 2);
-
+            int data_id = i * (reduceWidth << 2) + (seq_lane);
             if (data_id < sequence_length) {
-                if ((sequence_length - data_id) >= 4) {
-                    vals[data_id] = data[i].x / sum;
-                    vals[data_id + 1] = data[i].y / sum;
-                    vals[data_id + 2] = data[i].z / sum;
-                    vals[data_id + 3] = data[i].w / sum;
-                } else {
-                    vals[data_id] = data[i].x / sum;
-                    if ((data_id + 1) < sequence_length) vals[data_id + 1] = data[i].y / sum;
-                    if ((data_id + 2) < sequence_length) vals[data_id + 2] = data[i].z / sum;
-                }
+                vals[data_id] = data[i].x / sum;
+                if ((data_id + reduceWidth) < sequence_length)
+                    vals[data_id + reduceWidth] = data[i].y / sum;
+                if ((data_id + reduceWidth * 2) < sequence_length)
+                    vals[data_id + reduceWidth * 2] = data[i].z / sum;
+                if ((data_id + reduceWidth * 3) < sequence_length)
+                    vals[data_id + reduceWidth * 3] = data[i].w / sum;
             }
         }
     }
 }
 
+#define LAUNCH_ATTN_SOFTMAX_V2(iterations)                                      \
+    attn_softmax_v2<T, iterations><<<grid, block, 0, stream>>>(vals,            \
+                                                               mask,            \
+                                                               alibi,           \
+                                                               layer_scale,     \
+                                                               triangular,      \
+                                                               recompute,       \
+                                                               local_attention, \
+                                                               window_size,     \
+                                                               total_count,     \
+                                                               heads,           \
+                                                               sequence_length, \
+                                                               num_seq,         \
+                                                               head_offset,     \
+                                                               mask_stride,     \
+                                                               mp_size,         \
+                                                               reduce_width);
+
 template <typename T>
 void launch_attn_softmax_v2(T* vals,
                             T* mask,
@@ -423,66 +440,123 @@ void launch_attn_softmax_v2(T* vals,
                             int mp_size,
                             cudaStream_t stream)
 {
-    int total_count = batch_size * heads * num_seq;
-    int warp_num = ATTN_THREADS / WARP_SIZE;
-    int reduce_width = ((sequence_length - 1) / ATTN_THREADS + 1);
-    reduce_width = (int)pow(2.0, floor(log2((float)(reduce_width)))) * WARP_SIZE;
-    dim3 grid_dim((total_count - 1) / (ATTN_THREADS / reduce_width) + 1);
-    dim3 block_dim(ATTN_THREADS);
-
-    const int iterations = (sequence_length - 1) / (reduce_width << 2) + 1;
-
-    if (sequence_length <= 32768)
-        attn_softmax_v2<<<grid_dim, block_dim, 0, stream>>>(vals,
-                                                            mask,
-                                                            alibi,
-                                                            layer_scale,
-                                                            triangular,
-                                                            recompute,
-                                                            local_attention,
-                                                            window_size,
-                                                            total_count,
-                                                            heads,
-                                                            sequence_length,
-                                                            num_seq,
-                                                            head_offset,
-                                                            mask_stride,
-                                                            mp_size,
-                                                            iterations,
-                                                            reduce_width);
-    else
+    const int total_count = batch_size * heads * num_seq;
+
+    // Scheduling Overview
+    // 4 element unroll with power of 2 `reduce_width` threads to a ceiling of `attn_threads`
+    // Each block should be partitioned into as many `reduce_width` blocks
+    // as can be fit.
+    constexpr int attn_threads = 256;
+    constexpr int min_reduce_width = hw_warp_size;
+    constexpr int internal_unroll = 4;
+
+    // Handle internal unroll then round to next power of 2. Bump up to minimum granularity.
+    const int thread_steps_rounded =
+        next_pow2((sequence_length + internal_unroll - 1) / internal_unroll);
+    const int thread_steps_schedule =
+        (thread_steps_rounded < min_reduce_width) ? min_reduce_width : thread_steps_rounded;
+    // Bound reduce width to the number of threads
+    const int reduce_width = (thread_steps_schedule < attn_threads) ? thread_steps_schedule
+                                                                    : attn_threads;
+    // Scale for the excess
+    const int iterations = thread_steps_schedule / reduce_width;
+    // Should be safe since reduce_width is capped to attn_threads
+    const int partitions = attn_threads / reduce_width;
+
+    // Launch params
+    dim3 grid((total_count + partitions - 1) / partitions);
+    dim3 block(attn_threads);
+
+    if (sequence_length <= 32768) {
+        if (iterations == 1) {
+            LAUNCH_ATTN_SOFTMAX_V2(1);
+        } else if (iterations == 2) {
+            LAUNCH_ATTN_SOFTMAX_V2(2);
+        } else if (iterations == 4) {
+            LAUNCH_ATTN_SOFTMAX_V2(4);
+        } else if (iterations == 8) {
+            LAUNCH_ATTN_SOFTMAX_V2(8);
+        } else if (iterations == 16) {
+            LAUNCH_ATTN_SOFTMAX_V2(16);
+        } else if (iterations == 32) {
+            LAUNCH_ATTN_SOFTMAX_V2(32);
+        } else if (iterations == 64) {
+            LAUNCH_ATTN_SOFTMAX_V2(64);
+        }
+    } else
         throw std::runtime_error("Unsupport Seq_Length!");
 }
 
-template void launch_attn_softmax_v2(float* vals,
-                                     float* mask,
-                                     float* alibi,
-                                     float layer_scale,
-                                     bool triangular,
-                                     bool recompute,
-                                     bool local_attention,
-                                     int window_size,
-                                     int batch_size,
-                                     int heads,
-                                     int num_seq,
-                                     int sequence_length,
-                                     int head_offset,
-                                     int mask_stride,
-                                     int mp_size,
-                                     cudaStream_t stream);
-template void launch_attn_softmax_v2(__half* vals,
-                                     __half* mask,
-                                     __half* alibi,
-                                     float layer_scale,
-                                     bool triangular,
-                                     bool recompute,
-                                     bool local_attention,
-                                     int window_size,
-                                     int batch_size,
-                                     int heads,
-                                     int num_seq,
-                                     int sequence_length,
-                                     int head_offset,
-                                     int mask_stride,
-                                     int mp_size,
-                                     cudaStream_t stream);
+#define INSTANTIATE_LAUNCH_ATTN_SOFTMAX_V2(T)                  \
+    template void launch_attn_softmax_v2(T* vals,              \
+                                         T* mask,              \
+                                         T* alibi,             \
+                                         float layer_scale,    \
+                                         bool triangular,      \
+                                         bool recompute,       \
+                                         bool local_attention, \
+                                         int window_size,      \
+                                         int batch_size,       \
+                                         int heads,            \
+                                         int num_seq,          \
+                                         int sequence_length,  \
+                                         int head_offset,      \
+                                         int mask_stride,      \
+                                         int mp_size,          \
+                                         cudaStream_t stream);
+
+INSTANTIATE_LAUNCH_ATTN_SOFTMAX_V2(float);
+#ifdef BF16_AVAILABLE
+INSTANTIATE_LAUNCH_ATTN_SOFTMAX_V2(__nv_bfloat16);
+#endif
+INSTANTIATE_LAUNCH_ATTN_SOFTMAX_V2(__half);
+
+#define DEF_ATTN_SOFTMAX_V2_HALF(_iter)                                           \
+    template __global__ void attn_softmax_v2<__half, _iter>(__half * vals,        \
+                                                            __half * mask,        \
+                                                            __half * alibi,       \
+                                                            float layer_scale,    \
+                                                            bool triangular,      \
+                                                            bool recompute,       \
+                                                            bool local_attention, \
+                                                            int window_size,      \
+                                                            int total_count,      \
+                                                            int heads,            \
+                                                            int sequence_length,  \
+                                                            int num_seq,          \
+                                                            int head_offset,      \
+                                                            int mask_stride,      \
+                                                            int mp_size,          \
+                                                            int reduceWidth)
+
+#define DEF_ATTN_SOFTMAX_V2_BF16(_iter)                                                   \
+    template __global__ void attn_softmax_v2<__nv_bfloat16, _iter>(__nv_bfloat16 * vals,  \
+                                                                   __nv_bfloat16 * mask,  \
+                                                                   __nv_bfloat16 * alibi, \
+                                                                   float layer_scale,     \
+                                                                   bool triangular,       \
+                                                                   bool recompute,        \
+                                                                   bool local_attention,  \
+                                                                   int window_size,       \
+                                                                   int total_count,       \
+                                                                   int heads,             \
+                                                                   int sequence_length,   \
+                                                                   int num_seq,           \
+                                                                   int head_offset,       \
+                                                                   int mask_stride,       \
+                                                                   int mp_size,           \
+                                                                   int reduceWidth)
+
+#define FOREACH_ITERATIONS(cb) \
+    cb(1);                     \
+    cb(2);                     \
+    cb(4);                     \
+    cb(8);                     \
+    cb(16);                    \
+    cb(32);                    \
+    cb(64)
+
+FOREACH_ITERATIONS(DEF_ATTN_SOFTMAX_V2_HALF);
+#ifdef BF16_AVAILABLE
+FOREACH_ITERATIONS(DEF_ATTN_SOFTMAX_V2_BF16);
+#endif
diff --git a/csrc/transformer/inference/csrc/transform.cu b/csrc/transformer/inference/csrc/transform.cu
index 023e02fe1c52..8bc5a94e16ee 100644
--- a/csrc/transformer/inference/csrc/transform.cu
+++ b/csrc/transformer/inference/csrc/transform.cu
@@ -1,13 +1,20 @@
-/*
-Copyright 2022 The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
 
-#ifndef __HIP_PLATFORM_HCC__
+// DeepSpeed Team
+
+#ifndef __HIP_PLATFORM_AMD__
 #include <cuda_profiler_api.h>
 #endif
+#include "conversion_utils.h"
 #include "inference_cuda_layers.h"
 namespace cg = cooperative_groups;
 
+// only used to avoid compilation error due to lack of definition.
+#ifndef BF16_AVAILABLE
+using __nv_bfloat162 = __half2;
+#endif
+
 // Bias add
 
 __global__ void bias_add_transform_0213(float* output,
@@ -19,11 +26,14 @@ __global__ void bias_add_transform_0213(float* output,
                                         int seq_length,
                                         unsigned seq_offset,
                                         int heads,
+                                        int head_stride,
+                                        int num_kv,
                                         int rotary_dim,
                                         bool rotate_half,
                                         bool rotate_every_two,
                                         int head_ext,
-                                        int max_out_tokens)
+                                        int max_out_tokens,
+                                        float rope_theta)
 {
     int d0_stride = hidden_dim * seq_length;
     int d1_stride = hidden_dim;
@@ -42,10 +52,10 @@ __global__ void bias_add_transform_0213(float* output,
     float4* output_vec =
         reinterpret_cast<float4*>(cnt == 0 ? output : (cnt == 1 ? k_cache : v_cache));
 
-    vals_vec += (d0 * d0_stride * (gridDim.z / head_ext));
-    vals_vec += (d1 * d1_stride * (gridDim.z / head_ext));
-    vals_vec += (cnt * d1_stride);
-    vals_vec += (d2 * d2_stride);
+    vals_vec += (d0 * (d1_stride + num_kv * 2 * d2_stride) * seq_length);
+    vals_vec += d1 * (d1_stride + num_kv * 2 * d2_stride);
+    vals_vec += (cnt == 0 ? 0 : d1_stride) + (cnt == 0 ? 0 : (cnt - 1) * num_kv * d2_stride);
+    vals_vec += ((cnt == 0 ? d2 : (d2 / head_stride)) * d2_stride);
 
     output_vec += (d1 * d2_stride);
     output_vec += (d0 * d0_out_stride);
@@ -61,7 +71,7 @@ __global__ void bias_add_transform_0213(float* output,
 #pragma unroll
             for (int o = 0; o < 2; o++) {
                 float inv_freq = (float)(((d3 << 1) + o) * 2) / (float)(rotary_dim << 2);
-                inv_freq = 1.0 / powf(10000.0, inv_freq) * (float)seq_id;
+                inv_freq = 1.0 / powf(rope_theta, inv_freq) * (float)seq_id;
                 q_f[o].x = (-1.0 * q_f[o].y * sinf(inv_freq) + q_f[o].x * cosf(inv_freq));
                 q_f[o].y = (q_f[o].x * sinf(inv_freq) + q_f[o].y * cosf(inv_freq));
             }
@@ -74,22 +84,28 @@ __global__ void bias_add_transform_0213(float* output,
 #define ATTN_H 3
 #define MAX_SEQ_LINE 10
 
-__global__ void bias_add_transform_0213(__half* output,  // q
-                                        __half* k_cache,
-                                        __half* v_cache,
-                                        const __half* vals,  // qkv
-                                        const __half* bias,
+template <typename T>
+__global__ void bias_add_transform_0213(T* output,  // q
+                                        T* k_cache,
+                                        T* v_cache,
+                                        const T* vals,  // qkv
+                                        const T* bias,
                                         int hidden_dim,
                                         int seq_length,
                                         unsigned seq_offset,
                                         int all_tokens,
                                         int heads,
+                                        int head_stride,
+                                        int num_kv,
                                         int rotary_dim,
                                         bool rotate_half,
                                         bool rotate_every_two,
                                         int head_ext,
-                                        int max_out_tokens)
+                                        int max_out_tokens,
+                                        float rope_theta)
 {
+    using T2 =
+        typename std::conditional<std::is_same<T, __half>::value, __half2, __nv_bfloat162>::type;
     unsigned half_dim = (rotary_dim << 3) >> 1;
     int d0_stride = hidden_dim * seq_length;
     int d1_stride = hidden_dim;
@@ -107,17 +123,17 @@ __global__ void bias_add_transform_0213(__half* output,  // q
     float4 vals_arr;
     float4 output_arr;
 
-    __half2* vals_half = reinterpret_cast<__half2*>(&vals_arr);
-    __half2* output_half = reinterpret_cast<__half2*>(&output_arr);
+    T2* vals_half = reinterpret_cast<T2*>(&vals_arr);
+    T2* output_half = reinterpret_cast<T2*>(&output_arr);
 
     const float4* vals_vec = reinterpret_cast<const float4*>(vals);
     float4* output_vec =
         reinterpret_cast<float4*>(cnt == 0 ? output : (cnt == 1 ? k_cache : v_cache));
 
-    vals_vec += (d0 * d0_stride * (gridDim.z / head_ext));
-    vals_vec += (d1 * d1_stride * (gridDim.z / head_ext));
-    vals_vec += (cnt * d1_stride);
-    vals_vec += (d2 * d2_stride);
+    vals_vec += (d0 * (d1_stride + num_kv * 2 * d2_stride) * seq_length);
+    vals_vec += (d1 * (d1_stride + num_kv * 2 * d2_stride));
+    vals_vec += (cnt == 0 ? 0 : d1_stride) + (cnt == 0 ? 0 : (cnt - 1) * num_kv * d2_stride);
+    vals_vec += ((cnt == 0 ? d2 : (d2 / head_stride)) * d2_stride);
 
     output_vec += (d1 * d2_stride);
     output_vec += (d0 * d0_out_stride);
@@ -128,17 +144,19 @@ __global__ void bias_add_transform_0213(__half* output,  // q
     int lane = d3 & 0x1f;
     if (cnt < 2 && rotary_dim > 0 && d3 < rotary_dim) {
         float4 q = vals_vec[d3];
-        __half2* q_h = reinterpret_cast<__half2*>(&q);
+        T2* q_h = reinterpret_cast<T2*>(&q);
         if (rotate_every_two) {
 #pragma unroll
             for (int o = 0; o < 4; o++) {
                 float inv_freq = (float)(((d3 << 2) + o) * 2) / (float)(rotary_dim << 3);
-                inv_freq = 1.0 / powf(10000.0, inv_freq) * (float)seq_id;
+                inv_freq = 1.0 / powf(rope_theta, inv_freq) * (float)seq_id;
                 float q_data[2];
-                q_data[0] = (float)q_h[o].x;
-                q_data[1] = (float)q_h[o].y;
-                q_h[o].x = (__half)(-1.0 * q_data[1] * sinf(inv_freq) + q_data[0] * cosf(inv_freq));
-                q_h[o].y = (__half)(q_data[0] * sinf(inv_freq) + q_data[1] * cosf(inv_freq));
+                q_data[0] = conversion::to<float>(q_h[o].x);
+                q_data[1] = conversion::to<float>(q_h[o].y);
+                q_h[o].x = conversion::to<T>(-1.0 * q_data[1] * sinf(inv_freq) +
+                                             q_data[0] * cosf(inv_freq));
+                q_h[o].y =
+                    conversion::to<T>(q_data[0] * sinf(inv_freq) + q_data[1] * cosf(inv_freq));
             }
         }
         output_vec[d3] = q;
@@ -159,12 +177,14 @@ void launch_bias_add_transform_0213<float>(float* output,
                                            int all_tokens,
                                            int hidden_dim,
                                            int heads,
+                                           int num_kv,
                                            int rotary_dim,
                                            bool rotate_half,
                                            bool rotate_every_two,
                                            cudaStream_t stream,
                                            int trans_count,
-                                           int max_out_tokens)
+                                           int max_out_tokens,
+                                           float rope_theta)
 {
     hidden_dim >>= 2;
     int head_ext = (hidden_dim - 1) / MAX_THREADS + 1;
@@ -181,48 +201,36 @@ void launch_bias_add_transform_0213<float>(float* output,
                                                                 seq_length,
                                                                 seq_offset,
                                                                 heads,
+                                                                num_kv > 0 ? (heads / num_kv) : 1,
+                                                                num_kv > 0 ? num_kv : heads,
                                                                 rotary_dim >> 2,
                                                                 rotate_half,
                                                                 rotate_every_two,
                                                                 head_ext,
-                                                                max_out_tokens);
+                                                                max_out_tokens,
+                                                                rope_theta);
 }
+
 template <typename T>
-void launch_bias_add_transform_0213(T* outputs,
-                                    T* vals,
-                                    T* vals1,
-                                    const T* vals2,
+void launch_bias_add_transform_0213(T* output,
+                                    T* k_cache,
+                                    T* v_cache,
+                                    const T* vals,
                                     const T* bias,
                                     int batch_size,
                                     int seq_length,
                                     unsigned seq_offset,
-                                    int seq_length1,
+                                    int all_tokens,
                                     int hidden_dim,
                                     int heads,
+                                    int num_kv,
                                     int rotary_dim,
                                     bool rotate_half,
                                     bool rotate_every_two,
                                     cudaStream_t stream,
                                     int trans_count,
-                                    int max_out_tokens);
-template <>
-void launch_bias_add_transform_0213<__half>(__half* output,
-                                            __half* k_cache,
-                                            __half* v_cache,
-                                            const __half* vals,
-                                            const __half* bias,
-                                            int batch_size,
-                                            int seq_length,
-                                            unsigned seq_offset,
-                                            int all_tokens,
-                                            int hidden_dim,
-                                            int heads,
-                                            int rotary_dim,
-                                            bool rotate_half,
-                                            bool rotate_every_two,
-                                            cudaStream_t stream,
-                                            int trans_count,
-                                            int max_out_tokens)
+                                    int max_out_tokens,
+                                    float rope_theta)
 {
     hidden_dim >>= 3;
     int head_ext = 1;  // (hidden_dim - 1) / MAX_THREADS + 1;
@@ -238,13 +246,42 @@ void launch_bias_add_transform_0213<__half>(__half* output,
                                                                 seq_offset,
                                                                 all_tokens,
                                                                 heads,
+                                                                num_kv > 0 ? (heads / num_kv) : 1,
+                                                                num_kv > 0 ? num_kv : heads,
                                                                 rotary_dim >> 3,
                                                                 rotate_half,
                                                                 rotate_every_two,
                                                                 head_ext,
-                                                                max_out_tokens);
+                                                                max_out_tokens,
+                                                                rope_theta);
 }
 
+#define INSTANTIATE_LAUNCH_BIAS_ADD_TRANSFORM_0213(T)             \
+    template void launch_bias_add_transform_0213<T>(T*,           \
+                                                    T*,           \
+                                                    T*,           \
+                                                    const T*,     \
+                                                    const T*,     \
+                                                    int,          \
+                                                    int,          \
+                                                    unsigned,     \
+                                                    int,          \
+                                                    int,          \
+                                                    int,          \
+                                                    int,          \
+                                                    int,          \
+                                                    bool,         \
+                                                    bool,         \
+                                                    cudaStream_t, \
+                                                    int,          \
+                                                    int,          \
+                                                    float)
+
+#ifdef BF16_AVAILABLE
+INSTANTIATE_LAUNCH_BIAS_ADD_TRANSFORM_0213(__nv_bfloat16);
+#endif
+INSTANTIATE_LAUNCH_BIAS_ADD_TRANSFORM_0213(__half);
+
 // Bias add
 
 __global__ void pad_add_transform_0213(float* output,
@@ -257,17 +294,20 @@ __global__ void pad_add_transform_0213(float* output,
 {
 }
 
-__global__ void pad_add_transform_0213(__half* output,
-                                       const __half* vals,
+template <typename T>
+__global__ void pad_add_transform_0213(T* output,
+                                       const T* vals,
                                        int hidden_dim,
                                        int seq_length,
                                        int padded_seq_len,
                                        int heads,
                                        int padded_head_size)
 {
+    using T2 =
+        typename std::conditional<std::is_same<T, __half>::value, __half2, __nv_bfloat162>::type;
     float4 ZERO;
-    const __half2 zero_h = __float2half2_rn(0.f);
-    __half2* ZERO_h = reinterpret_cast<__half2*>(&ZERO);
+    const T2 zero_h = conversion::to<T2>(0.f);
+    T2* ZERO_h = reinterpret_cast<T2*>(&ZERO);
 #pragma unroll
     for (int i = 0; i < 4; i++) ZERO_h[i] = zero_h;
 
@@ -300,17 +340,6 @@ __global__ void pad_add_transform_0213(__half* output,
         output_vec[d3] = ZERO;
 }
 
-template <typename T>
-void launch_pad_add_transform_0213(T* output,
-                                   const T* vals,
-                                   int batch_size,
-                                   int hidden_dim,
-                                   int seq_length,
-                                   int padded_seq_len,
-                                   int heads,
-                                   int padded_head_size,
-                                   cudaStream_t stream);
-
 // [B S C*H] - > C * [B A S N]
 template <>
 void launch_pad_add_transform_0213<float>(float* output,
@@ -324,16 +353,17 @@ void launch_pad_add_transform_0213<float>(float* output,
                                           cudaStream_t stream)
 {
 }
-template <>
-void launch_pad_add_transform_0213<__half>(__half* output,
-                                           const __half* vals,
-                                           int batch_size,
-                                           int hidden_dim,
-                                           int seq_length,
-                                           int padded_seq_len,
-                                           int heads,
-                                           int padded_head_size,
-                                           cudaStream_t stream)
+
+template <typename T>
+void launch_pad_add_transform_0213(T* output,
+                                   const T* vals,
+                                   int batch_size,
+                                   int hidden_dim,
+                                   int seq_length,
+                                   int padded_seq_len,
+                                   int heads,
+                                   int padded_head_size,
+                                   cudaStream_t stream)
 {
     hidden_dim >>= 3;
     dim3 block_dim((padded_head_size >> 3), heads, 2);
@@ -342,6 +372,15 @@ void launch_pad_add_transform_0213<__half>(__half* output,
         output, vals, hidden_dim, seq_length, padded_seq_len, heads, padded_head_size >> 3);
 }
 
+#define INSTANTIATE_LAUNCH_PAD_ADD_TRANSFORM_0213_SIMPLE(T) \
+    template void launch_pad_add_transform_0213<T>(         \
+        T*, const T*, int, int, int, int, int, int, cudaStream_t);
+
+INSTANTIATE_LAUNCH_PAD_ADD_TRANSFORM_0213_SIMPLE(__half);
+#ifdef BF16_AVAILABLE
+INSTANTIATE_LAUNCH_PAD_ADD_TRANSFORM_0213_SIMPLE(__nv_bfloat16);
+#endif
+
 // Bias add
 template <typename T>
 __global__ void bias_add_transform_0213(T* output,
@@ -393,15 +432,17 @@ __global__ void bias_add_transform_0213<float>(float* output,
                d2 * d2_out_stride + d3] = outputs;
 }
 
-template <>
-__global__ void bias_add_transform_0213<__half>(__half* output,
-                                                const __half* vals,
-                                                const __half* bias,
-                                                int hidden_dim,
-                                                int seq_length,
-                                                int heads,
-                                                int head_ext)
+template <typename T>
+__global__ void bias_add_transform_0213(T* output,
+                                        const T* vals,
+                                        const T* bias,
+                                        int hidden_dim,
+                                        int seq_length,
+                                        int heads,
+                                        int head_ext)
 {
+    using T2 =
+        typename std::conditional<std::is_same<T, __half>::value, __half2, __nv_bfloat162>::type;
     int d0_stride = hidden_dim * seq_length;
     int d1_stride = hidden_dim;
     int d2_stride = hidden_dim / heads;
@@ -417,9 +458,9 @@ __global__ void bias_add_transform_0213<__half>(__half* output,
     float4 vals_arr;
     float4 bias_arr;
     float4 output_arr;
-    __half2* vals_half = reinterpret_cast<__half2*>(&vals_arr);
-    __half2* bias_half = reinterpret_cast<__half2*>(&bias_arr);
-    __half2* output_half = reinterpret_cast<__half2*>(&output_arr);
+    T2* vals_half = reinterpret_cast<T2*>(&vals_arr);
+    T2* bias_half = reinterpret_cast<T2*>(&bias_arr);
+    T2* output_half = reinterpret_cast<T2*>(&output_arr);
 
     const float4* vals_vec = reinterpret_cast<const float4*>(vals);
     const float4* bias_vec = reinterpret_cast<const float4*>(bias);
@@ -448,13 +489,16 @@ __global__ void bias_add_transform_0213<__half>(__half* output,
     output_vec[d3] = output_arr;
 }
 
-__global__ void bias_add_transform_0213_v2(__half* output,
-                                           const __half* vals,
-                                           const __half* bias,
+template <typename T>
+__global__ void bias_add_transform_0213_v2(T* output,
+                                           const T* vals,
+                                           const T* bias,
                                            int hidden_dim,
                                            int seq_length,
                                            int heads)
 {
+    using T2 =
+        typename std::conditional<std::is_same<T, __half>::value, __half2, __nv_bfloat162>::type;
     __shared__ float4 in_data[3072];
 
     int d0_stride = hidden_dim * seq_length;
@@ -476,9 +520,9 @@ __global__ void bias_add_transform_0213_v2(__half* output,
     float4 vals_arr[1];
     float4 bias_arr[1];
     float4 output_arr[1];
-    __half2* vals_half = reinterpret_cast<__half2*>(vals_arr);
-    __half2* bias_half = reinterpret_cast<__half2*>(bias_arr);
-    __half2* output_half = reinterpret_cast<__half2*>(output_arr);
+    T2* vals_half = reinterpret_cast<T2*>(vals_arr);
+    T2* bias_half = reinterpret_cast<T2*>(bias_arr);
+    T2* output_half = reinterpret_cast<T2*>(output_arr);
 
     const float4* vals_vec = reinterpret_cast<const float4*>(vals);
     const float4* bias_vec = reinterpret_cast<const float4*>(bias);
@@ -559,13 +603,13 @@ __global__ void transform4d_0213<float>(float* out,
     }
 }
 
-template <>
-__global__ void transform4d_0213<__half>(__half* out,
-                                         const __half* in,
-                                         int heads,
-                                         int seq_length,
-                                         int hidden_dim,
-                                         int head_ext)
+template <typename T>
+__global__ void transform4d_0213(T* out,
+                                 const T* in,
+                                 int heads,
+                                 int seq_length,
+                                 int hidden_dim,
+                                 int head_ext)
 {
     int d0_stride = hidden_dim * (seq_length / head_ext);
     int d1_stride = hidden_dim;
@@ -593,11 +637,8 @@ __global__ void transform4d_0213<__half>(__half* out,
     out_vec[d3] = in_vec[d3];
 }
 
-__global__ void transform4d_0213_v2(__half* out,
-                                    const __half* in,
-                                    int heads,
-                                    int seq_length,
-                                    int hidden_dim)
+template <typename T>
+__global__ void transform4d_0213_v2(T* out, const T* in, int heads, int seq_length, int hidden_dim)
 {
     __shared__ float4 in_data[3072];
 
@@ -659,20 +700,28 @@ void launch_transform4d_0213<float>(float* out,
         <<<grid_dims, block_dims, 0, stream>>>(out, in, heads, seq_length, hidden_dim, 1);
 }
 
-template <>
-void launch_transform4d_0213<__half>(__half* out,
-                                     const __half* in,
-                                     int batch_size,
-                                     int heads,
-                                     int seq_length,
-                                     int hidden_dim,
-                                     cudaStream_t stream,
-                                     int trans_count)
+template <typename T>
+void launch_transform4d_0213(T* out,
+                             const T* in,
+                             int batch_size,
+                             int heads,
+                             int seq_length,
+                             int hidden_dim,
+                             cudaStream_t stream,
+                             int trans_count)
 {
     hidden_dim >>= 3;
     int head_ext = (hidden_dim - 1) / MAX_THREADS + 1;
     dim3 grid_dims(batch_size, trans_count, (seq_length * head_ext));
     dim3 block_dims(hidden_dim / heads, (heads / head_ext));
-    transform4d_0213<__half>
-        <<<grid_dims, block_dims, 0, stream>>>(out, in, heads, seq_length, hidden_dim, head_ext);
+    transform4d_0213<<<grid_dims, block_dims, 0, stream>>>(
+        out, in, heads, seq_length, hidden_dim, head_ext);
 }
+
+#define INSTANTIATE_2B_LAUNCH_TRANSFORM4D(T) \
+    template void launch_transform4d_0213<T>(T*, const T*, int, int, int, int, cudaStream_t, int);
+
+INSTANTIATE_2B_LAUNCH_TRANSFORM4D(__half)
+#ifdef BF16_AVAILABLE
+INSTANTIATE_2B_LAUNCH_TRANSFORM4D(__nv_bfloat16)
+#endif
diff --git a/csrc/transformer/inference/includes/inference_context.h b/csrc/transformer/inference/includes/inference_context.h
index b3851ca43b72..378fd4e5e990 100644
--- a/csrc/transformer/inference/includes/inference_context.h
+++ b/csrc/transformer/inference/includes/inference_context.h
@@ -1,6 +1,7 @@
-/*
-Copyright 2022 The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #pragma once
 
@@ -45,23 +46,31 @@ inline int DS_GET_BLOCKS(const int N)
         1);
 }
 
-class Context {
+class InferenceContext {
 public:
-    Context()
+    InferenceContext()
         : _workspace(nullptr),
           _seed(42),
           _curr_offset(0),
           _stream(0),
           _free_memory_size(0),
           _num_tokens(1),
-          _attention_unfused_workspace_offset(0)
+          _attention_unfused_workspace_offset(0),
+          _workSpaceSize(0)
     {
-        if (cublasCreate(&_cublasHandle) != CUBLAS_STATUS_SUCCESS) {
-            auto message = std::string("Fail to create cublas handle.");
+        _workSpaceSize = 0;
+        _workspace = 0;
+
+        cublasStatus_t stat = cublasCreate(&_cublasHandle);
+        if (stat != CUBLAS_STATUS_SUCCESS) {
+            // It would be nice to use cublasGetStatusName and
+            // cublasGetStatusString, but they were only added in CUDA 11.4.2.
+            auto message = std::string("Failed to create cublas handle: cublasStatus_t was ") +
+                           std::to_string(stat);
             std::cerr << message << std::endl;
             throw std::runtime_error(message);
         }
-#ifndef __HIP_PLATFORM_HCC__
+#ifndef __HIP_PLATFORM_AMD__
         cublasSetMathMode(_cublasHandle, CUBLAS_TENSOR_OP_MATH);
 #endif
         cudaEventCreate(&_comp1_event);
@@ -70,7 +79,7 @@ class Context {
         cudaEventCreate(&_comm_event);
     }
 
-    virtual ~Context()
+    virtual ~InferenceContext()
     {
         cublasDestroy(_cublasHandle);
         cudaFree(_workspace);
@@ -80,9 +89,9 @@ class Context {
         cudaEventDestroy(_comm_event);
     }
 
-    static Context& Instance()
+    static InferenceContext& Instance()
     {
-        static Context _ctx;
+        static InferenceContext _ctx;
         return _ctx;
     }
 
@@ -95,7 +104,8 @@ class Context {
                       const bool& external_cache,
                       const size_t& elem_size,
                       const unsigned& rank,
-                      unsigned max_out_tokens)
+                      unsigned max_out_tokens,
+                      unsigned min_out_tokens)
     {
         size_t total_size;
         if (!_free_memory_size) { cudaMemGetInfo(&_free_memory_size, &total_size); }
@@ -106,9 +116,9 @@ class Context {
         const int padded_head_size = head_size <= 32 ? 32 : (head_size <= 64 ? 64 : 128);
         const int effective_head_size = (head_size > 128) ? head_size : padded_head_size;
 
-        size_t activation_size = 16 * (num_heads * effective_head_size) * batch_size;
+        size_t activation_size = 10 * (num_heads * effective_head_size) * batch_size;
         // Other sequence length dimension is added when the final workSpaceSize is calculated
-        size_t temp_size = batch_size * num_heads * max_out_tokens * 2;
+        size_t temp_size = batch_size * (num_heads / mp_size) * max_out_tokens;
         size_t cache_size =
             num_layers * batch_size * ((num_heads * effective_head_size) / mp_size) * 2;
         size_t minimal_requirements =
@@ -128,25 +138,37 @@ class Context {
                                                 : (activation_size + temp_size + cache_size))) *
                                _max_seq_len * elem_size;
         temp_size *= _max_seq_len * elem_size;
-        if (rank == 0 && !_workspace)
+
+        if (_max_seq_len < min_out_tokens) {
+            printf(
+                "Allocatable workspace available (%ld tokens) is less than minimum requested "
+                "workspace (%d tokens)\n",
+                _max_seq_len,
+                min_out_tokens);
+            throw std::runtime_error("Workspace can't be allocated, not enough memory");
+        }
+
+        if (!_workspace) {
+            assert(_workspace == nullptr);
+            cudaMalloc(&_workspace, workSpaceSize);
+        } else if (_workSpaceSize < workSpaceSize) {
+            cudaFree(_workspace);
+            cudaMalloc(&_workspace, workSpaceSize);
+        }
+        if (rank == 0 && (!_workspace || _workSpaceSize < workSpaceSize))
             printf(
                 "------------------------------------------------------\n"
                 "Free memory : %f (GigaBytes)  \n"
                 "Total memory: %f (GigaBytes)  \n"
                 "Requested memory: %f (GigaBytes) \n"
                 "Setting maximum total tokens (input + output) to %lu \n"
+                "WorkSpace: %p \n"
                 "------------------------------------------------------\n",
                 (float)_free_memory_size / GIGABYTE,
                 (float)total_size / GIGABYTE,
                 (float)workSpaceSize / GIGABYTE,
-                _max_seq_len);
-        if (!_workspace) {
-            assert(_workspace == nullptr);
-            cudaMalloc(&_workspace, workSpaceSize);
-        } else if (_workSpaceSize < workSpaceSize) {
-            cudaFree(_workspace);
-            cudaMalloc(&_workspace, workSpaceSize);
-        }
+                _max_seq_len,
+                _workspace);
 
         if (!_workspace) {
             printf("Requested:\t%lu\nFree:\t%lu\nTotal:\t%lu\n",
@@ -158,7 +180,7 @@ class Context {
         _workSpaceSize = workSpaceSize;
         _attention_unfused_workspace_offset = workSpaceSize - temp_size;
     }
-    inline size_t GetMaxTokenLenght() const { return _max_seq_len; }
+    inline size_t GetMaxTokenLength() const { return _max_seq_len; }
 
     cudaEvent_t GetCompEvent(int id) { return id == 1 ? _comp1_event : _comp2_event; }
 
@@ -202,6 +224,17 @@ class Context {
         return stream;
     }
 
+    void release_workspace()
+    {
+        cudaFree(_workspace);
+        _workspace = nullptr;
+    }
+    bool retake_workspace()
+    {
+        if (_workspace != nullptr || _workSpaceSize == 0) return true;
+        cudaMalloc(&_workspace, _workSpaceSize);
+        return _workspace != nullptr;
+    }
     cublasHandle_t GetCublasHandle() { return _cublasHandle; }
 
     std::pair<uint64_t, uint64_t> IncrementOffset(uint64_t offset_inc)
diff --git a/csrc/transformer/inference/includes/inference_cublas_wrappers.h b/csrc/transformer/inference/includes/inference_cublas_wrappers.h
index 9e55cc1c7423..640751b12c8f 100644
--- a/csrc/transformer/inference/includes/inference_cublas_wrappers.h
+++ b/csrc/transformer/inference/includes/inference_cublas_wrappers.h
@@ -1,20 +1,24 @@
-/*
-Copyright 2022 The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #pragma once
 
 #include <assert.h>
 #include <cublas_v2.h>
 #include <cuda.h>
+#ifdef BF16_AVAILABLE
+#include <cuda_bf16.h>
+#endif
 #include <cuda_fp16.h>
 #include <cuda_runtime.h>
-#ifndef __HIP_PLATFORM_HCC__
+#ifndef __HIP_PLATFORM_AMD__
 #include <mma.h>
 #endif
 #include <stdio.h>
 
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef __HIP_PLATFORM_AMD__
 int cublas_gemm_ex(rocblas_handle handle,
                    rocblas_operation transa,
                    rocblas_operation transb,
@@ -26,7 +30,8 @@ int cublas_gemm_ex(rocblas_handle handle,
                    const float* A,
                    const float* B,
                    float* C,
-                   rocblas_gemm_algo algo)
+                   rocblas_gemm_algo algo,
+                   int b_stride = -1)
 #else
 int cublas_gemm_ex(cublasHandle_t handle,
                    cublasOperation_t transa,
@@ -39,10 +44,12 @@ int cublas_gemm_ex(cublasHandle_t handle,
                    const float* A,
                    const float* B,
                    float* C,
-                   cublasGemmAlgo_t algo)
+                   cublasGemmAlgo_t algo,
+                   int b_stride = -1)
 #endif
 {
-#ifdef __HIP_PLATFORM_HCC__
+    const int ldb = (b_stride == -1) ? ((transb == CUBLAS_OP_N) ? k : n) : b_stride;
+#ifdef __HIP_PLATFORM_AMD__
     rocblas_status status = rocblas_gemm_ex(handle,
                                             transa,
                                             transb,
@@ -55,7 +62,7 @@ int cublas_gemm_ex(cublasHandle_t handle,
                                             (transa == rocblas_operation_none) ? m : k,
                                             (const void*)B,
                                             rocblas_datatype_f32_r,
-                                            (transb == rocblas_operation_none) ? k : n,
+                                            ldb,
                                             (const void*)beta,
                                             C,
                                             rocblas_datatype_f32_r,
@@ -80,7 +87,7 @@ int cublas_gemm_ex(cublasHandle_t handle,
                                          (transa == CUBLAS_OP_N) ? m : k,
                                          (const void*)B,
                                          CUDA_R_32F,
-                                         (transb == CUBLAS_OP_N) ? k : n,
+                                         ldb,
                                          (const void*)beta,
                                          C,
                                          CUDA_R_32F,
@@ -89,7 +96,7 @@ int cublas_gemm_ex(cublasHandle_t handle,
                                          algo);
 #endif
 
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef __HIP_PLATFORM_AMD__
     if (status != rocblas_status_success) {
 #else
     if (status != CUBLAS_STATUS_SUCCESS) {
@@ -105,7 +112,8 @@ int cublas_gemm_ex(cublasHandle_t handle,
     return 0;
 }
 
-#ifdef __HIP_PLATFORM_HCC__
+template <typename T>
+#ifdef __HIP_PLATFORM_AMD__
 int cublas_gemm_ex(rocblas_handle handle,
                    rocblas_operation transa,
                    rocblas_operation transb,
@@ -114,10 +122,11 @@ int cublas_gemm_ex(rocblas_handle handle,
                    int k,
                    const float* alpha,
                    const float* beta,
-                   const __half* A,
-                   const __half* B,
-                   __half* C,
-                   rocblas_gemm_algo algo)
+                   const T* A,
+                   const T* B,
+                   T* C,
+                   rocblas_gemm_algo algo,
+                   int b_stride = -1)
 #else
 int cublas_gemm_ex(cublasHandle_t handle,
                    cublasOperation_t transa,
@@ -127,13 +136,17 @@ int cublas_gemm_ex(cublasHandle_t handle,
                    int k,
                    const float* alpha,
                    const float* beta,
-                   const __half* A,
-                   const __half* B,
-                   __half* C,
-                   cublasGemmAlgo_t algo)
+                   const T* A,
+                   const T* B,
+                   T* C,
+                   cublasGemmAlgo_t algo,
+                   int b_stride = -1)
 #endif
 {
-#ifdef __HIP_PLATFORM_HCC__
+    const int ldb = (b_stride == -1) ? ((transb == CUBLAS_OP_N) ? k : n) : b_stride;
+#ifdef __HIP_PLATFORM_AMD__
+    constexpr auto rocblas_dtype_16 = std::is_same<T, __half>::value ? rocblas_datatype_f16_r
+                                                                     : rocblas_datatype_bf16_r;
     rocblas_status status = rocblas_gemm_ex(handle,
                                             transa,
                                             transb,
@@ -142,23 +155,24 @@ int cublas_gemm_ex(cublasHandle_t handle,
                                             k,
                                             (const void*)alpha,
                                             (const void*)A,
-                                            rocblas_datatype_f16_r,
+                                            rocblas_dtype_16,
                                             (transa == rocblas_operation_none) ? m : k,
                                             (const void*)B,
-                                            rocblas_datatype_f16_r,
-                                            (transb == rocblas_operation_none) ? k : n,
+                                            rocblas_dtype_16,
+                                            ldb,
                                             (const void*)beta,
                                             (void*)C,
-                                            rocblas_datatype_f16_r,
+                                            rocblas_dtype_16,
                                             m,
                                             (void*)C,
-                                            rocblas_datatype_f16_r,
+                                            rocblas_dtype_16,
                                             m,
                                             rocblas_datatype_f32_r,
                                             algo,
                                             0,
                                             0);
 #else
+    constexpr auto cublas_dtype_16 = std::is_same<T, __half>::value ? CUDA_R_16F : CUDA_R_16BF;
     cublasStatus_t status = cublasGemmEx(handle,
                                          transa,
                                          transb,
@@ -167,20 +181,20 @@ int cublas_gemm_ex(cublasHandle_t handle,
                                          k,
                                          (const void*)alpha,
                                          (const void*)A,
-                                         CUDA_R_16F,
+                                         cublas_dtype_16,
                                          (transa == CUBLAS_OP_N) ? m : k,
                                          (const void*)B,
-                                         CUDA_R_16F,
-                                         (transb == CUBLAS_OP_N) ? k : n,
+                                         cublas_dtype_16,
+                                         ldb,
                                          (const void*)beta,
                                          (void*)C,
-                                         CUDA_R_16F,
+                                         cublas_dtype_16,
                                          m,
                                          CUDA_R_32F,
                                          algo);
 #endif
 
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef __HIP_PLATFORM_AMD__
     if (status != rocblas_status_success) {
 #else
     if (status != CUBLAS_STATUS_SUCCESS) {
@@ -196,7 +210,7 @@ int cublas_gemm_ex(cublasHandle_t handle,
     return 0;
 }
 
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef __HIP_PLATFORM_AMD__
 int cublas_strided_batched_gemm(rocblas_handle handle,
                                 int m,
                                 int n,
@@ -232,7 +246,7 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
                                 cublasGemmAlgo_t algo)
 #endif
 {
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef __HIP_PLATFORM_AMD__
     rocblas_status status =
         rocblas_gemm_strided_batched_ex(handle,
                                         op_A,
@@ -289,7 +303,7 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
                                                        algo);
 #endif
 
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef __HIP_PLATFORM_AMD__
     if (status != rocblas_status_success) {
 #else
     if (status != CUBLAS_STATUS_SUCCESS) {
@@ -306,16 +320,17 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
     return 0;
 }
 
-#ifdef __HIP_PLATFORM_HCC__
+template <typename T>
+#ifdef __HIP_PLATFORM_AMD__
 int cublas_strided_batched_gemm(rocblas_handle handle,
                                 int m,
                                 int n,
                                 int k,
                                 const float* alpha,
                                 const float* beta,
-                                const __half* A,
-                                const __half* B,
-                                __half* C,
+                                const T* A,
+                                const T* B,
+                                T* C,
                                 rocblas_operation op_A,
                                 rocblas_operation op_B,
                                 int stride_A,
@@ -330,9 +345,9 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
                                 int k,
                                 const float* alpha,
                                 const float* beta,
-                                const __half* A,
-                                const __half* B,
-                                __half* C,
+                                const T* A,
+                                const T* B,
+                                T* C,
                                 cublasOperation_t op_A,
                                 cublasOperation_t op_B,
                                 int stride_A,
@@ -342,7 +357,9 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
                                 cublasGemmAlgo_t algo)
 #endif
 {
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef __HIP_PLATFORM_AMD__
+    constexpr auto rocblas_dtype_16 = std::is_same<T, __half>::value ? rocblas_datatype_f16_r
+                                                                     : rocblas_datatype_bf16_r;
     rocblas_status status =
         rocblas_gemm_strided_batched_ex(handle,
                                         op_A,
@@ -352,20 +369,20 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
                                         k,
                                         alpha,
                                         A,
-                                        rocblas_datatype_f16_r,
+                                        rocblas_dtype_16,
                                         (op_A == rocblas_operation_none) ? m : k,
                                         stride_A,
                                         B,
-                                        rocblas_datatype_f16_r,
+                                        rocblas_dtype_16,
                                         (op_B == rocblas_operation_none) ? k : n,
                                         stride_B,
                                         beta,
                                         C,
-                                        rocblas_datatype_f16_r,
+                                        rocblas_dtype_16,
                                         m,
                                         stride_C,
                                         C,
-                                        rocblas_datatype_f16_r,
+                                        rocblas_dtype_16,
                                         m,
                                         stride_C,
                                         batch,
@@ -374,6 +391,7 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
                                         0,
                                         0);
 #else
+    constexpr auto cublas_dtype_16 = std::is_same<T, __half>::value ? CUDA_R_16F : CUDA_R_16BF;
     cublasStatus_t status = cublasGemmStridedBatchedEx(handle,
                                                        op_A,
                                                        op_B,
@@ -382,16 +400,16 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
                                                        k,
                                                        alpha,
                                                        A,
-                                                       CUDA_R_16F,
+                                                       cublas_dtype_16,
                                                        (op_A == CUBLAS_OP_N) ? m : k,
                                                        stride_A,
                                                        B,
-                                                       CUDA_R_16F,
+                                                       cublas_dtype_16,
                                                        (op_B == CUBLAS_OP_N) ? k : n,
                                                        stride_B,
                                                        beta,
                                                        C,
-                                                       CUDA_R_16F,
+                                                       cublas_dtype_16,
                                                        m,
                                                        stride_C,
                                                        batch,
@@ -399,7 +417,7 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
                                                        algo);
 #endif
 
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef __HIP_PLATFORM_AMD__
     if (status != rocblas_status_success) {
 #else
     if (status != CUBLAS_STATUS_SUCCESS) {
diff --git a/csrc/transformer/inference/includes/inference_cuda_layers.h b/csrc/transformer/inference/includes/inference_cuda_layers.h
index 211f56bc8a76..dcc020483687 100644
--- a/csrc/transformer/inference/includes/inference_cuda_layers.h
+++ b/csrc/transformer/inference/includes/inference_cuda_layers.h
@@ -1,12 +1,16 @@
-/*
-Copyright 2022 The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #pragma once
 
 #include "ds_kernel_utils.h"
 
 #include <cuda.h>
+#ifdef BF16_AVAILABLE
+#include <cuda_bf16.h>
+#endif
 #include <cuda_fp16.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -48,11 +52,13 @@ void launch_bias_gelu(T* input,
                       cudaStream_t stream);
 
 template <typename T>
-void launch_fused_bias_geglu(T* output,
+void launch_gated_activation(T* output,
                              const T* activation,
                              const T* bias,
                              int rows,
+                             int output_stride,
                              int elems_per_row,
+                             bool use_gelu,
                              cudaStream_t stream);
 
 // Fused bias add with relu activation
@@ -113,6 +119,17 @@ void launch_fused_residual_ln_store_pre_ln_res(T* norm_output,
                                                int elems_per_row,
                                                cudaStream_t stream);
 
+template <typename T>
+void launch_rms_norm(T* norm_output,
+                     T* res_output,
+                     const T* vals,
+                     const T* residual,
+                     const T* gamma,
+                     float epsilon,
+                     int rows,
+                     int elems_per_row,
+                     cudaStream_t stream);
+
 template <typename T>
 void launch_dequantize(T* output,
                        const int8_t* input,
@@ -151,8 +168,7 @@ void launch_apply_rotary_pos_emb(T* mixed_query,
                                  unsigned offset,
                                  unsigned num_heads,
                                  unsigned batch,
-                                 bool rotate_half,
-                                 bool rotate_every_two,
+                                 float rope_theta,
                                  cudaStream_t stream,
                                  int max_out_tokens);
 
@@ -186,12 +202,14 @@ void launch_bias_add_transform_0213(T* outputs,
                                     int seq_length1,
                                     int hidden_dim,
                                     int heads,
+                                    int num_kv,
                                     int rotary_dim,
                                     bool rotate_half,
                                     bool rotate_every_two,
                                     cudaStream_t stream,
                                     int trans_count,
-                                    int max_out_tokens);
+                                    int max_out_tokens,
+                                    float rope_theta);
 template <typename T>
 void pad_data(T* padded_output,
               T* output,
@@ -220,3 +238,11 @@ void launch_pad_add_transform_0213(T* output,
                                    int heads,
                                    int padded_head_size,
                                    cudaStream_t stream);
+
+template <typename T>
+void launch_vector_add(T* out,
+                       const T* a,
+                       const T* b,
+                       float gamma,
+                       int num_elems,
+                       cudaStream_t stream);
diff --git a/csrc/transformer/normalize_kernels.cu b/csrc/transformer/normalize_kernels.cu
index 84f27b2c43af..8b9dbaea6c86 100644
--- a/csrc/transformer/normalize_kernels.cu
+++ b/csrc/transformer/normalize_kernels.cu
@@ -1,6 +1,7 @@
-/*
-Copyright The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #include "custom_cuda_layers.h"
 
diff --git a/csrc/transformer/softmax_kernels.cu b/csrc/transformer/softmax_kernels.cu
index bad814b8629d..d9efed286462 100644
--- a/csrc/transformer/softmax_kernels.cu
+++ b/csrc/transformer/softmax_kernels.cu
@@ -1,6 +1,7 @@
-/*
-Copyright The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #include <math.h>
 #include "custom_cuda_layers.h"
diff --git a/csrc/transformer/transform_kernels.cu b/csrc/transformer/transform_kernels.cu
old mode 100755
new mode 100644
index 37a6e79bb545..dbecc31536c6
--- a/csrc/transformer/transform_kernels.cu
+++ b/csrc/transformer/transform_kernels.cu
@@ -1,6 +1,7 @@
-/*
-Copyright The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #include "custom_cuda_layers.h"
 
diff --git a/csrc/utils/flatten_unflatten.cpp b/csrc/utils/flatten_unflatten.cpp
index 7d16c5c00a5f..ab95ee191464 100644
--- a/csrc/utils/flatten_unflatten.cpp
+++ b/csrc/utils/flatten_unflatten.cpp
@@ -1,7 +1,11 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
 /*
-   Copyright 2020 The Microsoft DeepSpeed Team
-   Copyright NVIDIA/apex
-   This file is adapted from fused adam in NVIDIA/apex, commit a109f85
+Copyright NVIDIA/apex
+This file is adapted from fused adam in NVIDIA/apex, commit a109f85
 */
 
 #include <torch/csrc/utils/tensor_flatten.h>
diff --git a/deepspeed/__init__.py b/deepspeed/__init__.py
index 9e2e25513a72..87b959f56be0 100755
--- a/deepspeed/__init__.py
+++ b/deepspeed/__init__.py
@@ -1,6 +1,7 @@
-'''
-Copyright 2020 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import sys
 import types
@@ -11,11 +12,19 @@
 from torch.optim.lr_scheduler import _LRScheduler
 from packaging import version as pkg_version
 
+try:
+    import triton  # noqa: F401 # type: ignore
+    HAS_TRITON = True
+except ImportError:
+    HAS_TRITON = False
+
 from . import ops
 from . import module_inject
 
+from .accelerator import get_accelerator
 from .runtime.engine import DeepSpeedEngine, DeepSpeedOptimizerCallable, DeepSpeedSchedulerCallable
 from .runtime.engine import ADAM_OPTIMIZER, LAMB_OPTIMIZER
+from .runtime.hybrid_engine import DeepSpeedHybridEngine
 from .runtime.pipe.engine import PipelineEngine
 from .inference.engine import InferenceEngine
 from .inference.config import DeepSpeedInferenceConfig
@@ -25,7 +34,7 @@
 from .ops.transformer import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig
 from .module_inject import replace_transformer_layer, revert_transformer_layer
 
-from .utils import log_dist, OnDevice
+from .utils import log_dist, OnDevice, logger
 from .comm.comm import init_distributed
 
 from .runtime import zero
@@ -48,15 +57,16 @@ def _parse_version(version_str):
 __git_hash__ = git_hash
 __git_branch__ = git_branch
 
+# Set to torch's distributed package or deepspeed.comm based inside DeepSpeedEngine init
+dist = None
+
 
 def initialize(args=None,
                model: torch.nn.Module = None,
-               optimizer: Optional[Union[Optimizer,
-                                         DeepSpeedOptimizerCallable]] = None,
+               optimizer: Optional[Union[Optimizer, DeepSpeedOptimizerCallable]] = None,
                model_parameters: Optional[torch.nn.Module] = None,
                training_data: Optional[torch.utils.data.Dataset] = None,
-               lr_scheduler: Optional[Union[_LRScheduler,
-                                            DeepSpeedSchedulerCallable]] = None,
+               lr_scheduler: Optional[Union[_LRScheduler, DeepSpeedSchedulerCallable]] = None,
                mpu=None,
                dist_init_required: Optional[bool] = None,
                collate_fn=None,
@@ -110,10 +120,8 @@ def initialize(args=None,
         * ``lr_scheduler``: Wrapped lr scheduler if user ``lr_scheduler`` is passed, or
           if ``lr_scheduler`` specified in JSON configuration. Otherwise ``None``.
     """
-    log_dist("DeepSpeed info: version={}, git-hash={}, git-branch={}".format(
-        __version__,
-        __git_hash__,
-        __git_branch__),
+    log_dist("DeepSpeed info: version={}, git-hash={}, git-branch={}".format(__version__, __git_hash__,
+                                                                             __git_branch__),
              ranks=[0])
 
     # Disable zero.Init context if it's currently enabled
@@ -121,38 +129,76 @@ def initialize(args=None,
 
     assert model is not None, "deepspeed.initialize requires a model"
 
+    global dist
+    from deepspeed import comm as dist
+    dist_backend = get_accelerator().communication_backend_name()
+    dist.init_distributed(dist_backend=dist_backend, dist_init_required=dist_init_required)
+
+    # Set config using config_params for backwards compat
+    if config is None and config_params is not None:
+        config = config_params
+
+    # Check for deepscale_config for backwards compat
+    if hasattr(args, "deepscale_config") and args.deepscale_config is not None:
+        logger.warning("************ --deepscale_config is deprecated, please use --deepspeed_config ************")
+        if hasattr(args, "deepspeed_config"):
+            assert (args.deepspeed_config is
+                    None), "Not sure how to proceed, we were given both a deepscale_config and deepspeed_config"
+        args.deepspeed_config = args.deepscale_config
+        args.deepscale_config = None
+
+    # Check that we have only one config passed
+    if hasattr(args, "deepspeed_config") and args.deepspeed_config is not None:
+        assert config is None, "Not sure how to proceed, we were given deepspeed configs in the deepspeed arguments and deepspeed.initialize() function call"
+        config = args.deepspeed_config
+    assert config is not None, "DeepSpeed requires --deepspeed_config to specify configuration file"
+
     if not isinstance(model, PipelineModule):
-        engine = DeepSpeedEngine(args=args,
-                                 model=model,
-                                 optimizer=optimizer,
-                                 model_parameters=model_parameters,
-                                 training_data=training_data,
-                                 lr_scheduler=lr_scheduler,
-                                 mpu=mpu,
-                                 dist_init_required=dist_init_required,
-                                 collate_fn=collate_fn,
-                                 config=config,
-                                 config_params=config_params)
+        config_class = DeepSpeedConfig(config, mpu)
+        if config_class.hybrid_engine.enabled:
+            engine = DeepSpeedHybridEngine(args=args,
+                                           model=model,
+                                           optimizer=optimizer,
+                                           model_parameters=model_parameters,
+                                           training_data=training_data,
+                                           lr_scheduler=lr_scheduler,
+                                           mpu=mpu,
+                                           dist_init_required=dist_init_required,
+                                           collate_fn=collate_fn,
+                                           config=config,
+                                           config_class=config_class)
+        else:
+            engine = DeepSpeedEngine(args=args,
+                                     model=model,
+                                     optimizer=optimizer,
+                                     model_parameters=model_parameters,
+                                     training_data=training_data,
+                                     lr_scheduler=lr_scheduler,
+                                     mpu=mpu,
+                                     dist_init_required=dist_init_required,
+                                     collate_fn=collate_fn,
+                                     config=config,
+                                     config_class=config_class)
     else:
         assert mpu is None, "mpu must be None with pipeline parallelism"
+        mpu = model.mpu()
+        config_class = DeepSpeedConfig(config, mpu)
         engine = PipelineEngine(args=args,
                                 model=model,
                                 optimizer=optimizer,
                                 model_parameters=model_parameters,
                                 training_data=training_data,
                                 lr_scheduler=lr_scheduler,
-                                mpu=model.mpu(),
+                                mpu=mpu,
                                 dist_init_required=dist_init_required,
                                 collate_fn=collate_fn,
                                 config=config,
-                                config_params=config_params)
-
-    return_items = [
-        engine,
-        engine.optimizer,
-        engine.training_dataloader,
-        engine.lr_scheduler
-    ]
+                                config_class=config_class)
+
+    # Restore zero.Init context if necessary
+    zero.partition_parameters.restore_init_context()
+
+    return_items = [engine, engine.optimizer, engine.training_dataloader, engine.lr_scheduler]
     return tuple(return_items)
 
 
@@ -171,38 +217,28 @@ def _add_core_arguments(parser):
     """
     group = parser.add_argument_group('DeepSpeed', 'DeepSpeed configurations')
 
-    group.add_argument(
-        '--deepspeed',
-        default=False,
-        action='store_true',
-        help=
-        'Enable DeepSpeed (helper flag for user code, no impact on DeepSpeed backend)')
+    group.add_argument('--deepspeed',
+                       default=False,
+                       action='store_true',
+                       help='Enable DeepSpeed (helper flag for user code, no impact on DeepSpeed backend)')
 
-    group.add_argument('--deepspeed_config',
-                       default=None,
-                       type=str,
-                       help='DeepSpeed json configuration file.')
+    group.add_argument('--deepspeed_config', default=None, type=str, help='DeepSpeed json configuration file.')
 
-    group.add_argument(
-        '--deepscale',
-        default=False,
-        action='store_true',
-        help=
-        'Deprecated enable DeepSpeed (helper flag for user code, no impact on DeepSpeed backend)'
-    )
+    group.add_argument('--deepscale',
+                       default=False,
+                       action='store_true',
+                       help='Deprecated enable DeepSpeed (helper flag for user code, no impact on DeepSpeed backend)')
 
     group.add_argument('--deepscale_config',
                        default=None,
                        type=str,
                        help='Deprecated DeepSpeed json configuration file.')
 
-    group.add_argument(
-        '--deepspeed_mpi',
-        default=False,
-        action='store_true',
-        help=
-        "Run via MPI, this will attempt to discover the necessary variables to initialize torch "
-        "distributed from the MPI environment")
+    group.add_argument('--deepspeed_mpi',
+                       default=False,
+                       action='store_true',
+                       help="Run via MPI, this will attempt to discover the necessary variables to initialize torch "
+                       "distributed from the MPI environment")
 
     return parser
 
@@ -278,10 +314,8 @@ def init_inference(model, config=None, **kwargs):
     Returns:
         A deepspeed.InferenceEngine wrapped model.
     """
-    log_dist("DeepSpeed info: version={}, git-hash={}, git-branch={}".format(
-        __version__,
-        __git_hash__,
-        __git_branch__),
+    log_dist("DeepSpeed info: version={}, git-hash={}, git-branch={}".format(__version__, __git_hash__,
+                                                                             __git_branch__),
              ranks=[0])
 
     # Load config_dict from config first
@@ -293,17 +327,14 @@ def init_inference(model, config=None, **kwargs):
     elif isinstance(config, dict):
         config_dict = config
     else:
-        raise ValueError(
-            f"'config' argument expected string or dictionary, got {type(config)}")
+        raise ValueError(f"'config' argument expected string or dictionary, got {type(config)}")
 
     # Update with values from kwargs, ensuring no conflicting overlap between config and kwargs
     overlap_keys = set(config_dict.keys()).intersection(kwargs.keys())
     # If there is overlap, error out if values are different
     for key in overlap_keys:
         if config_dict[key] != kwargs[key]:
-            raise ValueError(
-                f"Conflicting argument '{key}' in 'config':{config_dict[key]} and kwargs:{kwargs[key]}"
-            )
+            raise ValueError(f"Conflicting argument '{key}' in 'config':{config_dict[key]} and kwargs:{kwargs[key]}")
     config_dict.update(kwargs)
 
     ds_inference_config = DeepSpeedInferenceConfig(**config_dict)
diff --git a/deepspeed/autotuning/README.md b/deepspeed/autotuning/README.md
index 2cb73b01318a..b1fa435364d2 100755
--- a/deepspeed/autotuning/README.md
+++ b/deepspeed/autotuning/README.md
@@ -94,7 +94,7 @@ Note that ZeRO stages, micro-batch sizes, and other ZeRO configurations to tune
 The DeepSpeed Autotuner tunes ZeRO stages, micro-batch size per GPU, and ZeRO configurations. Other DeepSpeed configurations are used as defined by the user in the DeepSpeed configuration file. Users can overwrite any of the tuning parameters.
 ### Configuring ZeRO Stage
 
-By default, the DeepSpeed Autotuner tunes ZeRO stages. If `"zero_optimization"` is not defined or set to `"all"`, the Autotuner explores ZeRO stages in the order of `[0, 1, 2, 3]`. Users can overwrite this behavior if they already know what ZeRO stage(s) to use. For example, the below section in the DeepSpeed configuration file limits the Autotuner to only exploring ZeRO stage 2 and 3.
+By default, the DeepSpeed Autotuner does not tune ZeRO stages. If `"zero_optimization"` is not defined, DeepSpeed ZeRO is disabled. If `"zero_optimization"` is set to `"all"`, the Autotuner explores ZeRO stages in the order of `[0, 1, 2, 3]`. Users can overwrite this behavior if they already know what ZeRO stage(s) to use. For example, the below section in the DeepSpeed configuration file limits the Autotuner to only exploring ZeRO stage 2 and 3.
 
 ```json
 {
diff --git a/deepspeed/autotuning/__init__.py b/deepspeed/autotuning/__init__.py
index 11ad4700017a..73d8153bab92 100644
--- a/deepspeed/autotuning/__init__.py
+++ b/deepspeed/autotuning/__init__.py
@@ -1,3 +1,6 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .autotuner import Autotuner
diff --git a/deepspeed/autotuning/autotuner.py b/deepspeed/autotuning/autotuner.py
index 786597273462..3d6d79324201 100755
--- a/deepspeed/autotuning/autotuner.py
+++ b/deepspeed/autotuning/autotuner.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import shutil
 import subprocess
@@ -40,13 +43,14 @@ class Autotuner:
     """The DeepSpeed Autotuner automatically discovers the optimal DeepSpeed configuration that delivers good training speed. The Autotuner uses model information, system information, and heuristics to efficiently tune system knobs that affect compute and memory efficiencies, such as ZeRO optimization stages, micro-batch sizes, and many other ZeRO optimization configurations. It not only reduces the time and resources user spend on tuning, but also can discover configurations better than hand-tuned methods.
     Autotuning with DeepSpeed requires no code change from DeepSpeed users. Please refer to the README for usage details.
     """
+
     def __init__(self, args, active_resources):
         self.args = args
         self.selected_exp_dir = None
 
         assert tabulate is not None, "Missing required package `tabulate`, please install with `pip install deepspeed[autotuning]`."
 
-        logger.debug(f"autotunning args={args}")
+        logger.debug(f"autotuning args={args}")
 
         self.user_config = self._get_user_config(args.user_args)
         assert self.user_config is not None, "DeepSpeed configuration is not provided"
@@ -77,7 +81,7 @@ def __init__(self, args, active_resources):
         if not os.path.exists(self.results_dir):
             try:
                 os.makedirs(self.results_dir, exist_ok=True)
-                logger.info(f"Created autotuning resutls directory: {self.exps_dir}")
+                logger.info(f"Created autotuning results directory: {self.exps_dir}")
             except:
                 logger.error(
                     f"Failed to create {self.results_dir}, please check `results_dir` in the autotuning config file is accessible by all the nodes in the job."
@@ -92,11 +96,12 @@ def __init__(self, args, active_resources):
 
         assert self.exp_num_gpus <= self.rm.num_gpus_per_node, "num_gpus in the autotuning configuration must not be less than the --num_gpus value in the train script if any"
         assert self.exp_num_nodes <= len(
-            self.rm.nodes), "num_nodes in the autotuning configuration must not be less than the --num_nodes value in the train script if any"
+            self.rm.nodes
+        ), "num_nodes in the autotuning configuration must not be less than the --num_nodes value in the train script if any"
 
         self.records = {}
         self.optimal_cmd = None
-        self.optmal_ds_config = None
+        self.optimal_ds_config = None
 
         self.mlflow_parent_id = None
 
@@ -125,18 +130,10 @@ def print_tuning_results(self):
                 row.append(val[0]['name'])
                 tab.append(row)
             summary = tabulate(tab,
-                               headers=[
-                                   "tuning_space",
-                                   "num_experiments",
-                                   "best_metric_val",
-                                   "best_exp_name"
-                               ],
+                               headers=["tuning_space", "num_experiments", "best_metric_val", "best_exp_name"],
                                tablefmt="pipe")
             print(summary)
-            with open(os.path.join(self.results_dir,
-                                   'summary.txt'),
-                      'w',
-                      buffering=BUFSIZE) as fd:
+            with open(os.path.join(self.results_dir, 'summary.txt'), 'w', buffering=BUFSIZE) as fd:
                 fd.write(summary)
                 fd.flush()
                 os.fsync(fd)
@@ -148,9 +145,7 @@ def print_tuning_results(self):
                     f"{best_exp['name']} is the optimal setup after tuning. The exp result is at {best_exp['result_dir']}."
                 )
             else:
-                logger.info(
-                    f"No optimal setup is found. Please check that experiments were run successfully."
-                )
+                logger.info(f"No optimal setup is found. Please check that experiments were run successfully.")
             tuning_duration = datetime.timedelta(seconds=(time.time() - self.start_time))
 
             logger.info(f"Tuning completed in {tuning_duration}")
@@ -172,8 +167,8 @@ def _get_user_config(self, user_args):
         user_config_file = None
         if "--deepspeed_config" in user_args:
             idx = user_args.index("--deepspeed_config")
-            assert ".json" in user_args[idx +
-                                        1],  "DeepSpeed --deepspeed_config requires a json file to specify the configuration"
+            assert ".json" in user_args[
+                idx + 1], "DeepSpeed --deepspeed_config requires a json file to specify the configuration"
 
             user_config_file = user_args[idx + 1]
         elif "--deepspeed" in user_args:
@@ -183,15 +178,10 @@ def _get_user_config(self, user_args):
 
         logger.debug(f"user_config_file = {user_config_file}")
         if user_config_file is not None:
-            assert os.path.isfile(
-                user_config_file
-            ), "DeepSpeed configuration file: {} is not an existing file".format(
-                user_config_file
-            )
+            assert os.path.isfile(user_config_file), "DeepSpeed configuration file: {} is not an existing file".format(
+                user_config_file)
             if os.path.exists(user_config_file):
-                return json.load(open(user_config_file,
-                                      "r"),
-                                 object_pairs_hook=dict_raise_error_on_duplicate_keys)
+                return json.load(open(user_config_file, "r"), object_pairs_hook=dict_raise_error_on_duplicate_keys)
 
         return None
 
@@ -258,13 +248,11 @@ def mp_size(self):
         return self.autotuning_config.mp_size
 
     def max_train_micro_batch_size_per_gpu(self):
-        if self.max_train_batch_size() and self.max_train_batch_size(
-        ) > 0:  # if the user specifies a max_train_batch_size
-            max_train_micro_batch_size = self.max_train_batch_size() * self.mp_size(
-            ) // (self.exp_num_gpus * self.exp_num_nodes
-                  )  # gradient accumulation steps >=1
-            return min(self.autotuning_config.max_train_micro_batch_size_per_gpu,
-                       max_train_micro_batch_size)
+        if self.max_train_batch_size(
+        ) and self.max_train_batch_size() > 0:  # if the user specifies a max_train_batch_size
+            max_train_micro_batch_size = self.max_train_batch_size() * self.mp_size() // (
+                self.exp_num_gpus * self.exp_num_nodes)  # gradient accumulation steps >=1
+            return min(self.autotuning_config.max_train_micro_batch_size_per_gpu, max_train_micro_batch_size)
         else:
             return self.autotuning_config.max_train_micro_batch_size_per_gpu
 
@@ -361,19 +349,14 @@ def _generate_experiments(self, tuning_space, max_train_batch_size_per_gpu):
             if model_info and "hidden_size" in model_info:
                 hs = model_info["hidden_size"]
                 template_config[ZERO_OPTIMIZATION]['reduce_bucket_size'] = hs * hs
-                template_config[ZERO_OPTIMIZATION][
-                    'stage3_prefetch_bucket_size'] = 0.9 * hs * hs
-                template_config[ZERO_OPTIMIZATION][
-                    'stage3_param_persistence_threshold'] = 10 * hs
+                template_config[ZERO_OPTIMIZATION]['stage3_prefetch_bucket_size'] = 0.9 * hs * hs
+                template_config[ZERO_OPTIMIZATION]['stage3_param_persistence_threshold'] = 10 * hs
             prefix = "z3_"
         else:
             return exps
 
         # replace the corresponding parameter values if the user specifies them in the DeepSpeed configuration file
-        replace_dict(tuning_space,
-                     self.user_config,
-                     [ZERO_OPTIMIZATION,
-                      TRAIN_MICRO_BATCH_SIZE_PER_GPU])
+        replace_dict(tuning_space, self.user_config, [ZERO_OPTIMIZATION, TRAIN_MICRO_BATCH_SIZE_PER_GPU])
 
         logger.debug(f"tuning_space = {json.dumps(tuning_space)}")
 
@@ -397,11 +380,9 @@ def _generate_experiments(self, tuning_space, max_train_batch_size_per_gpu):
             # if the config does not use offloading, remove the offloading section
             config_zero = config.get(ZERO_OPTIMIZATION, None)
             if config_zero:
-                if OFFLOAD_OPTIMIZER not in config_zero and OFFLOAD_OPTIMIZER in exp_config[
-                        ZERO_OPTIMIZATION]:
+                if OFFLOAD_OPTIMIZER not in config_zero and OFFLOAD_OPTIMIZER in exp_config[ZERO_OPTIMIZATION]:
                     del exp_config[ZERO_OPTIMIZATION][OFFLOAD_OPTIMIZER]
-                if OFFLOAD_PARAM not in config_zero and OFFLOAD_PARAM in exp_config[
-                        ZERO_OPTIMIZATION]:
+                if OFFLOAD_PARAM not in config_zero and OFFLOAD_PARAM in exp_config[ZERO_OPTIMIZATION]:
                     del exp_config[ZERO_OPTIMIZATION][OFFLOAD_PARAM]
             # set gradient accumulation steps according to max_train_batch_size_per_gpu
             mbs = exp_config[TRAIN_MICRO_BATCH_SIZE_PER_GPU]
@@ -438,24 +419,18 @@ def tune(self):
         else:
             return
 
-        logger.info(
-            f"The model has {number_to_string(self.get_model_num_params())} parameters.")
+        logger.info(f"The model has {number_to_string(self.get_model_num_params())} parameters.")
 
         self.gpu_mem = self.get_gpu_memory_info()
-        logger.info(
-            f"Memory per GPU in the system is {memory_to_string(self.gpu_mem, postfix='B')}."
-        )
+        logger.info(f"Memory per GPU in the system is {memory_to_string(self.gpu_mem, postfix='B')}.")
 
         self.activation_mem = self.get_activation_memory_per_gpu()
         logger.info(
             f"The model requires at least {memory_to_string(self.activation_mem, postfix='B')} activation memory for micro batch size 1."
         )
 
-        #TODO: FIX THIS
-        stage = self.user_config.get(ZERO_OPTIMIZATION,
-                                     {}).get(ZERO_OPTIMIZATION_STAGE,
-                                             "all")
-        stage = "all"
+        stage = self.user_config.get(ZERO_OPTIMIZATION, {}).get(ZERO_OPTIMIZATION_STAGE, 0)
+
         user_zero_stages = [stage] if not isinstance(stage, list) else stage
         logger.info(f"User-defined zero stages are {stage}.")
 
@@ -463,15 +438,13 @@ def tune(self):
         max_mbs = 0
         metric_val = 0
 
-        required_gpu_mem = self.get_instantiation_memory_required_per_gpu(
-            ZeroStageEnum.disabled) + self.activation_mem
+        required_gpu_mem = self.get_instantiation_memory_required_per_gpu(ZeroStageEnum.disabled) + self.activation_mem
         if self.gpu_mem > required_gpu_mem:
             if "all" in user_zero_stages or ZeroStageEnum.disabled in user_zero_stages:
                 logger.info(
                     f"The model might be runable with ZERO 0 (which requires at least {memory_to_string(required_gpu_mem, postfix='B')} memory with mbs = 1), adding DEFAULT_TUNING_SPACE_ZERO_0 to the global tuning space"
                 )
-                next_max_mbs, next_mbs, next_metric_val = self.tune_space(
-                    DEFAULT_TUNING_SPACE_ZERO_0)
+                next_max_mbs, next_mbs, next_metric_val = self.tune_space(DEFAULT_TUNING_SPACE_ZERO_0)
                 if next_mbs > mbs:
                     mbs = next_mbs
                     max_mbs = next_max_mbs
@@ -490,8 +463,10 @@ def tune(self):
                 logger.info(
                     f"The model might be runable with ZERO 1 (which requires at least {memory_to_string(required_gpu_mem, postfix='B')} memory), adding DEFAULT_TUNING_SPACE_ZERO_1 to the global tuning space"
                 )
-                next_max_mbs, next_mbs, next_metric_val = self.tune_space(
-                    DEFAULT_TUNING_SPACE_ZERO_1, prev_max_mbs = max_mbs, prev_best_mbs=mbs, prev_best_metric_val=metric_val)
+                next_max_mbs, next_mbs, next_metric_val = self.tune_space(DEFAULT_TUNING_SPACE_ZERO_1,
+                                                                          prev_max_mbs=max_mbs,
+                                                                          prev_best_mbs=mbs,
+                                                                          prev_best_metric_val=metric_val)
                 if next_mbs > mbs:
                     mbs = next_mbs
                     max_mbs = next_max_mbs
@@ -510,8 +485,10 @@ def tune(self):
                 logger.info(
                     f"The model might be runable with ZERO 2 (which requires at least {memory_to_string(required_gpu_mem, postfix='B')} memory), adding DEFAULT_TUNING_SPACE_ZERO_2 to the global tuning space"
                 )
-                next_max_mbs, next_mbs, next_metric_val = self.tune_space(
-                    DEFAULT_TUNING_SPACE_ZERO_2, prev_max_mbs = max_mbs, prev_best_mbs=mbs, prev_best_metric_val=metric_val)
+                next_max_mbs, next_mbs, next_metric_val = self.tune_space(DEFAULT_TUNING_SPACE_ZERO_2,
+                                                                          prev_max_mbs=max_mbs,
+                                                                          prev_best_mbs=mbs,
+                                                                          prev_best_metric_val=metric_val)
                 if next_mbs > mbs:
                     mbs = next_mbs
                     max_mbs = next_max_mbs
@@ -523,15 +500,16 @@ def tune(self):
                 f"The model is not runable with ZERO stage {ZeroStageEnum.gradients} (which requires at least {memory_to_string(required_gpu_mem, postfix='B')} memory with mbs = 1)"
             )
 
-        required_gpu_mem = self.get_instantiation_memory_required_per_gpu(
-            ZeroStageEnum.weights) + self.activation_mem
+        required_gpu_mem = self.get_instantiation_memory_required_per_gpu(ZeroStageEnum.weights) + self.activation_mem
         if self.gpu_mem > required_gpu_mem:
             if "all" in user_zero_stages or ZeroStageEnum.weights in user_zero_stages:
                 logger.info(
                     f"The model might be runable with ZERO 3 (which requires at least {memory_to_string(required_gpu_mem, postfix='B')} memory), adding DEFAULT_TUNING_SPACE_ZERO_3 to the global tuning space"
                 )
-                _, _, next_metric_val = self.tune_space(
-                    DEFAULT_TUNING_SPACE_ZERO_3, prev_max_mbs = max_mbs, prev_best_mbs=mbs, prev_best_metric_val=metric_val)
+                _, _, next_metric_val = self.tune_space(DEFAULT_TUNING_SPACE_ZERO_3,
+                                                        prev_max_mbs=max_mbs,
+                                                        prev_best_mbs=mbs,
+                                                        prev_best_metric_val=metric_val)
                 if has_mlflow:
                     mlflow.log_metric(f"z3{self.metric()}", next_metric_val)
         else:
@@ -542,11 +520,7 @@ def tune(self):
         if has_mlflow:
             mlflow.end_run()
 
-    def tune_space(self,
-                   tuning_space,
-                   prev_max_mbs=0,
-                   prev_best_mbs=0,
-                   prev_best_metric_val=0):
+    def tune_space(self, tuning_space, prev_max_mbs=0, prev_best_mbs=0, prev_best_metric_val=0):
         config_zero = tuning_space.get(ZERO_OPTIMIZATION, {})
         stage = config_zero.get(ZERO_OPTIMIZATION_STAGE, None)
         tuning_space_name = TUNING_MICRO_BATCH_SIZE_PREFIX + str(stage)
@@ -556,27 +530,24 @@ def tune_space(self,
 
         # calculate max micro batch size using gpu memory, model instantiation memory and activation memory
         # calculated_max_micro_batch_size = (memory_per_gpu - instantiation_memory) // activation_memory_micro_batch_size_1
+        logger.info(f'GPU memory: {self.gpu_mem}')
+        logger.info(f'Instantiation mem required for stage {stage}: {self.get_instantiation_memory_required_per_gpu(stage)}')
+        logger.info(f'Activation mem: {self.activation_mem}')
         calculated_max_micro_batch_size = int(
-            self.gpu_mem -
-            self.get_instantiation_memory_required_per_gpu(stage)) // self.activation_mem
+            self.gpu_mem - self.get_instantiation_memory_required_per_gpu(stage)) // self.activation_mem
         logger.info(
             f"Start tuning for space {tuning_space_name}, calculated_max_micro_batch_size = {calculated_max_micro_batch_size}"
         )
 
         if calculated_max_micro_batch_size < prev_max_mbs:
-            logger.info(
-                f"No need to tune Zero stage {stage}. End tuning for space {tuning_space_name}"
-            )
+            logger.info(f"No need to tune Zero stage {stage}. End tuning for space {tuning_space_name}")
             return 0, 0, 0
 
         if TRAIN_MICRO_BATCH_SIZE_PER_GPU in self.user_config and isinstance(
-                self.user_config[TRAIN_MICRO_BATCH_SIZE_PER_GPU],
-                list):
+                self.user_config[TRAIN_MICRO_BATCH_SIZE_PER_GPU], list):
             # user-specified micro batch size per gpu is a list which overwrites the default tuning behavior
             tuning_micro_batch_sizes = [
-                s for s in self.user_config[TRAIN_MICRO_BATCH_SIZE_PER_GPU]
-                if isinstance(s,
-                              int)
+                s for s in self.user_config[TRAIN_MICRO_BATCH_SIZE_PER_GPU] if isinstance(s, int)
             ]
             gas = self.get_gas_from_user_config()
             min_micro_batch_size = min(tuning_micro_batch_sizes)
@@ -589,9 +560,7 @@ def tune_space(self,
                 stage, prev_max_mbs, calculated_max_micro_batch_size)
 
             if max_micro_batch_size < prev_max_mbs:
-                logger.info(
-                    f"No need to tune Zero stage {stage}. End tuning for space {tuning_space_name}"
-                )
+                logger.info(f"No need to tune Zero stage {stage}. End tuning for space {tuning_space_name}")
                 return 0, 0, 0
 
             tuning_micro_batch_sizes, max_train_batch_size_per_gpu = self.get_tuning_micro_batch_size_list(
@@ -609,19 +578,15 @@ def tune_space(self,
             return 0, 0, 0
 
         # tune micro batch sizes and gradient accumulation steps given max_train_batch_size_per_gpu
-        tuning_micro_batch_sizes = self.run_tuning_micro_batch_sizes(
-            tuning_micro_batch_sizes,
-            max_train_batch_size_per_gpu,
-            min_micro_batch_size,
-            stage,
-            tuning_micro_batch_sizes_overwritten)
+        tuning_micro_batch_sizes = self.run_tuning_micro_batch_sizes(tuning_micro_batch_sizes,
+                                                                     max_train_batch_size_per_gpu,
+                                                                     min_micro_batch_size, stage,
+                                                                     tuning_micro_batch_sizes_overwritten)
 
         fast_best_record = self.get_best_space_record(tuning_space_name)
         fast_best_metric_val = fast_best_record[1] if fast_best_record else 0
-        fast_best_mbs = fast_best_record[0][DS_CONFIG][
-            TRAIN_MICRO_BATCH_SIZE_PER_GPU] if fast_best_record else 0
-        logger.info(
-            f"fast_best_mbs = {fast_best_mbs}, name = {fast_best_record[0]['name']}")
+        fast_best_mbs = fast_best_record[0][DS_CONFIG][TRAIN_MICRO_BATCH_SIZE_PER_GPU] if fast_best_record else 0
+        logger.info(f"fast_best_mbs = {fast_best_mbs}, name = {fast_best_record[0]['name']}")
 
         if self.fast_enabled() or stage == 0:
             logger.info(f"End tuning for space: {tuning_space_name}")
@@ -631,8 +596,7 @@ def tune_space(self,
         if stage > 0:
             if fast_best_mbs <= prev_best_mbs or fast_best_metric_val < prev_best_metric_val:
                 logger.info(
-                    f"End tuning for space: {tuning_space_name}. No need to tune other Zero configuration parameters."
-                )
+                    f"End tuning for space: {tuning_space_name}. No need to tune other Zero configuration parameters.")
                 return max_micro_batch_size, fast_best_mbs, fast_best_metric_val
 
         tuning_space[TRAIN_MICRO_BATCH_SIZE_PER_GPU] = tuning_micro_batch_sizes
@@ -654,8 +618,7 @@ def tune_space(self,
         else:
             t = GridSearchTuner(exps, self.rm, self.metric())
 
-        sample_size = len(self.rm.nodes) * self.rm.num_gpus_per_node // (
-            self.exp_num_gpus * self.exp_num_nodes)
+        sample_size = len(self.rm.nodes) * self.rm.num_gpus_per_node // (self.exp_num_gpus * self.exp_num_nodes)
         num_exps = t.tune(sample_size=sample_size,
                           n_trials=self.autotuning_config.tuner_num_trials,
                           early_stopping=self.autotuning_config.tuner_early_stopping)
@@ -669,8 +632,7 @@ def tune_space(self,
 
         if full_best_metric_val > fast_best_metric_val:
             best_metric_val = full_best_metric_val
-            best_mbs = full_best_record[0][DS_CONFIG][
-                TRAIN_MICRO_BATCH_SIZE_PER_GPU] if full_best_record else -1
+            best_mbs = full_best_record[0][DS_CONFIG][TRAIN_MICRO_BATCH_SIZE_PER_GPU] if full_best_record else -1
         else:
             best_metric_val = fast_best_metric_val
             best_mbs = fast_best_mbs
@@ -678,13 +640,11 @@ def tune_space(self,
         logger.info(f"End tuning for space: {tuning_space_name}")
         return max_micro_batch_size, best_mbs, best_metric_val
 
-    def get_plauteu_mbs(self, tuning_space_name):
+    def get_plateau_mbs(self, tuning_space_name):
         if tuning_space_name not in self.records:
             return 0
         space_records = self.records[tuning_space_name]
-        sorted_space_records = sorted(
-            space_records,
-            key=lambda x: x[0][DS_CONFIG][TRAIN_MICRO_BATCH_SIZE_PER_GPU])
+        sorted_space_records = sorted(space_records, key=lambda x: x[0][DS_CONFIG][TRAIN_MICRO_BATCH_SIZE_PER_GPU])
         prev_metric_val = None
         prev_micro_batch_size = 0
         for (exp, metric_val, _) in sorted_space_records:
@@ -692,8 +652,7 @@ def get_plauteu_mbs(self, tuning_space_name):
                 if metric_val < prev_metric_val:
                     break
                 if (metric_val >= prev_metric_val
-                        and (metric_val - prev_metric_val) / prev_metric_val <
-                        METRIC_PERCENT_DIFF_CONST):
+                        and (metric_val - prev_metric_val) / prev_metric_val < METRIC_PERCENT_DIFF_CONST):
                     break
             prev_metric_val = metric_val
             prev_micro_batch_size = exp[DS_CONFIG][TRAIN_MICRO_BATCH_SIZE_PER_GPU]
@@ -705,7 +664,7 @@ def get_model_num_params(self):
             return self.model_info["num_params"]
 
     def model_info_profile_run(self):
-        """Does a model information profling experiment that collects the number of model parameters and activation memory.\
+        """Does a model information profiling experiment that collects the number of model parameters and activation memory.\
             The experiment produces a "profile_model_info" folder under self.results_dir.
         Returns:
             [dict]: a model information dictionary, e.g., {"num_params": 335144976, "trainable_num_params": 335144976, "activation_mem_per_gpu": 324358144, "rank": 0}
@@ -718,16 +677,8 @@ def model_info_profile_run(self):
         ds_config = copy.deepcopy(self.user_config)
         replace_dict(ds_config, DEFAULT_MIN_MEM_CONFIG)
 
-        model_info_path = os.path.join(self.results_dir,
-                                       "profile_model_info",
-                                       "model_info.json")
-        ds_config[AUTOTUNING] = {
-            "enabled": True,
-            "model_info_path": model_info_path,
-            "model_info": {
-                "profile": True
-            }
-        }
+        model_info_path = os.path.join(self.results_dir, "profile_model_info", "model_info.json")
+        ds_config[AUTOTUNING] = {"enabled": True, "model_info_path": model_info_path, "model_info": {"profile": True}}
 
         exp_config = {}
         exp_name = "profile_model_info"
@@ -748,14 +699,15 @@ def model_info_profile_run(self):
         for exp_id, (exp_json, err) in self.rm.finished_experiments.items():
             self.rm.clear()
             if err:
-                logger.error(
-                    f"The model is not runnable with DeepSpeed with error = {err}")
+                logger.error(f"The model is not runnable with DeepSpeed with error = {err}")
                 return None
 
         if os.path.exists(model_info_path):
             with open(model_info_path, 'r') as f:
                 model_info = hjson.load(f)
                 return model_info
+        else:
+            print(f'Could not find model_info at {model_info_path}')
 
     def update_records(self, space_name, exp, metric_val, num_exps):
         if space_name not in self.records:
@@ -790,12 +742,8 @@ def get_best_space_records(self):
             best_space_records[GLOBAL_TUNING_SPACE] = global_best_record
         return best_space_records
 
-    def run_tuning_micro_batch_sizes(self,
-                                     tuning_micro_batch_sizes,
-                                     max_train_batch_size_per_gpu,
-                                     min_micro_batch_size,
-                                     stage,
-                                     tuning_micro_batch_sizes_overwritten):
+    def run_tuning_micro_batch_sizes(self, tuning_micro_batch_sizes, max_train_batch_size_per_gpu,
+                                     min_micro_batch_size, stage, tuning_micro_batch_sizes_overwritten):
         assert tuning_micro_batch_sizes, "the tuning micro batch size list is empty"
         tuning_micro_batch_sizes.sort()
         max_micro_batch_size = tuning_micro_batch_sizes[-1]
@@ -838,8 +786,7 @@ def run_tuning_micro_batch_sizes(self,
                         results = hjson.load(f)
                         metric_val = results[self.metric()]
                         self.update_records(tuning_space_name, exp, metric_val, 1)
-                        if max_micro_batch_size == exp[DS_CONFIG][
-                                TRAIN_MICRO_BATCH_SIZE_PER_GPU]:
+                        if max_micro_batch_size == exp[DS_CONFIG][TRAIN_MICRO_BATCH_SIZE_PER_GPU]:
                             max_micro_batch_size_metric_val = metric_val
                         if has_mlflow:
                             os.environ.pop('MLFLOW_RUN_ID')
@@ -859,12 +806,11 @@ def run_tuning_micro_batch_sizes(self,
         if tuning_micro_batch_sizes_overwritten:
             return tuning_micro_batch_sizes
 
-        # in a auto-detected tuning_micro_batch_sizs list, max_micro_batch_size might not be performant as the memory consumption is close to max
+        # in a auto-detected tuning_micro_batch_sizes list, max_micro_batch_size might not be performant as the memory consumption is close to max
         # try smaller values while gas stays the same
         # if finding a more performant mbs value, use it to replace max_micro_batch_size in the list
-        min_micro_batch_size_with_same_gas = (
-            tuning_micro_batch_sizes[-2] +
-            1) if len(tuning_micro_batch_sizes) > 1 else min_micro_batch_size
+        min_micro_batch_size_with_same_gas = (tuning_micro_batch_sizes[-2] +
+                                              1) if len(tuning_micro_batch_sizes) > 1 else min_micro_batch_size
 
         prev_best_metric_val = max_micro_batch_size_metric_val
         prev_best_mbs = max_micro_batch_size
@@ -872,10 +818,7 @@ def run_tuning_micro_batch_sizes(self,
         stride = (max_micro_batch_size - min_micro_batch_size_with_same_gas) // 3
         if stride == 0:
             stride = 1
-        for mbs in reversed(
-                range(min_micro_batch_size_with_same_gas,
-                      max_micro_batch_size,
-                      stride)):
+        for mbs in reversed(range(min_micro_batch_size_with_same_gas, max_micro_batch_size, stride)):
             ds_config[TRAIN_MICRO_BATCH_SIZE_PER_GPU] = mbs
             gas = max_train_batch_size_per_gpu // mbs
             ds_config[GRADIENT_ACCUMULATION_STEPS] = gas
@@ -908,10 +851,7 @@ def run_tuning_micro_batch_sizes(self,
             tuning_micro_batch_sizes[-1] = prev_best_mbs
         return tuning_micro_batch_sizes
 
-    def get_min_max_micro_batch_size(self,
-                                     stage,
-                                     min_micro_batch_size,
-                                     calculated_max_micro_batch_size):
+    def get_min_max_micro_batch_size(self, stage, min_micro_batch_size, calculated_max_micro_batch_size):
         # get min and max micro batch size with gradient accumulation steps = 1
         if min_micro_batch_size > calculated_max_micro_batch_size:
             return -1, -1
@@ -927,8 +867,7 @@ def get_min_max_micro_batch_size(self,
         # search for the min micro batch size
         if min_micro_batch_size < 1:
             if TRAIN_MICRO_BATCH_SIZE_PER_GPU in self.user_config and isinstance(
-                    self.user_config[TRAIN_MICRO_BATCH_SIZE_PER_GPU],
-                    int):
+                    self.user_config[TRAIN_MICRO_BATCH_SIZE_PER_GPU], int):
                 # user specifies train_micro_batch_size_per_gpu as an int
                 mbs = int(self.user_config[TRAIN_MICRO_BATCH_SIZE_PER_GPU])
             else:
@@ -951,8 +890,7 @@ def get_min_max_micro_batch_size(self,
                 min_micro_batch_size = mbs
             else:
                 self.update_records(tuning_space_name, exp, 0, 1)
-                logger.info(
-                    f"User-specified micro batch size per GPU {mbs} does not run")
+                logger.info(f"User-specified micro batch size per GPU {mbs} does not run")
                 if self.min_train_micro_batch_size_per_gpu() == mbs:
                     return -1, -1
                 mbs = self.min_train_micro_batch_size_per_gpu()
@@ -964,8 +902,7 @@ def get_min_max_micro_batch_size(self,
                 exp, metric_val = self.run_ds_config(ds_config, exp_name)
                 if not metric_val:
                     self.update_records(tuning_space_name, exp, 0, 1)
-                    logger.info(
-                        f"min_train_micro_batch_size_per_gpu {mbs} is not runnable.")
+                    logger.info(f"min_train_micro_batch_size_per_gpu {mbs} is not runnable.")
                     return -1, -1
                 self.update_records(tuning_space_name, exp, metric_val, 1)
                 min_micro_batch_size = mbs
@@ -975,8 +912,7 @@ def get_min_max_micro_batch_size(self,
             ds_config[GRADIENT_ACCUMULATION_STEPS] = gas
             ds_config[TRAIN_BATCH_SIZE] = min_micro_batch_size * gas * \
                 self.exp_num_gpus * self.exp_num_nodes // self.mp_size()
-            exp_name = tuning_space_name + "_gas" + str(gas) + "_tmbspg" + str(
-                min_micro_batch_size)
+            exp_name = tuning_space_name + "_gas" + str(gas) + "_tmbspg" + str(min_micro_batch_size)
             exp, metric_val = self.run_ds_config(ds_config, exp_name)
             if metric_val:
                 self.update_records(tuning_space_name, exp, metric_val, 1)
@@ -986,13 +922,8 @@ def get_min_max_micro_batch_size(self,
                 return -1, -1
 
         # search for the max micro batch size
-        max_micro_batch_size = min(calculated_max_micro_batch_size,
-                                   self.max_train_micro_batch_size_per_gpu())
-        for mbs in [
-                math.ceil(1.05 * max_micro_batch_size),
-                max_micro_batch_size,
-                int(0.95 * max_micro_batch_size)
-        ]:
+        max_micro_batch_size = min(calculated_max_micro_batch_size, self.max_train_micro_batch_size_per_gpu())
+        for mbs in [math.ceil(1.05 * max_micro_batch_size), max_micro_batch_size, int(0.95 * max_micro_batch_size)]:
             if mbs > self.max_train_micro_batch_size_per_gpu():
                 continue
             if mbs in used_micro_batch_sizes:
@@ -1011,12 +942,11 @@ def get_min_max_micro_batch_size(self,
             else:
                 self.update_records(tuning_space_name, exp, 0, 1)
 
-        space_records = self.records[
-            tuning_space_name] if tuning_space_name in self.records else []
+        space_records = self.records[tuning_space_name] if tuning_space_name in self.records else []
         if space_records:
             prev_idx = min(range(len(space_records)),
-                           key=lambda i: abs(space_records[i][0][DS_CONFIG][
-                               TRAIN_MICRO_BATCH_SIZE_PER_GPU] - min_micro_batch_size))
+                           key=lambda i: abs(space_records[i][0][DS_CONFIG][TRAIN_MICRO_BATCH_SIZE_PER_GPU] -
+                                             min_micro_batch_size))
             prev_metric_val = space_records[prev_idx][1]
         else:
             prev_metric_val = None
@@ -1037,8 +967,8 @@ def get_min_max_micro_batch_size(self,
                     low = mid + 1
                     self.update_records(tuning_space_name, exp, metric_val, 1)
                     used_micro_batch_sizes.append(mid)
-                    if prev_metric_val and ((metric_val - prev_metric_val) /
-                                            prev_metric_val) < METRIC_PERCENT_DIFF_CONST:
+                    if prev_metric_val and (
+                        (metric_val - prev_metric_val) / prev_metric_val) < METRIC_PERCENT_DIFF_CONST:
                         logger.info(f"performance plateaus at mbs = {low}")
                         break
                     prev_metric_val = metric_val
@@ -1049,9 +979,7 @@ def get_min_max_micro_batch_size(self,
                 low = mid + 1
         max_micro_batch_size = low - 1
 
-        logger.info(
-            f"min_micro_batch_size = {min_micro_batch_size}, max_micro_batch_size = {max_micro_batch_size}."
-        )
+        logger.info(f"min_micro_batch_size = {min_micro_batch_size}, max_micro_batch_size = {max_micro_batch_size}.")
 
         return min_micro_batch_size, max_micro_batch_size
 
@@ -1062,13 +990,12 @@ def get_gas_from_user_config(self):
             if isinstance(gas_in_config, int):
                 gas = gas_in_config
             elif gas_in_config == "auto":  # GRADIENT_ACCUMULATION_STEPS: "auto"
-                val = self.get_val_from_config(GRADIENT_ACCUMULATION_STEPS)
+                val = self.get_val_from_user_args(GRADIENT_ACCUMULATION_STEPS)
                 if val:
                     gas = int(val)
             elif isinstance(gas_in_config, list):
                 logger.info(
-                    f"Specifying a list of {GRADIENT_ACCUMULATION_STEPS} to tune is not supported. 1 would be used."
-                )
+                    f"Specifying a list of {GRADIENT_ACCUMULATION_STEPS} to tune is not supported. 1 would be used.")
         assert gas > 0, "Gradient accumulation steps must be positive."
         return gas
 
@@ -1083,9 +1010,7 @@ def get_val_from_user_args(self, ds_name):
                     return (user_args[idx + 1])
         return None
 
-    def get_tuning_micro_batch_size_list(self,
-                                         min_micro_batch_size,
-                                         max_micro_batch_size,
+    def get_tuning_micro_batch_size_list(self, min_micro_batch_size, max_micro_batch_size,
                                          num_tuning_micro_batch_sizes):
         """Get a list of micro batch sizes to tune based on min and max values, as well as the size of the list.
         Args:
@@ -1098,17 +1023,16 @@ def get_tuning_micro_batch_size_list(self,
         """
         if min_micro_batch_size <= 0 or max_micro_batch_size <= 0:
             logger.info(
-                f"min_micro_batch_size = {min_micro_batch_size}, max_micro_batch_size = {max_micro_batch_size}"
-            )
+                f"min_micro_batch_size = {min_micro_batch_size}, max_micro_batch_size = {max_micro_batch_size}")
             return [], 0
 
         # NUM_GPUS=$(( ${NUM_WORKERS} * ${NUM_GPUS_PER_WORKER} ))
         # DP_SIZE=$(( ${NUM_GPUS} / (${PP_SIZE} * ${MP_SIZE}) ))
         # GRAD_ACC_STEPS=$(( ${TARGET_GLOBAL_BATCH_SIZE} / (${BATCH_SIZE} * ${DP_SIZE}) ))
-        if self.max_train_batch_size() and self.max_train_batch_size(
-        ) > 0:  # if the user specifies a max_train_batch_size
-            max_train_batch_size_per_gpu = self.max_train_batch_size() * self.mp_size(
-            ) // (self.exp_num_gpus * self.exp_num_nodes)
+        if self.max_train_batch_size(
+        ) and self.max_train_batch_size() > 0:  # if the user specifies a max_train_batch_size
+            max_train_batch_size_per_gpu = self.max_train_batch_size() * self.mp_size() // (self.exp_num_gpus *
+                                                                                            self.exp_num_nodes)
         else:
             gas = self.get_gas_from_user_config()
             max_train_batch_size_per_gpu = max_micro_batch_size * gas // self.mp_size()
@@ -1117,8 +1041,7 @@ def get_tuning_micro_batch_size_list(self,
             min_micro_batch_size = max_micro_batch_size // 2
 
         # constant stride
-        stride = (max_micro_batch_size -
-                  min_micro_batch_size) // num_tuning_micro_batch_sizes
+        stride = (max_micro_batch_size - min_micro_batch_size) // num_tuning_micro_batch_sizes
         if stride == 0:
             stride = 1
         ls = []
@@ -1174,21 +1097,19 @@ def write_optimal_config(self):
                 fd.write("\n")
                 fd.flush()
             self.optimal_cmd = cmd
-            self.optmal_ds_config = ds_config
+            self.optimal_ds_config = ds_config
             logger.info(
                 f"Wrote the optimal DeepSpeed configuration found by autotuning to {ds_config_path}, and the corresponding DeepSpeed command to {cmd_path}"
             )
 
     def run_after_tuning(self):
         """ Launches the training with the optimal DeepSpeed configuration found through the autotuning process.
-            "ds_config_optimal.json" describing the optmimal DeepSpeed configuration as well the command used to launch training "cmd_optimal.txt" are saved to self.results_dir.
+            "ds_config_optimal.json" describing the optimal DeepSpeed configuration as well the command used to launch training "cmd_optimal.txt" are saved to self.results_dir.
         """
         if self.optimal_cmd:
             result = subprocess.Popen(self.optimal_cmd)
             result.wait()
 
-            logger.info(
-                f"Done running with the optimal DeepSpeed configuration using {self.optimal_cmd}"
-            )
+            logger.info(f"Done running with the optimal DeepSpeed configuration using {self.optimal_cmd}")
         else:
             logger.info(f"No optimal DeepSpeed configuration found by autotuning.")
diff --git a/deepspeed/autotuning/config.py b/deepspeed/autotuning/config.py
index 6f6b6903efc5..6f58fb4e4296 100644
--- a/deepspeed/autotuning/config.py
+++ b/deepspeed/autotuning/config.py
@@ -1,14 +1,14 @@
-'''Copyright The Microsoft DeepSpeed Team'''
-"""
-Copyright (c) Microsoft Corporation
-Licensed under the MIT license.
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from deepspeed.runtime.config_utils import get_scalar_param, get_dict_param, DeepSpeedConfigObject
 from deepspeed.autotuning.constants import *
 
 
 class DeepSpeedAutotuningConfig(DeepSpeedConfigObject):
+
     def __init__(self, param_dict):
         super(DeepSpeedAutotuningConfig, self).__init__()
 
@@ -31,102 +31,65 @@ def __init__(self, param_dict):
         self._initialize(autotuning_dict)
 
     def _initialize(self, autotuning_dict):
-        self.enabled = get_scalar_param(autotuning_dict,
-                                        AUTOTUNING_ENABLED,
-                                        AUTOTUNING_ENABLED_DEFAULT)
+        self.enabled = get_scalar_param(autotuning_dict, AUTOTUNING_ENABLED, AUTOTUNING_ENABLED_DEFAULT)
 
-        self.fast = get_scalar_param(autotuning_dict,
-                                     AUTOTUNING_FAST,
-                                     AUTOTUNING_FAST_DEFAULT)
+        self.fast = get_scalar_param(autotuning_dict, AUTOTUNING_FAST, AUTOTUNING_FAST_DEFAULT)
 
-        self.results_dir = get_scalar_param(autotuning_dict,
-                                            AUTOTUNING_RESULTS_DIR,
-                                            AUTOTUNING_RESULTS_DIR_DEFAULT)
+        self.results_dir = get_scalar_param(autotuning_dict, AUTOTUNING_RESULTS_DIR, AUTOTUNING_RESULTS_DIR_DEFAULT)
         assert self.results_dir, "results_dir cannot be empty"
-        self.exps_dir = get_scalar_param(autotuning_dict,
-                                         AUTOTUNING_EXPS_DIR,
-                                         AUTOTUNING_EXPS_DIR_DEFAULT)
+        self.exps_dir = get_scalar_param(autotuning_dict, AUTOTUNING_EXPS_DIR, AUTOTUNING_EXPS_DIR_DEFAULT)
         assert self.exps_dir, "exps_dir cannot be empty"
-        self.overwrite = get_scalar_param(autotuning_dict,
-                                          AUTOTUNING_OVERWRITE,
-                                          AUTOTUNING_OVERWRITE_DEFAULT)
+        self.overwrite = get_scalar_param(autotuning_dict, AUTOTUNING_OVERWRITE, AUTOTUNING_OVERWRITE_DEFAULT)
 
-        self.start_profile_step = get_scalar_param(
-            autotuning_dict,
-            AUTOTUNING_START_PROFILE_STEP,
-            AUTOTUNING_START_PROFILE_STEP_DEFAULT)
+        self.start_profile_step = get_scalar_param(autotuning_dict, AUTOTUNING_START_PROFILE_STEP,
+                                                   AUTOTUNING_START_PROFILE_STEP_DEFAULT)
 
-        self.end_profile_step = get_scalar_param(autotuning_dict,
-                                                 AUTOTUNING_END_PROFILE_STEP,
+        self.end_profile_step = get_scalar_param(autotuning_dict, AUTOTUNING_END_PROFILE_STEP,
                                                  AUTOTUNING_END_PROFILE_STEP_DEFAULT)
 
-        self.metric = get_scalar_param(autotuning_dict,
-                                       AUTOTUNING_METRIC,
-                                       AUTOTUNING_METRIC_DEFAULT)
+        self.metric = get_scalar_param(autotuning_dict, AUTOTUNING_METRIC, AUTOTUNING_METRIC_DEFAULT)
 
-        self.metric_path = get_scalar_param(autotuning_dict,
-                                            AUTOTUNING_METRIC_PATH,
-                                            AUTOTUNING_METRIC_PATH_DEFAULT)
+        self.metric_path = get_scalar_param(autotuning_dict, AUTOTUNING_METRIC_PATH, AUTOTUNING_METRIC_PATH_DEFAULT)
 
-        self.tuner_type = get_scalar_param(autotuning_dict,
-                                           AUTOTUNING_TUNER_TYPE,
-                                           AUTOTUNING_TUNER_TYPE_DEFAULT)
+        self.tuner_type = get_scalar_param(autotuning_dict, AUTOTUNING_TUNER_TYPE, AUTOTUNING_TUNER_TYPE_DEFAULT)
 
-        self.tuner_early_stopping = get_scalar_param(
-            autotuning_dict,
-            AUTOTUNING_TUNER_EARLY_STOPPING,
-            AUTOTUNING_TUNER_EARLY_STOPPING_DEFAULT)
+        self.tuner_early_stopping = get_scalar_param(autotuning_dict, AUTOTUNING_TUNER_EARLY_STOPPING,
+                                                     AUTOTUNING_TUNER_EARLY_STOPPING_DEFAULT)
 
-        self.tuner_num_trials = get_scalar_param(autotuning_dict,
-                                                 AUTOTUNING_TUNER_NUM_TRIALS,
+        self.tuner_num_trials = get_scalar_param(autotuning_dict, AUTOTUNING_TUNER_NUM_TRIALS,
                                                  AUTOTUNING_TUNER_NUM_TRIALS_DEFAULT)
 
-        self.arg_mappings = get_dict_param(autotuning_dict,
-                                           AUTOTUNING_ARG_MAPPINGS,
-                                           AUTOTUNING_ARG_MAPPINGS_DEFAULT)
+        self.arg_mappings = get_dict_param(autotuning_dict, AUTOTUNING_ARG_MAPPINGS, AUTOTUNING_ARG_MAPPINGS_DEFAULT)
 
         self.model_info = get_model_info_config(autotuning_dict)
 
-        self.model_info_path = get_scalar_param(autotuning_dict,
-                                                AUTOTUNING_MODEL_INFO_PATH,
+        self.model_info_path = get_scalar_param(autotuning_dict, AUTOTUNING_MODEL_INFO_PATH,
                                                 AUTOTUNING_MODEL_INFO_PATH_DEFAULT)
-        self.mp_size = get_scalar_param(autotuning_dict,
-                                        AUTOTUNING_MP_SIZE,
-                                        AUTOTUNING_MP_SIZE_DEFAULT)
+        self.mp_size = get_scalar_param(autotuning_dict, AUTOTUNING_MP_SIZE, AUTOTUNING_MP_SIZE_DEFAULT)
 
-        self.max_train_batch_size = get_dict_param(
-            autotuning_dict,
-            AUTOTUNING_MAX_TRAIN_BATCH_SIZE,
-            AUTOTUNING_MAX_TRAIN_BATCH_SIZE_DEFAULT)
+        self.max_train_batch_size = get_dict_param(autotuning_dict, AUTOTUNING_MAX_TRAIN_BATCH_SIZE,
+                                                   AUTOTUNING_MAX_TRAIN_BATCH_SIZE_DEFAULT)
 
-        self.min_train_batch_size = get_dict_param(
-            autotuning_dict,
-            AUTOTUNING_MIN_TRAIN_BATCH_SIZE,
-            AUTOTUNING_MIN_TRAIN_BATCH_SIZE_DEFAULT)
+        self.min_train_batch_size = get_dict_param(autotuning_dict, AUTOTUNING_MIN_TRAIN_BATCH_SIZE,
+                                                   AUTOTUNING_MIN_TRAIN_BATCH_SIZE_DEFAULT)
 
         self.max_train_micro_batch_size_per_gpu = get_dict_param(
-            autotuning_dict,
-            AUTOTUNING_MAX_TRAIN_MICRO_BATCH_SIZE_PER_GPU,
+            autotuning_dict, AUTOTUNING_MAX_TRAIN_MICRO_BATCH_SIZE_PER_GPU,
             AUTOTUNING_MAX_TRAIN_MICRO_BATCH_SIZE_PER_GPU_DEFAULT)
 
         self.min_train_micro_batch_size_per_gpu = get_dict_param(
-            autotuning_dict,
-            AUTOTUNING_MIN_TRAIN_MICRO_BATCH_SIZE_PER_GPU,
+            autotuning_dict, AUTOTUNING_MIN_TRAIN_MICRO_BATCH_SIZE_PER_GPU,
             AUTOTUNING_MIN_TRAIN_MICRO_BATCH_SIZE_PER_GPU_DEFAULT)
 
-        self.num_tuning_micro_batch_sizes = get_dict_param(
-            autotuning_dict,
-            AUTOTUNING_NUM_TUNING_MICRO_BATCH_SIZES,
-            AUTOTUNING_NUM_TUNING_MICRO_BATCH_SIZES_DEFAULT)
+        self.num_tuning_micro_batch_sizes = get_dict_param(autotuning_dict, AUTOTUNING_NUM_TUNING_MICRO_BATCH_SIZES,
+                                                           AUTOTUNING_NUM_TUNING_MICRO_BATCH_SIZES_DEFAULT)
 
 
 def get_model_info_config(param_dict):
     if MODEL_INFO in param_dict and param_dict[MODEL_INFO] is not None:
         model_info_config = {}
         for key, default_value in MODEL_INFO_KEY_DEFAULT_DICT.items():
-            model_info_config[key] = get_scalar_param(param_dict[MODEL_INFO],
-                                                      key,
-                                                      default_value)
+            model_info_config[key] = get_scalar_param(param_dict[MODEL_INFO], key, default_value)
         return model_info_config
     return None
 
diff --git a/deepspeed/autotuning/constants.py b/deepspeed/autotuning/constants.py
index d0306bb09bb9..908868a417bb 100644
--- a/deepspeed/autotuning/constants.py
+++ b/deepspeed/autotuning/constants.py
@@ -1,26 +1,21 @@
-'''Copyright The Microsoft DeepSpeed Team'''
-"""
-Copyright (c) Microsoft Corporation
-Licensed under the MIT license.
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 #########################################
-# autotunner implementation constants
+# autotuner implementation constants
 #########################################
 
 import os
 
-DEFAULT_TEMPLATE_PATH_ZERO_0 = os.path.join(os.path.dirname(os.path.realpath(__file__)),
-                                            "config_templates",
+DEFAULT_TEMPLATE_PATH_ZERO_0 = os.path.join(os.path.dirname(os.path.realpath(__file__)), "config_templates",
                                             "template_zero0.json")
-DEFAULT_TEMPLATE_PATH_ZERO_1 = os.path.join(os.path.dirname(os.path.realpath(__file__)),
-                                            "config_templates",
+DEFAULT_TEMPLATE_PATH_ZERO_1 = os.path.join(os.path.dirname(os.path.realpath(__file__)), "config_templates",
                                             "template_zero1.json")
-DEFAULT_TEMPLATE_PATH_ZERO_2 = os.path.join(os.path.dirname(os.path.realpath(__file__)),
-                                            "config_templates",
+DEFAULT_TEMPLATE_PATH_ZERO_2 = os.path.join(os.path.dirname(os.path.realpath(__file__)), "config_templates",
                                             "template_zero2.json")
-DEFAULT_TEMPLATE_PATH_ZERO_3 = os.path.join(os.path.dirname(os.path.realpath(__file__)),
-                                            "config_templates",
+DEFAULT_TEMPLATE_PATH_ZERO_3 = os.path.join(os.path.dirname(os.path.realpath(__file__)), "config_templates",
                                             "template_zero3.json")
 
 METRIC_PERCENT_DIFF_CONST = 0.05
@@ -122,7 +117,7 @@
 MODEL_INFO_PROFILE_DEFAULT = False
 MODEL_INFO_NUM_PARAMS = "num_params"
 MODEL_INFO_NUM_PARAMS_DEFAULT = None
-MODEL_INFO_HIDDEN_SIZE = "hideen_size"
+MODEL_INFO_HIDDEN_SIZE = "hidden_size"
 MODEL_INFO_HIDDEN_SIZE_DEFAULT = None
 MODEL_INFO_NUM_LAYERS = "num_layers"
 MODEL_INFO_NUM_LAYERS_DEFAULT = None
@@ -135,7 +130,7 @@
 }
 
 #########################################
-# autotunner search space constants
+# autotuner search space constants
 #########################################
 
 DEFAULT_HF_CONFIG = {
@@ -157,50 +152,31 @@
 DEFAULT_TUNING_SPACE_ZERO_1 = {
     "zero_optimization": {
         "stage": 1,
-        "reduce_bucket_size": [5e7,
-                               5e8,
-                               1e9],
-        "allgather_bucket_size": [5e7,
-                                  5e8,
-                                  1e9],
+        "reduce_bucket_size": [5e7, 5e8, 1e9],
+        "allgather_bucket_size": [5e7, 5e8, 1e9],
     }
 }
 
 DEFAULT_TUNING_SPACE_ZERO_2 = {
     "zero_optimization": {
         "stage": 2,
-        "overlap_comm": [True,
-                         False],
-        "reduce_scatter": [False,
-                           True],
-        "reduce_bucket_size": [5e7,
-                               5e8,
-                               1e9],
-        "allgather_bucket_size": [5e7,
-                                  5e8,
-                                  1e9],
-        "contiguous_gradients": [False,
-                                 True]
+        "overlap_comm": [True, False],
+        "reduce_scatter": [False, True],
+        "reduce_bucket_size": [5e7, 5e8, 1e9],
+        "allgather_bucket_size": [5e7, 5e8, 1e9],
+        "contiguous_gradients": [False, True]
     },
 }
 
 DEFAULT_TUNING_SPACE_ZERO_3 = {
     "zero_optimization": {
         "stage": 3,
-        "overlap_comm": [True,
-                         False],
-        "reduce_scatter": [False,
-                           True],
-        "reduce_bucket_size": [5e7,
-                               5e8,
-                               1e9],
-        "allgather_partitions": [True,
-                                 False],
-        "allgather_bucket_size": [5e7,
-                                  5e8,
-                                  1e9],
-        "contiguous_gradients": [False,
-                                 True]
+        "overlap_comm": [True, False],
+        "reduce_scatter": [False, True],
+        "reduce_bucket_size": [5e7, 5e8, 1e9],
+        "allgather_partitions": [True, False],
+        "allgather_bucket_size": [5e7, 5e8, 1e9],
+        "contiguous_gradients": [False, True]
     },
 }
 
diff --git a/deepspeed/autotuning/scheduler.py b/deepspeed/autotuning/scheduler.py
index 2a4c0c70d955..c265da788305 100755
--- a/deepspeed/autotuning/scheduler.py
+++ b/deepspeed/autotuning/scheduler.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import copy
 
@@ -15,6 +18,8 @@
 from tqdm import tqdm
 
 from ..utils import logger
+from ..launcher.constants import MVAPICH_LAUNCHER, PDSH_LAUNCHER, OPENMPI_LAUNCHER, SLURM_LAUNCHER
+from .constants import *
 from .constants import AUTOTUNING, AUTOTUNING_METRIC_PATH
 from .utils import get_val_by_key, search_error, was_interruptted
 """
@@ -28,13 +33,8 @@
 
 
 class ResourceManager:
-    def __init__(self,
-                 args,
-                 hosts,
-                 num_gpus_per_node,
-                 results_dir,
-                 exps_dir,
-                 arg_mappings):
+
+    def __init__(self, args, hosts, num_gpus_per_node, results_dir, exps_dir, arg_mappings):
         self.results_dir = results_dir
         self.exps_dir = exps_dir
 
@@ -69,13 +69,10 @@ def schedule_experiments(self, exp_paths):
                     exp["exp_id"] = self.experiment_count
                     self.experiment_count += 1
 
-                    result_dir = exp["result_dir"] = os.path.join(
-                        self.results_dir,
-                        exp['name'])
+                    result_dir = exp["result_dir"] = os.path.join(self.results_dir, exp['name'])
                     if AUTOTUNING in exp["ds_config"]:
                         metric_file = os.path.join(result_dir, "metrics.json")
-                        exp["ds_config"][AUTOTUNING][
-                            AUTOTUNING_METRIC_PATH] = metric_file
+                        exp["ds_config"][AUTOTUNING][AUTOTUNING_METRIC_PATH] = metric_file
                     stderr_file = os.path.join(result_dir, "stderr.log")
                     model_info_file = os.path.join(result_dir, "model_info.json")
                     metric_file = os.path.join(result_dir, "metrics.json")
@@ -86,19 +83,22 @@ def schedule_experiments(self, exp_paths):
                             err = search_error(stderr_file)
                             exp_id = exp["exp_id"]
                             self.finished_experiments[exp_id] = (exp, err)
-                            if err or os.path.exists(metric_file) or os.path.exists(
-                                    model_info_file):
-                                logger.info(
-                                    f"Skipping exp {exp['name']} whose result already exists"
-                                )
+                            if err or os.path.exists(metric_file) or os.path.exists(model_info_file):
+                                logger.info(f"Skipping exp {exp['name']} whose result already exists")
                                 continue
 
                     self.experiment_queue.append(exp)
 
     def run_job(self, exp: dict, reservations):
         exp_id = exp["exp_id"]
+        exp["master_addr"] = self.args.master_addr
         exp["master_port"] = self.args.master_port + exp_id
         exp["result_dir"] = os.path.join(self.results_dir, exp['name'])
+        exp["hostfile"] = self.args.hostfile
+        exp["launcher"] = self.args.launcher
+        exp["no_ssh_check"] = self.args.no_ssh_check
+        if self.args.launcher == 'slurm' and hasattr(self.args, 'comment'):
+            exp["comment"] = self.args.comment
         user_script = self.args.user_script
         user_args = self.args.user_args
 
@@ -113,11 +113,7 @@ def run_job(self, exp: dict, reservations):
                     user_args.append(val)
                     user_args.append(str(nval))
 
-        t = threading.Thread(target=run_experiment,
-                             args=(exp,
-                                   reservations,
-                                   user_script,
-                                   user_args))
+        t = threading.Thread(target=run_experiment, args=(exp, reservations, user_script, user_args))
         t.start()
         self.running_experiments[exp_id] = (t, exp, reservations, time.time())
 
@@ -270,6 +266,7 @@ def clear(self):
 
 
 class Node:
+
     def __init__(self, host, max_slots):
         self.host = host
         self.max_slots = max_slots
@@ -284,6 +281,7 @@ def restore_slots(self, slots: list):
 
 
 class Reservation:
+
     def __init__(self, node, slots):
         self.node = node
         self.slots = slots
@@ -325,13 +323,26 @@ def run_experiment(exp: dict, reservations, user_script, user_args):
         slots = ",".join(map(str, reservation.slots))
         include_str += f"{reservation.node.host}:{slots}@"
     include_str = include_str[:-1]
+    master_addr = exp["master_addr"]
     master_port = exp["master_port"]
-    exp["launcher_args"] = [
-        "--include",
-        f"{include_str}",
-        "--master_port",
-        str(master_port),
-    ]
+    hostfile = exp["hostfile"]
+    launcher_args = ["--launcher", exp["launcher"]]
+    if master_addr:
+        launcher_args += ["--master_addr", master_addr]
+    if exp["launcher"] not in (MVAPICH_LAUNCHER, OPENMPI_LAUNCHER, SLURM_LAUNCHER):
+        launcher_args += [
+            "--include",
+            f"{include_str}",
+            "--master_port",
+            str(master_port),
+        ]
+    if hostfile != '':
+        launcher_args += ["--hostfile", hostfile]
+    if 'comment' in exp:
+        launcher_args += ["--comment", exp["comment"]]
+    if exp['no_ssh_check']:
+        launcher_args += ["--no_ssh_check"]
+    exp["launcher_args"] = launcher_args
     logger.debug(f'launcher args={exp["launcher_args"]}')
 
     exp["user"] = get_user()
@@ -389,21 +400,18 @@ def run_experiment(exp: dict, reservations, user_script, user_args):
         f"Launching exp_id = {exp['exp_id']}, exp_name = {exp['name']}, with resource = {include_str}, and ds_config = {os.path.abspath(ds_config_path)}"
     )
 
-    with open(os.path.join(exp_dir, "stdout.log"), "wb") as out, open(
-        os.path.join(exp_dir, "stderr.log"), "wb"
-    ) as err:
+    with open(os.path.join(exp_dir, "stdout.log"), "wb") as out, open(os.path.join(exp_dir, "stderr.log"),
+                                                                      "wb") as err:
         result = subprocess.Popen(cmd, stdout=out, stderr=err)
         result.wait()
         out.flush()
         err.flush()
         os.fsync(out)
         os.fsync(err)
-
+    
     clean_up(exp, reservations)
 
-    logger.info(
-        f"Done running exp_id = {exp['exp_id']}, exp_name = {exp['name']}, with resource = {include_str}"
-    )
+    logger.info(f"Done running exp_id = {exp['exp_id']}, exp_name = {exp['name']}, with resource = {include_str}")
 
 
 PDSH_MAX_FAN_OUT = 1024
@@ -417,19 +425,24 @@ def clean_up(exp: dict, reservations):
     for reservation in reservations:
         nodes_str += f"{reservation.node.host},"
     nodes_str = nodes_str[:-1]
-    logger.debug(
-        f"Cleaning up exp_id = {exp['exp_id']} on the following workers: {nodes_str}")
+    logger.debug(f"Cleaning up exp_id = {exp['exp_id']} on the following workers: {nodes_str}")
+
 
-    # PDSH flags for max node fan out and specific hosts to launch on
-    # See https://linux.die.net/man/1/pdsh for flag details
-    pdsh_cmd = ['pdsh', '-f', str(PDSH_MAX_FAN_OUT), '-w', nodes_str]
+    if exp['launcher'] == 'slurm':
+        runner_cmd = ['srun', '-w', nodes_str]
+        if 'comment' in exp:
+            runner_cmd += ['--comment', exp['comment']]
+    else:
+        # PDSH flags for max node fan out and specific hosts to launch on
+        # See https://linux.die.net/man/1/pdsh for flag details
+        runner_cmd = ['pdsh', '-f', str(PDSH_MAX_FAN_OUT), '-w', nodes_str]
 
     kill_cmd = [
         'pkill',
         '-f',
         exp['name'],
     ]
-    cmd = pdsh_cmd + kill_cmd
+    cmd = runner_cmd + kill_cmd
     logger.debug("cmd = {}".format(' '.join(cmd)))
 
     result = subprocess.Popen(cmd, env=env)
@@ -441,6 +454,4 @@ def clean_up(exp: dict, reservations):
     if result.returncode > 0:
         sys.exit(result.returncode)
 
-    logger.info(
-        f"Done cleaning up exp_id = {exp['exp_id']} on the following workers: {nodes_str}"
-    )
+    logger.info(f"Done cleaning up exp_id = {exp['exp_id']} on the following workers: {nodes_str}")
diff --git a/deepspeed/autotuning/tuner/__init__.py b/deepspeed/autotuning/tuner/__init__.py
index 9f2e5675d9fd..676ae429e077 100755
--- a/deepspeed/autotuning/tuner/__init__.py
+++ b/deepspeed/autotuning/tuner/__init__.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .index_based_tuner import RandomTuner, GridSearchTuner
 # from .ga_tuner import GATuner
diff --git a/deepspeed/autotuning/tuner/base_tuner.py b/deepspeed/autotuning/tuner/base_tuner.py
index 89eb2d384c94..b2da065e44f4 100755
--- a/deepspeed/autotuning/tuner/base_tuner.py
+++ b/deepspeed/autotuning/tuner/base_tuner.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import sys
 
@@ -8,6 +11,7 @@
 
 
 class BaseTuner:
+
     def __init__(self, exps, resource_manager, metric):
         self.all_exps = exps
         self.rm = resource_manager
@@ -35,15 +39,15 @@ def tune(self, sample_size=1, n_trials=1000, early_stopping=None):
         i = 0
         try:
             while i < n_trials and self.has_next():
-                # Select the next batch of configuratiosn for evaluation
+                # Select the next batch of configuration for evaluation
                 sampled_exps = self.next_batch(sample_size)
                 # Generate experiments for measurement of performance
                 exp_paths = write_experiments(sampled_exps, self.rm.exps_dir)
                 self.rm.schedule_experiments(exp_paths)
                 self.rm.run()
                 exp, metric_val = self.rm.parse_results(self.metric)
-                if self.best_exp == None or self.best_metric_val == None or (
-                        metric_val and metric_val > self.best_metric_val):
+                if self.best_exp is None or self.best_metric_val is None or (metric_val
+                                                                             and metric_val > self.best_metric_val):
                     # logger.info(f"tuner finds better = {exp}")
                     self.best_exp = exp
                     self.best_metric_val = metric_val
@@ -64,5 +68,5 @@ def tune(self, sample_size=1, n_trials=1000, early_stopping=None):
                     break
             return i
         except:
-            logger.info("Tunner Error:", sys.exc_info()[0])
+            logger.info("Tuner Error:", sys.exc_info()[0])
             return i
diff --git a/deepspeed/autotuning/tuner/cost_model.py b/deepspeed/autotuning/tuner/cost_model.py
index 858ab6d3d1ea..c12b10f74363 100755
--- a/deepspeed/autotuning/tuner/cost_model.py
+++ b/deepspeed/autotuning/tuner/cost_model.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .utils import *
 
@@ -9,6 +12,7 @@
 
 
 class XGBoostCostModel():
+
     def __init__(self, loss_type, num_threads=None, log_interval=25, upper_model=None):
 
         assert xgb is not None, "missing requirements, please install deepspeed w. 'autotuning_ml' extra."
diff --git a/deepspeed/autotuning/tuner/index_based_tuner.py b/deepspeed/autotuning/tuner/index_based_tuner.py
index f19694871c53..d3c822be0d35 100755
--- a/deepspeed/autotuning/tuner/index_based_tuner.py
+++ b/deepspeed/autotuning/tuner/index_based_tuner.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import random
 
@@ -7,6 +10,7 @@
 
 class RandomTuner(BaseTuner):
     """Explore the search space in random order"""
+
     def __init__(self, exps: list, resource_manager, metric):
         super().__init__(exps, resource_manager, metric)
 
@@ -22,6 +26,7 @@ def next_batch(self, sample_size=1):
 
 class GridSearchTuner(BaseTuner):
     """Explore the search space in sequential order"""
+
     def __init__(self, exps: list, resource_manager, metric):
         super().__init__(exps, resource_manager, metric)
 
diff --git a/deepspeed/autotuning/tuner/model_based_tuner.py b/deepspeed/autotuning/tuner/model_based_tuner.py
index ec475005abe0..aec9264f9b7c 100755
--- a/deepspeed/autotuning/tuner/model_based_tuner.py
+++ b/deepspeed/autotuning/tuner/model_based_tuner.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import hjson
 
@@ -15,9 +18,10 @@
 
 class ModelBasedTuner(BaseTuner):
     """Exploring the search space with a cost model"""
-    def __init__(self, exps: list, resource_manager, metric, tuning_sapce):
+
+    def __init__(self, exps: list, resource_manager, metric, tuning_space):
         super().__init__(exps, resource_manager, metric)
-        self.tuning_space = tuning_sapce
+        self.tuning_space = tuning_space
         self.best_iter = 0
 
         self.all_configs = [e['ds_config'] for e in exps]
@@ -25,8 +29,7 @@ def __init__(self, exps: list, resource_manager, metric, tuning_sapce):
 
         self.dims = dict_to_dims(self.tuning_space)
 
-        logger.info(
-            f"Create config dim: {self.dims}, all configs: {self.num_all_configs}")
+        logger.info(f"Create config dim: {self.dims}, all configs: {self.num_all_configs}")
 
         self.visited = set([])
 
@@ -71,9 +74,7 @@ def find_estimated_top_configs(self):
 
         n = len(estimates)
         top_idx = np.argsort(estimates)
-        top_idx_ret = top_idx if self.metric == AUTOTUNING_METRIC_LATENCY else top_idx[::
-                                                                                       -1][:
-                                                                                           n]
+        top_idx_ret = top_idx if self.metric == AUTOTUNING_METRIC_LATENCY else top_idx[::-1][:n]
 
         # top_configs = [self.all_configs[i] for i in top_idx]
 
@@ -145,9 +146,7 @@ def update(self):
                 self.evaluated_configs.append(feature_val)
                 self.evaluated_perf.append(curr_iter)
 
-        logger.debug(
-            f"**Evaluated configs: {len(self.evaluated_configs)}, evaluated perf: {self.evaluated_perf}"
-        )
+        logger.debug(f"**Evaluated configs: {len(self.evaluated_configs)}, evaluated perf: {self.evaluated_perf}")
 
         self.cost_model.fit(self.evaluated_configs, self.evaluated_perf)
 
diff --git a/deepspeed/autotuning/tuner/utils.py b/deepspeed/autotuning/tuner/utils.py
index f6719f0b37d2..ada643f2c02c 100755
--- a/deepspeed/autotuning/tuner/utils.py
+++ b/deepspeed/autotuning/tuner/utils.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import numpy as np
 import itertools
@@ -44,9 +47,7 @@ def gen_combinations(d: dict):
     for v in values:
         if not isinstance(v, list):
             v = [v]
-    values_choices = (gen_combinations(v) if isinstance(v,
-                                                        dict) else get_list(v)
-                      for v in values)
+    values_choices = (gen_combinations(v) if isinstance(v, dict) else get_list(v) for v in values)
     for comb in itertools.product(*values_choices):
         yield dict(zip(keys, comb))
 
diff --git a/deepspeed/autotuning/utils.py b/deepspeed/autotuning/utils.py
index 40da7ec9613a..8c9a5fa85bf2 100644
--- a/deepspeed/autotuning/utils.py
+++ b/deepspeed/autotuning/utils.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import re
 import collections.abc
@@ -176,6 +179,7 @@ def fetch_hostfile(hostfile_path):
 
 
 def validate_ds_config(config: dict):
+
     def is_False(config: dict, key):
         if config is None:
             return False
@@ -189,9 +193,7 @@ def is_False(config: dict, key):
     if stage == 1:
         return True
     elif stage == 2:
-        if is_False(config_zero,
-                    "cpu_offload") and is_False(config_zero,
-                                                "cpu_offload_params"):
+        if is_False(config_zero, "cpu_offload") and is_False(config_zero, "cpu_offload_params"):
             return False
     elif stage == 3:
         offload_devices = ["cpu", "nvme"]
@@ -266,7 +268,7 @@ def prune_configs(configs, ignored_keys=[]):
 
 
 def get_tuning_keys(tuning_space: dict):
-    """Outputs the list of tunnable parameters in the tuning space dict.
+    """Outputs the list of tunable parameters in the tuning space dict.
 
     Args:
         tuning_space (dict): a configuration dictionary containing tunable parameters as lists of values.
@@ -289,14 +291,13 @@ def get_all_configs(tuning_space: dict, ignore_keys=None):
     Args:
         tuning_space (dict): the tuning space where tunable parameters are lists of values.
     """
+
     def gen_combinations(d: dict):
         keys, values = d.keys(), d.values()
         for v in values:
             if not isinstance(v, list):
                 v = [v]
-        values_choices = (gen_combinations(v) if isinstance(v,
-                                                            dict) else get_list(v)
-                          for v in values)
+        values_choices = (gen_combinations(v) if isinstance(v, dict) else get_list(v) for v in values)
         for comb in itertools.product(*values_choices):
             yield dict(zip(keys, comb))
 
diff --git a/deepspeed/checkpoint/__init__.py b/deepspeed/checkpoint/__init__.py
index 2ef0817c724e..b9a28ed51088 100644
--- a/deepspeed/checkpoint/__init__.py
+++ b/deepspeed/checkpoint/__init__.py
@@ -1,12 +1,13 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .reshape_meg_2d import reshape_meg_2d_parallel
 
-from .deepspeed_checkpoint import DeepSpeedCheckpoint
+from .deepspeed_checkpoint import DeepSpeedCheckpoint, NeoxCheckpoint
 
-from .utils import (get_layer_ckpt_name_for_rank,
-                    get_model_ckpt_name_for_rank,
-                    get_zero_ckpt_name_for_rank)
+from .utils import (get_layer_ckpt_name_for_rank, get_model_ckpt_name_for_rank, get_zero_ckpt_name_for_rank)
 
 from .reshape_utils import (merge_state)
 
diff --git a/deepspeed/checkpoint/constants.py b/deepspeed/checkpoint/constants.py
index 787398486ca2..f809a0c39270 100644
--- a/deepspeed/checkpoint/constants.py
+++ b/deepspeed/checkpoint/constants.py
@@ -1,7 +1,10 @@
-'''Copyright The Microsoft DeepSpeed Team'''
-'''
-    Various symbolic constants used for model checkpointing
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""
+Various symbolic constants used for model checkpointing
+"""
 
 #########################################
 # Optimizer checkpoint keys
@@ -11,12 +14,14 @@
 FP32_FLAT_GROUPS = 'fp32_flat_groups'
 
 BASE_OPTIMIZER_STATE = 'base_optimizer_state'
+BASE_OPTIMIZER_STATE_STEP = 'base_optimizer_state_step'
 SINGLE_PARTITION_OF_FP32_GROUPS = "single_partition_of_fp32_groups"
 GROUP_PADDINGS = 'group_paddings'
 PARTITION_COUNT = 'partition_count'
 ZERO_STAGE = 'zero_stage'
 CLIP_GRAD = 'clip_grad'
 FP32_WEIGHT_KEY = "fp32"
+LOSS_SCALER = 'loss_scaler'
 
 #########################################
 # Module checkpoint keys
@@ -24,6 +29,8 @@
 PARAM = 'param'
 PARAM_SHAPES = 'param_shapes'
 BUFFER_NAMES = 'buffer_names'
+FROZEN_PARAM_SHAPES = 'frozen_param_shapes'
+FROZEN_PARAM_FRAGMENTS = 'frozen_param_fragments'
 
 #########################################
 # Checkpoint naming constants
@@ -50,16 +57,26 @@
 UNIVERSAL_CHECKPOINT_VERSION_VALUE = 0.2
 
 # Vocabulary padding
-VOCAB_DIVISIBILITY_PADDING_TENSOR = 'vocab_divisibility_padding_tensor'
+VOCAB_TENSOR = 'vocab_tensor'
 PADDED_VOCAB_SIZE = 'padded_vocab_size'
 ORIGINAL_VOCAB_SIZE = 'original_vocab_size'
 
 # Parameter splitting/merging
 PARAM_SLICE_MAPPINGS = 'param_slice_mappings'
 CAT_DIM = "cat_dim"
+# Following is a special case where a parameter effectively contains sub parameters.
+# As an example, consider Megatron-DeepSpeed GPT SWIGLU implementation (mlp.h_to_4h).
+# In this case, a single parameter ia allocated contiguously, but used as separate parameters.
+# When using universal checkpoint, we have to normalize the representation of the full parameter.
+# We normalize it by concatenating all slices of the sub params and then concatenating the sub params.
+# All concat operations are done on CAT_DIM (currently, no support for different concat dims sub params and TP slicing).
+# Similarly, load_hp_checkpoint_state has to take the needed actions when loading from universal.
+PARAM_N_SUB_PARAMS = "param_n_sub_params"
 
 # Regex list of parameters that require special handling
 VOCABULARY_PARAMETER_PATTERNS = 'vocabulary_parameter_patterns'
 PIPELINE_REPLICATED_PARAMETER_PATTERNS = 'pipeline_replicated_parameter_patterns'
 PARAMETER_TO_AVERAGE_PATTERNS = 'parameter_to_average_patterns'
 PARAMETER_WITH_ROW_PARALLELISM_PATTERNS = 'parameter_with_row_parallelism_patterns'
+TP_REPLICATED_PARAMETER_PATTERNS = 'tp_replicated_parameter_patterns'
+PARAMETER_WITH_2_SUB_PARAMS_CAT_DIM_0 = 'parameter_with_2_sub_params_cat_dim_0'
diff --git a/deepspeed/checkpoint/deepspeed_checkpoint.py b/deepspeed/checkpoint/deepspeed_checkpoint.py
index c1a31b0fb4a5..c9d2d4e22e99 100644
--- a/deepspeed/checkpoint/deepspeed_checkpoint.py
+++ b/deepspeed/checkpoint/deepspeed_checkpoint.py
@@ -1,15 +1,14 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import os
 from typing import Dict
 import torch
 
 from .reshape_3d_utils import model_3d_desc
-from .reshape_utils import (basic_folder_validation,
-                            merge_state,
-                            partition_data,
-                            get_files,
-                            get_files_with_prefix)
+from .reshape_utils import (basic_folder_validation, merge_state, partition_data, get_files, get_files_with_prefix)
 
 from .constants import (MODEL_FILE_PREFIX, LAYER_FILE_PREFIX)
 
@@ -24,19 +23,15 @@
 ITERATION_KEY = 'iteration'
 
 SEQUENTIAL_LAYERS = [
-    'input_layernorm.weight',
-    'input_layernorm.bias',
-    'self_attention.dense.bias',
-    'post_attention_layernorm.weight',
-    'post_attention_layernorm.bias',
-    'mlp.dense_4h_to_h.bias',
-    'position_embeddings.weight'
+    'input_layernorm.weight', 'input_layernorm.bias', 'self_attention.dense.bias', 'post_attention_layernorm.weight',
+    'post_attention_layernorm.bias', 'mlp.dense_4h_to_h.bias', 'position_embeddings.weight'
 ]
 
 LAYER_CONCAT_DIM = {'self_attention.dense.weight': 1, 'mlp.dense_4h_to_h.weight': 1}
 
 
 class DeepSpeedCheckpoint(object):
+
     def __init__(self, dir, tp_degree=None, pp_degree=None, dp_degree=None):
         self.dir = dir
         self._validate_folder(dir)
@@ -50,33 +45,24 @@ def __init__(self, dir, tp_degree=None, pp_degree=None, dp_degree=None):
         self.layer_keys = self._get_layer_keys()
         self.layer_count = len(self.layer_keys)
 
-        self.tp_degree = self.zero_checkpoint.get_src_tp_degree(
-        ) if tp_degree is None else tp_degree
-        self.pp_degree = self.zero_checkpoint.get_src_pp_degree(
-        ) if pp_degree is None else pp_degree
-        self.dp_degree = self.zero_checkpoint.get_src_dp_degree(
-        ) if dp_degree is None else dp_degree
+        self.tp_degree = self.zero_checkpoint.get_src_tp_degree() if tp_degree is None else tp_degree
+        self.pp_degree = self.zero_checkpoint.get_src_pp_degree() if pp_degree is None else pp_degree
+        self.dp_degree = self.zero_checkpoint.get_src_dp_degree() if dp_degree is None else dp_degree
 
-        self.original_world_size = self.zero_checkpoint.get_src_tp_degree(
-        ) * self.zero_checkpoint.get_src_pp_degree(
+        self.original_world_size = self.zero_checkpoint.get_src_tp_degree() * self.zero_checkpoint.get_src_pp_degree(
         ) * self.zero_checkpoint.get_src_dp_degree()
         self.world_size = self.tp_degree * self.pp_degree * self.dp_degree
 
         self.old_2d_map = meg_2d_parallel_map(self.zero_checkpoint.get_src_pp_degree(),
                                               self.zero_checkpoint.get_src_tp_degree())
         self.old_2d_map.simple_init()
-        self.new_2d_map = reshape_meg_2d_parallel(
-            old_pp_degree=self.zero_checkpoint.get_src_pp_degree(),
-            old_tp_degree=self.zero_checkpoint.get_src_tp_degree(),
-            new_pp_degree=self.pp_degree,
-            new_tp_degree=self.tp_degree)
-
-        if self.is_change_pp_degree() or self.is_change_tp_degree(
-        ) or self.is_change_dp_degree():
-            self.zero_checkpoint.reshape(
-                model_3d_desc(self.pp_degree,
-                              self.tp_degree,
-                              self.dp_degree))
+        self.new_2d_map = reshape_meg_2d_parallel(old_pp_degree=self.zero_checkpoint.get_src_pp_degree(),
+                                                  old_tp_degree=self.zero_checkpoint.get_src_tp_degree(),
+                                                  new_pp_degree=self.pp_degree,
+                                                  new_tp_degree=self.tp_degree)
+
+        if self.is_change_pp_degree() or self.is_change_tp_degree() or self.is_change_dp_degree():
+            self.zero_checkpoint.reshape(model_3d_desc(self.pp_degree, self.tp_degree, self.dp_degree))
 
         self.global_state = {}
 
@@ -84,10 +70,21 @@ def __init__(self, dir, tp_degree=None, pp_degree=None, dp_degree=None):
         self.pp_to_transformer_map = self._build_pp_transformer_map()
         self.transformer_file_map = self._build_transformer_file_map()
         self.tp_to_embedding_map = self._build_tp_other_layer_map(EMBEDDING_LAYER_INDEX)
-        self.tp_to_final_norm_map = self._build_tp_other_layer_map(
-            FINAL_LAYER_NORM_INDEX)
+        self.tp_to_final_norm_map = self._build_tp_other_layer_map(FINAL_LAYER_NORM_INDEX)
         self._build_global_state()
 
+    @property
+    def original_tp_degree(self):
+        return self.zero_checkpoint.get_src_tp_degree()
+
+    @property
+    def original_pp_degree(self):
+        return self.zero_checkpoint.get_src_pp_degree()
+
+    @property
+    def zero_files(self):
+        return self.zero_checkpoint.file_list
+
     def is_change_tp_degree(self):
         return self.tp_degree != self.zero_checkpoint.get_src_tp_degree()
 
@@ -113,11 +110,11 @@ def show_tp_embedding_map(self):
     def show_tp_final_norm_map(self):
         self._dump_mapping(self.tp_to_final_norm_map, 'tp_to_final_norm_layers')
 
-    def show_pp_tranformer_map(self):
-        self._dump_mapping(self.pp_to_transformer_map, 'pp_to_tranformer_layers')
+    def show_pp_transformer_map(self):
+        self._dump_mapping(self.pp_to_transformer_map, 'pp_to_transformer_layers')
 
     def show_transformer_file_map(self):
-        self._dump_mapping(self.transformer_file_map, 'rank_to_tranformer_files')
+        self._dump_mapping(self.transformer_file_map, 'rank_to_transformer_files')
 
     def _build_global_state(self):
         sd = torch.load(self.mp_rank_files[0], map_location=torch.device('cpu'))
@@ -131,9 +128,7 @@ def get_zero_checkpoint_state(self, pp_index, tp_index, dp_index) -> dict:
                                                        keys_to_ignore=[PARAM_SHAPES])
 
     def get_zero_files(self, pp_index, tp_index, dp_index) -> list:
-        return self.zero_checkpoint.get_files_for_rank(pp_index=pp_index,
-                                                       tp_index=tp_index,
-                                                       dp_index=dp_index)
+        return self.zero_checkpoint.get_files_for_rank(pp_index=pp_index, tp_index=tp_index, dp_index=dp_index)
 
     def get_embedding_layer_id(self):
         return self.layer_keys[EMBEDDING_LAYER_INDEX]
@@ -150,11 +145,7 @@ def get_iteration(self):
 
     def get_embedding_state(self, tp_index: int) -> Dict:
         assert tp_index in self.tp_to_embedding_map.keys()
-        sd_list = [
-            torch.load(fname,
-                       map_location=torch.device('cpu'))
-            for fname in self.tp_to_embedding_map[tp_index]
-        ]
+        sd_list = [torch.load(fname, map_location=torch.device('cpu')) for fname in self.tp_to_embedding_map[tp_index]]
         sd = self._merge_state_dicts(sd_list)
         return sd
 
@@ -179,10 +170,7 @@ def get_2d_parallel_state(self, tp_index: int, pp_index: int) -> dict:
         assert tp_index < self.tp_degree
         assert pp_index < self.pp_degree
         fname_list = self.get_2d_parallel_files(tp_index=tp_index, pp_index=pp_index)
-        sd_list = [
-            torch.load(fname,
-                       map_location=torch.device('cpu')) for fname in fname_list
-        ]
+        sd_list = [torch.load(fname, map_location=torch.device('cpu')) for fname in fname_list]
 
         merged_sd = None
         for sd in sd_list:
@@ -198,10 +186,7 @@ def get_transformer_state(self, tp_index: int, pp_index: int) -> list:
         assert pp_index < self.pp_degree
         t_list = []
         for fname_list in self.transformer_file_map[(tp_index, pp_index)]:
-            sd_list = [
-                torch.load(fname,
-                           map_location=torch.device('cpu')) for fname in fname_list
-            ]
+            sd_list = [torch.load(fname, map_location=torch.device('cpu')) for fname in fname_list]
             sd = self._merge_state_dicts(sd_list)
             t_list.append(sd)
         return t_list
@@ -212,8 +197,7 @@ def get_pp_transformer_map(self, pp_index: int) -> list:
 
     def get_final_norm_state(self, tp_index: int) -> Dict:
         assert tp_index in self.tp_to_final_norm_map.keys()
-        sd = torch.load(self.tp_to_final_norm_map[tp_index][0],
-                        map_location=torch.device('cpu'))
+        sd = torch.load(self.tp_to_final_norm_map[tp_index][0], map_location=torch.device('cpu'))
         return sd
 
     def get_final_norm_files(self, tp_index: int) -> list:
@@ -222,8 +206,7 @@ def get_final_norm_files(self, tp_index: int) -> list:
 
     def _build_tp_other_layer_map(self, layer_index: int):
         assert layer_index < len(self.layer_files)
-        layer_files = get_files_with_prefix(self.layer_files,
-                                            self.layer_keys[layer_index])
+        layer_files = get_files_with_prefix(self.layer_files, self.layer_keys[layer_index])
         layer_file_partitions = partition_data(layer_files, self.tp_degree)
         data_map = {i: flist for i, flist in enumerate(layer_file_partitions)}
         return data_map
@@ -238,11 +221,7 @@ def _build_pp_transformer_map(self):
         data_map = {}
         transformer_layers = self.layer_keys[1:-1]
         layers_per_pp = len(transformer_layers) // self.pp_degree
-        data_map = {
-            i: transformer_layers[i * layers_per_pp:(i + 1) * layers_per_pp]
-            for i in range(0,
-                           self.pp_degree)
-        }
+        data_map = {i: transformer_layers[i * layers_per_pp:(i + 1) * layers_per_pp] for i in range(0, self.pp_degree)}
         return data_map
 
     def _dump_mapping(self, data_map, map_tag=None):
@@ -303,6 +282,18 @@ def _merge_state_dicts(self, sd_list):
 
         return merged_sd
 
+    def _validate_folder(self, dir):
+        basic_folder_validation(dir)
+
+        file_list = get_files(dir)
+
+        for file_prefix in [MODEL_FILE_PREFIX, LAYER_FILE_PREFIX, f'{LAYER_FILE_PREFIX}01']:
+            ckpt_files = get_files_with_prefix(file_list, file_prefix)
+            assert len(ckpt_files) > 0, f'{dir} seems a bogus DeepSpeed checkpoint folder: Cannot find {file_prefix}* files in there.'
+
+
+class NeoxCheckpoint(DeepSpeedCheckpoint):
+
     def _validate_folder(self, dir):
         basic_folder_validation(dir)
 
@@ -310,8 +301,9 @@ def _validate_folder(self, dir):
 
         for file_prefix in [
                 MODEL_FILE_PREFIX,
-                LAYER_FILE_PREFIX,
-                f'{LAYER_FILE_PREFIX}01'
+                LAYER_FILE_PREFIX
         ]:
             ckpt_files = get_files_with_prefix(file_list, file_prefix)
-            assert len(ckpt_files) > 0, f'{dir} seems a bogus DeepSpeed checkpoint folder: Cannot find {file_prefix}* files in there.'
+            assert len(
+                ckpt_files
+            ) > 0, f'{dir} seems a bogus DeepSpeed checkpoint folder: Cannot find {file_prefix}* files in there.'
diff --git a/deepspeed/checkpoint/ds_to_universal.py b/deepspeed/checkpoint/ds_to_universal.py
new file mode 100755
index 000000000000..8be187aa89c2
--- /dev/null
+++ b/deepspeed/checkpoint/ds_to_universal.py
@@ -0,0 +1,335 @@
+#!/usr/bin/env python
+
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from functools import partial
+import argparse
+import glob
+import itertools
+import multiprocessing
+import os
+import re
+import shutil
+import torch
+import tqdm
+# from pprint import pprint
+
+from deepspeed.checkpoint import DeepSpeedCheckpoint
+from deepspeed.checkpoint import (
+    OPTIMIZER_STATE_DICT,
+    BASE_OPTIMIZER_STATE,
+    SINGLE_PARTITION_OF_FP32_GROUPS,
+    PARAM_SLICE_MAPPINGS,
+    PARAM_SHAPES,
+    PARAM,
+    CAT_DIM,
+    PARAM_N_SUB_PARAMS,
+    VOCAB_TENSOR,
+    UNIVERSAL_CHECKPOINT_INFO,
+    VOCABULARY_PARAMETER_PATTERNS,
+    PIPELINE_REPLICATED_PARAMETER_PATTERNS,
+    TP_REPLICATED_PARAMETER_PATTERNS,
+    PARAMETER_TO_AVERAGE_PATTERNS,
+    PARAMETER_WITH_ROW_PARALLELISM_PATTERNS,
+    PARAMETER_WITH_2_SUB_PARAMS_CAT_DIM_0,
+)
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--input_folder', type=str, required=True, help='Input DeepSpeed Checkpoint folder')
+    parser.add_argument('--output_folder', type=str, required=True, help='Output DeepSpeed checkpoint folder')
+    parser.add_argument('--num_extract_workers',
+                        default=4,
+                        type=int,
+                        help='How many parallel processes to extract zero shards')
+    parser.add_argument(
+        '--num_merge_workers',
+        default=2,
+        type=int,
+        help=
+        'How many parallel processes to merge tp slices (more memory intensive, use much fewer than --num_extract_workers))'
+    )
+    parser.add_argument('--keep_temp_folder',
+                        action='store_true',
+                        help='Preserve temporary folder of intermediate checkpoint slice files. Useful for debugging.')
+    parser.add_argument('--no_strict',
+                        dest='strict',
+                        action='store_false',
+                        help='Do not perform validity checks on converted checkpoint.')
+    args = parser.parse_args()
+    print(f'args = {args}')
+    return args
+
+
+def _create_checkpoint_paths(base_folder, iteration, tp_degree, pp_degree):
+    path_list = []
+    iter_folder = f'iter_{iteration:07d}'
+    for i in range(0, tp_degree):
+        path_list.append([])
+        for j in range(0, pp_degree):
+            rank_folder = f'mp_rank_{i:02d}' if pp_degree == 1 else f'mp_rank_{i:02d}_{j:03d}'
+            ckpt_path = os.path.join(rank_folder, 'model_optim_rng.pt')
+            path_list[i].append(os.path.join(base_folder, iter_folder, ckpt_path))
+
+    return path_list
+
+
+def _save_checkpoint(file_path, chkpt_sd):
+    dir, _ = os.path.split(file_path)
+    os.makedirs(dir, exist_ok=True)
+    torch.save(chkpt_sd, file_path)
+
+
+def extract_zero_shards(dir, ds_checkpoint, indices_3D):
+    pp_index, tp_index, dp_index = indices_3D
+    sd = ds_checkpoint.get_zero_checkpoint_state(pp_index=pp_index, tp_index=tp_index, dp_index=dp_index)
+
+    # pprint(f"Processing {dp_index=} {pp_index=}, {tp_index=}")
+
+    optim_sd = sd[OPTIMIZER_STATE_DICT]
+    param_slice_mappings = optim_sd[PARAM_SLICE_MAPPINGS]
+    universal_checkpoint_info = ds_checkpoint.get_checkpoint_info(UNIVERSAL_CHECKPOINT_INFO)
+    pipeline_replicated_params = universal_checkpoint_info.get(PIPELINE_REPLICATED_PARAMETER_PATTERNS, [])
+    # print(f'{pipeline_replicated_params=}')
+
+    # dict
+    state_groups = optim_sd[BASE_OPTIMIZER_STATE]["state"]
+    # list
+    fp32_groups = optim_sd[SINGLE_PARTITION_OF_FP32_GROUPS]
+    param_groups_cnt = len(state_groups)
+
+    for param_group_id in range(param_groups_cnt):
+
+        flat_state = dict(
+            exp_avg=state_groups[param_group_id]["exp_avg"],
+            exp_avg_sq=state_groups[param_group_id]["exp_avg_sq"],
+            fp32=fp32_groups[param_group_id],
+        )
+
+        for name, fragment_mapping in param_slice_mappings[param_group_id].items():
+            if pp_index > 0 and any(re.match(pattern, name) for pattern in pipeline_replicated_params):
+                # Skip tied weights that are replicated in first and last pp stages
+                continue
+
+            # pprint(f"dpt{dp_index}{pp_index}{tp_index} {param_group_id} {name} => {fragment_mapping.start}:{fragment_mapping.numel}")
+            for state_key in flat_state.keys():
+                dump_param_fragment(dir, tp_index, dp_index, state_key, flat_state[state_key], name,
+                                    fragment_mapping.start, fragment_mapping.numel)
+
+
+cnt = 0
+
+
+def dump_param_fragment(dir, tp_index, dp_index, state_name, state_flat_tensor, param_name, offset, numel):
+
+    global cnt  # temp hack
+
+    param_base_path = os.path.join(dir, param_name, str(tp_index))
+    os.makedirs(param_base_path, exist_ok=True)
+
+    cnt += 1
+    counter = f"{dp_index:0>2d}"
+
+    path = os.path.join(param_base_path, f"{state_name}.{counter}")
+
+    #print(f"{param_name}: {offset}: {numel} => {path}")
+
+    t = state_flat_tensor.narrow(0, offset, numel).clone()
+    _save_checkpoint(path, t)
+
+
+def _merge_zero_shards(param_base_path, state, tp_degree, slice_shape):
+    slices = []
+    for tp_index in range(tp_degree):
+        prefix_path = os.path.join(param_base_path, str(tp_index), f"{state}")
+        paths = sorted(list(glob.glob(f"{prefix_path}.*")))
+        shards = [torch.load(p) for p in paths]
+        slice = torch.cat(shards, dim=0).reshape(slice_shape)
+        slices.append(slice)
+    return slices
+
+
+def merge_tp_slices(ds_checkpoint, dir, slice_dir, tp_degree, name_and_shape):
+
+    name, shape = name_and_shape
+    slice_base_path = os.path.join(slice_dir, name)
+    param_base_path = os.path.join(dir, name)
+
+    universal_checkpoint_info = ds_checkpoint.get_checkpoint_info(UNIVERSAL_CHECKPOINT_INFO)
+    replicated_parameters = universal_checkpoint_info.get(TP_REPLICATED_PARAMETER_PATTERNS, [])
+    parameters_to_average = universal_checkpoint_info.get(PARAMETER_TO_AVERAGE_PATTERNS, [])
+    parameters_with_row_parallelism = universal_checkpoint_info.get(PARAMETER_WITH_ROW_PARALLELISM_PATTERNS, [])
+    vocabulary_parameters = universal_checkpoint_info.get(VOCABULARY_PARAMETER_PATTERNS, [])
+    parameters_with_2_sub_params_cat_dim_0 = universal_checkpoint_info.get(PARAMETER_WITH_2_SUB_PARAMS_CAT_DIM_0, [])
+    unmatched_patterns = set(replicated_parameters + parameters_to_average + parameters_with_row_parallelism +
+                             vocabulary_parameters + parameters_with_2_sub_params_cat_dim_0)
+
+    def get_matched_pattern(patterns_, name_):
+        matched_ = [pattern_ for pattern_ in patterns_ if re.match(pattern_, name_)]
+        assert len(matched_) <= 1, f'Got more than one matching patterns={matched_} for {name_}'
+        if matched_:
+            pattern_ = matched_[0]
+            unmatched_patterns.discard(pattern_)
+            return pattern_
+        return None
+
+    for state in ("fp32", "exp_avg", "exp_avg_sq"):
+        slices = _merge_zero_shards(slice_base_path, state, tp_degree, shape)
+        final_path = os.path.join(param_base_path, f"{state}.pt")
+
+        #print(f"Expected shape: {shape}")
+        #print(f"Fragment sizes:", list(frag.shape for frag in slices))
+        ckpt_dict = {}
+        if get_matched_pattern(replicated_parameters, name):
+            if len(slices) > 1:
+                assert all([slices[0].equal(other_slice) for other_slice in slices[1:]])
+            param = slices[0]
+            # print(f'replicate {name} using first slice')
+        elif get_matched_pattern(parameters_to_average, name):
+            param = sum(slices) / len(slices)
+            # print(f'merge {name} using average')
+        elif get_matched_pattern(parameters_with_2_sub_params_cat_dim_0, name):
+            cat_dim = 0
+            chunked_slices = [torch.chunk(s, 2, dim=cat_dim) for s in slices]
+            merged_chunks_0 = torch.cat([s[0] for s in chunked_slices], dim=cat_dim)
+            merged_chunks_1 = torch.cat([s[1] for s in chunked_slices], dim=cat_dim)
+            param = torch.cat([merged_chunks_0, merged_chunks_1], dim=cat_dim)
+            ckpt_dict[CAT_DIM] = cat_dim
+            ckpt_dict[PARAM_N_SUB_PARAMS] = 2
+        else:
+            cat_dim = 1 if get_matched_pattern(parameters_with_row_parallelism, name) else 0
+            # print(f"merge {name} with CAT DIM: {cat_dim}")
+            param = torch.cat(slices, dim=cat_dim)
+            ckpt_dict[CAT_DIM] = cat_dim
+
+        if get_matched_pattern(vocabulary_parameters, name):
+            #print(f"Before {param.shape=}")
+            # strip padding
+            original_vocab_size = universal_checkpoint_info['original_vocab_size']
+            param = param[:original_vocab_size, :]
+            ckpt_dict[VOCAB_TENSOR] = True
+            #print(f"After {param.shape=}")
+
+        #print(f"Final shape: {param.shape}")
+        ckpt_dict[PARAM] = param
+        _save_checkpoint(final_path, ckpt_dict)
+
+    return unmatched_patterns
+
+
+def _get_chunks(l, n):
+    for i in range(0, len(l), n):
+        yield l[i:i + n]
+
+
+def _do_parallel_work(do_work, work_chunks, num_workers):
+    pool = multiprocessing.Pool(num_workers)
+    results = []
+    for batch in tqdm.tqdm(work_chunks):
+        res = pool.map(do_work, batch)
+        results.extend(res)
+    pool.close()
+    pool.join()
+    return results
+
+
+def _extract_zero_shard_files(args, ds_checkpoint, temp_dir):
+    _3d_range_list = list(
+        itertools.product(range(ds_checkpoint.pp_degree), range(ds_checkpoint.tp_degree),
+                          range(ds_checkpoint.dp_degree)))
+    # pprint(f'{_3d_range_list=}')
+    work_chunks = list(_get_chunks(_3d_range_list, args.num_extract_workers))
+    # pprint(f'{work_chunks=}')
+
+    # extract_zero_shards(temp_dir, ds_checkpoint, _3d_range_list[0])
+    do_work = partial(extract_zero_shards, temp_dir, ds_checkpoint)
+    _do_parallel_work(do_work, work_chunks, args.num_extract_workers)
+
+
+def _merge_tp_slice_files(args, ds_checkpoint, slice_shapes, temp_dir):
+    work_chunks = list(_get_chunks(list(slice_shapes.items()), args.num_merge_workers))
+    #pprint(work_chunks)
+    zero_output_folder = os.path.join(args.output_folder, "zero")
+    do_work = partial(merge_tp_slices, ds_checkpoint, zero_output_folder, temp_dir, ds_checkpoint.tp_degree)
+    unmatched_patterns_lists = _do_parallel_work(do_work, work_chunks, args.num_merge_workers)
+
+    # verify that all patterns were used
+    # if a pattern was not used by any of the workers, then it was not used at all -> assert/alert
+    sets = [set(lst) for lst in unmatched_patterns_lists]
+    unmatched_patterns = list(set.intersection(*sets))
+    if args.strict:
+        assert not unmatched_patterns, f'Unused patterns={unmatched_patterns} while merging tp slices'
+    elif unmatched_patterns:
+        print(f'Warning: Unused patterns={unmatched_patterns} while merging tp slices')
+
+
+def _save_optimizer_state(args, ds_checkpoint):
+    sharded_states = [BASE_OPTIMIZER_STATE, PARAM_SLICE_MAPPINGS, SINGLE_PARTITION_OF_FP32_GROUPS]
+    sd = ds_checkpoint.get_zero_checkpoint_state(pp_index=0, tp_index=0, dp_index=0)
+
+    optim_sd = sd[OPTIMIZER_STATE_DICT]
+    output_sd = {k: v for k, v in optim_sd.items() if k not in sharded_states}
+    zero_output_folder = os.path.join(args.output_folder, "zero")
+    output_file_path = os.path.join(zero_output_folder, f"optimizer_state.pt")
+    _save_checkpoint(output_file_path, output_sd)
+
+
+def _check_for_required_state(ds_checkpoint):
+    universal_checkpoint_info = ds_checkpoint.get_checkpoint_info(UNIVERSAL_CHECKPOINT_INFO)
+    assert universal_checkpoint_info is not None, f'Required {UNIVERSAL_CHECKPOINT_INFO} state is missing in checkpoint. Verify that client creates this state.'
+
+
+def main():
+    print(f'Convert DeepSpeed Checkpoint to Universal Checkpoint')
+
+    args = parse_arguments()
+    print(f'Converting DeepSpeed checkpoint in {args.input_folder} to Universal checkpoint in {args.output_folder}')
+
+    ds_checkpoint = DeepSpeedCheckpoint(args.input_folder)
+    _check_for_required_state(ds_checkpoint)
+
+    iteration = ds_checkpoint.get_iteration()
+    #_create_latest_file(args.output_folder, iteration)
+    checkpoint_paths = _create_checkpoint_paths(args.output_folder, iteration, ds_checkpoint.tp_degree,
+                                                ds_checkpoint.pp_degree)
+
+    slice_shapes = []
+    for mp_rank_file in ds_checkpoint.mp_rank_files:
+        mp_sd = torch.load(mp_rank_file, map_location=torch.device('cpu'))
+        slice_shapes += mp_sd[PARAM_SHAPES]
+
+    # fix back to normal flat dict, merge duplicates for tp>1
+    slice_shapes = dict((k, v) for d in slice_shapes for k, v in d.items())
+    temp_dir = os.path.join(args.output_folder, 'tmp')
+
+    print('*** 1. Extracting ZeRO fragments')
+    _extract_zero_shard_files(args, ds_checkpoint, temp_dir)
+
+    print('*** 2. Merging slices')
+    _merge_tp_slice_files(args, ds_checkpoint, slice_shapes, temp_dir)
+
+    print('*** 3. Saving common optimizer states')
+    _save_optimizer_state(args, ds_checkpoint)
+
+    if not args.keep_temp_folder:
+        shutil.rmtree(temp_dir, ignore_errors=True)
+
+    # Copy mp* files into output folder
+    for f in glob.glob(os.path.join(args.input_folder, 'mp*')):
+        shutil.copy2(f, args.output_folder)
+
+    # Update latest to output folder
+    checkpoint_root_folder, step_folder = os.path.split(args.output_folder)
+    latest_file = os.path.join(checkpoint_root_folder, 'latest_universal')
+    with open(latest_file, "w") as f:
+        f.write(step_folder)
+
+    print('*** Done!')
+
+
+if __name__ == "__main__":
+    main()
diff --git a/deepspeed/checkpoint/reshape_3d_utils.py b/deepspeed/checkpoint/reshape_3d_utils.py
index 15faffb2a680..7c98b6318336 100644
--- a/deepspeed/checkpoint/reshape_3d_utils.py
+++ b/deepspeed/checkpoint/reshape_3d_utils.py
@@ -1,9 +1,9 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
 
-from .reshape_utils import (get_files,
-                            get_files_with_prefix,
-                            partition_data,
-                            get_zero_files)
+# DeepSpeed Team
+
+from .reshape_utils import (get_files, get_files_with_prefix, partition_data, get_zero_files)
 
 from .constants import (MODEL_FILE_PREFIX, LAYER_FILE_PREFIX)
 
@@ -15,6 +15,7 @@
 
 
 class model_3d_desc(object):
+
     def __init__(self, pp_degree=1, tp_degree=1, dp_degree=1):
         self.pp_degree = pp_degree
         self.tp_degree = tp_degree
@@ -33,8 +34,7 @@ def reshape(self, target_3d_desc, verbose=False):
                                            src_2d_size=self.pp_degree * self.tp_degree,
                                            dp_degree=self.dp_degree)
 
-        return unflatten_dp_dimension(meg_2d_map=flat_3d_map,
-                                      dp_degree=target_3d_desc.dp_degree)
+        return unflatten_dp_dimension(meg_2d_map=flat_3d_map, dp_degree=target_3d_desc.dp_degree)
 
     def get_desc(self):
         return f'{PP_DIM},{TP_DIM},{DP_DIM} = ({self.pp_degree}, {self.tp_degree}, {self.dp_degree})'
@@ -45,14 +45,11 @@ def world_size(self):
     def is_valid(self, pp_index, tp_index, dp_index):
         err_msg = []
         valid = True
-        for index, degree, dim_name in [
-            (pp_index, self.pp_degree, PP_DIM),
-            (tp_index, self.tp_degree, TP_DIM),
-            (dp_index, self.dp_degree, DP_DIM)]:
+        for index, degree, dim_name in [(pp_index, self.pp_degree, PP_DIM), (tp_index, self.tp_degree, TP_DIM),
+                                        (dp_index, self.dp_degree, DP_DIM)]:
             if index >= degree:
                 valid = False
-                err_msg.append(
-                    f'{dim_name} indexing error: index {index} >= degree {degree}')
+                err_msg.append(f'{dim_name} indexing error: index {index} >= degree {degree}')
 
         return valid, err_msg
 
@@ -60,26 +57,33 @@ def can_reshape(self, target_3d_desc):
         err_msg = []
         if target_3d_desc.pp_degree > self.pp_degree:
             err_msg.append(
-                f'Expansion reshape not supported - {PP_DIM}: {self.pp_degree} ---> {target_3d_desc.pp_degree}'
-            )
+                f'Expansion reshape not supported - {PP_DIM}: {self.pp_degree} ---> {target_3d_desc.pp_degree}')
 
         if target_3d_desc.tp_degree > self.tp_degree:
             err_msg.append(
-                f'Expansion reshape not supported - {TP_DIM}: {self.tp_degree} ---> {target_3d_desc.tp_degree}'
-            )
+                f'Expansion reshape not supported - {TP_DIM}: {self.tp_degree} ---> {target_3d_desc.tp_degree}')
 
         if target_3d_desc.dp_degree > self.dp_degree:
             err_msg.append(
-                f'Expansion reshape not supported - {DP_DIM}: {self.dp_degree} ---> {target_3d_desc.dp_degree}'
-            )
+                f'Expansion reshape not supported - {DP_DIM}: {self.dp_degree} ---> {target_3d_desc.dp_degree}')
 
         return len(err_msg) == 0, err_msg
 
 
+def get_num_pp0_files(file_list):
+    num_layer1_files = len(get_files_with_prefix(file_list, f'{LAYER_FILE_PREFIX}01'))
+    num_layer2_files = len(get_files_with_prefix(file_list, f'{LAYER_FILE_PREFIX}02'))
+    if (num_layer1_files > 0) or (num_layer2_files == 0):
+        return num_layer1_files
+    else:
+        return num_layer2_files
+
+
+
 def get_model_3d_descriptor(dir):
     file_list = get_files(dir)
     zero_file_list = get_zero_files(dir)
-    num_pp0_files = len(get_files_with_prefix(file_list, f'{LAYER_FILE_PREFIX}01'))
+    num_pp0_files = get_num_pp0_files(file_list)
     if num_pp0_files > 0:
         tp_degree = num_pp0_files
         pp_degree = len(get_files_with_prefix(file_list, MODEL_FILE_PREFIX)) // tp_degree
@@ -106,10 +110,7 @@ def flatten_dp_dimension(meg_2d_map, src_2d_size, dp_degree):
 def unflatten_dp_dimension(meg_2d_map, dp_degree):
     pp_degree = meg_2d_map.pp_degree
     tp_degree = meg_2d_map.tp_degree
-    meg_2d_map_list = [
-        meg_2d_parallel_map(pp_degree=pp_degree,
-                            tp_degree=tp_degree) for _ in range(dp_degree)
-    ]
+    meg_2d_map_list = [meg_2d_parallel_map(pp_degree=pp_degree, tp_degree=tp_degree) for _ in range(dp_degree)]
     for pp_index in range(pp_degree):
         for tp_index in range(tp_degree):
             flat_dp_indices = meg_2d_map.get_data(pp_index, tp_index)
diff --git a/deepspeed/checkpoint/reshape_meg_2d.py b/deepspeed/checkpoint/reshape_meg_2d.py
index 4800b55583b8..3bff87f4344f 100644
--- a/deepspeed/checkpoint/reshape_meg_2d.py
+++ b/deepspeed/checkpoint/reshape_meg_2d.py
@@ -1,9 +1,13 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .reshape_utils import partition_data
 
 
 class meg_2d_parallel_map(object):
+
     def __init__(self, pp_degree, tp_degree):
         self.pp_degree = pp_degree
         self.tp_degree = tp_degree
@@ -11,8 +15,7 @@ def __init__(self, pp_degree, tp_degree):
 
     def simple_init(self):
         self.map = {
-            self._make_key(i // self.tp_degree,
-                           i % self.tp_degree): [i]
+            self._make_key(i // self.tp_degree, i % self.tp_degree): [i]
             for i in range(self.pp_degree * self.tp_degree)
         }
 
@@ -74,11 +77,7 @@ def _reshape_pp_dimension(old_2d_map, new_pp_degree):
     return new_2d_map
 
 
-def reshape_meg_2d_parallel(old_pp_degree,
-                            old_tp_degree,
-                            new_pp_degree,
-                            new_tp_degree,
-                            verbose=False):
+def reshape_meg_2d_parallel(old_pp_degree, old_tp_degree, new_pp_degree, new_tp_degree, verbose=False):
     assert new_pp_degree <= old_pp_degree
     assert new_tp_degree <= old_tp_degree
 
@@ -137,8 +136,7 @@ def get_mpu_ranks(tp_size=1, pp_size=1, dp_size=1, virtual_pp_size=None):
 
     tensor_model_parallel_size = min(tp_size, world_size)
     pipeline_model_parallel_size = min(pp_size, world_size)
-    data_parallel_size = world_size // (tensor_model_parallel_size *
-                                        pipeline_model_parallel_size)
+    data_parallel_size = world_size // (tensor_model_parallel_size * pipeline_model_parallel_size)
 
     num_tensor_model_parallel_groups = world_size // tensor_model_parallel_size
     num_pipeline_model_parallel_groups = world_size // pipeline_model_parallel_size
@@ -158,10 +156,7 @@ def get_mpu_ranks(tp_size=1, pp_size=1, dp_size=1, virtual_pp_size=None):
     # Build the model-parallel groups.
     all_pp_group_ranks = []
     for i in range(data_parallel_size):
-        ranks = [
-            data_parallel_group_ranks[i]
-            for data_parallel_group_ranks in all_dp_group_ranks
-        ]
+        ranks = [data_parallel_group_ranks[i] for data_parallel_group_ranks in all_dp_group_ranks]
         all_pp_group_ranks.append(list(ranks))
 
     print(f"PP", all_pp_group_ranks)
@@ -169,8 +164,7 @@ def get_mpu_ranks(tp_size=1, pp_size=1, dp_size=1, virtual_pp_size=None):
     # Build the tensor model-parallel groups.
     all_tp_group_ranks = []
     for i in range(num_tensor_model_parallel_groups):
-        ranks = range(i * tensor_model_parallel_size,
-                      (i + 1) * tensor_model_parallel_size)
+        ranks = range(i * tensor_model_parallel_size, (i + 1) * tensor_model_parallel_size)
         all_tp_group_ranks.append(list(ranks))
 
     print(f"TP", all_tp_group_ranks)
diff --git a/deepspeed/checkpoint/reshape_utils.py b/deepspeed/checkpoint/reshape_utils.py
index 4d6b272f43e3..e6ef664a4755 100644
--- a/deepspeed/checkpoint/reshape_utils.py
+++ b/deepspeed/checkpoint/reshape_utils.py
@@ -1,9 +1,12 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import os
 import torch
 from collections import OrderedDict
-from .constants import (ZERO_FILE_PREFIX, FP16_ZERO_FILE_PREFIX, BF16_ZERO_FILE_PREFIX)
+from .constants import (ZERO_FILE_PREFIX, FP16_ZERO_FILE_PREFIX, BF16_ZERO_FILE_PREFIX, PARTITION_COUNT)
 
 
 def basic_folder_validation(dir):
@@ -49,11 +52,7 @@ def partition_data(data_list, num_partitions):
     num_elems = len(data_list)
     assert num_elems % num_partitions == 0
     partition_size = num_elems // num_partitions
-    partitions_list = [
-        data_list[i:i + partition_size] for i in range(0,
-                                                       num_elems,
-                                                       partition_size)
-    ]
+    partitions_list = [data_list[i:i + partition_size] for i in range(0, num_elems, partition_size)]
     return partitions_list
 
 
@@ -61,12 +60,25 @@ def _key_list_to_string(key_list):
     return '.'.join(key_list)
 
 
+def _to_list_if_int(list_or_int):
+    if isinstance(list_or_int, int):
+        return [list_or_int]
+    else:
+        return list_or_int
+
+
 def merge_state_dict(dict_a, dict_b, key_list):
     merged_dict = type(dict_a)({})
 
     for key, value in dict_b.items():
         if key in dict_a.keys():
-            merged_dict[key] = merge_state(dict_a[key], dict_b[key], [str(key)])
+            # TODO: Fix ugliest hack ever
+            if key == PARTITION_COUNT:
+                count_a = _to_list_if_int(dict_a[key])
+                count_b = _to_list_if_int(dict_b[key])
+                merged_dict[key] = merge_state(count_a, count_b, [str(key)])
+            else:
+                merged_dict[key] = merge_state(dict_a[key], dict_b[key], [str(key)])
         else:
             merged_dict[key] = value
 
@@ -76,9 +88,7 @@ def merge_state_dict(dict_a, dict_b, key_list):
 def merge_state_list(list_a, list_b, key_list):
     if len(list_a) != len(list_b):
         print(f'{_key_list_to_string(key_list)}')
-        raise ValueError(
-            f'Cannot merge lists of different lengths, a = {len(list_a)} b = {len(list_b)}'
-        )
+        raise ValueError(f'Cannot merge lists of different lengths, a = {len(list_a)} b = {len(list_b)}')
 
     return [merge_state(a, b, key_list) for a, b in zip(list_a, list_b)]
 
@@ -87,8 +97,7 @@ def merge_state(state_a, state_b, key_list=[]):
     if type(state_a) != type(state_b):
         key_list_string = _key_list_to_string(key_list)
         print(f'key_list = {key_list_string}')
-        raise ValueError(
-            f'Cannot merge two states of types {type(state_a)} and type {type(state_b)}')
+        raise ValueError(f'Cannot merge two states of types {type(state_a)} and type {type(state_b)}')
 
     if type(state_a) in (dict, OrderedDict):
         return merge_state_dict(state_a, state_b, key_list)
diff --git a/deepspeed/checkpoint/universal_checkpoint.py b/deepspeed/checkpoint/universal_checkpoint.py
index b58de4871031..5849a834cdd3 100644
--- a/deepspeed/checkpoint/universal_checkpoint.py
+++ b/deepspeed/checkpoint/universal_checkpoint.py
@@ -1,13 +1,12 @@
-"""
-Copyright 2022 The Microsoft DeepSpeed Team
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 import os
 import torch
 import types
-from .constants import (FP32_WEIGHT_KEY,
-                        PARAM,
-                        VOCAB_DIVISIBILITY_PADDING_TENSOR,
-                        CAT_DIM)
+from .constants import (FP32_WEIGHT_KEY, PARAM, VOCAB_TENSOR, CAT_DIM, PARAM_N_SUB_PARAMS)
 
 
 def load_hp_checkpoint_state(self, folder, tp_rank, tp_world_size):
@@ -44,29 +43,16 @@ def load_hp_checkpoint_state(self, folder, tp_rank, tp_world_size):
         # the converter to universal currently strips the original padding completely so the saved
         # weight is padding-free and we just need to add new padding depending on the target TP
         # degree
-        vocab_divisibility_padding_tensor = ckpt_dict.get(
-            VOCAB_DIVISIBILITY_PADDING_TENSOR,
-            None)
-        if vocab_divisibility_padding_tensor is not None:
+        is_vocab_tensor = ckpt_dict.get(VOCAB_TENSOR, False)
+        if is_vocab_tensor:
             # In the absence of data passed from the user wrt new padded vocab specific to tp degree
             # we can again derive that data by reverse engineering the target shapes like so:
             padded_target_vocab_size = self.shape[0] * tp_world_size
+            assert padded_target_vocab_size >= full_hp_param.shape[0], \
+                f'Vocab tensor padded size {padded_target_vocab_size} < loaded universal size {full_hp_param.shape[0]}'
             if padded_target_vocab_size > full_hp_param.shape[0]:
-                # Need to expand
                 padding_size = padded_target_vocab_size - full_hp_param.shape[0]
-                # Implement the following concat in efficient way using pad
-                #full_hp_param = torch.cat((full_hp_param, padding_tensor), 0)
-                full_hp_param = torch.nn.functional.pad(full_hp_param,
-                                                        (0,
-                                                         0,
-                                                         0,
-                                                         padding_size),
-                                                        "constant",
-                                                        0)
-                full_hp_param[:-padding_size, :] = vocab_divisibility_padding_tensor
-            else:
-                # Need to shrink or keep the same
-                full_hp_param = full_hp_param[:padded_target_vocab_size, :]
+                full_hp_param = torch.nn.functional.pad(full_hp_param, (0, 0, 0, padding_size), "constant", 0)
 
         full_param_numel = full_hp_param.numel()
         tp_slice_numel = self.numel()
@@ -76,23 +62,28 @@ def load_hp_checkpoint_state(self, folder, tp_rank, tp_world_size):
 
         assert full_param_numel == tp_world_size * tp_slice_numel, \
             f'Loading {ckpt_file} full param numel {full_param_numel} != tensor slice numel {tp_slice_numel} * tp_world_size {tp_world_size}'
-        dst_tensor = hp_mapping.hp_fragment if key == FP32_WEIGHT_KEY else hp_mapping.get_optim_state_fragment(
-            key)
+        dst_tensor = hp_mapping.hp_fragment if key == FP32_WEIGHT_KEY else hp_mapping.get_optim_state_fragment(key)
 
         #        print(f"{full_hp_param.shape=} {full_param_numel=} {folder=}")
         #        print(f"{dst_tensor.shape=} {dst_tensor.numel()=}{folder=}")
 
         # since when we do many to 1 on tp we cat sometimes on dim=0 and other times on dim=1 we have to do exactly the same in reverse
+        # special case is when a single parameter is effectively a container for multiple sub parameters
+        # (more details at PARAM_N_SUB_PARAMS definition)
         chunk_dim = ckpt_dict.get(CAT_DIM, 0)
+        n_sub_params = ckpt_dict.get(PARAM_N_SUB_PARAMS, 1)
+        if n_sub_params > 1:
+            sub_params = full_hp_param.chunk(n_sub_params, dim=chunk_dim)
+            sub_params_tp_slice = [p.chunk(tp_world_size, dim=chunk_dim)[tp_rank] for p in sub_params]
+            tp_hp_slice = torch.cat(sub_params_tp_slice, dim=chunk_dim)
+        else:
+            # this performs the opposite of cat when merging TP slices
+            tp_hp_slice = full_hp_param.chunk(tp_world_size, chunk_dim)[tp_rank]
 
-        # this performs the opposite of cat when merging TP slices
-        tp_hp_slice = full_hp_param.chunk(tp_world_size, chunk_dim)[tp_rank]
         tp_hp_slice = tp_hp_slice.flatten()
 
         lp_frag_address = hp_mapping.lp_fragment_address
-        tp_hp_fragment = tp_hp_slice.narrow(0,
-                                            lp_frag_address.start,
-                                            lp_frag_address.numel)
+        tp_hp_fragment = tp_hp_slice.narrow(0, lp_frag_address.start, lp_frag_address.numel)
         assert dst_tensor.numel() == lp_frag_address.numel, \
             f'Load checkpoint {key} dst_tensor numel {dst_tensor.numel()} != src numel {lp_frag_address.numel}'
 
@@ -104,5 +95,4 @@ def load_hp_checkpoint_state(self, folder, tp_rank, tp_world_size):
 
 def enable_universal_checkpoint(param_list):
     for param in param_list:
-        param.load_hp_checkpoint_state = types.MethodType(load_hp_checkpoint_state,
-                                                          param)
+        param.load_hp_checkpoint_state = types.MethodType(load_hp_checkpoint_state, param)
diff --git a/deepspeed/checkpoint/utils.py b/deepspeed/checkpoint/utils.py
index 4e96d073067d..c305e8884e83 100644
--- a/deepspeed/checkpoint/utils.py
+++ b/deepspeed/checkpoint/utils.py
@@ -1,10 +1,11 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import os
-from .constants import (MODEL_FILE_PREFIX,
-                        MODEL_FILE_SUFFIX,
-                        OPTIM_FILE_SUFFIX,
-                        ZERO_FILE_PREFIX)
+import torch
+from .constants import (MODEL_FILE_PREFIX, MODEL_FILE_SUFFIX, OPTIM_FILE_SUFFIX, ZERO_FILE_PREFIX)
 
 
 def get_model_ckpt_name_for_rank(base_folder, mp_rank_str):
@@ -29,3 +30,33 @@ def get_layer_ckpt_name_for_rank(base_folder, layer_id, tp_rank):
     ckpt_file = f'{layer_id}-model_{tp_rank:02d}{MODEL_FILE_SUFFIX}'
     ckpt_path = os.path.join(base_folder, ckpt_file)
     return ckpt_path
+
+
+# We pass cloned tensors to torch.save() to avoid checkpoint bloat that occurs when torch.save()
+# saves the underlying storage rather than the slice of the storage corresponding to individual tensors.
+# This is a problem in DeepSpeed because we often allocate tensors using slices of large flattened buffers.
+# Tensor cloning helps to avoid this problem because the storage of cloned tensors are closer to the true size.
+# It is expected that the garbage collector will reclaim the cloned tensor storage to avoid memory bloat.
+# See https://pytorch.org/docs/stable/notes/serialization.html#preserve-storage-sharing
+def clone_tensors_for_torch_save(item, device=torch.device('cpu')):
+    """
+    Returns a copy of ``item`` with all enclosed tensors replaced by clones on a specified device.
+    Works on individual tensors, and tensors contained/nested in lists, tuples, and dicts.
+
+    Parameters:
+        - ``item``: tensor to clone or (possibly nested) container of tensors to clone.
+        - ``device``: target device (defaults to 'cpu')
+
+    Returns:
+        - copy of ``item`` with cloned tensors on target device
+    """
+    if torch.is_tensor(item):
+        return item.detach().clone().to(device)
+    elif isinstance(item, list):
+        return [clone_tensors_for_torch_save(v, device) for v in item]
+    elif isinstance(item, tuple):
+        return tuple([clone_tensors_for_torch_save(v, device) for v in item])
+    elif isinstance(item, dict):
+        return type(item)({k: clone_tensors_for_torch_save(v, device) for k, v in item.items()})
+    else:
+        return item
diff --git a/deepspeed/checkpoint/zero_checkpoint.py b/deepspeed/checkpoint/zero_checkpoint.py
index cb33e8e37671..283b71b64f80 100644
--- a/deepspeed/checkpoint/zero_checkpoint.py
+++ b/deepspeed/checkpoint/zero_checkpoint.py
@@ -1,11 +1,11 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 
-from .constants import (BASE_OPTIMIZER_STATE,
-                        GROUP_PADDINGS,
-                        OPTIMIZER_STATE_DICT,
-                        PARTITION_COUNT)
+from .constants import (BASE_OPTIMIZER_STATE, GROUP_PADDINGS, OPTIMIZER_STATE_DICT, PARTITION_COUNT)
 
 from .reshape_utils import (basic_folder_validation, get_zero_files, merge_state)
 
@@ -15,6 +15,7 @@
 
 
 class ZeROCheckpoint(object):
+
     def __init__(self, dir):
         basic_folder_validation(dir)
         self.dir = dir
@@ -49,12 +50,7 @@ def get_files_for_rank(self, pp_index, tp_index, dp_index):
         file_idx_list = self.get_file_indices_for_rank(pp_index, tp_index, dp_index)
         return [self.file_list[idx] for idx in file_idx_list]
 
-    def get_state_for_rank(self,
-                           pp_index,
-                           tp_index,
-                           dp_index,
-                           keys_to_ignore=[],
-                           strip_tensor_paddings=True):
+    def get_state_for_rank(self, pp_index, tp_index, dp_index, keys_to_ignore=[], strip_tensor_paddings=True):
         state_file_list = self.get_files_for_rank(pp_index, tp_index, dp_index)
         merged_sd = None
         for state_file in state_file_list:
@@ -111,10 +107,7 @@ def _strip_tensor_paddings(self, sd):
             for state_name, state_value in group_state.items():
                 if torch.is_tensor(state_value):
                     raw_length = state_value.numel() - group_paddings[key]
-                    group_state[state_name] = torch.narrow(state_value,
-                                                           0,
-                                                           0,
-                                                           raw_length).clone()
+                    group_state[state_name] = torch.narrow(state_value, 0, 0, raw_length).clone()
 
     def _clear_group_paddings(self, sd):
         group_paddings = self._get_optimizer_state(sd, GROUP_PADDINGS)
@@ -135,7 +128,7 @@ def _get_param_group_states(self, sd):
             return None
 
         base_optimizer_state = optimizer_state.get(BASE_OPTIMIZER_STATE, None)
-        if base_optimizer_state is None:
+        if base_optimizer_state is None or isinstance(base_optimizer_state, list):
             return None
 
         return base_optimizer_state.get(GROUP_STATE_KEY, None)
@@ -144,5 +137,4 @@ def _update_partition_count(self, sd):
         partition_counts = self._get_optimizer_state(sd, PARTITION_COUNT)
         if partition_counts:
             num_groups = len(partition_counts)
-            sd[OPTIMIZER_STATE_DICT][PARTITION_COUNT] = [self.target_3d.dp_degree
-                                                         ] * num_groups
+            sd[OPTIMIZER_STATE_DICT][PARTITION_COUNT] = [self.target_3d.dp_degree] * num_groups
diff --git a/deepspeed/comm/__init__.py b/deepspeed/comm/__init__.py
index 84d0f54c76a1..2c26ef491308 100644
--- a/deepspeed/comm/__init__.py
+++ b/deepspeed/comm/__init__.py
@@ -1,52 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
 
-import torch
-from .utils import *
-from deepspeed import utils
-
-supported_torch_version = False
-
-# See more details at: https://github.com/pytorch/pytorch/pull/48767
-# The PG API in torch versions lesser than 1.8 are different so it is
-# non-trivial to support both in the same API. We will just use the
-# DS comm. backend in deepspeed/comm/comm.py if torch version if 1.8+.
-
-if older_torch():
-    # Add custom deepspeed torch comm functions here since we can't import deepspeed.comm
-    # NOTE: We can't call torch.distributed directly here. Current hack is to import functions before calling them.
-    supported_torch_version = False
-    from torch.distributed import *
-
-    def get_world_group():
-        return group.WORLD
+# DeepSpeed Team
 
-    def get_global_rank(group, group_rank):
-        if hasattr(torch.distributed.distributed_c10d, "get_global_rank"):
-            from torch.distributed.distributed_c10d import get_global_rank as _get_global_rank
-        else:
-            from torch.distributed.distributed_c10d import _get_global_rank
-        return _get_global_rank(group, group_rank)
-
-    def allgather_fn(output_tensor, input_tensor, group=None, async_op=False):
-        from torch.distributed import all_gather, get_world_size
-        from torch import chunk
-        output_tensors = list(chunk(output_tensor, get_world_size(group)))
-        return all_gather(output_tensors, input_tensor, group=group, async_op=async_op)
-
-    def reduce_scatter_fn(output_tensor, input_tensor, group=None, async_op=False):
-        from torch.distributed import reduce_scatter, get_world_size
-        from torch import chunk
-        input_tensor_lst = list(chunk(input_tensor, get_world_size(group)))
-        return reduce_scatter(output_tensor, input_tensor_lst, group=group)
-
-    def configure(deepspeed_config=None,
-                  enabled=None,
-                  prof_all=None,
-                  prof_ops=None,
-                  verbose=None):
-        utils.logger.warn(
-            "Communication logging is not supported in torch versions older than 1.8")
-
-else:
-    supported_torch_version = True
-    from .comm import *
+from .utils import *
+from .comm import *
diff --git a/deepspeed/comm/backend.py b/deepspeed/comm/backend.py
index cd436865c5ad..15dc96c40521 100644
--- a/deepspeed/comm/backend.py
+++ b/deepspeed/comm/backend.py
@@ -1,5 +1,9 @@
-'''Copyright The Microsoft DeepSpeed Team'''
-''' DeepSpeed Communication Backend.
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""
+DeepSpeed Communication Backend.
 
 In the future, directly use NCCL/MPI/Gloo/etc without requiring torch.distributed. Simply wrap torch.distributed for now.
 
@@ -8,24 +12,24 @@
  - MPI -- [EXPERIMENTAL]
  - RCCL -- [EXPERIMENTAL]
  - GLOO -- [EXPERIMENTAL]
-
 # DS backend wrapper for torch.distributed [DEFAULT]
  - T-NCCL -- [DEFAULT]
  - T-GLOO
  - T-MPI
-'''
-''' Backend is the base class
+
+# Backend is the base class
      -- NcclBackend, MpiBackend, and TorchBackend are the main subclasses. TorchBackend is the only officially supported backend for now.
-'''
+"""
 
 
 class Backend(object):
+
     def __init__(self, name='backend', rank=0, size=1):
         self.name = name
         # The world size and rank of the world process group
         self.world_group = None
-        self.world_size = rank
-        self.world_rank = size
+        self.world_size = size
+        self.world_rank = rank
         # Single process group (pg) implementation for now but keep a list for future
         self.process_groups = []
         self.initialized = False
diff --git a/deepspeed/comm/ccl.py b/deepspeed/comm/ccl.py
new file mode 100644
index 000000000000..6e915d0ca430
--- /dev/null
+++ b/deepspeed/comm/ccl.py
@@ -0,0 +1,177 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+'''
+Copyright 2021 The Microsoft DeepSpeed Team
+'''
+
+import torch
+from deepspeed.accelerator import get_accelerator
+from .reduce_op import ReduceOp
+from .torch import TorchBackend
+
+
+def build_ccl_op():
+    builder = get_accelerator().create_op_builder("CCLCommBuilder")
+    if builder is None:
+        return None
+    ccl_cpp_module = builder.load()
+    print(f'DeepSpeed {builder.absolute_name()} built successfully')
+    return ccl_cpp_module
+
+
+class CCLHandler():
+
+    def __init__(self, ccl_comm_op=None):
+        self.ccl_comm_op = ccl_comm_op
+
+    def wait(self):
+        # backend covered it
+        pass
+
+
+class CCLBackend(TorchBackend):
+
+    def __init__(self, name='ccl', rank=-1, world_size=-1, mpu=None, timeout=None, init_method=None):
+        self.ccl_comm_op = build_ccl_op()
+        if self.ccl_comm_op is None:
+            # set CCLBackend to uninitialized state if CCLCommBuilder cannot be loaded
+            self.initialized = False
+            return
+        super(CCLBackend, self).__init__(backend='ccl',
+                                         name='torch',
+                                         rank=rank,
+                                         world_size=world_size,
+                                         timeout=timeout,
+                                         init_method=init_method)
+        self.name = 'ccl'
+        size = self.get_world_size()
+        rank = self.get_rank()
+        main_kvs = self.ccl_comm_op.get_kvs_addr(rank)
+        main_kvs = torch.tensor(main_kvs).to(torch.uint8).to(get_accelerator().current_device_name())
+        super(CCLBackend, self).broadcast(main_kvs, 0)
+        self.ccl_comm_op.initialize(size, rank, main_kvs)
+        self.initialized = True
+        self.groups = [tuple(range(self.get_world_size()))]
+        self.available_coll = self.ccl_comm_op.get_available_coll()
+
+    def is_initialized(self):
+        return self.initialized
+
+    def run_collective(self, name, **kwargs):
+        if name in self.available_coll:
+            kwargs['group'] = self.get_all_ranks_from_group(kwargs['group'])
+            if 'dst' in kwargs:
+                kwargs['dst'] = kwargs['group'].index(kwargs['dst'])
+            if 'src' in kwargs:
+                kwargs['src'] = kwargs['group'].index(kwargs['src'])
+            func = "self.ccl_comm_op." + name
+            eval(func)(*(kwargs.values()))
+            return CCLHandler(self.ccl_comm_op)
+        else:
+            func = "super(CCLBackend, self)." + name
+            return eval(func)(*(kwargs.values()))
+
+    def all_reduce(self, tensor, op=ReduceOp.SUM, group=None, async_op=False):
+        use_caching = False
+        if use_caching:
+            match_id = f"{tensor.size()}-{op}"
+            return self.run_collective(name="all_reduce_caching",
+                                       tensor=tensor,
+                                       op=op,
+                                       match_id=match_id,
+                                       group=group,
+                                       async_op=async_op)
+        else:
+            return self.run_collective(name="all_reduce", tensor=tensor, op=op, group=group, async_op=async_op)
+
+    def inference_all_reduce(self, tensor, op=ReduceOp.SUM, group=None, async_op=False):
+        return self.run_collective(name="inference_all_reduce", tensor=tensor, op=op, group=group, async_op=async_op)
+
+    def broadcast(self, tensor, src, group=None, async_op=False):
+        return self.run_collective(name="broadcast", tensor=tensor, src=src, group=group, async_op=async_op)
+
+    def all_gather(self, tensor_list, tensor, group=None, async_op=False):
+        return self.run_collective(name="all_gather",
+                                   tensor_list=tensor_list,
+                                   tensor=tensor,
+                                   group=group,
+                                   async_op=async_op)
+
+    def reduce_scatter_tensor(self, output_tensor, input_tensor, op, group=None, async_op=False):
+        return self.run_collective(name="reduce_scatter_tensor",
+                                   output_tensor=output_tensor,
+                                   input_tensor=input_tensor,
+                                   op=op,
+                                   group=group)
+
+    def all_gather_into_tensor(self, output_tensor, input_tensor, group=None, async_op=False):
+        return self.run_collective(name="all_gather_into_tensor",
+                                   output_tensor=output_tensor,
+                                   input_tensor=input_tensor,
+                                   group=group)
+
+    def all_to_all_single(self, output, input, output_split_sizes, input_split_sizes, group=None, async_op=False):
+        return self.run_collective(name="all_to_all_single",
+                                   output=output,
+                                   input=input,
+                                   output_split_sizes=output_split_sizes,
+                                   input_split_sizes=input_split_sizes,
+                                   group=group)
+
+    def send(self, tensor, dst, group=None, async_op=False):
+        return self.run_collective(name="send", tensor=tensor, dst=dst, group=group, async_op=async_op)
+
+    def recv(self, tensor, src, group=None, async_op=False):
+        return self.run_collective(name="recv", tensor=tensor, src=src, group=group, async_op=async_op)
+
+    def gather(self, tensor, gather_list, dst, group=None, async_op=False):
+        return self.run_collective(name="gather", tensor=tensor, gather_list=gather_list, dst=dst, group=group)
+
+    def scatter(self, tensor, gather_list, dst, group=None, async_op=False):
+        return self.run_collective(name="scatter", tensor=tensor, gather_list=gather_list, dst=dst, group=group)
+
+    def barrier(self, group=None, async_op=False):
+        return self.run_collective(name="barrier", group=group, async_op=async_op)
+
+    def monitored_barrier(self, group=None, timeout=None, wait_all_ranks=False):
+        return self.run_collective(name="monitored_barrier", group=group)
+
+    def reduce_scatter(self, output, input_list, op=ReduceOp.SUM, group=None, async_op=False):
+        return self.run_collective(name="reduce_scatter",
+                                   output=output,
+                                   input_list=input_list,
+                                   op=op,
+                                   group=group,
+                                   async_op=async_op)
+
+    def reduce(self, tensor, dst, op=ReduceOp.SUM, group=None, async_op=False):
+        return self.run_collective(name="reduce", tensor=tensor, dst=dst, op=op, group=group, async_op=async_op)
+
+    def new_group(self, ranks):
+        return super(CCLBackend, self).new_group(ranks)
+
+    def _new_group(self, ranks, group):
+        size = len(ranks)
+        rank = self.get_rank()
+        sub_main_kvs = self.ccl_comm_op.get_sub_kvs_addr(rank == ranks[0])
+        sub_main_kvs = torch.tensor(sub_main_kvs).to(torch.uint8).to(get_accelerator().current_device_name())
+        super(CCLBackend, self).broadcast(sub_main_kvs, ranks[0], group)
+        self.ccl_comm_op.initialize_sub_comm(size, ranks.index(rank), sub_main_kvs, ranks)
+        self.groups.append(tuple(ranks))
+
+    def get_all_ranks_from_group(self, group):
+        if group is None:
+            return list(range(self.get_world_size()))
+        rank = 0
+        results = []
+        try:
+            while True:
+                results.append(super(CCLBackend, self).get_global_rank(group, rank))
+                rank += 1
+        except ValueError:
+            pass
+        if tuple(results) not in self.groups:
+            self._new_group(results, group)
+        return results
diff --git a/deepspeed/comm/comm.py b/deepspeed/comm/comm.py
index 302e3eaf2210..568211645f40 100644
--- a/deepspeed/comm/comm.py
+++ b/deepspeed/comm/comm.py
@@ -1,50 +1,34 @@
-"""
-    Copyright 2021 The Microsoft DeepSpeed Team
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
 
+# DeepSpeed Team
+"""
     DeepSpeed Communication Package: deepspeed.comm
-
     deepspeed.comm
-        -- import and use deepspeeed.ops.comm
+        -- import and use deepspeed.ops.comm
         -- use torch.distributed directly if both this package and torch.distributed use the same NCCL version
         -- use custom collectives
             -- can either use torch.dist or ds.ops.comm?
-
         Note: the old 1-bit compressed allreduce variants that resided in deepspeed.runtime.comm will be moved here as well.
-
     deepspeed.comm API
         -- must be kept fully compatible (same signatures) as torch.dist API to ensure backward/cross-framework compatibility.
         -- e.g. if a client code used
             from deepspeed import comm as dist
-
             instead of
             import torch.distributed as dist
-
             The code should work without breaking any of the public torch.distributed functionality
-
     Future:
         -- deepspeed groups API should be brought into ds.comm
 """
 
-from enum import Enum
 import torch
 import os
 
 from ..constants import TORCH_DISTRIBUTED_DEFAULT_PORT, default_pg_timeout
 from .constants import *
+from .reduce_op import *
 from deepspeed.accelerator import get_accelerator
-
-
-class ReduceOp(Enum):
-    SUM = 0
-    PRODUCT = 1
-    MIN = 2
-    MAX = 3
-    BAND = 4
-    BOR = 5
-    BXOR = 6
-    AVG = 7
-    UNUSED = 8
-
+from deepspeed.comm.ccl import CCLBackend
 
 from deepspeed.utils.comms_logging import CommsLogger
 from deepspeed.utils import timer, get_caller_func
@@ -54,7 +38,6 @@ class ReduceOp(Enum):
 from datetime import timedelta
 
 # Current deepspeed.comm backend (cdb) global object for simple access by client code
-use_ds_backend = False
 cdb = None
 
 # Create global timer for ops
@@ -63,18 +46,24 @@ class ReduceOp(Enum):
 
 comms_logger = CommsLogger()
 
-# Ensure we don't warn about base collectives more than once
-has_warned_all_gather = False
-has_warned_reduce_scatter = False
-
 # Maintain objects of all initialized ds backends and assign them using the API functions in this file
 nccl_backend = None
 mpi_backend = None
+ccl_backend = None
+hccl_backend = None
 
 # This should be set here so all rank/size information from the launcher can be propagated
 from deepspeed.comm.utils import *
 
 
+class ProcessGroup():
+
+    def __init__(self, comm_id, ranks=[]):
+        self.ranks = ranks
+        self.comm_id = comm_id
+        self.size = len(ranks)
+
+
 def _configure_using_config_file(config):
     if config.comms_logger_enabled:
         comms_logger.configure(config)
@@ -110,12 +99,13 @@ def configure(
 
 # Logging wrapper for timing ops
 def timed_op(func):
+
     def log_wrapper(*args, **kwargs):
         # Add enabled flag so that overhead to each comm op is two if conditions at most
         if comms_logger.enabled:
-            if ('prof' in kwargs and kwargs['prof']) or comms_logger.prof_all or (
-                    'log_name' in kwargs
-                    and kwargs['log_name'] in comms_logger.prof_ops):
+            if ('prof' in kwargs
+                    and kwargs['prof']) or comms_logger.prof_all or ('log_name' in kwargs
+                                                                     and kwargs['log_name'] in comms_logger.prof_ops):
                 # Need func args for their defaults
                 func_args = get_default_args(func)
                 func_args.update(kwargs)
@@ -133,8 +123,7 @@ def log_wrapper(*args, **kwargs):
                 if cdb.using_mpi:
                     cdb.barrier()
                 if ('prof' in kwargs and kwargs['prof']) or comms_logger.prof_all or (
-                        'log_name' in kwargs
-                        and kwargs['log_name'] in comms_logger.prof_ops):
+                        'log_name' in kwargs and kwargs['log_name'] in comms_logger.prof_ops):
                     log_name = get_debug_log_name(func_args, comms_logger.debug)
                     raw_name = func.__name__
                     timers(log_name).stop()
@@ -151,20 +140,29 @@ def log_wrapper(*args, **kwargs):
 
 
 # UNUSED: Future helper function to initialize DS backends
-def init_deepspeed_backend(ds_backend):
+def init_deepspeed_backend(ds_backend, timeout, init_method):
     global cdb
     global nccl_backend
     global mpi_backend
-    global use_ds_backend
+    global ccl_backend
+    global hccl_backend
+
+    rank = int(os.getenv('RANK', '-1'))
+    size = int(os.getenv('WORLD_SIZE', '-1'))
 
     if ds_backend == NCCL_BACKEND:
-        utils.logger.warn("NCCL backend in DeepSpeed not yet implemented")
+        utils.logger.debug("NCCL backend in DeepSpeed not yet implemented")
     elif ds_backend == MPI_BACKEND:
-        utils.logger.warn("MPI backend in DeepSpeed not yet implemented")
+        utils.logger.debug("MPI backend in DeepSpeed not yet implemented")
     elif ds_backend == GLOO_BACKEND:
-        utils.logger.warn("Gloo backend in DeepSpeed not yet implemented")
+        utils.logger.debug("Gloo backend in DeepSpeed not yet implemented")
+    elif ds_backend == CCL_BACKEND:
+        ccl_backend = CCLBackend(rank=rank, world_size=size, timeout=timeout, init_method=init_method)
+        utils.logger.info(f"Initialize {ds_backend} backend")
+    elif ds_backend == HCCL_BACKEND:
+        utils.logger.warn("HCCL backend in DeepSpeed not yet implemented")
     else:
-        utils.logger.warn(f"DeepSpeed does not support {ds_backend} backend")
+        utils.logger.debug(f"DeepSpeed does not support {ds_backend} backend")
 
 
 def is_initialized():
@@ -182,7 +180,8 @@ def destroy_process_group(group=None):
 
 def new_group(ranks):
     global cdb
-    assert cdb is not None and cdb.is_initialized(), 'DeepSpeed backend not set, please initialize it using init_process_group()'
+    assert cdb is not None and cdb.is_initialized(
+    ), 'DeepSpeed backend not set, please initialize it using init_process_group()'
     return cdb.new_group(ranks)
 
 
@@ -196,38 +195,31 @@ def is_available() -> bool:
     return True
 
 
-def set_backend(backend):
-    if not use_ds_backend:
-        utils.logger.error(
-            "DeepSpeed communication backend is required. Please use deepspeed.comm.init_distributed(backend, use_deepspeed=True) to use this functionality"
-        )
-        raise RuntimeError(
-            'Error: Custom DeepSpeed backend called without initializing DeepSpeed distributed.'
-        )
-
+def set_backend():
     global cdb
     global nccl_backend
     global mpi_backend
+    global ccl_backend
+    global hccl_backend
 
-    try:
-        if backend_name == NCCL_BACKEND:
-            if nccl_backend is not None and nccl_backend.is_initialized():
-                cdb = nccl_backend
-        elif backend_name == MPI_BACKEND:
-            if mpi_backend is not None and mpi_backend.is_initialized():
-                cdb = mpi_backend
-    except Exception as inst:
-        print(inst)
+    backend_name = get_accelerator().communication_backend_name()
+
+    if backend_name == NCCL_BACKEND:
+        if nccl_backend is not None and nccl_backend.is_initialized():
+            cdb = nccl_backend
+    elif backend_name == MPI_BACKEND:
+        if mpi_backend is not None and mpi_backend.is_initialized():
+            cdb = mpi_backend
+    elif backend_name == CCL_BACKEND:
+        if ccl_backend is not None and ccl_backend.is_initialized():
+            cdb = ccl_backend
+    elif backend_name == HCCL_BACKEND:
+        if hccl_backend is not None and hccl_backend.is_initialized():
+            cdb = hccl_backend
 
 
 @timed_op
-def broadcast(tensor,
-              src,
-              group=None,
-              async_op=False,
-              prof=False,
-              log_name='broadcast',
-              debug=get_caller_func()):
+def broadcast(tensor, src, group=None, async_op=False, prof=False, log_name='broadcast', debug=get_caller_func()):
     global cdb
     return cdb.broadcast(tensor=tensor, src=src, group=group, async_op=async_op)
 
@@ -241,17 +233,14 @@ def all_gather(tensor_list,
                log_name='all_gather',
                debug=get_caller_func()):
     global cdb
-    return cdb.all_gather(tensor_list=tensor_list,
-                          tensor=tensor,
-                          group=group,
-                          async_op=async_op)
+    return cdb.all_gather(tensor_list=tensor_list, tensor=tensor, group=group, async_op=async_op)
 
 
-def has_reduce_scatter_base():
+def has_reduce_scatter_tensor():
     global cdb
-    assert cdb is not None and cdb.is_initialized(), 'DeepSpeed backend not set, please initialize it using init_process_group()'
-    assert cdb.has_reduce_scatter_base is not None, 'has_reduce_scatter_base is not yet defined'
-    return cdb.has_reduce_scatter_base
+    assert cdb is not None and cdb.is_initialized(
+    ), 'DeepSpeed backend not set, please initialize it using init_process_group()'
+    return cdb.has_reduce_scatter_tensor()
 
 
 def reduce_scatter_fn(output_tensor,
@@ -262,23 +251,21 @@ def reduce_scatter_fn(output_tensor,
                       prof=False,
                       debug=get_caller_func()):
     global cdb
-    global has_warned_reduce_scatter
-    assert cdb is not None and cdb.is_initialized(), 'DeepSpeed backend not set, please initialize it using init_process_group()'
-    if cdb.has_reduce_scatter_base:
-        return reduce_scatter_base(output_tensor,
-                                   tensor,
-                                   op=op,
-                                   group=group,
-                                   async_op=async_op,
-                                   prof=prof,
-                                   debug=debug)
+    assert cdb is not None and cdb.is_initialized(
+    ), 'DeepSpeed backend not set, please initialize it using init_process_group()'
+    if cdb.has_reduce_scatter_tensor():
+        return reduce_scatter_tensor(output_tensor,
+                                     tensor,
+                                     op=op,
+                                     group=group,
+                                     async_op=async_op,
+                                     prof=prof,
+                                     debug=debug)
     else:
-        if not has_warned_reduce_scatter:
-            utils.logger.warning(
-                "unable to find torch.distributed._reduce_scatter_base. will fall back to "
-                "torch.distributed.all_gather which will result in suboptimal performance. "
-                "please consider upgrading your pytorch installation.")
-            has_warned_reduce_scatter = True
+        if get_rank() == 0:
+            utils.logger.warning_once("unable to find torch.distributed.reduce_scatter_tensor. will fall back to "
+                                      "torch.distributed.reduce_scatter which will result in suboptimal performance. "
+                                      "please consider upgrading your pytorch installation.")
         input_tensor_lst = list(torch.chunk(tensor, cdb.get_world_size(group)))
         return reduce_scatter(output_tensor,
                               input_tensor_lst,
@@ -290,71 +277,54 @@ def reduce_scatter_fn(output_tensor,
 
 
 @timed_op
-def reduce_scatter_base(output_tensor,
-                        tensor,
-                        op=ReduceOp.SUM,
-                        group=None,
-                        async_op=False,
-                        prof=False,
-                        log_name='reduce_scatter_base',
-                        debug=get_caller_func()):
-    global cdb
-    return cdb.reduce_scatter_base(output_tensor=output_tensor,
-                                   input_tensor=tensor,
-                                   op=op,
-                                   group=group,
-                                   async_op=async_op)
+def reduce_scatter_tensor(output_tensor,
+                          tensor,
+                          op=ReduceOp.SUM,
+                          group=None,
+                          async_op=False,
+                          prof=False,
+                          log_name='reduce_scatter_tensor',
+                          debug=get_caller_func()):
+    global cdb
+    return cdb.reduce_scatter_tensor(output_tensor=output_tensor,
+                                     input_tensor=tensor,
+                                     op=op,
+                                     group=group,
+                                     async_op=async_op)
 
 
 @timed_op
-def all_gather_base(output_tensor,
-                    tensor,
-                    group=None,
-                    async_op=False,
-                    prof=False,
-                    log_name='all_gather_base',
-                    debug=get_caller_func()):
-    global cdb
-    return cdb.all_gather_base(output_tensor=output_tensor,
-                               input_tensor=tensor,
-                               group=group,
-                               async_op=async_op)
-
-
-def has_allgather_base():
-    global cdb
-    assert cdb is not None and cdb.is_initialized(), 'DeepSpeed backend not set, please initialize it using init_process_group()'
-    assert cdb.has_allgather_base is not None, 'has_allgather_base is not yet defined'
-    return cdb.has_allgather_base
-
-
-def allgather_fn(output_tensor,
-                 input_tensor,
-                 group=None,
-                 async_op=False,
-                 debug=get_caller_func()):
-    global cdb
-    global has_warned_all_gather
-    assert cdb is not None and cdb.is_initialized(), 'DeepSpeed backend not set, please initialize it using init_process_group()'
-    if cdb.has_allgather_base:
-        return all_gather_base(output_tensor,
-                               input_tensor,
-                               group=group,
-                               async_op=async_op,
-                               debug=debug)
+def all_gather_into_tensor(output_tensor,
+                           tensor,
+                           group=None,
+                           async_op=False,
+                           prof=False,
+                           log_name='all_gather_into_tensor',
+                           debug=get_caller_func()):
+    global cdb
+    return cdb.all_gather_into_tensor(output_tensor=output_tensor, input_tensor=tensor, group=group, async_op=async_op)
+
+
+def has_all_gather_into_tensor():
+    global cdb
+    assert cdb is not None and cdb.is_initialized(
+    ), 'DeepSpeed backend not set, please initialize it using init_process_group()'
+    return cdb.has_all_gather_into_tensor()
+
+
+def allgather_fn(output_tensor, input_tensor, group=None, async_op=False, debug=get_caller_func()):
+    global cdb
+    assert cdb is not None and cdb.is_initialized(
+    ), 'DeepSpeed backend not set, please initialize it using init_process_group()'
+    if cdb.has_all_gather_into_tensor():
+        return all_gather_into_tensor(output_tensor, input_tensor, group=group, async_op=async_op, debug=debug)
     else:
-        if not has_warned_all_gather and get_rank() == 0:
-            utils.logger.warning(
-                "unable to find torch.distributed._all_gather_base. will fall back to "
-                "torch.distributed.all_gather which will result in suboptimal performance. "
-                "please consider upgrading your pytorch installation.")
-            has_warned_all_gather = True
+        if get_rank() == 0:
+            utils.logger.warning_once("unable to find torch.distributed.all_gather_into_tensor. will fall back to "
+                                      "torch.distributed.all_gather which will result in suboptimal performance. "
+                                      "please consider upgrading your pytorch installation.")
         output_tensors = list(torch.chunk(output_tensor, cdb.get_world_size(group)))
-        return all_gather(output_tensors,
-                          input_tensor,
-                          group=group,
-                          async_op=async_op,
-                          debug=debug)
+        return all_gather(output_tensors, input_tensor, group=group, async_op=async_op, debug=debug)
 
 
 @timed_op
@@ -377,49 +347,31 @@ def all_to_all_single(output,
 
 
 @timed_op
-def send(tensor,
-         dst,
-         group=None,
-         tag=0,
-         prof=False,
-         log_name='send',
-         debug=get_caller_func()):
+def all_to_all(output_tensor_list, input_tensor_list, group=None, async_op=False):
+    global cdb
+    return cdb.all_to_all(output_tensor_list, input_tensor_list, group=group, async_op=async_op)
+
+
+@timed_op
+def send(tensor, dst, group=None, tag=0, prof=False, log_name='send', debug=get_caller_func()):
     global cdb
     return cdb.send(tensor=tensor, dst=dst, group=group, tag=tag)
 
 
 @timed_op
-def recv(tensor,
-         src=None,
-         group=None,
-         tag=0,
-         prof=False,
-         log_name='recv',
-         debug=get_caller_func()):
+def recv(tensor, src=None, group=None, tag=0, prof=False, log_name='recv', debug=get_caller_func()):
     global cdb
     return cdb.recv(tensor=tensor, src=src, group=group, tag=tag)
 
 
 @timed_op
-def isend(tensor,
-          dst,
-          group=None,
-          tag=0,
-          prof=False,
-          log_name='isend',
-          debug=get_caller_func()):
+def isend(tensor, dst, group=None, tag=0, prof=False, log_name='isend', debug=get_caller_func()):
     global cdb
     return cdb.send(tensor=tensor, dst=dst, group=group, tag=tag)
 
 
 @timed_op
-def irecv(tensor,
-          src=None,
-          group=None,
-          tag=0,
-          prof=False,
-          log_name='irecv',
-          debug=get_caller_func()):
+def irecv(tensor, src=None, group=None, tag=0, prof=False, log_name='irecv', debug=get_caller_func()):
     global cdb
     return cdb.recv(tensor=tensor, src=src, group=group, tag=tag)
 
@@ -434,11 +386,7 @@ def gather(tensor,
            log_name='gather',
            debug=get_caller_func()):
     global cdb
-    return cdb.gather(tensor=tensor,
-                      gather_list=gather_list,
-                      dst=dst,
-                      group=group,
-                      async_op=async_op)
+    return cdb.gather(tensor=tensor, gather_list=gather_list, dst=dst, group=group, async_op=async_op)
 
 
 @timed_op
@@ -451,22 +399,13 @@ def scatter(tensor,
             log_name='scatter',
             debug=get_caller_func()):
     global cdb
-    return cdb.scatter(tensor=tensor,
-                       scatter_list=scatter_list,
-                       src=src,
-                       group=group,
-                       async_op=async_op)
+    return cdb.scatter(tensor=tensor, scatter_list=scatter_list, src=src, group=group, async_op=async_op)
 
 
 @timed_op
-def barrier(group=None,
-            async_op=False,
-            device_ids=None,
-            prof=False,
-            log_name='barrier',
-            debug=get_caller_func()):
+def barrier(group=None, async_op=False, device_ids=None, prof=False, log_name='barrier', debug=get_caller_func()):
     global cdb
-    return cdb.barrier(group=group, async_op=async_op, device_ids=device_ids)
+    return cdb.barrier(group=group, async_op=async_op)
 
 
 @timed_op
@@ -477,14 +416,16 @@ def monitored_barrier(group=None,
                       log_name='monitored_barrier',
                       debug=get_caller_func()):
     global cdb
-    return cdb.barrier(group=group, timeout=timeout, wait_all_ranks=wait_all_ranks)
+    return cdb.monitored_barrier(group=group, timeout=timeout, wait_all_ranks=wait_all_ranks)
 
 
-def log_summary():
+def log_summary(show_straggler=False):
     global cdb
     barrier(log_name='log_summary_barrier')
     if cdb.get_rank() == 0:
-        comms_logger.log_all()
+        comms_logger.log_all(print_log=True, show_straggler=show_straggler)
+    else:
+        comms_logger.log_all(print_log=False, show_straggler=show_straggler)
     barrier(log_name='log_summary_barrier')
 
 
@@ -511,11 +452,31 @@ def reduce_scatter(output,
                    log_name='reduce_scatter',
                    debug=get_caller_func()):
     global cdb
-    return cdb.reduce_scatter(output=output,
-                              input_list=input_list,
-                              op=op,
-                              group=group,
-                              async_op=async_op)
+    return cdb.reduce_scatter(output=output, input_list=input_list, op=op, group=group, async_op=async_op)
+
+
+def has_all_reduce_coalesced():
+    """"""
+    global cdb
+    assert cdb is not None and cdb.is_initialized(
+    ), 'DeepSpeed backend not set, please initialize it using init_process_group()'
+    assert cdb.has_all_reduce_coalesced is not None, 'has_all_reduce_coalesced is not yet defined'
+    return cdb.has_all_reduce_coalesced
+
+
+def has_coalescing_manager():
+    global cdb
+    assert cdb is not None and cdb.is_initialized(
+    ), 'DeepSpeed backend not set, please initialize it using init_process_group()'
+    assert cdb.has_coalescing_manager is not None, 'has_coalescing_manager is not yet defined'
+    return cdb.has_coalescing_manager
+
+
+def all_gather_coalesced(output_tensors, input_tensors, group=None, async_op=False):
+    global cdb
+    assert cdb is not None and cdb.is_initialized(
+    ), 'DeepSpeed backend not set, please initialize it using init_process_group()'
+    return cdb.all_gather_coalesced(output_tensors, input_tensors, group=group, async_op=async_op)
 
 
 @timed_op
@@ -535,9 +496,34 @@ def all_reduce(tensor,
     return cdb.all_reduce(tensor, op, group, async_op)
 
 
+@timed_op
+def inference_all_reduce(tensor,
+                         op=ReduceOp.SUM,
+                         group=None,
+                         async_op=False,
+                         prof=False,
+                         log_name='all_reduce',
+                         debug=get_caller_func()):
+    global cdb
+    return cdb.inference_all_reduce(tensor, op, group, async_op)
+
+
+@timed_op
+def all_reduce_coalesced(tensors,
+                         op=ReduceOp.SUM,
+                         group=None,
+                         async_op=False,
+                         prof=False,
+                         log_name='all_reduce',
+                         debug=get_caller_func()):
+    global cdb
+    return cdb.all_reduce_coalesced(tensors, op, group, async_op)
+
+
 def get_world_group():
     global cdb
-    assert cdb is not None and cdb.is_initialized(), 'DeepSpeed backend not set, please initialize it using init_process_group()'
+    assert cdb is not None and cdb.is_initialized(
+    ), 'DeepSpeed backend not set, please initialize it using init_process_group()'
     return cdb.get_world_group()
 
 
@@ -553,7 +539,8 @@ def get_world_size(group=None) -> int:
     """
     global cdb
 
-    assert cdb is not None and cdb.is_initialized(), 'DeepSpeed backend not set, please initialize it using init_process_group()'
+    assert cdb is not None and cdb.is_initialized(
+    ), 'DeepSpeed backend not set, please initialize it using init_process_group()'
     return cdb.get_world_size(group)
 
 
@@ -572,7 +559,8 @@ def get_rank(group=None):
         -1, if not part of the group
     """
     global cdb
-    assert cdb is not None and cdb.is_initialized(), 'DeepSpeed backend not set, please initialize it using init_process_group()'
+    assert cdb is not None and cdb.is_initialized(
+    ), 'DeepSpeed backend not set, please initialize it using init_process_group()'
     return cdb.get_rank(group)
 
 
@@ -585,16 +573,33 @@ def get_local_rank():
             local rank (= GPU device ID)
     """
     global cdb
-    assert cdb is not None and cdb.is_initialized(), 'DeepSpeed backend not set, please initialize it using init_process_group()'
+    assert cdb is not None and cdb.is_initialized(
+    ), 'DeepSpeed backend not set, please initialize it using init_process_group()'
     return get_local_rank_from_launcher()
 
 
 def get_global_rank(group=None, group_rank=0):
     global cdb
-    assert cdb is not None and cdb.is_initialized(), 'DeepSpeed backend not set, please initialize it using init_process_group()'
+    assert cdb is not None and cdb.is_initialized(
+    ), 'DeepSpeed backend not set, please initialize it using init_process_group()'
     return cdb.get_global_rank(group, group_rank)
 
 
+def get_all_ranks_from_group(group=None):
+    global cdb
+    assert cdb is not None and cdb.is_initialized(
+    ), 'DeepSpeed backend not set, please initialize it using init_process_group()'
+    rank = 0
+    group_ranks = []
+    try:
+        while True:
+            group_ranks.append(cdb.get_global_rank(group, rank))
+            rank += 1
+    except RuntimeError:
+        pass
+    return group_ranks
+
+
 # Main DeepSpeed Comms. public API.
 def init_distributed(dist_backend=None,
                      auto_mpi_discovery=True,
@@ -626,6 +631,10 @@ def init_distributed(dist_backend=None,
     if dist_init_required is None:
         dist_init_required = cdb is None or not cdb.is_initialized()
 
+    if cdb is None:
+        init_deepspeed_backend(get_accelerator().communication_backend_name(), timeout, init_method)
+        set_backend()
+        utils.logger.info(f'cdb={cdb}')
     if cdb is None and torch.distributed.is_initialized():
         # The user initialized torch.dist themselves, create cdb and short-circuit
         cdb = TorchBackend(dist_backend, timeout, init_method)
@@ -640,9 +649,7 @@ def init_distributed(dist_backend=None,
         required_env = ["RANK", "WORLD_SIZE", "MASTER_ADDR", "MASTER_PORT", "LOCAL_RANK"]
         if auto_mpi_discovery and not all(map(lambda v: v in os.environ, required_env)):
             if verbose:
-                utils.logger.info(
-                    "Not using the DeepSpeed or dist launchers, attempting to detect MPI environment..."
-                )
+                utils.logger.info("Not using the DeepSpeed or dist launchers, attempting to detect MPI environment...")
             if in_aml() and not in_dlts():
                 patch_aml_env_for_torch_nccl_backend(verbose=verbose)
             elif in_aws_sm():
@@ -655,12 +662,10 @@ def init_distributed(dist_backend=None,
                 utils.logger.info('Distributed backend already initialized')
         else:
             assert isinstance(timeout, timedelta)
-            if dist_backend == None:
+            if dist_backend is None:
                 dist_backend = get_accelerator().communication_backend_name()
             if int(os.getenv('RANK', '0')) == 0:
-                utils.logger.info(
-                    'Initializing TorchBackend in DeepSpeed with backend {}'.format(
-                        dist_backend))
+                utils.logger.info('Initializing TorchBackend in DeepSpeed with backend {}'.format(dist_backend))
             # Create a torch backend object, initialize torch distributed, and assign to cdb
             cdb = TorchBackend(dist_backend, timeout, init_method, rank, world_size)
 
@@ -695,16 +700,12 @@ def mpi_discovery(distributed_port=TORCH_DISTRIBUTED_DEFAULT_PORT, verbose=True)
 
     if verbose:
         utils.logger.info(
-            "Discovered MPI settings of world_rank={}, local_rank={}, world_size={}, master_addr={}, master_port={}"
-            .format(os.environ['RANK'],
-                    os.environ['LOCAL_RANK'],
-                    os.environ['WORLD_SIZE'],
-                    os.environ['MASTER_ADDR'],
-                    os.environ['MASTER_PORT']))
+            "Discovered MPI settings of world_rank={}, local_rank={}, world_size={}, master_addr={}, master_port={}".
+            format(os.environ['RANK'], os.environ['LOCAL_RANK'], os.environ['WORLD_SIZE'], os.environ['MASTER_ADDR'],
+                   os.environ['MASTER_PORT']))
 
     if cdb is not None and cdb.is_initialized():
-        assert cdb.get_rank() == rank, "MPI rank {} does not match torch rank {}".format(
-            rank, cdb.get_rank())
+        assert cdb.get_rank() == rank, "MPI rank {} does not match torch rank {}".format(rank, cdb.get_rank())
         assert cdb.get_world_size() == world_size, "MPI world size {} does not match torch world size {}".format(
             world_size, cdb.get_world_size())
 
@@ -731,8 +732,7 @@ def patch_aml_env_for_torch_nccl_backend(master_port=6105, verbose=True):
     """
     os.environ["RANK"] = os.environ["OMPI_COMM_WORLD_RANK"]
     os.environ["WORLD_SIZE"] = os.environ["OMPI_COMM_WORLD_SIZE"]
-    single_node = int(os.environ["OMPI_COMM_WORLD_LOCAL_SIZE"]) == int(
-        os.environ["WORLD_SIZE"])
+    single_node = int(os.environ["OMPI_COMM_WORLD_LOCAL_SIZE"]) == int(os.environ["WORLD_SIZE"])
 
     if not single_node:
         master_node_params = os.environ["AZ_BATCH_MASTER_NODE"].split(":")
@@ -745,8 +745,7 @@ def patch_aml_env_for_torch_nccl_backend(master_port=6105, verbose=True):
         os.environ["MASTER_PORT"] = DEFAULT_AML_MASTER_PORT
 
     if verbose:
-        utils.logger.info("NCCL_SOCKET_IFNAME original value = {}".format(
-            os.environ["NCCL_SOCKET_IFNAME"]))
+        utils.logger.info("NCCL_SOCKET_IFNAME original value = {}".format(os.environ["NCCL_SOCKET_IFNAME"]))
 
     os.environ["NCCL_SOCKET_IFNAME"] = DEFAULT_AML_NCCL_SOCKET_IFNAME
     os.environ['LOCAL_RANK'] = os.environ["OMPI_COMM_WORLD_LOCAL_RANK"]
@@ -754,10 +753,7 @@ def patch_aml_env_for_torch_nccl_backend(master_port=6105, verbose=True):
     if verbose:
         utils.logger.info(
             "Discovered AzureML settings of world_rank={}, local_rank={}, world_size={}, master_addr={}, master_port={}"
-            .format(os.environ['RANK'],
-                    os.environ['LOCAL_RANK'],
-                    os.environ['WORLD_SIZE'],
-                    os.environ['MASTER_ADDR'],
+            .format(os.environ['RANK'], os.environ['LOCAL_RANK'], os.environ['WORLD_SIZE'], os.environ['MASTER_ADDR'],
                     os.environ['MASTER_PORT']))
 
 
@@ -771,8 +767,5 @@ def patch_aws_sm_env_for_torch_nccl_backend(verbose=True):
     if verbose:
         utils.logger.info(
             "Discovered AWS SageMaker settings of world_rank={}, local_rank={}, world_size={}, master_addr={}, master_port={}"
-            .format(os.environ['RANK'],
-                    os.environ['LOCAL_RANK'],
-                    os.environ['WORLD_SIZE'],
-                    os.environ['MASTER_ADDR'],
+            .format(os.environ['RANK'], os.environ['LOCAL_RANK'], os.environ['WORLD_SIZE'], os.environ['MASTER_ADDR'],
                     os.environ['MASTER_PORT']))
diff --git a/deepspeed/comm/config.py b/deepspeed/comm/config.py
index 25839514fcfc..1c441bb6bfe9 100644
--- a/deepspeed/comm/config.py
+++ b/deepspeed/comm/config.py
@@ -1,14 +1,14 @@
-'''Copyright The Microsoft DeepSpeed Team'''
-"""
-Copyright (c) Microsoft Corporation
-Licensed under the MIT license.
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
-from pydantic import BaseModel
 from .constants import *
+from ..pydantic_v1 import BaseModel
 
 
 class CommsConfig(BaseModel):
+
     class Config:
         validate_all = True
         validate_assignment = True
@@ -25,6 +25,7 @@ class CommsLoggerConfig(CommsConfig):
 
 
 class DeepSpeedCommsConfig:
+
     def __init__(self, ds_config):
         self.comms_logger_enabled = 'comms_logger' in ds_config
 
diff --git a/deepspeed/comm/constants.py b/deepspeed/comm/constants.py
index 45fd5ac7e3ed..50d234c93fa0 100644
--- a/deepspeed/comm/constants.py
+++ b/deepspeed/comm/constants.py
@@ -1,9 +1,14 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 NCCL_BACKEND = 'nccl'
+CCL_BACKEND = 'ccl'
 MPI_BACKEND = 'mpi'
 GLOO_BACKEND = 'gloo'
 SCCL_BACKEND = 'sccl'
+HCCL_BACKEND = 'hccl'
 
 DEFAULT_AML_MASTER_PORT = "54965"
 DEFAULT_AML_NCCL_SOCKET_IFNAME = "^docker0,lo"
diff --git a/deepspeed/comm/reduce_op.py b/deepspeed/comm/reduce_op.py
new file mode 100644
index 000000000000..49393f0c7227
--- /dev/null
+++ b/deepspeed/comm/reduce_op.py
@@ -0,0 +1,18 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from enum import Enum
+
+
+class ReduceOp(Enum):
+    SUM = 0
+    PRODUCT = 1
+    MIN = 2
+    MAX = 3
+    BAND = 4
+    BOR = 5
+    BXOR = 6
+    AVG = 7
+    UNUSED = 8
diff --git a/deepspeed/comm/torch.py b/deepspeed/comm/torch.py
index 0753e7ce4bb6..531f54d2628f 100644
--- a/deepspeed/comm/torch.py
+++ b/deepspeed/comm/torch.py
@@ -1,11 +1,99 @@
-'''
-Copyright 2021 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 from deepspeed import utils
 
 from .utils import *
 from .backend import *
 from .comm import *
+import os
+
+DS_COMM_ALL_GATHER_OFF = False
+DS_COMM_REDUCE_SCATTER_OFF = False
+DS_COMM_BROADCAST_OFF = False
+DS_COMM_ALL_REDUCE_OFF = False
+DS_COMM_REDUCE_OFF = False
+
+
+def is_torch_ver_eq_2_0():
+    TORCH_MAJOR, TORCH_MINOR = map(int, torch.__version__.split('.')[:2])
+    if TORCH_MAJOR == 2 and TORCH_MINOR == 0:
+        return True
+    return False
+
+
+def is_torch_ver_ge_2_1():
+    TORCH_MAJOR, TORCH_MINOR = map(int, torch.__version__.split('.')[:2])
+    if TORCH_MAJOR >= 2 and TORCH_MINOR >= 1:
+        return True
+    return False
+
+
+def torch_ver_ge_1_13():
+    TORCH_MAJOR, TORCH_MINOR = map(int, torch.__version__.split('.')[:2])
+    if TORCH_MAJOR >= 1 and TORCH_MINOR >= 13:
+        return True
+    return False
+
+
+def has_coalescing_manager():
+    has_c10d = hasattr(torch.distributed, 'distributed_c10d')
+    return has_c10d and hasattr(torch.distributed.distributed_c10d, '_coalescing_manager')
+
+
+def has_all_reduce_coalesced():
+    return hasattr(torch.distributed, "all_reduce_coalesced") and torch_ver_ge_1_13()
+
+
+def get_coalescing_manager(group, device, reqs, async_op):
+    if is_torch_ver_eq_2_0():
+        return torch.distributed.distributed_c10d._coalescing_manager(group, device=device, reqs=reqs)
+    elif is_torch_ver_ge_2_1():
+        return torch.distributed.distributed_c10d._coalescing_manager(group, device=device, async_ops=async_op)
+    else:
+        return torch.distributed.distributed_c10d._coalescing_manager(group, reqs)
+
+
+##Utilities to turn comm off
+##TODO: move to base comm (wrapper)
+def all_gather_comm_off(flag=False):
+    global DS_COMM_ALL_GATHER_OFF
+    DS_COMM_ALL_GATHER_OFF = flag
+
+
+def reduce_scatter_comm_off(flag=False):
+    global DS_COMM_REDUCE_SCATTER_OFF
+    DS_COMM_REDUCE_SCATTER_OFF = flag
+
+
+def broadcast_comm_off(flag=False):
+    global DS_COMM_BROADCAST_OFF
+    DS_COMM_BROADCAST_OFF = flag
+
+
+def all_reduce_comm_off(flag=False):
+    global DS_COMM_ALL_REDUCE_OFF
+    DS_COMM_ALL_REDUCE_OFF = flag
+
+
+def reduce_comm_off(flag=False):
+    global DS_COMM_REDUCE_OFF
+    DS_COMM_REDUCE_OFF = flag
+
+
+#assumption: all_gather and reduce scatter
+## are what we care about
+def backward_comm_off(flag=False):
+    all_gather_comm_off(flag)
+    reduce_scatter_comm_off(flag)
+
+
+class Noop:
+
+    def wait(self):
+        return None
 
 
 class TorchBackend(Backend):
@@ -16,17 +104,13 @@ class TorchBackend(Backend):
         so no need to wrap all the functions. We can keep adding wrappers as
         needed.
     """
-    def __init__(self,
-                 backend,
-                 timeout,
-                 init_method,
-                 rank=-1,
-                 world_size=-1,
-                 name='torch'):
+
+    def __init__(self, backend, timeout, init_method, rank=-1, world_size=-1, name='torch'):
         super(TorchBackend, self).__init__()
-        self.torch_version_before_18 = older_torch()
-        self.has_allgather_base = has_allgather_base()
-        self.has_reduce_scatter_base = has_reduce_scatter_base()
+        self.has_all_reduce_coalesced = has_all_reduce_coalesced()
+        self.has_coalescing_manager = has_coalescing_manager()
+        self.all_gather_function = self.get_all_gather_function()
+        self.reduce_scatter_function = self.get_reduce_scatter_function()
         self.initialized = True
         self.name = name
         # Future functionality to support ds.initialize() on a single GPU
@@ -35,6 +119,28 @@ def __init__(self,
         self.single_gpu_mode = True
         self.init_process_group(backend, timeout, init_method, rank, world_size)
 
+    @classmethod
+    def get_all_gather_function(self):
+        if hasattr(torch.distributed, "all_gather_into_tensor"):
+            return torch.distributed.all_gather_into_tensor
+        elif hasattr(torch.distributed, "_all_gather_base"):
+            return torch.distributed._all_gather_base
+        return None
+
+    @classmethod
+    def get_reduce_scatter_function(self):
+        if hasattr(torch.distributed, "reduce_scatter_tensor"):
+            return torch.distributed.reduce_scatter_tensor
+        elif hasattr(torch.distributed, "_reduce_scatter_base"):
+            return torch.distributed._reduce_scatter_base
+        return None
+
+    def has_all_gather_into_tensor(self):
+        return self.all_gather_function is not None
+
+    def has_reduce_scatter_tensor(self):
+        return self.reduce_scatter_function is not None
+
     def init_process_group(self, backend, timeout, init_method, rank, world_size):
         if not torch.distributed.is_initialized():
             torch.distributed.init_process_group(backend,
@@ -44,79 +150,117 @@ def init_process_group(self, backend, timeout, init_method, rank, world_size):
                                                  world_size=world_size)
         self.using_mpi = torch.distributed.get_backend() == 'mpi'
 
-    def all_reduce(self,
-                   tensor,
-                   op=torch.distributed.ReduceOp.SUM,
-                   group=None,
-                   async_op=False):
+    def all_reduce(self, tensor, op=torch.distributed.ReduceOp.SUM, group=None, async_op=False):
         op = self._reduce_op(op)
-        return torch.distributed.all_reduce(tensor=tensor,
-                                            op=op,
-                                            group=group,
-                                            async_op=async_op)
+        return torch.distributed.all_reduce(tensor=tensor, op=op, group=group, async_op=async_op)
+
+    def inference_all_reduce(self, tensor, op=torch.distributed.ReduceOp.SUM, group=None, async_op=False):
+        op = self._reduce_op(op)
+        return torch.distributed.all_reduce(tensor=tensor, op=op, group=group, async_op=async_op)
+
+    def all_reduce_coalesced(self, tensors, op=torch.distributed.ReduceOp.SUM, group=None, async_op=False):
+        """ proxy func to torch.distributed.all_reduce_coalesced,
+        which is included in PyTorch 1.13 and above
+        """
+        if not self.has_all_reduce_coalesced:
+            raise RuntimeError(f"Current torch version does not have all_reduce_coalesced "
+                               f"api (torch.__version__: {torch.__version__})")
+        op = self._reduce_op(op)
+        return torch.distributed.all_reduce_coalesced(tensors=tensors, op=op, group=group, async_op=async_op)
 
     def reduce(self, tensor, dst, op=ReduceOp.SUM, group=None, async_op=False):
-        return torch.distributed.reduce(tensor=tensor,
-                                        dst=dst,
-                                        op=self._reduce_op(op),
-                                        group=group,
-                                        async_op=async_op)
+        if DS_COMM_REDUCE_OFF:
+            if int(os.getenv('RANK', '0')) == 0:
+                utils.logger.warning("REDUCE is OFF")
+            return Noop()
+        return torch.distributed.reduce(tensor=tensor, dst=dst, op=self._reduce_op(op), group=group, async_op=async_op)
 
-    def reduce_scatter(self,
-                       output,
-                       input_list,
-                       op=ReduceOp.SUM,
-                       group=None,
-                       async_op=False):
-        return torch.distributed.reduce_scatter(output=output,
-                                                input_list=input_list,
-                                                op=self._reduce_op(op),
-                                                group=group,
-                                                async_op=async_op)
+    def reduce_scatter(self, output, input_list, op=ReduceOp.SUM, group=None, async_op=False):
+        if DS_COMM_REDUCE_SCATTER_OFF:
+            if int(os.getenv('RANK', '0')) == 0:
+                utils.logger.warning("REDUCE SCATTER  is OFF")
+            return Noop()
+        else:
+            return torch.distributed.reduce_scatter(output=output,
+                                                    input_list=input_list,
+                                                    op=self._reduce_op(op),
+                                                    group=group,
+                                                    async_op=async_op)
 
     def broadcast(self, tensor, src, group=None, async_op=False):
-        return torch.distributed.broadcast(tensor=tensor,
-                                           src=src,
-                                           group=group,
-                                           async_op=async_op)
+        if DS_COMM_BROADCAST_OFF:
+            if int(os.getenv('RANK', '0')) == 0:
+                utils.logger.warning("BROADCAST  is OFF")
+            return Noop()
+        else:
+            return torch.distributed.broadcast(tensor=tensor, src=src, group=group, async_op=async_op)
 
     def all_gather(self, tensor_list, tensor, group=None, async_op=False):
-        return torch.distributed.all_gather(tensor_list=tensor_list,
-                                            tensor=tensor,
+        if DS_COMM_ALL_GATHER_OFF:
+            if int(os.getenv('RANK', '0')) == 0:
+                utils.logger.warning("All Gather is OFF")
+            return Noop()
+        else:
+            return torch.distributed.all_gather(tensor_list=tensor_list, tensor=tensor, group=group, async_op=async_op)
+
+    def all_gather_into_tensor(self, output_tensor, input_tensor, group=None, async_op=False):
+        if self.has_all_gather_into_tensor():
+            return self.all_gather_function(output_tensor=output_tensor,
+                                            input_tensor=input_tensor,
                                             group=group,
                                             async_op=async_op)
 
     def all_gather_base(self, output_tensor, input_tensor, group=None, async_op=False):
-        if self.has_allgather_base:
-            return torch.distributed.distributed_c10d._all_gather_base(
-                output_tensor=output_tensor,
-                input_tensor=input_tensor,
-                group=group,
-                async_op=async_op)
+        if DS_COMM_ALL_GATHER_OFF:
+            if int(os.getenv('RANK', '0')) == 0:
+                utils.logger.warning("All Gather is OFF")
+            return Noop()
         else:
-            utils.logger.warning(
-                "unable to find torch.distributed._all_gather_base. will fall back to "
-                "torch.distributed.reduce_scatter which will result in suboptimal performance. "
-                "please consider upgrading your pytorch installation.")
-            pass
+            if self.has_allgather_base:
+                return torch.distributed.distributed_c10d._all_gather_base(output_tensor=output_tensor,
+                                                                           input_tensor=input_tensor,
+                                                                           group=group,
+                                                                           async_op=async_op)
+            else:
+                utils.logger.warning("unable to find torch.distributed._all_gather_base. will fall back to "
+                                     "torch.distributed.reduce_scatter which will result in suboptimal performance. "
+                                     "please consider upgrading your pytorch installation.")
+                pass
+
+    def all_gather_coalesced(self, output_tensors, input_tensors, group=None, async_op=False):
+        """"""
+        assert len(output_tensors) == len(input_tensors), ""
+        if hasattr(torch.distributed.distributed_c10d, '_all_gather_base_coalesced'):
+            # customized PyTorch
+            return torch.distributed.distributed_c10d._all_gather_base_coalesced(output_tensors,
+                                                                                 input_tensors,
+                                                                                 group=group,
+                                                                                 async_op=async_op)
+        elif has_coalescing_manager():
+            reqs = []
+            with get_coalescing_manager(group, input_tensors[0].device, reqs, async_op):
+                for output, input in zip(output_tensors, input_tensors):
+                    handle = torch.distributed.distributed_c10d.all_gather_into_tensor(output,
+                                                                                       input,
+                                                                                       group=group,
+                                                                                       async_op=True)
+                    reqs.append(handle)
+            if async_op:
+                return reqs[-1]
+            else:
+                reqs[-1].wait()
 
-    def reduce_scatter_base(self,
-                            output_tensor,
-                            input_tensor,
-                            op=ReduceOp.SUM,
-                            group=None,
-                            async_op=False):
-        if self.has_reduce_scatter_base:
-            return torch.distributed._reduce_scatter_base(output_tensor,
-                                                          input_tensor,
-                                                          op=self._reduce_op(op),
-                                                          group=group,
-                                                          async_op=async_op)
+    def reduce_scatter_tensor(self, output_tensor, input_tensor, op=ReduceOp.SUM, group=None, async_op=False):
+        if self.has_reduce_scatter_tensor():
+            return self.reduce_scatter_function(output_tensor,
+                                                input_tensor,
+                                                op=self._reduce_op(op),
+                                                group=group,
+                                                async_op=async_op)
         else:
-            utils.logger.warning(
-                "unable to find torch.distributed._reduce_scatter_base. will fall back to "
-                "torch.distributed.reduce_scatter which will result in suboptimal performance. "
-                "please consider upgrading your pytorch installation.")
+            utils.logger.warning("unable to find torch.distributed.reduce_scatter_tensor. will fall back to "
+                                 "torch.distributed.reduce_scatter which will result in suboptimal performance. "
+                                 "please consider upgrading your pytorch installation.")
             pass
 
     def all_to_all_single(self,
@@ -133,6 +277,9 @@ def all_to_all_single(self,
                                                    group=group,
                                                    async_op=async_op)
 
+    def all_to_all(self, output_tensor_list, input_tensor_list, group=None, async_op=False):
+        return torch.distributed.all_to_all(output_tensor_list, input_tensor_list, group=group, async_op=async_op)
+
     def send(self, tensor, dst, group=None, tag=0):
         return torch.distributed.send(tensor=tensor, dst=dst, group=group, tag=tag)
 
@@ -159,25 +306,15 @@ def scatter(self, tensor, scatter_list=None, src=0, group=None, async_op=False):
                                          group=group,
                                          async_op=async_op)
 
-    def barrier(self,
-                group=torch.distributed.GroupMember.WORLD,
-                async_op=False,
-                device_ids=None):
+    def barrier(self, group=torch.distributed.GroupMember.WORLD, async_op=False, device_ids=None):
         if group is None:
             group = torch.distributed.GroupMember.WORLD
-        return torch.distributed.barrier(group=group,
-                                         async_op=async_op,
-                                         device_ids=device_ids)
-
-    def monitored_barrier(self,
-                          group=torch.distributed.GroupMember.WORLD,
-                          timeout=None,
-                          wait_all_ranks=False):
+        return torch.distributed.barrier(group=group, async_op=async_op, device_ids=device_ids)
+
+    def monitored_barrier(self, group=torch.distributed.GroupMember.WORLD, timeout=None, wait_all_ranks=False):
         if group is None:
             group = torch.distributed.GroupMember.WORLD
-        return torch.distributed.monitored_barrier(group=group,
-                                                   timeout=timeout,
-                                                   wait_all_ranks=wait_all_ranks)
+        return torch.distributed.monitored_barrier(group=group, timeout=timeout, wait_all_ranks=wait_all_ranks)
 
     def get_rank(self, group=None):
         return torch.distributed.get_rank(group=group)
diff --git a/deepspeed/comm/utils.py b/deepspeed/comm/utils.py
index 120838e72eed..78bfdf4462e4 100644
--- a/deepspeed/comm/utils.py
+++ b/deepspeed/comm/utils.py
@@ -1,39 +1,13 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import os
-import torch
 import inspect
 from deepspeed.utils import get_caller_func
 
 
-def older_torch():
-    '''
-        Helper to lookup torch version. For versions less than 1.8, torch.dist
-        used torch.distributed.group.WORLD as the default group argument instead of None.
-        See more details at: https://github.com/pytorch/pytorch/pull/48767
-    '''
-    TORCH_MAJOR = int(torch.__version__.split('.')[0])
-    TORCH_MINOR = int(torch.__version__.split('.')[1])
-    if TORCH_MAJOR == 1 and TORCH_MINOR < 8:
-        return True
-    else:
-        return False
-
-
-def has_allgather_base():
-    '''
-        Helper to check if torch.distributed has _all_gather_base
-    '''
-    return hasattr(torch.distributed, "_all_gather_base")
-
-
-def has_reduce_scatter_base():
-    '''
-        Helper to check if torch.distributed has _reduce_scatter_base
-    '''
-    return hasattr(torch.distributed, "_reduce_scatter_base")
-
-
 def get_local_rank_from_launcher():
 
     # DeepSpeed launcher will set it so get from there
@@ -84,11 +58,7 @@ def get_world_size_from_launcher():
 
 def get_default_args(func):
     signature = inspect.signature(func)
-    return {
-        k: v.default
-        for k,
-        v in signature.parameters.items() if v.default is not inspect.Parameter.empty
-    }
+    return {k: v.default for k, v in signature.parameters.items() if v.default is not inspect.Parameter.empty}
 
 
 # We need this hacky function since torch doesn't consistently name or place the input tensor args
@@ -98,6 +68,9 @@ def get_tensor_position(func):
     # most colls
     if 'tensor' in sig_params:
         arg = 'tensor'
+    # all_reduce_coalesced coll
+    elif 'tensors' in sig_params:
+        arg = 'tensors'
     # reduce scatter coll
     elif 'input_list' in sig_params:
         arg = 'input_list'
@@ -117,6 +90,8 @@ def get_tensor_kwarg(func, kwargs):
 
     if 'tensor' in func_args:
         arg = func_args['tensor']
+    elif 'tensors' in func_args:
+        arg = func_args['tensors']
     elif 'input_list' in func_args:
         arg = func_args['input_list']
     elif 'input_tensor_list' in func_args:
diff --git a/deepspeed/compression/__init__.py b/deepspeed/compression/__init__.py
index 8e4974e69819..8c7e1c893873 100644
--- a/deepspeed/compression/__init__.py
+++ b/deepspeed/compression/__init__.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .compress import init_compression, redundancy_clean
 from .scheduler import compression_scheduler
diff --git a/deepspeed/compression/basic_layer.py b/deepspeed/compression/basic_layer.py
index a15f80da188e..bc2b54951bbe 100644
--- a/deepspeed/compression/basic_layer.py
+++ b/deepspeed/compression/basic_layer.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 import math
@@ -13,7 +16,7 @@
 
 class QuantAct(nn.Module):
     """
-    Class to quantize given activations. Note that when using this function, the input acttivation quantization range will be fixed for all
+    Class to quantize given activations. Note that when using this function, the input activation quantization range will be fixed for all
     tokens/images for inference. This generally will affect some accuracy but achieve better latency performance.
     Parameters:
     ----------
@@ -21,6 +24,7 @@ class QuantAct(nn.Module):
         Momentum for updating the activation quantization range.
     quant_mode : str, default 'symmetric'
     """
+
     def __init__(self, act_range_momentum=0.95, quant_mode='symmetric'):
         super(QuantAct, self).__init__()
 
@@ -50,10 +54,8 @@ def forward(self, x, num_bits, *args):
                 self.x_min_max[1] = x_max
 
             # if do not need momentum, please set self.act_range_momentum = 0
-            self.x_min_max[0] = self.x_min_max[0] * self.act_range_momentum + x_min * (
-                1 - self.act_range_momentum)
-            self.x_min_max[1] = self.x_min_max[1] * self.act_range_momentum + x_max * (
-                1 - self.act_range_momentum)
+            self.x_min_max[0] = self.x_min_max[0] * self.act_range_momentum + x_min * (1 - self.act_range_momentum)
+            self.x_min_max[1] = self.x_min_max[1] * self.act_range_momentum + x_max * (1 - self.act_range_momentum)
 
         x_q = self.act_function(x, num_bits, self.x_min_max[0], self.x_min_max[1])
 
@@ -61,6 +63,7 @@ def forward(self, x, num_bits, *args):
 
 
 class Embedding_Compress(nn.Embedding):
+
     def __init__(self, *kargs):
         super(Embedding_Compress, self).__init__(*kargs)
         self.weight.start_bits = None
@@ -71,17 +74,10 @@ def __init__(self, *kargs):
 
     def extra_repr(self):
         return 'num_embeddings={}, embedding_dim={}, weight_quantization={}'.format(
-            self.num_embeddings,
-            self.embedding_dim,
-            self.weight.target_bits)
-
-    def enable_weight_quantization(self,
-                                   start_bits,
-                                   target_bits,
-                                   quantization_period,
-                                   weight_quantization_enabled_in_forward,
-                                   quantization_type,
-                                   num_groups):
+            self.num_embeddings, self.embedding_dim, self.weight.target_bits)
+
+    def enable_weight_quantization(self, start_bits, target_bits, quantization_period,
+                                   weight_quantization_enabled_in_forward, quantization_type, num_groups):
         self.weight.start_bits = start_bits
         self.weight.target_bits = target_bits
         self.weight.q_period = quantization_period
@@ -105,31 +101,20 @@ def enable_weight_quantization(self,
             self.weight_quantize_num_groups = self.weight.size(0)
 
     def fix_weight_quantization(self):
-        self.weight.data = self.weight_quantizer(self.weight,
-                                                 self.weight.target_bits,
-                                                 None,
-                                                 None,
+        self.weight.data = self.weight_quantizer(self.weight, self.weight.target_bits, None, None,
                                                  self.weight_quantize_num_groups).data
         self.weight_quantization_enabled_in_forward = False
         return None
 
     def forward(self, input):
         if self.weight_quantization_enabled_in_forward and self.weight_quantization_enabled:
-            weight = self.weight_quantizer(self.weight,
-                                           self.weight.target_bits,
-                                           None,
-                                           None,
+            weight = self.weight_quantizer(self.weight, self.weight.target_bits, None, None,
                                            self.weight_quantize_num_groups)
         else:
             weight = self.weight
 
-        out = nn.functional.embedding(input,
-                                      weight,
-                                      self.padding_idx,
-                                      self.max_norm,
-                                      self.norm_type,
-                                      self.scale_grad_by_freq,
-                                      self.sparse)
+        out = nn.functional.embedding(input, weight, self.padding_idx, self.max_norm, self.norm_type,
+                                      self.scale_grad_by_freq, self.sparse)
         return out
 
 
@@ -137,6 +122,7 @@ class LinearLayer_Compress(nn.Linear):
     """
     Linear layer with compression.
     """
+
     def __init__(self, *kargs, bias=True):
         super(LinearLayer_Compress, self).__init__(*kargs, bias=bias)
         self.sparse_pruning_method = None
@@ -169,8 +155,7 @@ def enable_sparse_pruning(self, ratio, method):
             mask = mask.to(self.weight.device)
         elif method == 'topk':
             self.sparse_mask_scores = nn.Parameter(torch.Tensor(self.weight.size()))
-            self.sparse_mask_scores.data = self.sparse_mask_scores.data.to(
-                self.weight.device)
+            self.sparse_mask_scores.data = self.sparse_mask_scores.data.to(self.weight.device)
             init.kaiming_uniform_(self.sparse_mask_scores, a=math.sqrt(5))
             mask = None
         else:
@@ -185,7 +170,7 @@ def enable_row_pruning(self, ratio, method):
 
         if method == 'l1':
             # compute the l1 norm of each column
-            weight_norm = torch.norm(self.weight.data, p=1, dim=1)
+            weight_norm = torch.linalg.norm(self.weight.data, ord=1, dim=1)
             mask = TopKBinarizer.apply(weight_norm, self.row_pruning_ratio, False)
             mask = mask.view(-1, 1)
             mask = mask.to(self.weight.device)
@@ -209,11 +194,9 @@ def enable_head_pruning(self, ratio, method, num_heads):
             raise NotImplementedError
         else:
             self.head_pruning_ratio = ratio
-            self.head_pruning_scores = nn.Parameter(torch.Tensor(
-                1,
-                self.num_heads))  # we apply the pruning to O matrix
-            self.head_pruning_scores.data = self.head_pruning_scores.data.to(
-                self.weight.device)
+            self.head_pruning_scores = nn.Parameter(torch.Tensor(1,
+                                                                 self.num_heads))  # we apply the pruning to O matrix
+            self.head_pruning_scores.data = self.head_pruning_scores.data.to(self.weight.device)
             init.kaiming_uniform_(self.head_pruning_scores, a=math.sqrt(5))
 
     def fix_sparse_pruning_helper(self):
@@ -279,18 +262,17 @@ def fix_head_pruning_helper(self, mask=None, num_heads=None, dim_reduction=False
                     start_bits = self.weight.start_bits
                     target_bits = self.weight.target_bits
                     q_period = self.weight.q_period
-                    self.weight = nn.Parameter(self.weight.data.t().reshape(num_heads, -1)[mask.view(-1), :].reshape(-1, shape).t())
+                    self.weight = nn.Parameter(self.weight.data.t().reshape(num_heads,
+                                                                            -1)[mask.view(-1), :].reshape(-1,
+                                                                                                          shape).t())
                     self.weight.start_bits = start_bits
                     self.weight.target_bits = target_bits
                     self.weight.q_period = q_period
                 else:
 
                     shape = self.weight.size()
-                    self.weight.data = (self.weight.data.t().reshape(self.num_heads,
-                                                                     -1) *
-                                        mask.view(-1,
-                                                  1)).reshape(shape[1],
-                                                              shape[0]).t()
+                    self.weight.data = (self.weight.data.t().reshape(self.num_heads, -1) * mask.view(-1, 1)).reshape(
+                        shape[1], shape[0]).t()
 
                 if self.head_pruning_method == 'topk':
                     del self.head_pruning_scores
@@ -316,37 +298,26 @@ def get_mask(self, pruning_type='row'):
             if self.sparse_pruning_method == 'l1':
                 return self.sparse_pruning_mask.to(self.weight.device)
             elif self.sparse_pruning_method == 'topk':
-                return TopKBinarizer.apply(self.sparse_mask_scores,
-                                           self.sparse_pruning_ratio,
-                                           False)
+                return TopKBinarizer.apply(self.sparse_mask_scores, self.sparse_pruning_ratio, False)
             else:
                 raise NotImplementedError
         if pruning_type == 'row':
             if self.row_pruning_method == 'l1':
                 return self.row_pruning_mask.to(self.weight.device)
             elif self.row_pruning_method == 'topk':
-                return TopKBinarizer.apply(self.row_mask_scores,
-                                           self.row_pruning_ratio,
-                                           False)
+                return TopKBinarizer.apply(self.row_mask_scores, self.row_pruning_ratio, False)
             else:
                 raise NotImplementedError
         elif pruning_type == 'head':
             if self.head_pruning_method == 'topk':
-                return TopKBinarizer.apply(self.head_pruning_scores,
-                                           self.head_pruning_ratio,
-                                           False)
+                return TopKBinarizer.apply(self.head_pruning_scores, self.head_pruning_ratio, False)
             else:
                 raise NotImplementedError
         else:
             raise NotImplementedError
 
-    def enable_weight_quantization(self,
-                                   start_bits,
-                                   target_bits,
-                                   quantization_period,
-                                   weight_quantization_enabled_in_forward,
-                                   quantization_type,
-                                   num_groups):
+    def enable_weight_quantization(self, start_bits, target_bits, quantization_period,
+                                   weight_quantization_enabled_in_forward, quantization_type, num_groups):
         self.weight.start_bits = start_bits
         self.weight.target_bits = target_bits
         self.weight.q_period = quantization_period
@@ -369,10 +340,7 @@ def enable_weight_quantization(self,
             self.weight_quantize_num_groups = num_groups
 
     def fix_weight_quantization(self):
-        self.weight.data = self.weight_quantizer(self.weight,
-                                                 self.weight.target_bits,
-                                                 None,
-                                                 None,
+        self.weight.data = self.weight_quantizer(self.weight, self.weight.target_bits, None, None,
                                                  self.weight_quantize_num_groups).data
         self.weight_quantization_enabled_in_forward = False
         return None
@@ -391,18 +359,12 @@ def enable_activation_quantization(self, bits, quantization_type, range_calibrat
 
     def head_pruning_reshape(self, w, mask):
         shape = w.shape
-        return (w.t().reshape(self.num_heads,
-                              -1) * mask.view(-1,
-                                              1)).reshape(shape[1],
-                                                          shape[0]).t()
+        return (w.t().reshape(self.num_heads, -1) * mask.view(-1, 1)).reshape(shape[1], shape[0]).t()
 
     def forward(self, input, skip_bias_add=False):
 
         if self.weight_quantization_enabled_in_forward and self.weight_quantization_enabled:
-            weight = self.weight_quantizer(self.weight,
-                                           self.weight.target_bits,
-                                           None,
-                                           None,
+            weight = self.weight_quantizer(self.weight, self.weight.target_bits, None, None,
                                            self.weight_quantize_num_groups)
             bias = self.bias
         else:
@@ -428,11 +390,7 @@ def forward(self, input, skip_bias_add=False):
                 num_groups = input.numel() // input.size(-1)
             else:
                 num_groups = 1
-            input = self.activation_quantizer(input,
-                                              self.activation_quantization_bits,
-                                              None,
-                                              None,
-                                              num_groups)
+            input = self.activation_quantizer(input, self.activation_quantization_bits, None, None, num_groups)
 
         if skip_bias_add:
             # used for mpu linear layers
@@ -447,6 +405,7 @@ class Conv2dLayer_Compress(nn.Conv2d):
     """
     Conv2D layer with compression.
     """
+
     def __init__(self, *kargs):
         super(Conv2dLayer_Compress, self).__init__(*kargs)
         self.sparse_pruning_method = None
@@ -478,10 +437,8 @@ def __repr__(self):
         output = s.format(**self.__dict__)
 
         return output + ' sparse pruning={}, channel pruning={}, activation quantization={}, weight_quantization={}'.format(
-            self.sparse_pruning_method is not None,
-            self.channel_pruning_method is not None,
-            self.activation_quantization_method is not None,
-            self.weight.target_bits)
+            self.sparse_pruning_method is not None, self.channel_pruning_method is not None,
+            self.activation_quantization_method is not None, self.weight.target_bits)
 
     def enable_sparse_pruning(self, ratio, method):
         self.sparse_pruning_ratio = ratio
@@ -493,8 +450,7 @@ def enable_sparse_pruning(self, ratio, method):
             mask = mask.to(self.weight.device)
         elif method == 'topk':
             self.sparse_mask_scores = nn.Parameter(torch.Tensor(self.weight.size()))
-            self.sparse_mask_scores.data = self.sparse_mask_scores.data.to(
-                self.weight.device)
+            self.sparse_mask_scores.data = self.sparse_mask_scores.data.to(self.weight.device)
             init.kaiming_uniform_(self.sparse_mask_scores, a=math.sqrt(5))
             mask = None
         else:
@@ -509,18 +465,13 @@ def enable_channel_pruning(self, ratio, method):
 
         if method == 'l1':
             # compute the l1 norm of each conv2d kernel (the last three dimension)
-            weight_norm = torch.norm(self.weight.data, p=1, dim=[1, 2, 3])
+            weight_norm = torch.linalg.norm(self.weight.data, ord=1, dim=[1, 2, 3])
             mask = TopKBinarizer.apply(weight_norm, self.channel_pruning_ratio, False)
             mask = mask.view(-1, 1, 1, 1)
             mask = mask.to(self.weight.device)
         elif method == 'topk':
-            self.channel_mask_scores = nn.Parameter(
-                torch.Tensor(self.weight.size(0),
-                             1,
-                             1,
-                             1))
-            self.channel_mask_scores.data = self.channel_mask_scores.data.to(
-                self.weight.device)
+            self.channel_mask_scores = nn.Parameter(torch.Tensor(self.weight.size(0), 1, 1, 1))
+            self.channel_mask_scores.data = self.channel_mask_scores.data.to(self.weight.device)
             init.kaiming_uniform_(self.channel_mask_scores, a=math.sqrt(5))
             mask = None
         else:
@@ -579,39 +530,27 @@ def get_mask(self, pruning_type='sparse'):
             if self.sparse_pruning_method == 'l1':
                 return self.sparse_pruning_mask.to(self.weight.device)
             elif self.sparse_pruning_method == 'topk':
-                return TopKBinarizer.apply(self.sparse_mask_scores,
-                                           self.sparse_pruning_ratio,
-                                           False)
+                return TopKBinarizer.apply(self.sparse_mask_scores, self.sparse_pruning_ratio, False)
             else:
                 raise NotImplementedError
         elif pruning_type == 'channel':
             if self.channel_pruning_method == 'l1':
                 return self.channel_pruning_mask.to(self.weight.device)
             elif self.channel_pruning_method == 'topk':
-                return TopKBinarizer.apply(self.channel_mask_scores,
-                                           self.channel_pruning_ratio,
-                                           False)
+                return TopKBinarizer.apply(self.channel_mask_scores, self.channel_pruning_ratio, False)
             else:
                 raise NotImplementedError
         else:
             raise NotImplementedError
 
     def fix_weight_quantization(self):
-        self.weight.data = self.weight_quantizer(self.weight,
-                                                 self.weight.target_bits,
-                                                 None,
-                                                 None,
+        self.weight.data = self.weight_quantizer(self.weight, self.weight.target_bits, None, None,
                                                  self.weight_quantize_num_groups).data
         self.weight_quantization_enabled_in_forward = False
         return None
 
-    def enable_weight_quantization(self,
-                                   start_bits,
-                                   target_bits,
-                                   quantization_period,
-                                   weight_quantization_enabled_in_forward,
-                                   quantization_type,
-                                   num_groups):
+    def enable_weight_quantization(self, start_bits, target_bits, quantization_period,
+                                   weight_quantization_enabled_in_forward, quantization_type, num_groups):
         self.weight.start_bits = start_bits
         self.weight.target_bits = target_bits
         self.weight.q_period = quantization_period
@@ -642,10 +581,7 @@ def enable_activation_quantization(self, bits, quantization_type, range_calibrat
     def forward(self, input):
 
         if self.weight_quantization_enabled_in_forward and self.weight_quantization_enabled:
-            weight = self.weight_quantizer(self.weight,
-                                           self.weight.target_bits,
-                                           None,
-                                           None,
+            weight = self.weight_quantizer(self.weight, self.weight.target_bits, None, None,
                                            self.weight_quantize_num_groups)
             bias = self.bias
         else:
@@ -667,22 +603,13 @@ def forward(self, input):
                 num_groups = input.numel() // input[0].numel()
             else:
                 num_groups = 1
-            input = self.activation_quantizer(input,
-                                              self.activation_quantization_bits,
-                                              None,
-                                              None,
-                                              num_groups)
+            input = self.activation_quantizer(input, self.activation_quantization_bits, None, None, num_groups)
 
-        return nn.functional.conv2d(input,
-                                    weight,
-                                    bias,
-                                    self.stride,
-                                    self.padding,
-                                    self.dilation,
-                                    self.groups)
+        return nn.functional.conv2d(input, weight, bias, self.stride, self.padding, self.dilation, self.groups)
 
 
 class BNLayer_Compress(nn.BatchNorm2d):
+
     def fix_channel_pruning_helper(self, mask, dim_reduction=True):
         self.weight = nn.Parameter(self.weight.data[mask.view(-1)])
         self.bias = nn.Parameter(self.bias.data[mask.view(-1)])
@@ -691,7 +618,7 @@ def fix_channel_pruning_helper(self, mask, dim_reduction=True):
 
 
 def _reduce(input_):
-    """All-reduce the the input tensor across model parallel group."""
+    """All-reduce the input tensor across model parallel group."""
     group = g_mpu.get_model_parallel_group()
 
     # Bypass the function if we are using only 1 GPU.
@@ -746,7 +673,7 @@ def _split(input_):
 
 
 def _gather(input_):
-    """Gather tensors and concatinate along the last dimension."""
+    """Gather tensors and concatenate along the last dimension."""
     group = g_mpu.get_model_parallel_group()
 
     # Bypass the function if we are using only 1 GPU.
@@ -770,6 +697,7 @@ def _gather(input_):
 
 class _CopyToModelParallelRegion(torch.autograd.Function):
     """Pass the input to the model parallel region."""
+
     @staticmethod
     def forward(ctx, input_):
         return input_
@@ -780,7 +708,8 @@ def backward(ctx, grad_output):
 
 
 class _ReduceFromModelParallelRegion(torch.autograd.Function):
-    """All-redcue the input from the model parallel region."""
+    """All-reduce the input from the model parallel region."""
+
     @staticmethod
     def forward(ctx, input_):
         return _reduce(input_)
@@ -792,6 +721,7 @@ def backward(ctx, grad_output):
 
 class _ScatterToModelParallelRegion(torch.autograd.Function):
     """Split the input and keep only the corresponding chuck to the rank."""
+
     @staticmethod
     def forward(ctx, input_):
         return _split(input_)
@@ -802,7 +732,8 @@ def backward(ctx, grad_output):
 
 
 class _GatherFromModelParallelRegion(torch.autograd.Function):
-    """Gather the input from model parallel region and concatinate."""
+    """Gather the input from model parallel region and concatenate."""
+
     @staticmethod
     def forward(ctx, input_):
         return _gather(input_)
@@ -834,13 +765,8 @@ def gather_from_model_parallel_region(input_):
 
 
 class ColumnParallelLinear_Compress(LinearLayer_Compress):
-    def __init__(self,
-                 mpu,
-                 input_size,
-                 output_size,
-                 bias=True,
-                 gather_output=True,
-                 skip_bias_add=False):
+
+    def __init__(self, mpu, input_size, output_size, bias=True, gather_output=True, skip_bias_add=False):
         # Keep input parameters
         global g_mpu
         g_mpu = mpu
@@ -854,10 +780,7 @@ def __init__(self,
         assert output_size % world_size == 0
         self.output_size_per_partition = output_size // world_size
 
-        super(ColumnParallelLinear_Compress,
-              self).__init__(self.input_size,
-                             self.output_size_per_partition,
-                             bias=bias)
+        super(ColumnParallelLinear_Compress, self).__init__(self.input_size, self.output_size_per_partition, bias=bias)
 
     def forward(self, input_):
         # Set up backprop all-reduce.
@@ -877,13 +800,8 @@ def forward(self, input_):
 
 
 class RowParallelLinear_Compress(LinearLayer_Compress):
-    def __init__(self,
-                 mpu,
-                 input_size,
-                 output_size,
-                 bias=True,
-                 input_is_parallel=False,
-                 skip_bias_add=False):
+
+    def __init__(self, mpu, input_size, output_size, bias=True, input_is_parallel=False, skip_bias_add=False):
         # Keep input parameters
         global g_mpu
         g_mpu = mpu
@@ -897,10 +815,7 @@ def __init__(self,
         assert input_size % world_size == 0
         self.input_size_per_partition = input_size // world_size
 
-        super(RowParallelLinear_Compress,
-              self).__init__(self.input_size_per_partition,
-                             self.output_size,
-                             bias=bias)
+        super(RowParallelLinear_Compress, self).__init__(self.input_size_per_partition, self.output_size, bias=bias)
 
     def forward(self, input_):
         # Set up backprop all-reduce.
diff --git a/deepspeed/compression/compress.py b/deepspeed/compression/compress.py
index bf3b6c2760fa..2f0e88beee21 100644
--- a/deepspeed/compression/compress.py
+++ b/deepspeed/compression/compress.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import re
 from .helper import compression_preparation, fix_compression, recursive_getattr, is_module_compressible
@@ -8,26 +11,23 @@
 import os
 import json
 
+try:
+    import neural_compressor as nc
+except ImportError as e:
+    nc = None
+
 
 def check_deepspeed_config(config):
     if isinstance(config, dict):
         return config
     elif os.path.exists(config):
-        return json.load(open(config,
-                              "r"),
-                         object_pairs_hook=dict_raise_error_on_duplicate_keys)
+        return json.load(open(config, "r"), object_pairs_hook=dict_raise_error_on_duplicate_keys)
     else:
         raise ValueError(
-            f"Expected a string path to an existing deepspeed config, or a dictionary. Received: {config}"
-        )
+            f"Expected a string path to an existing deepspeed config, or a dictionary. Received: {config}")
 
 
-def get_module_name(group_name,
-                    model,
-                    key_word,
-                    exist_module_name,
-                    mpu=None,
-                    verbose=True):
+def get_module_name(group_name, model, key_word, exist_module_name, mpu=None, verbose=True):
     '''
     get the associated module name from the model based on the key_word provided by users
     '''
@@ -40,8 +40,7 @@ def get_module_name(group_name,
             if name in exist_module_name and verbose:
                 # logger.warning
                 raise ValueError(
-                    f"{name} is already added to compression, please check your config file for {group_name}."
-                )
+                    f"{name} is already added to compression, please check your config file for {group_name}.")
             if name not in exist_module_name:
                 exist_module_name.add(name)
                 return_module_name.append(name)
@@ -56,8 +55,7 @@ def get_compress_methods(model, compress_methods, mpu=None):
             continue
         # for loop different methods, i.e., weight quantization, activation quantization etc
         exist_module_name = set()
-        shared_parameters = method_content[
-            SHARED_PARAMETERS]  # get all the shared parameters
+        shared_parameters = method_content[SHARED_PARAMETERS]  # get all the shared parameters
         for group_name, method_parameters in method_content[DIFFERENT_GROUPS].items():
             # for loop different groups, i.e., weight quantization group 1, weight quantization group 2 etc
             module_name_list = []
@@ -65,8 +63,13 @@ def get_compress_methods(model, compress_methods, mpu=None):
             if method_parameters[DIFFERENT_GROUPS_RELATED_MODULE_SCOPE]:
                 # this is used for head/row/channel pruning, if users provide the related module scope, we can shrink the layer dim for them
                 # otherwise we just mask those as zeros
-                for key_word, related_key_words in zip(method_parameters[DIFFERENT_GROUPS_MODULE_SCOPE], method_parameters[DIFFERENT_GROUPS_RELATED_MODULE_SCOPE]):
-                    module_name, exist_module_name = get_module_name(group_name, model, key_word, exist_module_name, mpu=mpu)
+                for key_word, related_key_words in zip(method_parameters[DIFFERENT_GROUPS_MODULE_SCOPE],
+                                                       method_parameters[DIFFERENT_GROUPS_RELATED_MODULE_SCOPE]):
+                    module_name, exist_module_name = get_module_name(group_name,
+                                                                     model,
+                                                                     key_word,
+                                                                     exist_module_name,
+                                                                     mpu=mpu)
                     module_name_list.append(module_name)
                     tmp_related_module_name_list = []
                     for rkw in related_key_words:
@@ -76,7 +79,11 @@ def get_compress_methods(model, compress_methods, mpu=None):
                     related_module_name_list.append(tmp_related_module_name_list)
             else:
                 for key_word in method_parameters[DIFFERENT_GROUPS_MODULE_SCOPE]:
-                    module_name, exist_module_name = get_module_name(group_name, model, key_word, exist_module_name, mpu=mpu)
+                    module_name, exist_module_name = get_module_name(group_name,
+                                                                     model,
+                                                                     key_word,
+                                                                     exist_module_name,
+                                                                     mpu=mpu)
                     module_name_list.append(module_name)
 
             if module_name_list:
@@ -85,13 +92,7 @@ def get_compress_methods(model, compress_methods, mpu=None):
                     **(method_parameters.copy().pop(DIFFERENT_GROUPS_PARAMETERS)),
                     **shared_parameters
                 }
-                compression_item = [
-                    module_name_list,
-                    related_module_name_list,
-                    {
-                        method: combined_method_parameters
-                    }
-                ]
+                compression_item = [module_name_list, related_module_name_list, {method: combined_method_parameters}]
                 layer_added_compress_methods.append(compression_item)
     return layer_added_compress_methods
 
@@ -118,11 +119,29 @@ def init_compression(model, deepspeed_config, teacher_model=None, mpu=None):
         assert teacher_model is not None, "Teacher model is required for layer reduction"
         student_initialization(c_model, teacher_model, deepspeed_config)
 
-    layer_added_compress_methods = get_compress_methods(c_model,
-                                                        compress_methods,
-                                                        mpu=mpu)
+    layer_added_compress_methods = get_compress_methods(c_model, compress_methods, mpu=mpu)
     compression_preparation(c_model, layer_added_compress_methods, mpu)
 
+    # For sparse pruning snip_momentum method
+    shared_parameters = compress_methods[SPARSE_PRUNING][SHARED_PARAMETERS]
+    if shared_parameters[SPARSE_PRUNING_ENABLED] and \
+        shared_parameters[SPARSE_PRUNING_METHOD] == SPARSE_PRUNING_METHOD_SNIP_MOMENTUM:
+
+        assert nc is not None, "please ensure the neural_compressor python package is installed by pip or conda if user wants to use snip_momentum sparse pruning"
+
+        from .helper import generate_pruners, register_on_step_begin
+        from nc import WeightPruningConfig
+
+        config = WeightPruningConfig(target_sparsity=1 - shared_parameters[SPARSE_PRUNING_DENSE_RATIO],
+                                     pattern=shared_parameters[SPARSE_PRUNING_BLOCK_PATTERN],
+                                     pruning_frequency=shared_parameters[SPARSE_PRUNING_SCHEDULE_OFFSET_STRIDE],
+                                     start_step=shared_parameters[SPARSE_PRUNING_SCHEDULE_OFFSET],
+                                     end_step=shared_parameters[SPARSE_PRUNING_SCHEDULE_OFFSET_END],
+                                     excluded_op_names=shared_parameters[SPARSE_PRUNING_EXCLUDED_MODULES])
+        pruners = generate_pruners(config, c_model)
+        c_model.pruners = pruners
+        register_on_step_begin(c_model)
+
     return model
 
 
@@ -143,31 +162,20 @@ def redundancy_clean(model, deepspeed_config, mpu=None):
     else:
         c_model = model
 
-    layer_added_compress_methods_tmp = get_compress_methods(c_model,
-                                                            compress_methods,
-                                                            mpu=mpu)
+    layer_added_compress_methods_tmp = get_compress_methods(c_model, compress_methods, mpu=mpu)
     # sort methods
     order_list = [
-        WEIGHT_QUANTIZATION,
-        SPARSE_PRUNING,
-        ROW_PRUNING,
-        HEAD_PRUNING,
-        CHANNEL_PRUNING,
-        ACTIVATION_QUANTIZATION
+        WEIGHT_QUANTIZATION, SPARSE_PRUNING, ROW_PRUNING, HEAD_PRUNING, CHANNEL_PRUNING, ACTIVATION_QUANTIZATION
     ]
-    layer_added_compress_methods = sorted(
-        layer_added_compress_methods_tmp,
-        key=lambda x: order_list.index(list(x[2].keys())[0]))
+    layer_added_compress_methods = sorted(layer_added_compress_methods_tmp,
+                                          key=lambda x: order_list.index(list(x[2].keys())[0]))
 
     for module_name_lists, related_module_name_lists, compression_technique in layer_added_compress_methods:
         stored_mask = []
         need_mask = True if related_module_name_lists else False
         for i, mnl in enumerate(module_name_lists):
             for module_name in mnl:
-                mask = fix_compression(c_model,
-                                       module_name,
-                                       compression_technique,
-                                       dim_reduction=need_mask)
+                mask = fix_compression(c_model, module_name, compression_technique, dim_reduction=need_mask)
                 if need_mask:
                     stored_mask.append(mask)
             if need_mask:
@@ -204,25 +212,23 @@ def student_initialization(student_model, teacher_model, deepspeed_config):
             The prefix name before the layer #.
             Example 1: bert.encoder.layer, for BERT_base model's prefix name
             Example 2: transformer.h, for GPT-2 hugging face prefix name
-        teacher_layer (`list of intergers`)
-            The layer of teacher will be used for student's reinitializedion
+        teacher_layer (`list of integers`)
+            The layer of teacher will be used for student's reinitialization
             Example 1: [1,3,5,7,9], means we want to matches the 2nd/4th/6th/8th/10th layer of teacher to the first 5 layers of student
         student_layer (`list` or None)
-            The layer of student need to be re-intiialized
+            The layer of student need to be re-initialized
             Example 1: None, means we want to reinitialize all the layers
             Example 1: [0,1,2,3,4], means  we want to reinitialize the first 5 layers
         other_module_name (`list of string`)
-            The modules will be used for student's reinitializedion
+            The modules will be used for student's reinitialization
             Example 1: ['bert.pooler', 'bert.embeddings', 'classifier'], means we want to apply the weight in teacher's embedding/pooler/classier module to the student
-            Example 2: ['transformer.w', 'transformer.ln_f', 'lm_head'], means we want to apply the weight in teacher's embeddingn layers module to the student
+            Example 2: ['transformer.w', 'transformer.ln_f', 'lm_head'], means we want to apply the weight in teacher's embedding layers module to the student
     Note that teacher_layer should matches student layer
     '''
     assert len(student_layer) == len(teacher_layer)
     for s_name, t_name in zip(student_layer, teacher_layer):
-        s_module = recursive_getattr(student_model,
-                                     module_name_prefix + '.' + str(s_name))
-        t_module = recursive_getattr(teacher_model,
-                                     module_name_prefix + '.' + str(t_name))
+        s_module = recursive_getattr(student_model, module_name_prefix + '.' + str(s_name))
+        t_module = recursive_getattr(teacher_model, module_name_prefix + '.' + str(t_name))
         for s_param, t_param in zip(s_module.parameters(), t_module.parameters()):
             s_param.data.copy_(t_param.data)
     for name in other_module_name:
diff --git a/deepspeed/compression/config.py b/deepspeed/compression/config.py
index e6a710dfa3ea..e1fa5ef4bdb5 100644
--- a/deepspeed/compression/config.py
+++ b/deepspeed/compression/config.py
@@ -1,8 +1,11 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .constants import *
 import copy
-from ..runtime.config_utils import get_scalar_param
+from ..runtime.config_utils import get_scalar_param, get_list_param
 
 
 def get_compression_config(param_dict):
@@ -36,9 +39,7 @@ def get_layer_reduction(param_dict):
 
 def get_layer_reduction_enabled(param_dict):
     if LAYER_REDUCTION in param_dict.keys():
-        return get_scalar_param(param_dict[LAYER_REDUCTION],
-                                LAYER_REDUCTION_ENABLED,
-                                LAYER_REDUCTION_ENABLED_DEFAULT)
+        return get_scalar_param(param_dict[LAYER_REDUCTION], LAYER_REDUCTION_ENABLED, LAYER_REDUCTION_ENABLED_DEFAULT)
     else:
         return False
 
@@ -70,7 +71,8 @@ def get_weight_quantization(param_dict):
     output[SHARED_PARAMETERS] = get_weight_quantization_shared_parameters(sub_param_dict)
     # each sub-groups
     if output[SHARED_PARAMETERS][WEIGHT_QUANTIZE_ENABLED]:
-        assert DIFFERENT_GROUPS in sub_param_dict.keys(), f"Weigh Quantization is enabled, {DIFFERENT_GROUPS} must be specified"
+        assert DIFFERENT_GROUPS in sub_param_dict.keys(
+        ), f"Weigh Quantization is enabled, {DIFFERENT_GROUPS} must be specified"
     output[DIFFERENT_GROUPS] = get_weight_quantization_different_groups(sub_param_dict)
     return output
 
@@ -79,51 +81,38 @@ def get_weight_quantization_shared_parameters(param_dict):
     output = {}
     if SHARED_PARAMETERS in param_dict.keys():
         sub_param_dict = param_dict[SHARED_PARAMETERS]
-        output[WEIGHT_QUANTIZE_ENABLED] = get_scalar_param(
-            sub_param_dict,
-            WEIGHT_QUANTIZE_ENABLED,
-            WEIGHT_QUANTIZE_ENABLED_DEFAULT)
-        output[WEIGHT_QUANTIZE_KERNEL] = get_scalar_param(
-            sub_param_dict,
-            WEIGHT_QUANTIZE_KERNEL,
-            WEIGHT_QUANTIZE_KERNEL_DEFAULT)
-        output[WEIGHT_QUANTIZE_SCHEDULE_OFFSET] = get_scalar_param(
-            sub_param_dict,
-            WEIGHT_QUANTIZE_SCHEDULE_OFFSET,
-            WEIGHT_QUANTIZE_SCHEDULE_OFFSET_DEFAULT)
-        output[WEIGHT_QUANTIZE_GROUPS] = get_scalar_param(
-            sub_param_dict,
-            WEIGHT_QUANTIZE_GROUPS,
-            WEIGHT_QUANTIZE_GROUPS_DEFAULT)
-        output[WEIGHT_QUANTIZE_VERBOSE] = get_scalar_param(
-            sub_param_dict,
-            WEIGHT_QUANTIZE_VERBOSE,
-            WEIGHT_QUANTIZE_VERBOSE_DEFAULT)
-        output[WEIGHT_QUANTIZE_TYPE] = get_scalar_param(sub_param_dict,
-                                                        WEIGHT_QUANTIZE_TYPE,
+        output[WEIGHT_QUANTIZE_ENABLED] = get_scalar_param(sub_param_dict, WEIGHT_QUANTIZE_ENABLED,
+                                                           WEIGHT_QUANTIZE_ENABLED_DEFAULT)
+        output[WEIGHT_QUANTIZE_KERNEL] = get_scalar_param(sub_param_dict, WEIGHT_QUANTIZE_KERNEL,
+                                                          WEIGHT_QUANTIZE_KERNEL_DEFAULT)
+        output[WEIGHT_QUANTIZE_SCHEDULE_OFFSET] = get_scalar_param(sub_param_dict, WEIGHT_QUANTIZE_SCHEDULE_OFFSET,
+                                                                   WEIGHT_QUANTIZE_SCHEDULE_OFFSET_DEFAULT)
+        output[WEIGHT_QUANTIZE_GROUPS] = get_scalar_param(sub_param_dict, WEIGHT_QUANTIZE_GROUPS,
+                                                          WEIGHT_QUANTIZE_GROUPS_DEFAULT)
+        output[WEIGHT_QUANTIZE_VERBOSE] = get_scalar_param(sub_param_dict, WEIGHT_QUANTIZE_VERBOSE,
+                                                           WEIGHT_QUANTIZE_VERBOSE_DEFAULT)
+        output[WEIGHT_QUANTIZE_TYPE] = get_scalar_param(sub_param_dict, WEIGHT_QUANTIZE_TYPE,
                                                         WEIGHT_QUANTIZE_TYPE_DEFAULT)
-        output[WEIGHT_QUANTIZE_IN_FORWARD_ENABLED] = get_scalar_param(
-            sub_param_dict,
-            WEIGHT_QUANTIZE_IN_FORWARD_ENABLED,
-            WEIGHT_QUANTIZE_IN_FORWARD_ENABLED_DEFAULT)
-        assert output[WEIGHT_QUANTIZE_TYPE] in [WEIGHT_QUANTIZE_SYMMETRIC, WEIGHT_QUANTIZE_ASYMMETRIC], f"Invalid weight quantize type. Supported types: [{WEIGHT_QUANTIZE_SYMMETRIC}, {WEIGHT_QUANTIZE_ASYMMETRIC}]"
-        output[WEIGHT_QUANTIZE_ROUNDING] = get_scalar_param(
-            sub_param_dict,
-            WEIGHT_QUANTIZE_ROUNDING,
-            WEIGHT_QUANTIZE_ROUNDING_DEFAULT)
-        assert output[WEIGHT_QUANTIZE_ROUNDING] in [WEIGHT_QUANTIZE_NEAREST_ROUNDING, WEIGHT_QUANTIZE_STOCHASTIC_ROUNDING], f"Invalid weight quantize rounding. Supported types: [{WEIGHT_QUANTIZE_NEAREST_ROUNDING}, {WEIGHT_QUANTIZE_STOCHASTIC_ROUNDING}]"
+        output[WEIGHT_QUANTIZE_IN_FORWARD_ENABLED] = get_scalar_param(sub_param_dict,
+                                                                      WEIGHT_QUANTIZE_IN_FORWARD_ENABLED,
+                                                                      WEIGHT_QUANTIZE_IN_FORWARD_ENABLED_DEFAULT)
+        assert output[WEIGHT_QUANTIZE_TYPE] in [
+            WEIGHT_QUANTIZE_SYMMETRIC, WEIGHT_QUANTIZE_ASYMMETRIC
+        ], f"Invalid weight quantize type. Supported types: [{WEIGHT_QUANTIZE_SYMMETRIC}, {WEIGHT_QUANTIZE_ASYMMETRIC}]"
+        output[WEIGHT_QUANTIZE_ROUNDING] = get_scalar_param(sub_param_dict, WEIGHT_QUANTIZE_ROUNDING,
+                                                            WEIGHT_QUANTIZE_ROUNDING_DEFAULT)
+        assert output[WEIGHT_QUANTIZE_ROUNDING] in [
+            WEIGHT_QUANTIZE_NEAREST_ROUNDING, WEIGHT_QUANTIZE_STOCHASTIC_ROUNDING
+        ], f"Invalid weight quantize rounding. Supported types: [{WEIGHT_QUANTIZE_NEAREST_ROUNDING}, {WEIGHT_QUANTIZE_STOCHASTIC_ROUNDING}]"
         if WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZE in sub_param_dict.keys():
             output[WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZE] = get_scalar_param(
-                sub_param_dict[WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZE],
-                WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZE_ENABLED,
+                sub_param_dict[WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZE], WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZE_ENABLED,
                 WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZE_ENABLED_DEFAULT)
             output[WEIGHT_QUANTIZE_CHANGE_RATIO] = get_scalar_param(
-                sub_param_dict[WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZE],
-                WEIGHT_QUANTIZE_CHANGE_RATIO,
+                sub_param_dict[WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZE], WEIGHT_QUANTIZE_CHANGE_RATIO,
                 WEIGHT_QUANTIZE_CHANGE_RATIO_DEFAULT)
         else:
-            output[
-                WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZE] = WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZE_ENABLED_DEFAULT
+            output[WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZE] = WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZE_ENABLED_DEFAULT
             output[WEIGHT_QUANTIZE_CHANGE_RATIO] = WEIGHT_QUANTIZE_CHANGE_RATIO_DEFAULT
     else:
         output[WEIGHT_QUANTIZE_ENABLED] = WEIGHT_QUANTIZE_ENABLED_DEFAULT
@@ -133,8 +122,7 @@ def get_weight_quantization_shared_parameters(param_dict):
         output[WEIGHT_QUANTIZE_VERBOSE] = WEIGHT_QUANTIZE_VERBOSE_DEFAULT
         output[WEIGHT_QUANTIZE_TYPE] = WEIGHT_QUANTIZE_TYPE_DEFAULT
         output[WEIGHT_QUANTIZE_ROUNDING] = WEIGHT_QUANTIZE_ROUNDING_DEFAULT
-        output[
-            WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZE] = WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZE_ENABLED_DEFAULT
+        output[WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZE] = WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZE_ENABLED_DEFAULT
         output[WEIGHT_QUANTIZE_CHANGE_RATIO] = WEIGHT_QUANTIZE_CHANGE_RATIO_DEFAULT
     return output
 
@@ -144,27 +132,21 @@ def get_weight_quantization_different_groups(param_dict):
     sub_param_dict = param_dict[DIFFERENT_GROUPS]
 
     def get_params(name, group_dict):
-        assert WEIGHT_QUANTIZE_START_BITS in group_dict.keys(), f"{WEIGHT_QUANTIZE_START_BITS} must be specified for weight quantization group {name}"
-        assert WEIGHT_QUANTIZE_TARGET_BITS in group_dict.keys(), f"{WEIGHT_QUANTIZE_TARGET_BITS} must be specified for weight quantization group {name}"
-        group_dict[WEIGHT_QUANTIZATION_PERIOD] = get_scalar_param(
-            group_dict,
-            WEIGHT_QUANTIZATION_PERIOD,
-            WEIGHT_QUANTIZATION_PERIOD_DEFAULT)
+        assert WEIGHT_QUANTIZE_START_BITS in group_dict.keys(
+        ), f"{WEIGHT_QUANTIZE_START_BITS} must be specified for weight quantization group {name}"
+        assert WEIGHT_QUANTIZE_TARGET_BITS in group_dict.keys(
+        ), f"{WEIGHT_QUANTIZE_TARGET_BITS} must be specified for weight quantization group {name}"
+        group_dict[WEIGHT_QUANTIZATION_PERIOD] = get_scalar_param(group_dict, WEIGHT_QUANTIZATION_PERIOD,
+                                                                  WEIGHT_QUANTIZATION_PERIOD_DEFAULT)
         return group_dict
 
     for k, v in sub_param_dict.items():
         output[k] = {}
-        output[k][DIFFERENT_GROUPS_PARAMETERS] = get_params(
-            k,
-            sub_param_dict[k][DIFFERENT_GROUPS_PARAMETERS])
-        output[k][DIFFERENT_GROUPS_MODULE_SCOPE] = get_scalar_param(
-            sub_param_dict[k],
-            DIFFERENT_GROUPS_MODULE_SCOPE,
-            DIFFERENT_GROUPS_MODULE_SCOPE_DEFAULT)
+        output[k][DIFFERENT_GROUPS_PARAMETERS] = get_params(k, sub_param_dict[k][DIFFERENT_GROUPS_PARAMETERS])
+        output[k][DIFFERENT_GROUPS_MODULE_SCOPE] = get_scalar_param(sub_param_dict[k], DIFFERENT_GROUPS_MODULE_SCOPE,
+                                                                    DIFFERENT_GROUPS_MODULE_SCOPE_DEFAULT)
         output[k][DIFFERENT_GROUPS_RELATED_MODULE_SCOPE] = get_scalar_param(
-            sub_param_dict[k],
-            DIFFERENT_GROUPS_RELATED_MODULE_SCOPE,
-            DIFFERENT_GROUPS_RELATED_MODULE_SCOPE_DEFAULT)
+            sub_param_dict[k], DIFFERENT_GROUPS_RELATED_MODULE_SCOPE, DIFFERENT_GROUPS_RELATED_MODULE_SCOPE_DEFAULT)
 
     return output
 
@@ -172,19 +154,15 @@ def get_params(name, group_dict):
 def get_activation_quantization(param_dict):
     output = {}
     if ACTIVATION_QUANTIZATION not in param_dict.keys():
-        param_dict[ACTIVATION_QUANTIZATION] = {
-            SHARED_PARAMETERS: {},
-            DIFFERENT_GROUPS: {}
-        }
+        param_dict[ACTIVATION_QUANTIZATION] = {SHARED_PARAMETERS: {}, DIFFERENT_GROUPS: {}}
     sub_param_dict = param_dict[ACTIVATION_QUANTIZATION]
     # shared parameters
-    output[SHARED_PARAMETERS] = get_activation_quantization_shared_parameters(
-        sub_param_dict)
+    output[SHARED_PARAMETERS] = get_activation_quantization_shared_parameters(sub_param_dict)
     # each sub-groups
     if output[SHARED_PARAMETERS][ACTIVATION_QUANTIZATION_ENABLED]:
-        assert DIFFERENT_GROUPS in sub_param_dict.keys(), f"Activation Quantization is enabled, {DIFFERENT_GROUPS} must be specified"
-    output[DIFFERENT_GROUPS] = get_activation_quantization_different_groups(
-        sub_param_dict)
+        assert DIFFERENT_GROUPS in sub_param_dict.keys(
+        ), f"Activation Quantization is enabled, {DIFFERENT_GROUPS} must be specified"
+    output[DIFFERENT_GROUPS] = get_activation_quantization_different_groups(sub_param_dict)
     return output
 
 
@@ -192,30 +170,26 @@ def get_activation_quantization_shared_parameters(param_dict):
     output = {}
     if SHARED_PARAMETERS in param_dict.keys():
         sub_param_dict = param_dict[SHARED_PARAMETERS]
-        output[ACTIVATION_QUANTIZATION_ENABLED] = get_scalar_param(
-            sub_param_dict,
-            ACTIVATION_QUANTIZATION_ENABLED,
-            ACTIVATION_QUANTIZATION_ENABLED_DEFAULT)
-        output[ACTIVATION_QUANTIZE_TYPE] = get_scalar_param(
-            sub_param_dict,
-            ACTIVATION_QUANTIZE_TYPE,
-            ACTIVATION_QUANTIZE_TYPE_DEFAULT)
-        assert output[ACTIVATION_QUANTIZE_TYPE] in [ACTIVATION_QUANTIZE_SYMMETRIC, ACTIVATION_QUANTIZE_ASYMMETRIC], f"Invalid activation quantize type. Supported types: [{ACTIVATION_QUANTIZE_SYMMETRIC}, {ACTIVATION_QUANTIZE_ASYMMETRIC}]"
-        output[ACTIVATION_QUANTIZE_RANGE] = get_scalar_param(
-            sub_param_dict,
-            ACTIVATION_QUANTIZE_RANGE,
-            ACTIVATION_QUANTIZE_RANGE_DEFAULT)
-        assert output[ACTIVATION_QUANTIZE_RANGE] in [ACTIVATION_QUANTIZE_RANGE_DYNAMIC, ACTIVATION_QUANTIZE_RANGE_STATIC], f"Invalid activation quantize range calibration. Supported types: [{ACTIVATION_QUANTIZE_RANGE_DYNAMIC}, {ACTIVATION_QUANTIZE_RANGE_STATIC}]"
-        output[ACTIVATION_QUANTIZE_SCHEDULE_OFFSET] = get_scalar_param(
-            sub_param_dict,
-            ACTIVATION_QUANTIZE_SCHEDULE_OFFSET,
-            ACTIVATION_QUANTIZE_SCHEDULE_OFFSET_DEFAULT)
+        output[ACTIVATION_QUANTIZATION_ENABLED] = get_scalar_param(sub_param_dict, ACTIVATION_QUANTIZATION_ENABLED,
+                                                                   ACTIVATION_QUANTIZATION_ENABLED_DEFAULT)
+        output[ACTIVATION_QUANTIZE_TYPE] = get_scalar_param(sub_param_dict, ACTIVATION_QUANTIZE_TYPE,
+                                                            ACTIVATION_QUANTIZE_TYPE_DEFAULT)
+        assert output[ACTIVATION_QUANTIZE_TYPE] in [
+            ACTIVATION_QUANTIZE_SYMMETRIC, ACTIVATION_QUANTIZE_ASYMMETRIC
+        ], f"Invalid activation quantize type. Supported types: [{ACTIVATION_QUANTIZE_SYMMETRIC}, {ACTIVATION_QUANTIZE_ASYMMETRIC}]"
+        output[ACTIVATION_QUANTIZE_RANGE] = get_scalar_param(sub_param_dict, ACTIVATION_QUANTIZE_RANGE,
+                                                             ACTIVATION_QUANTIZE_RANGE_DEFAULT)
+        assert output[ACTIVATION_QUANTIZE_RANGE] in [
+            ACTIVATION_QUANTIZE_RANGE_DYNAMIC, ACTIVATION_QUANTIZE_RANGE_STATIC
+        ], f"Invalid activation quantize range calibration. Supported types: [{ACTIVATION_QUANTIZE_RANGE_DYNAMIC}, {ACTIVATION_QUANTIZE_RANGE_STATIC}]"
+        output[ACTIVATION_QUANTIZE_SCHEDULE_OFFSET] = get_scalar_param(sub_param_dict,
+                                                                       ACTIVATION_QUANTIZE_SCHEDULE_OFFSET,
+                                                                       ACTIVATION_QUANTIZE_SCHEDULE_OFFSET_DEFAULT)
     else:
         output[ACTIVATION_QUANTIZATION_ENABLED] = ACTIVATION_QUANTIZATION_ENABLED_DEFAULT
         output[ACTIVATION_QUANTIZE_TYPE] = ACTIVATION_QUANTIZE_TYPE_DEFAULT
         output[ACTIVATION_QUANTIZE_RANGE] = ACTIVATION_QUANTIZE_RANGE_DEFAULT
-        output[
-            ACTIVATION_QUANTIZE_SCHEDULE_OFFSET] = ACTIVATION_QUANTIZE_SCHEDULE_OFFSET_DEFAULT
+        output[ACTIVATION_QUANTIZE_SCHEDULE_OFFSET] = ACTIVATION_QUANTIZE_SCHEDULE_OFFSET_DEFAULT
     return output
 
 
@@ -224,22 +198,17 @@ def get_activation_quantization_different_groups(param_dict):
     sub_param_dict = param_dict[DIFFERENT_GROUPS]
 
     def get_params(name, group_dict):
-        assert ACTIVATION_QUANTIZE_BITS in group_dict.keys(), f"{ACTIVATION_QUANTIZE_BITS} must be specified for activation quantization group {name}"
+        assert ACTIVATION_QUANTIZE_BITS in group_dict.keys(
+        ), f"{ACTIVATION_QUANTIZE_BITS} must be specified for activation quantization group {name}"
         return group_dict
 
     for k, v in sub_param_dict.items():
         output[k] = {}
-        output[k][DIFFERENT_GROUPS_PARAMETERS] = get_params(
-            k,
-            sub_param_dict[k][DIFFERENT_GROUPS_PARAMETERS])
-        output[k][DIFFERENT_GROUPS_MODULE_SCOPE] = get_scalar_param(
-            sub_param_dict[k],
-            DIFFERENT_GROUPS_MODULE_SCOPE,
-            DIFFERENT_GROUPS_MODULE_SCOPE_DEFAULT)
+        output[k][DIFFERENT_GROUPS_PARAMETERS] = get_params(k, sub_param_dict[k][DIFFERENT_GROUPS_PARAMETERS])
+        output[k][DIFFERENT_GROUPS_MODULE_SCOPE] = get_scalar_param(sub_param_dict[k], DIFFERENT_GROUPS_MODULE_SCOPE,
+                                                                    DIFFERENT_GROUPS_MODULE_SCOPE_DEFAULT)
         output[k][DIFFERENT_GROUPS_RELATED_MODULE_SCOPE] = get_scalar_param(
-            sub_param_dict[k],
-            DIFFERENT_GROUPS_RELATED_MODULE_SCOPE,
-            DIFFERENT_GROUPS_RELATED_MODULE_SCOPE_DEFAULT)
+            sub_param_dict[k], DIFFERENT_GROUPS_RELATED_MODULE_SCOPE, DIFFERENT_GROUPS_RELATED_MODULE_SCOPE_DEFAULT)
 
     return output
 
@@ -252,28 +221,44 @@ def get_sparse_pruning(param_dict):
     # shared parameters
     output[SHARED_PARAMETERS] = get_sparse_pruning_shared_parameters(sub_param_dict)
     # each sub-groups
-    if output[SHARED_PARAMETERS][SPARSE_PRUNING_ENABLED]:
-        assert DIFFERENT_GROUPS in sub_param_dict.keys(), f"Sparse Pruning is enabled, {DIFFERENT_GROUPS} must be specified"
+    if output[SHARED_PARAMETERS][SPARSE_PRUNING_ENABLED] and output[SHARED_PARAMETERS][
+            SPARSE_PRUNING_METHOD] != SPARSE_PRUNING_METHOD_SNIP_MOMENTUM:
+        assert DIFFERENT_GROUPS in sub_param_dict.keys(
+        ), f"Sparse Pruning is enabled and not snip_momentum method, {DIFFERENT_GROUPS} must be specified"
     output[DIFFERENT_GROUPS] = get_sparse_pruning_different_groups(sub_param_dict)
     return output
 
 
 def get_sparse_pruning_shared_parameters(param_dict):
     output = {}
+
     if SHARED_PARAMETERS in param_dict.keys():
         sub_param_dict = param_dict[SHARED_PARAMETERS]
-        output[SPARSE_PRUNING_ENABLED] = get_scalar_param(
-            sub_param_dict,
-            SPARSE_PRUNING_ENABLED,
-            SPARSE_PRUNING_ENABLED_DEFAULT)
-        output[SPARSE_PRUNING_METHOD] = get_scalar_param(sub_param_dict,
-                                                         SPARSE_PRUNING_METHOD,
+        output[SPARSE_PRUNING_ENABLED] = get_scalar_param(sub_param_dict, SPARSE_PRUNING_ENABLED,
+                                                          SPARSE_PRUNING_ENABLED_DEFAULT)
+        output[SPARSE_PRUNING_METHOD] = get_scalar_param(sub_param_dict, SPARSE_PRUNING_METHOD,
                                                          SPARSE_PRUNING_METHOD_DEFAULT)
-        assert output[SPARSE_PRUNING_METHOD] in [SPARSE_PRUNING_METHOD_L1, SPARSE_PRUNING_METHOD_TOPK], f"Invalid sparse pruning method. Supported types: [{SPARSE_PRUNING_METHOD_L1}, {SPARSE_PRUNING_METHOD_TOPK}]"
-        output[SPARSE_PRUNING_SCHEDULE_OFFSET] = get_scalar_param(
-            sub_param_dict,
-            SPARSE_PRUNING_SCHEDULE_OFFSET,
-            SPARSE_PRUNING_SCHEDULE_OFFSET_DEFAULT)
+        assert output[SPARSE_PRUNING_METHOD] in [
+            SPARSE_PRUNING_METHOD_L1, SPARSE_PRUNING_METHOD_TOPK, SPARSE_PRUNING_METHOD_SNIP_MOMENTUM
+        ], f"Invalid sparse pruning method. Supported types: [{SPARSE_PRUNING_METHOD_L1}, {SPARSE_PRUNING_METHOD_TOPK}, {SPARSE_PRUNING_METHOD_SNIP_MOMENTUM}]"
+        output[SPARSE_PRUNING_SCHEDULE_OFFSET] = get_scalar_param(sub_param_dict, SPARSE_PRUNING_SCHEDULE_OFFSET,
+                                                                  SPARSE_PRUNING_SCHEDULE_OFFSET_DEFAULT)
+        if output[SPARSE_PRUNING_METHOD] == SPARSE_PRUNING_METHOD_SNIP_MOMENTUM:
+            output[SPARSE_PRUNING_BLOCK_PATTERN] = get_scalar_param(sub_param_dict, SPARSE_PRUNING_BLOCK_PATTERN,
+                                                                    SPARSE_PRUNING_BLOCK_PATTERN_DEFAULT)
+            output[SPARSE_PRUNING_DENSE_RATIO] = get_scalar_param(sub_param_dict, SPARSE_PRUNING_DENSE_RATIO,
+                                                                  SPARSE_PRUNING_DENSE_RATIO_DEFAULT)
+            assert output[SPARSE_PRUNING_DENSE_RATIO] > 0 and output[
+                SPARSE_PRUNING_DENSE_RATIO] < 1, f"Invalid dense_ratio value. Must be less than 1"
+            output[SPARSE_PRUNING_SCHEDULE_OFFSET_STRIDE] = get_scalar_param(
+                sub_param_dict, SPARSE_PRUNING_SCHEDULE_OFFSET_STRIDE, SPARSE_PRUNING_SCHEDULE_OFFSET_STRIDE_DEFAULT)
+            output[SPARSE_PRUNING_EXCLUDED_MODULES] = get_list_param(sub_param_dict, SPARSE_PRUNING_EXCLUDED_MODULES,
+                                                                     SPARSE_PRUNING_EXCLUDED_MODULES_DEFAULT)
+            output[SPARSE_PRUNING_SCHEDULE_OFFSET_END] = get_scalar_param(sub_param_dict,
+                                                                          SPARSE_PRUNING_SCHEDULE_OFFSET_END,
+                                                                          output[SPARSE_PRUNING_SCHEDULE_OFFSET])
+            assert output[SPARSE_PRUNING_SCHEDULE_OFFSET] <= output[
+                SPARSE_PRUNING_SCHEDULE_OFFSET_END], f"Invalid schedule_offset and schedule_offset_end values"
     else:
         output[SPARSE_PRUNING_ENABLED] = SPARSE_PRUNING_ENABLED_DEFAULT
         output[SPARSE_PRUNING_METHOD] = SPARSE_PRUNING_METHOD_DEFAULT
@@ -286,22 +271,17 @@ def get_sparse_pruning_different_groups(param_dict):
     sub_param_dict = param_dict[DIFFERENT_GROUPS]
 
     def get_params(name, group_dict):
-        assert SPARSE_PRUNING_DENSE_RATIO in group_dict.keys(), f"{SPARSE_PRUNING_DENSE_RATIO} must be specified for sparse pruning group {name}"
+        assert SPARSE_PRUNING_DENSE_RATIO in group_dict.keys(
+        ), f"{SPARSE_PRUNING_DENSE_RATIO} must be specified for sparse pruning group {name}"
         return group_dict
 
     for k, v in sub_param_dict.items():
         output[k] = {}
-        output[k][DIFFERENT_GROUPS_PARAMETERS] = get_params(
-            k,
-            sub_param_dict[k][DIFFERENT_GROUPS_PARAMETERS])
-        output[k][DIFFERENT_GROUPS_MODULE_SCOPE] = get_scalar_param(
-            sub_param_dict[k],
-            DIFFERENT_GROUPS_MODULE_SCOPE,
-            DIFFERENT_GROUPS_MODULE_SCOPE_DEFAULT)
+        output[k][DIFFERENT_GROUPS_PARAMETERS] = get_params(k, sub_param_dict[k][DIFFERENT_GROUPS_PARAMETERS])
+        output[k][DIFFERENT_GROUPS_MODULE_SCOPE] = get_scalar_param(sub_param_dict[k], DIFFERENT_GROUPS_MODULE_SCOPE,
+                                                                    DIFFERENT_GROUPS_MODULE_SCOPE_DEFAULT)
         output[k][DIFFERENT_GROUPS_RELATED_MODULE_SCOPE] = get_scalar_param(
-            sub_param_dict[k],
-            DIFFERENT_GROUPS_RELATED_MODULE_SCOPE,
-            DIFFERENT_GROUPS_RELATED_MODULE_SCOPE_DEFAULT)
+            sub_param_dict[k], DIFFERENT_GROUPS_RELATED_MODULE_SCOPE, DIFFERENT_GROUPS_RELATED_MODULE_SCOPE_DEFAULT)
 
     return output
 
@@ -315,7 +295,8 @@ def get_row_pruning(param_dict):
     output[SHARED_PARAMETERS] = get_row_pruning_shared_parameters(sub_param_dict)
     # each sub-groups
     if output[SHARED_PARAMETERS][ROW_PRUNING_ENABLED]:
-        assert DIFFERENT_GROUPS in sub_param_dict.keys(), f"Row Pruning is enabled, {DIFFERENT_GROUPS} must be specified"
+        assert DIFFERENT_GROUPS in sub_param_dict.keys(
+        ), f"Row Pruning is enabled, {DIFFERENT_GROUPS} must be specified"
     output[DIFFERENT_GROUPS] = get_row_pruning_different_groups(sub_param_dict)
     return output
 
@@ -324,17 +305,14 @@ def get_row_pruning_shared_parameters(param_dict):
     output = {}
     if SHARED_PARAMETERS in param_dict.keys():
         sub_param_dict = param_dict[SHARED_PARAMETERS]
-        output[ROW_PRUNING_ENABLED] = get_scalar_param(sub_param_dict,
-                                                       ROW_PRUNING_ENABLED,
+        output[ROW_PRUNING_ENABLED] = get_scalar_param(sub_param_dict, ROW_PRUNING_ENABLED,
                                                        ROW_PRUNING_ENABLED_DEFAULT)
-        output[ROW_PRUNING_METHOD] = get_scalar_param(sub_param_dict,
-                                                      ROW_PRUNING_METHOD,
-                                                      ROW_PRUNING_METHOD_DEFAULT)
-        assert output[ROW_PRUNING_METHOD] in [ROW_PRUNING_METHOD_L1, ROW_PRUNING_METHOD_TOPK], f"Invalid row pruning method. Supported types: [{ROW_PRUNING_METHOD_L1}, {ROW_PRUNING_METHOD_TOPK}]"
-        output[ROW_PRUNING_SCHEDULE_OFFSET] = get_scalar_param(
-            sub_param_dict,
-            ROW_PRUNING_SCHEDULE_OFFSET,
-            ROW_PRUNING_SCHEDULE_OFFSET_DEFAULT)
+        output[ROW_PRUNING_METHOD] = get_scalar_param(sub_param_dict, ROW_PRUNING_METHOD, ROW_PRUNING_METHOD_DEFAULT)
+        assert output[ROW_PRUNING_METHOD] in [
+            ROW_PRUNING_METHOD_L1, ROW_PRUNING_METHOD_TOPK
+        ], f"Invalid row pruning method. Supported types: [{ROW_PRUNING_METHOD_L1}, {ROW_PRUNING_METHOD_TOPK}]"
+        output[ROW_PRUNING_SCHEDULE_OFFSET] = get_scalar_param(sub_param_dict, ROW_PRUNING_SCHEDULE_OFFSET,
+                                                               ROW_PRUNING_SCHEDULE_OFFSET_DEFAULT)
     else:
         output[ROW_PRUNING_ENABLED] = ROW_PRUNING_ENABLED_DEFAULT
         output[ROW_PRUNING_METHOD] = ROW_PRUNING_METHOD_DEFAULT
@@ -347,22 +325,17 @@ def get_row_pruning_different_groups(param_dict):
     sub_param_dict = param_dict[DIFFERENT_GROUPS]
 
     def get_params(name, group_dict):
-        assert ROW_PRUNING_DENSE_RATIO in group_dict.keys(), f"{ROW_PRUNING_DENSE_RATIO} must be specified for row pruning group {name}"
+        assert ROW_PRUNING_DENSE_RATIO in group_dict.keys(
+        ), f"{ROW_PRUNING_DENSE_RATIO} must be specified for row pruning group {name}"
         return group_dict
 
     for k, v in sub_param_dict.items():
         output[k] = {}
-        output[k][DIFFERENT_GROUPS_PARAMETERS] = get_params(
-            k,
-            sub_param_dict[k][DIFFERENT_GROUPS_PARAMETERS])
-        output[k][DIFFERENT_GROUPS_MODULE_SCOPE] = get_scalar_param(
-            sub_param_dict[k],
-            DIFFERENT_GROUPS_MODULE_SCOPE,
-            DIFFERENT_GROUPS_MODULE_SCOPE_DEFAULT)
+        output[k][DIFFERENT_GROUPS_PARAMETERS] = get_params(k, sub_param_dict[k][DIFFERENT_GROUPS_PARAMETERS])
+        output[k][DIFFERENT_GROUPS_MODULE_SCOPE] = get_scalar_param(sub_param_dict[k], DIFFERENT_GROUPS_MODULE_SCOPE,
+                                                                    DIFFERENT_GROUPS_MODULE_SCOPE_DEFAULT)
         output[k][DIFFERENT_GROUPS_RELATED_MODULE_SCOPE] = get_scalar_param(
-            sub_param_dict[k],
-            DIFFERENT_GROUPS_RELATED_MODULE_SCOPE,
-            DIFFERENT_GROUPS_RELATED_MODULE_SCOPE_DEFAULT)
+            sub_param_dict[k], DIFFERENT_GROUPS_RELATED_MODULE_SCOPE, DIFFERENT_GROUPS_RELATED_MODULE_SCOPE_DEFAULT)
     return output
 
 
@@ -375,7 +348,8 @@ def get_head_pruning(param_dict):
     output[SHARED_PARAMETERS] = get_head_pruning_shared_parameters(sub_param_dict)
     # each sub-groups
     if output[SHARED_PARAMETERS][HEAD_PRUNING_ENABLED]:
-        assert DIFFERENT_GROUPS in sub_param_dict.keys(), f"Head Pruning is enabled, {DIFFERENT_GROUPS} must be specified"
+        assert DIFFERENT_GROUPS in sub_param_dict.keys(
+        ), f"Head Pruning is enabled, {DIFFERENT_GROUPS} must be specified"
     output[DIFFERENT_GROUPS] = get_head_pruning_different_groups(sub_param_dict)
     return output
 
@@ -384,19 +358,18 @@ def get_head_pruning_shared_parameters(param_dict):
     output = {}
     if SHARED_PARAMETERS in param_dict.keys():
         sub_param_dict = param_dict[SHARED_PARAMETERS]
-        output[HEAD_PRUNING_ENABLED] = get_scalar_param(sub_param_dict,
-                                                        HEAD_PRUNING_ENABLED,
+        output[HEAD_PRUNING_ENABLED] = get_scalar_param(sub_param_dict, HEAD_PRUNING_ENABLED,
                                                         HEAD_PRUNING_ENABLED_DEFAULT)
-        output[HEAD_PRUNING_METHOD] = get_scalar_param(sub_param_dict,
-                                                       HEAD_PRUNING_METHOD,
+        output[HEAD_PRUNING_METHOD] = get_scalar_param(sub_param_dict, HEAD_PRUNING_METHOD,
                                                        HEAD_PRUNING_METHOD_DEFAULT)
-        assert output[HEAD_PRUNING_METHOD] in [HEAD_PRUNING_METHOD_L1, HEAD_PRUNING_METHOD_TOPK], f"Invalid head pruning method. Supported types: [{HEAD_PRUNING_METHOD_L1}, {HEAD_PRUNING_METHOD_TOPK}]"
-        output[HEAD_PRUNING_SCHEDULE_OFFSET] = get_scalar_param(
-            sub_param_dict,
-            HEAD_PRUNING_SCHEDULE_OFFSET,
-            HEAD_PRUNING_SCHEDULE_OFFSET_DEFAULT)
+        assert output[HEAD_PRUNING_METHOD] in [
+            HEAD_PRUNING_METHOD_L1, HEAD_PRUNING_METHOD_TOPK
+        ], f"Invalid head pruning method. Supported types: [{HEAD_PRUNING_METHOD_L1}, {HEAD_PRUNING_METHOD_TOPK}]"
+        output[HEAD_PRUNING_SCHEDULE_OFFSET] = get_scalar_param(sub_param_dict, HEAD_PRUNING_SCHEDULE_OFFSET,
+                                                                HEAD_PRUNING_SCHEDULE_OFFSET_DEFAULT)
         if output[HEAD_PRUNING_ENABLED]:
-            assert HEAD_PRUNING_NUM_HEADS in sub_param_dict.keys(), f"{HEAD_PRUNING_NUM_HEADS} must be specified for head pruning"
+            assert HEAD_PRUNING_NUM_HEADS in sub_param_dict.keys(
+            ), f"{HEAD_PRUNING_NUM_HEADS} must be specified for head pruning"
             output[HEAD_PRUNING_NUM_HEADS] = sub_param_dict[HEAD_PRUNING_NUM_HEADS]
     else:
         output[HEAD_PRUNING_ENABLED] = HEAD_PRUNING_ENABLED_DEFAULT
@@ -410,22 +383,17 @@ def get_head_pruning_different_groups(param_dict):
     sub_param_dict = param_dict[DIFFERENT_GROUPS]
 
     def get_params(name, group_dict):
-        assert HEAD_PRUNING_DENSE_RATIO in group_dict.keys(), f"dense_ratio must be specified for head pruning group {name}"
+        assert HEAD_PRUNING_DENSE_RATIO in group_dict.keys(
+        ), f"dense_ratio must be specified for head pruning group {name}"
         return group_dict
 
     for k, v in sub_param_dict.items():
         output[k] = {}
-        output[k][DIFFERENT_GROUPS_PARAMETERS] = get_params(
-            k,
-            sub_param_dict[k][DIFFERENT_GROUPS_PARAMETERS])
-        output[k][DIFFERENT_GROUPS_MODULE_SCOPE] = get_scalar_param(
-            sub_param_dict[k],
-            DIFFERENT_GROUPS_MODULE_SCOPE,
-            DIFFERENT_GROUPS_MODULE_SCOPE_DEFAULT)
+        output[k][DIFFERENT_GROUPS_PARAMETERS] = get_params(k, sub_param_dict[k][DIFFERENT_GROUPS_PARAMETERS])
+        output[k][DIFFERENT_GROUPS_MODULE_SCOPE] = get_scalar_param(sub_param_dict[k], DIFFERENT_GROUPS_MODULE_SCOPE,
+                                                                    DIFFERENT_GROUPS_MODULE_SCOPE_DEFAULT)
         output[k][DIFFERENT_GROUPS_RELATED_MODULE_SCOPE] = get_scalar_param(
-            sub_param_dict[k],
-            DIFFERENT_GROUPS_RELATED_MODULE_SCOPE,
-            DIFFERENT_GROUPS_RELATED_MODULE_SCOPE_DEFAULT)
+            sub_param_dict[k], DIFFERENT_GROUPS_RELATED_MODULE_SCOPE, DIFFERENT_GROUPS_RELATED_MODULE_SCOPE_DEFAULT)
     return output
 
 
@@ -438,7 +406,8 @@ def get_channel_pruning(param_dict):
     output[SHARED_PARAMETERS] = get_channel_pruning_shared_parameters(sub_param_dict)
     # each sub-groups
     if output[SHARED_PARAMETERS][CHANNEL_PRUNING_ENABLED]:
-        assert DIFFERENT_GROUPS in sub_param_dict.keys(), f"Sparse Pruning is enabled, {DIFFERENT_GROUPS} must be specified"
+        assert DIFFERENT_GROUPS in sub_param_dict.keys(
+        ), f"Sparse Pruning is enabled, {DIFFERENT_GROUPS} must be specified"
     output[DIFFERENT_GROUPS] = get_channel_pruning_different_groups(sub_param_dict)
     return output
 
@@ -447,19 +416,15 @@ def get_channel_pruning_shared_parameters(param_dict):
     output = {}
     if SHARED_PARAMETERS in param_dict.keys():
         sub_param_dict = param_dict[SHARED_PARAMETERS]
-        output[CHANNEL_PRUNING_ENABLED] = get_scalar_param(
-            sub_param_dict,
-            CHANNEL_PRUNING_ENABLED,
-            CHANNEL_PRUNING_ENABLED_DEFAULT)
-        output[CHANNEL_PRUNING_METHOD] = get_scalar_param(
-            sub_param_dict,
-            CHANNEL_PRUNING_METHOD,
-            CHANNEL_PRUNING_METHOD_DEFAULT)
-        assert output[CHANNEL_PRUNING_METHOD] in [CHANNEL_PRUNING_METHOD_L1, CHANNEL_PRUNING_METHOD_TOPK], f"Invalid channel pruning method. Supported types: [{CHANNEL_PRUNING_METHOD_L1}, {CHANNEL_PRUNING_METHOD_TOPK}]"
-        output[CHANNEL_PRUNING_SCHEDULE_OFFSET] = get_scalar_param(
-            sub_param_dict,
-            CHANNEL_PRUNING_SCHEDULE_OFFSET,
-            CHANNEL_PRUNING_SCHEDULE_OFFSET_DEFAULT)
+        output[CHANNEL_PRUNING_ENABLED] = get_scalar_param(sub_param_dict, CHANNEL_PRUNING_ENABLED,
+                                                           CHANNEL_PRUNING_ENABLED_DEFAULT)
+        output[CHANNEL_PRUNING_METHOD] = get_scalar_param(sub_param_dict, CHANNEL_PRUNING_METHOD,
+                                                          CHANNEL_PRUNING_METHOD_DEFAULT)
+        assert output[CHANNEL_PRUNING_METHOD] in [
+            CHANNEL_PRUNING_METHOD_L1, CHANNEL_PRUNING_METHOD_TOPK
+        ], f"Invalid channel pruning method. Supported types: [{CHANNEL_PRUNING_METHOD_L1}, {CHANNEL_PRUNING_METHOD_TOPK}]"
+        output[CHANNEL_PRUNING_SCHEDULE_OFFSET] = get_scalar_param(sub_param_dict, CHANNEL_PRUNING_SCHEDULE_OFFSET,
+                                                                   CHANNEL_PRUNING_SCHEDULE_OFFSET_DEFAULT)
     else:
         output[CHANNEL_PRUNING_ENABLED] = CHANNEL_PRUNING_ENABLED_DEFAULT
         output[CHANNEL_PRUNING_METHOD] = CHANNEL_PRUNING_METHOD_DEFAULT
@@ -472,21 +437,16 @@ def get_channel_pruning_different_groups(param_dict):
     sub_param_dict = param_dict[DIFFERENT_GROUPS]
 
     def get_params(name, group_dict):
-        assert CHANNEL_PRUNING_DENSE_RATIO in group_dict.keys(), f"{CHANNEL_PRUNING_DENSE_RATIO} must be specified for channel pruning group {name}"
+        assert CHANNEL_PRUNING_DENSE_RATIO in group_dict.keys(
+        ), f"{CHANNEL_PRUNING_DENSE_RATIO} must be specified for channel pruning group {name}"
         return group_dict
 
     for k, v in sub_param_dict.items():
         output[k] = {}
-        output[k][DIFFERENT_GROUPS_PARAMETERS] = get_params(
-            k,
-            sub_param_dict[k][DIFFERENT_GROUPS_PARAMETERS])
-        output[k][DIFFERENT_GROUPS_MODULE_SCOPE] = get_scalar_param(
-            sub_param_dict[k],
-            DIFFERENT_GROUPS_MODULE_SCOPE,
-            DIFFERENT_GROUPS_MODULE_SCOPE_DEFAULT)
+        output[k][DIFFERENT_GROUPS_PARAMETERS] = get_params(k, sub_param_dict[k][DIFFERENT_GROUPS_PARAMETERS])
+        output[k][DIFFERENT_GROUPS_MODULE_SCOPE] = get_scalar_param(sub_param_dict[k], DIFFERENT_GROUPS_MODULE_SCOPE,
+                                                                    DIFFERENT_GROUPS_MODULE_SCOPE_DEFAULT)
         output[k][DIFFERENT_GROUPS_RELATED_MODULE_SCOPE] = get_scalar_param(
-            sub_param_dict[k],
-            DIFFERENT_GROUPS_RELATED_MODULE_SCOPE,
-            DIFFERENT_GROUPS_RELATED_MODULE_SCOPE_DEFAULT)
+            sub_param_dict[k], DIFFERENT_GROUPS_RELATED_MODULE_SCOPE, DIFFERENT_GROUPS_RELATED_MODULE_SCOPE_DEFAULT)
 
     return output
diff --git a/deepspeed/compression/constants.py b/deepspeed/compression/constants.py
index 593b86e5f5c9..67375d510a4b 100644
--- a/deepspeed/compression/constants.py
+++ b/deepspeed/compression/constants.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 #########################################
 # Compression Methods
@@ -9,6 +12,7 @@
 DIFFERENT_GROUPS = "different_groups"
 TECHNIQUE_ENABLED = "enabled"
 TECHNIQUE_SCHEDULE_OFFSET = "schedule_offset"
+TECHNIQUE_SCHEDULE_OFFSET_END = "schedule_offset_end"
 DIFFERENT_GROUPS_PARAMETERS = "params"
 DIFFERENT_GROUPS_MODULE_SCOPE = "modules"
 DIFFERENT_GROUPS_MODULE_SCOPE_DEFAULT = "*"
@@ -29,7 +33,7 @@
 OTHER_MODULE_NAME = "other_module_name"
 
 ####
-# Weight Quantzation
+# Weight Quantization
 ####
 WEIGHT_QUANTIZATION = "weight_quantization"
 
@@ -108,11 +112,25 @@
 SPARSE_PRUNING_METHOD_DEFAULT = "l1"
 SPARSE_PRUNING_METHOD_L1 = "l1"
 SPARSE_PRUNING_METHOD_TOPK = "topk"
+SPARSE_PRUNING_METHOD_SNIP_MOMENTUM = "snip_momentum"
+
+SPARSE_PRUNING_BLOCK_PATTERN = "block_pattern"
+SPARSE_PRUNING_BLOCK_PATTERN_DEFAULT = "4x1"
+
+SPARSE_PRUNING_SCHEDULE_OFFSET_STRIDE = "schedule_offset_stride"
+SPARSE_PRUNING_SCHEDULE_OFFSET_STRIDE_DEFAULT = 1
 
 SPARSE_PRUNING_SCHEDULE_OFFSET = TECHNIQUE_SCHEDULE_OFFSET
 SPARSE_PRUNING_SCHEDULE_OFFSET_DEFAULT = 1000
 
+SPARSE_PRUNING_SCHEDULE_OFFSET_END = TECHNIQUE_SCHEDULE_OFFSET_END
+SPARSE_PRUNING_SCHEDULE_OFFSET_END_DEFAULT = SPARSE_PRUNING_SCHEDULE_OFFSET_DEFAULT
+
 SPARSE_PRUNING_DENSE_RATIO = "dense_ratio"
+SPARSE_PRUNING_DENSE_RATIO_DEFAULT = 0.1
+
+SPARSE_PRUNING_EXCLUDED_MODULES = "excluded_modules"
+SPARSE_PRUNING_EXCLUDED_MODULES_DEFAULT = []
 ###
 # Row Pruning
 ###
diff --git a/deepspeed/compression/helper.py b/deepspeed/compression/helper.py
index e839a5d03582..e2d66100a468 100644
--- a/deepspeed/compression/helper.py
+++ b/deepspeed/compression/helper.py
@@ -1,8 +1,17 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 from .basic_layer import Embedding_Compress, LinearLayer_Compress, Conv2dLayer_Compress, BNLayer_Compress, ColumnParallelLinear_Compress, RowParallelLinear_Compress
 from .constants import *
+from deepspeed.utils import logger
+
+try:
+    from neural_compressor.compression import pruner as nc_pruner
+except ImportError as e:
+    nc_pruner = None
 
 
 def recursive_getattr(model, module_name):
@@ -59,23 +68,17 @@ def module_replacement(model, module_name, compression_technique=None, mpu=None)
         need_bias = True
 
     # Initialize the new module
-    if isinstance(old_module,
-                  LinearLayer_Compress) or isinstance(old_module,
-                                                      torch.nn.Linear):
+    if isinstance(old_module, LinearLayer_Compress) or isinstance(old_module, torch.nn.Linear):
         if isinstance(old_module, LinearLayer_Compress):
             new_module = old_module
         else:
-            new_module = LinearLayer_Compress(old_module.in_features,
-                                              old_module.out_features,
-                                              bias=need_bias).to(
-                                                  device=old_module.weight.device,
-                                                  dtype=old_module.weight.dtype)
+            new_module = LinearLayer_Compress(old_module.in_features, old_module.out_features,
+                                              bias=need_bias).to(device=old_module.weight.device,
+                                                                 dtype=old_module.weight.dtype)
             new_module.weight.data = old_module.weight.data
             if need_bias:
                 new_module.bias.data = old_module.bias.data
-    elif isinstance(old_module,
-                    Conv2dLayer_Compress) or isinstance(old_module,
-                                                        torch.nn.Conv2d):
+    elif isinstance(old_module, Conv2dLayer_Compress) or isinstance(old_module, torch.nn.Conv2d):
         if isinstance(old_module, Conv2dLayer_Compress):
             new_module = old_module
         else:
@@ -86,60 +89,48 @@ def module_replacement(model, module_name, compression_technique=None, mpu=None)
             if need_bias:
                 new_module.bias.data = old_module.bias.data
     elif isinstance(old_module, torch.nn.BatchNorm2d):
-        new_module = BNLayer_Compress(old_module.num_features,
-                                      old_module.eps,
-                                      old_module.momentum,
-                                      old_module.affine,
-                                      old_module.track_running_stats).to(
-                                          old_module.weight.device,
-                                          old_module.weight.dtype)
+        new_module = BNLayer_Compress(old_module.num_features, old_module.eps, old_module.momentum, old_module.affine,
+                                      old_module.track_running_stats).to(old_module.weight.device,
+                                                                         old_module.weight.dtype)
         new_module.weight.data = old_module.weight.data
         if need_bias:
             new_module.bias.data = old_module.bias.data
         new_module.running_mean.data = old_module.running_mean.data
         new_module.running_var.data = old_module.running_var.data
-    elif isinstance(old_module,
-                    Embedding_Compress) or isinstance(old_module,
-                                                      torch.nn.Embedding):
+    elif isinstance(old_module, Embedding_Compress) or isinstance(old_module, torch.nn.Embedding):
         if isinstance(old_module, Embedding_Compress):
             new_module = old_module
         else:
             new_module = Embedding_Compress(old_module.num_embeddings, old_module.embedding_dim, old_module.padding_idx, old_module.max_norm, old_module.norm_type, \
                                         old_module.scale_grad_by_freq, old_module.sparse).to(device=old_module.weight.device, dtype=old_module.weight.dtype)
             new_module.weight.data = old_module.weight.data
-    elif mpu is not None and (isinstance(old_module,
-                                         ColumnParallelLinear_Compress)
-                              or isinstance(old_module,
-                                            mpu.ColumnParallelLinear)):
+    elif mpu is not None and (isinstance(old_module, ColumnParallelLinear_Compress)
+                              or isinstance(old_module, mpu.ColumnParallelLinear)):
         if isinstance(old_module, ColumnParallelLinear_Compress):
             new_module = old_module
         else:
-            new_module = ColumnParallelLinear_Compress(
-                mpu,
-                old_module.input_size,
-                old_module.output_size,
-                gather_output=old_module.gather_output,
-                skip_bias_add=old_module.skip_bias_add,
-                bias=need_bias).to(device=old_module.weight.device,
-                                   dtype=old_module.weight.dtype)
+            new_module = ColumnParallelLinear_Compress(mpu,
+                                                       old_module.input_size,
+                                                       old_module.output_size,
+                                                       gather_output=old_module.gather_output,
+                                                       skip_bias_add=old_module.skip_bias_add,
+                                                       bias=need_bias).to(device=old_module.weight.device,
+                                                                          dtype=old_module.weight.dtype)
             new_module.weight.data = old_module.weight.data
             if need_bias:
                 new_module.bias.data = old_module.bias.data
-    elif mpu is not None and (isinstance(old_module,
-                                         RowParallelLinear_Compress)
-                              or isinstance(old_module,
-                                            mpu.RowParallelLinear)):
+    elif mpu is not None and (isinstance(old_module, RowParallelLinear_Compress)
+                              or isinstance(old_module, mpu.RowParallelLinear)):
         if isinstance(old_module, RowParallelLinear_Compress):
             new_module = old_module
         else:
-            new_module = RowParallelLinear_Compress(
-                mpu,
-                old_module.input_size,
-                old_module.output_size,
-                input_is_parallel=old_module.input_is_parallel,
-                skip_bias_add=old_module.skip_bias_add,
-                bias=need_bias).to(device=old_module.weight.device,
-                                   dtype=old_module.weight.dtype)
+            new_module = RowParallelLinear_Compress(mpu,
+                                                    old_module.input_size,
+                                                    old_module.output_size,
+                                                    input_is_parallel=old_module.input_is_parallel,
+                                                    skip_bias_add=old_module.skip_bias_add,
+                                                    bias=need_bias).to(device=old_module.weight.device,
+                                                                       dtype=old_module.weight.dtype)
             new_module.weight.data = old_module.weight.data
             if need_bias:
                 new_module.bias.data = old_module.bias.data
@@ -150,39 +141,30 @@ def module_replacement(model, module_name, compression_technique=None, mpu=None)
         for k, v in compression_technique.items():
             if k == SPARSE_PRUNING:
                 if v[SPARSE_PRUNING_ENABLED]:
-                    new_module.enable_sparse_pruning(v[SPARSE_PRUNING_DENSE_RATIO],
-                                                     v[SPARSE_PRUNING_METHOD])
+                    new_module.enable_sparse_pruning(v[SPARSE_PRUNING_DENSE_RATIO], v[SPARSE_PRUNING_METHOD])
             elif k == ROW_PRUNING:
                 if v[ROW_PRUNING_ENABLED]:
-                    new_module.enable_row_pruning(v[ROW_PRUNING_DENSE_RATIO],
-                                                  v[ROW_PRUNING_METHOD])
+                    new_module.enable_row_pruning(v[ROW_PRUNING_DENSE_RATIO], v[ROW_PRUNING_METHOD])
             elif k == HEAD_PRUNING:
                 if v[HEAD_PRUNING_ENABLED]:
-                    new_module.enable_head_pruning(v[HEAD_PRUNING_DENSE_RATIO],
-                                                   v[HEAD_PRUNING_METHOD],
+                    new_module.enable_head_pruning(v[HEAD_PRUNING_DENSE_RATIO], v[HEAD_PRUNING_METHOD],
                                                    v[HEAD_PRUNING_NUM_HEADS])
             elif k == ACTIVATION_QUANTIZATION:
                 if v[ACTIVATION_QUANTIZATION_ENABLED]:
-                    new_module.enable_activation_quantization(
-                        v[ACTIVATION_QUANTIZE_BITS],
-                        v[ACTIVATION_QUANTIZE_TYPE],
-                        v[ACTIVATION_QUANTIZE_RANGE])
+                    new_module.enable_activation_quantization(v[ACTIVATION_QUANTIZE_BITS], v[ACTIVATION_QUANTIZE_TYPE],
+                                                              v[ACTIVATION_QUANTIZE_RANGE])
             elif k == WEIGHT_QUANTIZATION:
                 if v[WEIGHT_QUANTIZE_ENABLED]:
-                    new_module.enable_weight_quantization(
-                        v[WEIGHT_QUANTIZE_START_BITS],
-                        v[WEIGHT_QUANTIZE_TARGET_BITS],
-                        v[WEIGHT_QUANTIZATION_PERIOD],
-                        v[WEIGHT_QUANTIZE_IN_FORWARD_ENABLED],
-                        v[WEIGHT_QUANTIZE_TYPE],
-                        v[WEIGHT_QUANTIZE_GROUPS])
+                    new_module.enable_weight_quantization(v[WEIGHT_QUANTIZE_START_BITS],
+                                                          v[WEIGHT_QUANTIZE_TARGET_BITS],
+                                                          v[WEIGHT_QUANTIZATION_PERIOD],
+                                                          v[WEIGHT_QUANTIZE_IN_FORWARD_ENABLED],
+                                                          v[WEIGHT_QUANTIZE_TYPE], v[WEIGHT_QUANTIZE_GROUPS])
             elif k == CHANNEL_PRUNING:
                 if v[CHANNEL_PRUNING_ENABLED]:
-                    new_module.enable_channel_pruning(v[CHANNEL_PRUNING_DENSE_RATIO],
-                                                      v[CHANNEL_PRUNING_METHOD])
+                    new_module.enable_channel_pruning(v[CHANNEL_PRUNING_DENSE_RATIO], v[CHANNEL_PRUNING_METHOD])
             else:
-                raise NotImplementedError(
-                    'Compression technique {} is not implemented'.format(k))
+                raise NotImplementedError('Compression technique {} is not implemented'.format(k))
 
     # Replace the old module with the new one
     recursive_setattr(model, module_name, new_module)
@@ -195,21 +177,18 @@ def is_module_compressible(module, mpu=None):
           isinstance(module, torch.nn.BatchNorm2d)
 
     if mpu is not None:
-        ret = ret or isinstance(module,
-                                mpu.RowParallelLinear) or isinstance(
-                                    module,
-                                    mpu.ColumnParallelLinear)
+        ret = ret or isinstance(module, mpu.RowParallelLinear) or isinstance(module, mpu.ColumnParallelLinear)
 
     return ret
 
 
-def compression_preparation(model, compression_techinique_list, mpu):
+def compression_preparation(model, compression_technique_list, mpu):
     """
     Prepare the compression techniques of a model.
     Args:
         model (`torch.nn.Module`)
             The model to prepare the compression techniques of.
-        compression_techinique_list (`list`)
+        compression_technique_list (`list`)
             The list of compression techniques to prepare the model to.
             list[]
     """
@@ -217,7 +196,7 @@ def compression_preparation(model, compression_techinique_list, mpu):
     for module_name, module in model.named_modules():
         if is_module_compressible(module, mpu):
             module_replacement(model, module_name, mpu=mpu)
-    for module_name_lists, _, compression_technique in compression_techinique_list:
+    for module_name_lists, _, compression_technique in compression_technique_list:
         for mnl in module_name_lists:
             for module_name in mnl:
                 module_replacement(model, module_name, compression_technique)
@@ -225,11 +204,7 @@ def compression_preparation(model, compression_techinique_list, mpu):
     return model
 
 
-def fix_compression(model,
-                    module_name,
-                    compression_technique,
-                    mask=None,
-                    dim_reduction=False):
+def fix_compression(model, module_name, compression_technique, mask=None, dim_reduction=False):
     """
     Fix the compression technique of a module.
     Args:
@@ -243,17 +218,14 @@ def fix_compression(model,
     # Here we can make things much simpler by just replacing the module
     module = recursive_getattr(model, module_name)
     for k, v in compression_technique.items():
-        if k == WEIGHT_QUANTIZATION and v[WEIGHT_QUANTIZE_IN_FORWARD_ENABLED] and v[
-                WEIGHT_QUANTIZE_ENABLED]:
+        if k == WEIGHT_QUANTIZATION and v[WEIGHT_QUANTIZE_IN_FORWARD_ENABLED] and v[WEIGHT_QUANTIZE_ENABLED]:
             return module.fix_weight_quantization()
         elif k == SPARSE_PRUNING and v[SPARSE_PRUNING_ENABLED]:
             return module.fix_sparse_pruning_helper()
         elif k == ROW_PRUNING and (v[ROW_PRUNING_ENABLED] or mask is not None):
             return module.fix_row_col_pruning_helper(mask, dim_reduction=dim_reduction)
         elif k == HEAD_PRUNING and (v[HEAD_PRUNING_ENABLED] or mask is not None):
-            return module.fix_head_pruning_helper(mask,
-                                                  v[HEAD_PRUNING_NUM_HEADS],
-                                                  dim_reduction=dim_reduction)
+            return module.fix_head_pruning_helper(mask, v[HEAD_PRUNING_NUM_HEADS], dim_reduction=dim_reduction)
         elif k == CHANNEL_PRUNING and (v[CHANNEL_PRUNING_ENABLED] or mask is not None):
             return module.fix_channel_pruning_helper(mask, dim_reduction=dim_reduction)
 
@@ -270,10 +242,9 @@ def convert_conv1d_to_linear(model, convert_type):
     for name, module in c_model.named_modules():
         if isinstance(module, convert_type):
             old_module = recursive_getattr(c_model, name)
-            new_module = torch.nn.Linear(
-                old_module.weight.data.size(0),
-                old_module.weight.data.size(1),
-                bias=True if old_module.bias is not None else False)
+            new_module = torch.nn.Linear(old_module.weight.data.size(0),
+                                         old_module.weight.data.size(1),
+                                         bias=True if old_module.bias is not None else False)
             new_module.weight.data = old_module.weight.data.t().contiguous()
             if new_module.bias is not None:
                 new_module.bias.data = old_module.bias.data.view(-1)
@@ -281,3 +252,71 @@ def convert_conv1d_to_linear(model, convert_type):
             recursive_setattr(c_model, name, new_module)
 
     return model
+
+
+def generate_pruners(config, model):
+    """Generate pruners.
+    Args:
+        config (`neural_compressor.WeightPruningConfig`)
+            The object to the class WeightPruningConfig.
+        model (`torch.nn.module`)
+            The torch module object to be pruned.
+    """
+    assert nc_pruner is not None, "please ensure the neural_compressor python package is installed by pip or conda if user wants to use snip_momentum sparse pruning"
+    from nc_pruner.utils import process_config, parse_to_prune
+    from nc_pruner.pruners import get_pruner
+    assert isinstance(model, torch.nn.Module)
+    pruners_info = process_config(config)
+    pruners = []
+    for info in pruners_info:
+        modules = parse_to_prune(info, model)
+        if modules == {}:
+            logger.warning("one pruner hooks no layers, please have a check")
+
+        pruners.append(get_pruner(info, modules))
+        info['modules'] = [key for key in modules.keys()]
+        info['len_of_modules'] = len(info['modules'])
+        logger.info(info)
+    return pruners
+
+
+def register_on_step_begin(model):
+    """Mount on_step_begin to the model.
+    Args:
+        model (`torch.nn.module`)
+            The torch module object to be pruned.
+    """
+
+    def hook(module, input):
+        for pruner in module.pruners:
+            pruner.on_step_begin(0)
+
+    hook_handle = model.register_forward_pre_hook(hook)
+    return hook_handle
+
+
+def rewrite_optimizer_step(opt: torch.optim.Optimizer):
+    """Mount on_before/after_optimizer_step to the optimizer.
+    Args:
+        model (`torch.opt.Optimizer`)
+            The torch optimizer object to be hooked.
+    """
+
+    def new_step(self, closure=None):
+        if hasattr(self, "pruners"):
+            for pruner in self.pruners:
+                pruner.on_before_optimizer_step()
+
+        if closure is not None:
+            res = self.orig_step(closure)
+        else:
+            res = self.orig_step()
+        if hasattr(self, "pruners"):
+            for pruner in self.pruners:
+                pruner.on_after_optimizer_step()
+        return res
+
+    opt.orig_step = opt.step
+    import types
+    opt.step = types.MethodType(new_step, opt)
+    return opt
diff --git a/deepspeed/compression/scheduler.py b/deepspeed/compression/scheduler.py
index 67955a825251..85fdb67f642f 100644
--- a/deepspeed/compression/scheduler.py
+++ b/deepspeed/compression/scheduler.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .compress import get_module_name
 from .constants import *
@@ -10,6 +13,7 @@ class compression_scheduler():
     '''
     Used to schedule different compression methods
     '''
+
     def __init__(self, model, compression_config):
         self.model = model
         self.compression_config = compression_config
@@ -38,22 +42,22 @@ def make_init(self):
             }
             exist_module_name = set()
             shared_parameters = method_content[SHARED_PARAMETERS]
-            self.different_compression_methods[method][
-                TECHNIQUE_ENABLED] = shared_parameters[TECHNIQUE_ENABLED]
-            self.different_compression_methods[method][
-                SHARED_PARAMETERS] = shared_parameters
+            self.different_compression_methods[method][TECHNIQUE_ENABLED] = shared_parameters[TECHNIQUE_ENABLED]
+            self.different_compression_methods[method][SHARED_PARAMETERS] = shared_parameters
 
             for group_name, method_parameters in method_content[DIFFERENT_GROUPS].items():
                 module_name_list = []
                 for key_word in method_parameters[DIFFERENT_GROUPS_MODULE_SCOPE]:
-                    module_name, exist_module_name = get_module_name(group_name, self.model, key_word, exist_module_name, verbose=False)
+                    module_name, exist_module_name = get_module_name(group_name,
+                                                                     self.model,
+                                                                     key_word,
+                                                                     exist_module_name,
+                                                                     verbose=False)
                     module_name_list.extend(module_name)
                 if module_name_list:
-                    self.different_compression_methods[method][DIFFERENT_GROUPS].append([
-                        group_name,
-                        module_name_list,
-                        method_parameters.copy().pop('params')
-                    ])
+                    self.different_compression_methods[method][DIFFERENT_GROUPS].append(
+                        [group_name, module_name_list,
+                         method_parameters.copy().pop('params')])
 
     def check_weight_quantization(self):
         # check weight quantization
@@ -69,8 +73,7 @@ def check_weight_quantization(self):
                         module.weight_quantization_enabled = True
 
                 if not self.verbose[WEIGHT_QUANTIZATION]:
-                    logger.info(
-                        f'Weight quantization is enabled at step {self.training_steps}')
+                    logger.info(f'Weight quantization is enabled at step {self.training_steps}')
                     self.weight_quantization_enabled = True
                     self.verbose[WEIGHT_QUANTIZATION] = True
 
@@ -87,9 +90,7 @@ def check_activation_quantization(self):
                         module = recursive_getattr(self.model, module_name)
                         module.activation_quantization_enabled = True
                 if not self.verbose[ACTIVATION_QUANTIZATION]:
-                    logger.info(
-                        f'Activation quantization is enabled at step {self.training_steps}'
-                    )
+                    logger.info(f'Activation quantization is enabled at step {self.training_steps}')
                     self.verbose[ACTIVATION_QUANTIZATION] = True
 
     def check_sparse_pruning(self):
@@ -99,14 +100,14 @@ def check_sparse_pruning(self):
             return
         else:
             shared_parameters = sp[SHARED_PARAMETERS]
-            if self.training_steps >= shared_parameters[TECHNIQUE_SCHEDULE_OFFSET]:
+            if shared_parameters[TECHNIQUE_SCHEDULE_OFFSET] <= self.training_steps <= shared_parameters[
+                    TECHNIQUE_SCHEDULE_OFFSET_END]:
                 for group_name, module_name_list, method_parameters in sp[DIFFERENT_GROUPS]:
                     for module_name in module_name_list:
                         module = recursive_getattr(self.model, module_name)
                         module.sparse_pruning_enabled = True
                 if not self.verbose[SPARSE_PRUNING]:
-                    logger.info(
-                        f'Sparse pruning is enabled at step {self.training_steps}')
+                    logger.info(f'Sparse pruning is enabled at step {self.training_steps}')
                     self.verbose[SPARSE_PRUNING] = True
 
     def check_head_pruning(self):
@@ -154,8 +155,7 @@ def check_channel_pruning(self):
                         module = recursive_getattr(self.model, module_name)
                         module.channel_pruning_enabled = True
                 if not self.verbose[CHANNEL_PRUNING]:
-                    logger.info(
-                        f'Channel pruning is enabled at step {self.training_steps}')
+                    logger.info(f'Channel pruning is enabled at step {self.training_steps}')
                     self.verbose[CHANNEL_PRUNING] = True
 
     def check_all_modules(self):
diff --git a/deepspeed/compression/utils.py b/deepspeed/compression/utils.py
index 2492c8ce9d86..481e833bdf8c 100644
--- a/deepspeed/compression/utils.py
+++ b/deepspeed/compression/utils.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 from torch import autograd
@@ -13,6 +16,7 @@ class TopKBinarizer(autograd.Function):
     Implementation is inspired from:
         https://github.com/yaozhewei/MLPruning
     """
+
     @staticmethod
     def forward(ctx, inputs: torch.tensor, threshold: float, sigmoid: bool):
         """
@@ -59,6 +63,7 @@ class SymQuantizer(torch.autograd.Function):
     """
     Symmetric quantization
     """
+
     @staticmethod
     def forward(ctx, input, num_bits, min_value=None, max_value=None, num_groups=1):
         """
@@ -67,7 +72,7 @@ def forward(ctx, input, num_bits, min_value=None, max_value=None, num_groups=1):
                 The input which needs to be quantized
             num_bits (int, >=4)
                 Number of bits to use for quantization
-            min_value/max_vlue (torch.FloatTensor)
+            min_value/max_value (torch.FloatTensor)
                 Used for static activation quantization
             num_groups (int)
                 How many groups to partition the quantization into
@@ -75,9 +80,8 @@ def forward(ctx, input, num_bits, min_value=None, max_value=None, num_groups=1):
             quantized_input (`torch.FloatTensor`)
                 Quantized input
         """
-        assert (min_value is None
-                and max_value is None) or (min_value is not None
-                                           and max_value is not None and num_groups == 1)
+        assert (min_value is None and max_value is None) or (min_value is not None and max_value is not None
+                                                             and num_groups == 1)
         q_range = 2**num_bits
         input_shape = input.shape
         if min_value is None:
@@ -101,6 +105,7 @@ class AsymQuantizer(torch.autograd.Function):
     """
     Asymmetric quantization
     """
+
     @staticmethod
     def forward(ctx, input, num_bits, min_value=None, max_value=None, num_groups=1):
         """
@@ -109,7 +114,7 @@ def forward(ctx, input, num_bits, min_value=None, max_value=None, num_groups=1):
                 The input which needs to be quantized
             num_bits (int, >=4)
                 Number of bits to use for quantization
-            min_value/max_vlue (torch.FloatTensor)
+            min_value/max_value (torch.FloatTensor)
                 Used for static activation quantization
             num_groups (int)
                 How many groups to partition the quantization into
@@ -118,9 +123,8 @@ def forward(ctx, input, num_bits, min_value=None, max_value=None, num_groups=1):
                 Quantized input
         """
 
-        assert (min_value is None
-                and max_value is None) or (min_value is not None
-                                           and max_value is not None and num_groups == 1)
+        assert (min_value is None and max_value is None) or (min_value is not None and max_value is not None
+                                                             and num_groups == 1)
         q_range = 2**num_bits
         input_shape = input.shape
         if min_value is None:
@@ -131,9 +135,7 @@ def forward(ctx, input, num_bits, min_value=None, max_value=None, num_groups=1):
         scale = (max_value - min_value) / q_range
         zero_point = (min_value / scale).round() * scale
 
-        output = (
-            (input - zero_point) / scale).round().clamp(0,
-                                                        q_range - 1) * scale + zero_point
+        output = ((input - zero_point) / scale).round().clamp(0, q_range - 1) * scale + zero_point
         output = output.reshape(input_shape).contiguous()
         return output
 
@@ -147,6 +149,7 @@ class TernaryQuantizer(torch.autograd.Function):
     """
     Ternary quantization
     """
+
     @staticmethod
     def forward(ctx, input, num_bits, min_value=None, max_value=None, num_groups=1):
         """
@@ -155,7 +158,7 @@ def forward(ctx, input, num_bits, min_value=None, max_value=None, num_groups=1):
                 The input which needs to be quantized
             num_bits (int)
                 Dummy variable
-            min_value/max_vlue (torch.FloatTensor)
+            min_value/max_value (torch.FloatTensor)
                 Used for static activation quantization; for now they are dummy variable
             num_groups (int)
                 How many groups to partition the quantization into
@@ -187,6 +190,7 @@ class BinaryQuantizer(torch.autograd.Function):
     """
     Binary quantization
     """
+
     @staticmethod
     def forward(ctx, input, num_bits, min_value=None, max_value=None, num_groups=1):
         """
@@ -195,7 +199,7 @@ def forward(ctx, input, num_bits, min_value=None, max_value=None, num_groups=1):
                 The input which needs to be quantized
             num_bits (int)
                 Dummy variable
-            min_value/max_vlue (torch.FloatTensor)
+            min_value/max_value (torch.FloatTensor)
                 Used for static activation quantization; for now they are dummy variable
             num_groups (int)
                 How many groups to partition the quantization into
diff --git a/deepspeed/constants.py b/deepspeed/constants.py
index 9e7f511e443f..30135f41b7b6 100644
--- a/deepspeed/constants.py
+++ b/deepspeed/constants.py
@@ -1,6 +1,9 @@
-'''
-Copyright 2020 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import os
 from datetime import timedelta
 
 #############################################
@@ -13,6 +16,6 @@
 # (only if NCCL_BLOCKING_WAIT or NCCL_ASYNC_ERROR_HANDLING is set to 1).
 # To make an attempt at backwards compatibility with THD, we use an
 # extraordinarily high default timeout, given that THD did not have timeouts.
-default_pg_timeout = timedelta(minutes=30)
+default_pg_timeout = timedelta(minutes=int(os.getenv("DEEPSPEED_TIMEOUT", default=30)))
 INFERENCE_GENERIC_MODE = 'generic'
 INFERENCE_SPECIALIZED_MODE = 'specialized'
diff --git a/deepspeed/elasticity/__init__.py b/deepspeed/elasticity/__init__.py
index be95e3a3ee19..66bda96fa6ba 100644
--- a/deepspeed/elasticity/__init__.py
+++ b/deepspeed/elasticity/__init__.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .elasticity import compute_elastic_config, elasticity_enabled, ensure_immutable_elastic_config
 from .utils import is_torch_elastic_compatible
diff --git a/deepspeed/elasticity/config.py b/deepspeed/elasticity/config.py
index ffbce7028e03..7c6bd42cdfd9 100644
--- a/deepspeed/elasticity/config.py
+++ b/deepspeed/elasticity/config.py
@@ -1,6 +1,7 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import json
 from .constants import *
@@ -43,77 +44,64 @@ class ElasticityConfig:
         "version": 0.1
     }
     """
+
     def __init__(self, param_dict):
         self.enabled = param_dict.get(ENABLED, ENABLED_DEFAULT)
         if self.enabled:
             if MAX_ACCEPTABLE_BATCH_SIZE in param_dict:
                 self.max_acceptable_batch_size = param_dict[MAX_ACCEPTABLE_BATCH_SIZE]
             else:
-                raise ElasticityConfigError(
-                    f"Elasticity config missing {MAX_ACCEPTABLE_BATCH_SIZE}")
+                raise ElasticityConfigError(f"Elasticity config missing {MAX_ACCEPTABLE_BATCH_SIZE}")
             if MICRO_BATCHES in param_dict:
                 self.micro_batches = param_dict[MICRO_BATCHES]
             else:
                 raise ElasticityConfigError(f"Elasticity config missing {MICRO_BATCHES}")
         else:
-            self.max_acceptable_batch_size = param_dict.get(
-                MAX_ACCEPTABLE_BATCH_SIZE,
-                MAX_ACCEPTABLE_BATCH_SIZE_DEFAULT)
+            self.max_acceptable_batch_size = param_dict.get(MAX_ACCEPTABLE_BATCH_SIZE,
+                                                            MAX_ACCEPTABLE_BATCH_SIZE_DEFAULT)
             self.micro_batches = param_dict.get(MICRO_BATCHES, MICRO_BATCHES_DEFAULT)
 
         if not isinstance(self.micro_batches, list):
             raise ElasticityConfigError(
                 f"Elasticity expected value of {MICRO_BATCHES} to be a "
-                f"list of micro batches, instead is: {type(self.micro_batches)}, containing: {self.micro_batches}"
-            )
+                f"list of micro batches, instead is: {type(self.micro_batches)}, containing: {self.micro_batches}")
 
         if not all(map(lambda m: isinstance(m, int), self.micro_batches)):
-            raise ElasticityConfigError(
-                f"Elasticity expected {MICRO_BATCHES} to only contain a list of integers, "
-                f"instead contains: f{self.micro_batches}")
+            raise ElasticityConfigError(f"Elasticity expected {MICRO_BATCHES} to only contain a list of integers, "
+                                        f"instead contains: f{self.micro_batches}")
 
         if not all(map(lambda m: m > 0, self.micro_batches)):
-            raise ElasticityConfigError(
-                f"Elasticity expected {MICRO_BATCHES} to only contain positive integers, "
-                f"instead contains: f{self.micro_batches}")
+            raise ElasticityConfigError(f"Elasticity expected {MICRO_BATCHES} to only contain positive integers, "
+                                        f"instead contains: f{self.micro_batches}")
 
         self.min_gpus = param_dict.get(MIN_GPUS, MIN_GPUS_DEFAULT)
         self.max_gpus = param_dict.get(MAX_GPUS, MAX_GPUS_DEFAULT)
 
         if self.min_gpus < 1 or self.max_gpus < 1:
-            raise ElasticityConfigError(
-                "Elasticity min/max gpus must be > 0, "
-                f"given min_gpus: {self.min_gpus}, max_gpus: {self.max_gpus}")
+            raise ElasticityConfigError("Elasticity min/max gpus must be > 0, "
+                                        f"given min_gpus: {self.min_gpus}, max_gpus: {self.max_gpus}")
         if self.max_gpus < self.min_gpus:
-            raise ElasticityConfigError(
-                "Elasticity min_gpus cannot be greater than max_gpus, "
-                f"given min_gpus: {self.min_gpus}, max_gpus: {self.max_gpus}")
+            raise ElasticityConfigError("Elasticity min_gpus cannot be greater than max_gpus, "
+                                        f"given min_gpus: {self.min_gpus}, max_gpus: {self.max_gpus}")
 
-        self.model_parallel_size = param_dict.get(MODEL_PARLLEL_SIZE,
-                                                  MODEL_PARLLEL_SIZE_DEFAULT)
+        self.model_parallel_size = param_dict.get(MODEL_PARALLEL_SIZE, MODEL_PARALLEL_SIZE_DEFAULT)
         if self.model_parallel_size < 1:
-            raise ElasticityConfigError(
-                "Model-Parallel size cannot be less than 1, "
-                f"given model-parallel size: {self.model_parallel_size}")
+            raise ElasticityConfigError("Model-Parallel size cannot be less than 1, "
+                                        f"given model-parallel size: {self.model_parallel_size}")
 
-        self.num_gpus_per_node = param_dict.get(NUM_GPUS_PER_NODE,
-                                                NUM_GPUS_PER_NODE_DEFAULT)
+        self.num_gpus_per_node = param_dict.get(NUM_GPUS_PER_NODE, NUM_GPUS_PER_NODE_DEFAULT)
         if self.num_gpus_per_node < 1:
-            raise ElasticityConfigError(
-                "Number of GPUs per node cannot be less than 1, "
-                f"given number of GPUs per node: {self.num_gpus_per_node}")
+            raise ElasticityConfigError("Number of GPUs per node cannot be less than 1, "
+                                        f"given number of GPUs per node: {self.num_gpus_per_node}")
 
         self.min_time = param_dict.get(MIN_TIME, MIN_TIME_DEFAULT)
         if self.min_time < 0:
-            raise ElasticityConfigError(
-                f"Elasticity min time needs to be >= 0: given {self.min_time}")
+            raise ElasticityConfigError(f"Elasticity min time needs to be >= 0: given {self.min_time}")
 
         self.version = param_dict.get(VERSION, VERSION_DEFAULT)
-        self.prefer_larger_batch_size = param_dict.get(PREFER_LARGER_BATCH,
-                                                       PREFER_LARGER_BATCH_DEFAULT)
-        self.ignore_non_elastic_batch_info = param_dict.get(
-            IGNORE_NON_ELASTIC_BATCH_INFO,
-            IGNORE_NON_ELASTIC_BATCH_INFO_DEFAULT)
+        self.prefer_larger_batch_size = param_dict.get(PREFER_LARGER_BATCH, PREFER_LARGER_BATCH_DEFAULT)
+        self.ignore_non_elastic_batch_info = param_dict.get(IGNORE_NON_ELASTIC_BATCH_INFO,
+                                                            IGNORE_NON_ELASTIC_BATCH_INFO_DEFAULT)
 
     def repr(self):
         return self.__dict__
diff --git a/deepspeed/elasticity/constants.py b/deepspeed/elasticity/constants.py
index eb40edc84711..b3134c54b4d6 100644
--- a/deepspeed/elasticity/constants.py
+++ b/deepspeed/elasticity/constants.py
@@ -1,6 +1,7 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 #########################################
 # Elasticity
@@ -49,8 +50,8 @@
 NUM_GPUS_PER_NODE = 'num_gpus_per_node'
 NUM_GPUS_PER_NODE_DEFAULT = 1
 
-MODEL_PARLLEL_SIZE = "model_parallel_size"
-MODEL_PARLLEL_SIZE_DEFAULT = 1
+MODEL_PARALLEL_SIZE = "model_parallel_size"
+MODEL_PARALLEL_SIZE_DEFAULT = 1
 
 # Minimum running time (minutes) before the scheduler will scale us, 0 implies it's unknown
 MIN_TIME = "min_time"
diff --git a/deepspeed/elasticity/elastic_agent.py b/deepspeed/elasticity/elastic_agent.py
index db0a61b4fa7a..91a217cb9971 100644
--- a/deepspeed/elasticity/elastic_agent.py
+++ b/deepspeed/elasticity/elastic_agent.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from torch.distributed.elastic.agent.server.local_elastic_agent import LocalElasticAgent
 from typing import Any, Dict, Optional, Tuple
@@ -23,6 +26,7 @@
 
 
 class DSElasticAgent(LocalElasticAgent):
+
     def __init__(
         self,
         spec: WorkerSpec,
@@ -35,9 +39,7 @@ def __init__(
         self.ds_env = env
 
     @staticmethod
-    def _set_master_addr_port(store: Store,
-                              master_addr: Optional[str],
-                              master_port: Optional[int]):
+    def _set_master_addr_port(store: Store, master_addr: Optional[str], master_port: Optional[int]):
         if master_port is None:
             sock = _get_socket_with_port()
             with closing(sock):
@@ -82,8 +84,7 @@ def _start_workers(self, worker_group: WorkerGroup) -> Dict[int, Any]:
                 "TORCHELASTIC_MAX_RESTARTS": str(spec.max_restarts),
                 "TORCHELASTIC_RUN_ID": spec.rdzv_handler.get_run_id(),
                 "TORCHELASTIC_USE_AGENT_STORE": str(use_agent_store),
-                "NCCL_ASYNC_ERROR_HANDLING": os.getenv("NCCL_ASYNC_ERROR_HANDLING",
-                                                       str(1)),
+                "NCCL_ASYNC_ERROR_HANDLING": os.getenv("NCCL_ASYNC_ERROR_HANDLING", str(1)),
             }
             worker_env_ds.update(worker_env_elastic)
             if "OMP_NUM_THREADS" in os.environ:
@@ -120,8 +121,7 @@ def _invoke_run(self, role: str = "default") -> RunResult:
         spec = self._worker_group.spec
         role = spec.role
 
-        log.info(
-            f"[{role}] starting workers for entrypoint: {spec.get_entrypoint_name()}")
+        log.info(f"[{role}] starting workers for entrypoint: {spec.get_entrypoint_name()}")
 
         self._initialize_workers(self._worker_group)
         monitor_interval = spec.monitor_interval
@@ -136,13 +136,10 @@ def _invoke_run(self, role: str = "default") -> RunResult:
             state = run_result.state
             self._worker_group.state = state
 
-            expire_time = datetime.utcnow() - (
-                rdzv_handler._settings.keep_alive_interval *
-                rdzv_handler._settings.keep_alive_max_attempt)
+            expire_time = datetime.utcnow() - (rdzv_handler._settings.keep_alive_interval *
+                                               rdzv_handler._settings.keep_alive_max_attempt)
             _dead_nodes = [
-                node for node,
-                last_heartbeat in
-                rdzv_handler._state_holder.state.last_heartbeats.items()
+                node for node, last_heartbeat in rdzv_handler._state_holder.state.last_heartbeats.items()
                 if last_heartbeat < expire_time
             ]
 
@@ -150,21 +147,16 @@ def _invoke_run(self, role: str = "default") -> RunResult:
             put_metric(f"workers.{role}.{state.name.lower()}", 1)
 
             if state == WorkerState.SUCCEEDED:
-                log.info(
-                    f"[{role}] worker group successfully finished."
-                    f" Waiting {self._exit_barrier_timeout} seconds for other agents to finish."
-                )
+                log.info(f"[{role}] worker group successfully finished."
+                         f" Waiting {self._exit_barrier_timeout} seconds for other agents to finish.")
                 self._exit_barrier()
                 return run_result
-            elif state in {
-                    WorkerState.UNHEALTHY,
-                    WorkerState.FAILED
-            } or len(participants) > len(rdzv_handler._state_holder.state.participants):
+            elif state in {WorkerState.UNHEALTHY, WorkerState.FAILED
+                           } or len(participants) > len(rdzv_handler._state_holder.state.participants):
                 if self._remaining_restarts > 0:
-                    log.info(
-                        f"[{role}] Worker group {state.name}. "
-                        f"{self._remaining_restarts}/{spec.max_restarts} attempts left;"
-                        f" will restart worker group")
+                    log.info(f"[{role}] Worker group {state.name}. "
+                             f"{self._remaining_restarts}/{spec.max_restarts} attempts left;"
+                             f" will restart worker group")
                     self._remaining_restarts -= 1
                     # rdzv_handler._state_holder.state.restart = False
                     self._restart_workers(self._worker_group)
diff --git a/deepspeed/elasticity/elasticity.py b/deepspeed/elasticity/elasticity.py
index 17a8b6ecf394..f26f13c98dfa 100644
--- a/deepspeed/elasticity/elasticity.py
+++ b/deepspeed/elasticity/elasticity.py
@@ -1,6 +1,8 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 import os
 import json
 import numpy as np
@@ -17,44 +19,8 @@
 # Thirty eight smallest highly composite numbers. The list should
 # be enough to support up to 720K batch size.
 HCN_LIST = [
-    1,
-    2,
-    4,
-    6,
-    12,
-    24,
-    36,
-    48,
-    60,
-    120,
-    180,
-    240,
-    360,
-    720,
-    840,
-    1260,
-    1680,
-    2520,
-    5040,
-    7560,
-    10080,
-    15120,
-    20160,
-    25200,
-    27720,
-    45360,
-    50400,
-    55440,
-    83160,
-    110880,
-    166320,
-    221760,
-    277200,
-    332640,
-    498960,
-    554400,
-    665280,
-    720720
+    1, 2, 4, 6, 12, 24, 36, 48, 60, 120, 180, 240, 360, 720, 840, 1260, 1680, 2520, 5040, 7560, 10080, 15120, 20160,
+    25200, 27720, 45360, 50400, 55440, 83160, 110880, 166320, 221760, 277200, 332640, 498960, 554400, 665280, 720720
 ]
 
 
@@ -78,7 +44,7 @@ def get_valid_gpus(batch_size, micro_batches, min_valid_gpus, max_valid_gpus):
         if batch_size % micro_batch == 0:
 
             max_gpus = batch_size // micro_batch
-            if max_gpus >= min_valid_gpus and max_gpus <= max_valid_gpus:
+            if min_valid_gpus <= max_gpus <= max_valid_gpus:
                 valid_gpus.append(max_gpus)
 
             # find all factors less than max_gpus / 2
@@ -94,11 +60,7 @@ def get_valid_gpus(batch_size, micro_batches, min_valid_gpus, max_valid_gpus):
     return valid_gpus
 
 
-def get_best_candidates(candidate_batch_sizes,
-                        micro_batches,
-                        min_gpus,
-                        max_gpus,
-                        prefer_larger):
+def get_best_candidates(candidate_batch_sizes, micro_batches, min_gpus, max_gpus, prefer_larger):
 
     max_valid_gpus = 0
     valid_gpus = None
@@ -106,15 +68,11 @@ def get_best_candidates(candidate_batch_sizes,
 
     for batch_size in candidate_batch_sizes:
 
-        current_valid_gpus = get_valid_gpus(batch_size,
-                                            micro_batches,
-                                            min_gpus,
-                                            max_gpus)
+        current_valid_gpus = get_valid_gpus(batch_size, micro_batches, min_gpus, max_gpus)
 
-        if (len(current_valid_gpus) > max_valid_gpus
-                or (len(current_valid_gpus) == max_valid_gpus and
-                    ((prefer_larger and batch_size > final_batch_size) or
-                     (not prefer_larger and batch_size < final_batch_size)))):
+        if (len(current_valid_gpus) > max_valid_gpus or (len(current_valid_gpus) == max_valid_gpus and
+                                                         ((prefer_larger and batch_size > final_batch_size) or
+                                                          (not prefer_larger and batch_size < final_batch_size)))):
             max_valid_gpus = len(current_valid_gpus)
             valid_gpus = current_valid_gpus
             final_batch_size = batch_size
@@ -157,15 +115,10 @@ def _get_compatible_gpus_v01(micro_batches,
     base_list.extend(micro_batches)
     base_list.append(lcm)
 
-    candidate_batch_sizes = get_candidate_batch_sizes(base_list,
-                                                      max_acceptable_batch_size)
+    candidate_batch_sizes = get_candidate_batch_sizes(base_list, max_acceptable_batch_size)
 
-    final_batch_size, valid_gpus = get_best_candidates(
-        candidate_batch_sizes,
-        micro_batches,
-        min_gpus,
-        max_gpus,
-        prefer_larger)
+    final_batch_size, valid_gpus = get_best_candidates(candidate_batch_sizes, micro_batches, min_gpus, max_gpus,
+                                                       prefer_larger)
 
     return final_batch_size, valid_gpus
 
@@ -195,7 +148,7 @@ def get_microbatch(final_batch_size):
 
         for micro_batch in micro_batches:
             if final_batch_size // current_num_gpus % micro_batch == 0:
-                if candidate_microbatch == None:
+                if candidate_microbatch is None:
                     candidate_microbatch = micro_batch
                 if prefer_larger and candidate_microbatch < micro_batch:
                     candidate_microbatch = micro_batch
@@ -203,11 +156,12 @@ def get_microbatch(final_batch_size):
 
     dp_size_per_node = num_gpus_per_node // model_parallel_size
 
-    final_batch_size, valid_world_size = _get_compatible_gpus_v01(micro_batches,
-                             int(max_acceptable_batch_size/dp_size_per_node),
-                             int(min_gpus/num_gpus_per_node),
-                             int(max_gpus/num_gpus_per_node), # Passing number of max nodes as Elasticity v2 works at node level
-                             prefer_larger=prefer_larger)
+    final_batch_size, valid_world_size = _get_compatible_gpus_v01(
+        micro_batches,
+        int(max_acceptable_batch_size / dp_size_per_node),
+        int(min_gpus / num_gpus_per_node),
+        int(max_gpus / num_gpus_per_node),  # Passing number of max nodes as Elasticity v2 works at node level
+        prefer_larger=prefer_larger)
 
     final_batch_size = int(final_batch_size) * dp_size_per_node
     valid_dp_world_size = [i * dp_size_per_node for i in valid_world_size]
@@ -256,38 +210,27 @@ def ensure_immutable_elastic_config(runtime_elastic_config_dict: dict):
     Ensure the resource scheduler saw the same elastic config we are using at runtime
     """
     if DEEPSPEED_ELASTICITY_CONFIG in os.environ:
-        scheduler_elastic_config_dict = json.loads(
-            os.environ[DEEPSPEED_ELASTICITY_CONFIG])
+        scheduler_elastic_config_dict = json.loads(os.environ[DEEPSPEED_ELASTICITY_CONFIG])
         scheduler_elastic_config = ElasticityConfig(scheduler_elastic_config_dict)
         runtime_elastic_config = ElasticityConfig(runtime_elastic_config_dict)
         err_str = "Elastic config '{}={}' seen by resource scheduler does not match config passed to runtime {}={}"
         if runtime_elastic_config.max_acceptable_batch_size != scheduler_elastic_config.max_acceptable_batch_size:
             raise ElasticityConfigError(
-                err_str.format('max_acceptable_batch_size',
-                               scheduler_elastic_config.max_acceptable_batch_size,
-                               'max_acceptable_batch_size',
-                               runtime_elastic_config.max_acceptable_batch_size))
+                err_str.format('max_acceptable_batch_size', scheduler_elastic_config.max_acceptable_batch_size,
+                               'max_acceptable_batch_size', runtime_elastic_config.max_acceptable_batch_size))
         if runtime_elastic_config.micro_batches != scheduler_elastic_config.micro_batches:
             raise ElasticityConfigError(
-                err_str.format('micro_batches',
-                               scheduler_elastic_config.micro_batches,
-                               'micro_batches',
+                err_str.format('micro_batches', scheduler_elastic_config.micro_batches, 'micro_batches',
                                runtime_elastic_config.micro_batches))
         if runtime_elastic_config.version != scheduler_elastic_config.version:
             raise ElasticityConfigError(
-                err_str.format('version',
-                               scheduler_elastic_config.version,
-                               'version',
-                               runtime_elastic_config.version))
+                err_str.format('version', scheduler_elastic_config.version, 'version', runtime_elastic_config.version))
     else:
         logger.warning("Unable to find DEEPSPEED_ELASTICITY_CONFIG environment variable, cannot " \
             "guarantee resource scheduler will scale this job using compatible GPU counts.")
 
 
-def compute_elastic_config(ds_config: dict,
-                           target_deepspeed_version: str,
-                           world_size=0,
-                           return_microbatch=False):
+def compute_elastic_config(ds_config: dict, target_deepspeed_version: str, world_size=0, return_microbatch=False):
     """Core deepspeed elasticity API. Given an elastic config (similar to the example below)
     DeepSpeed will compute a total train batch size corresponding valid GPU count list that
     provides a high level of elasticity. Elasticity in this case means we are safe to scale
@@ -397,8 +340,7 @@ def compute_elastic_config(ds_config: dict,
         # ensure batch size is int dtype
         final_batch_size = int(final_batch_size)
     else:
-        raise NotImplementedError(
-            f"Unable to find elastic logic for version: {elastic_config.version}")
+        raise NotImplementedError(f"Unable to find elastic logic for version: {elastic_config.version}")
 
     logger.info(f"Valid World Size (GPUs / Model Parallel Size): {valid_gpus}")
 
diff --git a/deepspeed/elasticity/utils.py b/deepspeed/elasticity/utils.py
index 3fba6949be03..848ea8646449 100644
--- a/deepspeed/elasticity/utils.py
+++ b/deepspeed/elasticity/utils.py
@@ -1,6 +1,9 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
 
-import torch
+# DeepSpeed Team
+
+from deepspeed.runtime.utils import required_torch_version
 
 
 def is_torch_elastic_compatible():
@@ -8,9 +11,4 @@ def is_torch_elastic_compatible():
         Helper to lookup torch version. Elastic training is
         introduced in 1.11.x
     '''
-    TORCH_MAJOR = int(torch.__version__.split('.')[0])
-    TORCH_MINOR = int(torch.__version__.split('.')[1])
-    if TORCH_MAJOR == 1 and TORCH_MINOR >= 11:
-        return True
-    else:
-        return False
+    return required_torch_version(min_version=1.11)
diff --git a/deepspeed/env_report.py b/deepspeed/env_report.py
index e8be40a0535d..912262453748 100644
--- a/deepspeed/env_report.py
+++ b/deepspeed/env_report.py
@@ -1,5 +1,9 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
 
+# DeepSpeed Team
+
+import os
 import torch
 import deepspeed
 import subprocess
@@ -47,16 +51,15 @@ def op_report(verbose=True):
     for op_name, builder in ALL_OPS.items():
         dots = "." * (max_dots - len(op_name))
         is_compatible = OKAY if builder.is_compatible(verbose) else no
-        is_installed = installed if installed_ops[op_name] else no
-        dots2 = '.' * ((len(h[1]) + (max_dots2 - len(h[1]))) -
-                       (len(is_installed) - color_len))
+        is_installed = installed if installed_ops.get(op_name, False) else no
+        dots2 = '.' * ((len(h[1]) + (max_dots2 - len(h[1]))) - (len(is_installed) - color_len))
         print(op_name, dots, is_installed, dots2, is_compatible)
     print("-" * (max_dots + max_dots2 + len(h[0]) + len(h[1])))
 
 
 def ninja_installed():
     try:
-        import ninja  # noqa: F401
+        import ninja  # noqa: F401 # type: ignore
     except ImportError:
         return False
     return True
@@ -68,9 +71,7 @@ def nvcc_version():
     if cuda_home is None:
         return f"{RED} [FAIL] cannot find CUDA_HOME via torch.utils.cpp_extension.CUDA_HOME={torch.utils.cpp_extension.CUDA_HOME} {END}"
     try:
-        output = subprocess.check_output([cuda_home + "/bin/nvcc",
-                                          "-V"],
-                                         universal_newlines=True)
+        output = subprocess.check_output([cuda_home + "/bin/nvcc", "-V"], universal_newlines=True)
     except FileNotFoundError:
         return f"{RED} [FAIL] nvcc missing {END}"
     output_split = output.split()
@@ -79,52 +80,101 @@ def nvcc_version():
     return ".".join(release)
 
 
+def installed_cann_path():
+    if "ASCEND_HOME_PATH" in os.environ or os.path.exists(os.environ["ASCEND_HOME_PATH"]):
+        return os.environ["ASCEND_HOME_PATH"]
+    return None
+
+
+def installed_cann_version():
+    import re
+    ascend_path = installed_cann_path()
+    if ascend_path is None:
+        return f"CANN_HOME does not exist, unable to compile NPU op(s)"
+    cann_version = ""
+    for dirpath, _, filenames in os.walk(os.path.realpath(ascend_path)):
+        if cann_version:
+            break
+        install_files = [file for file in filenames if re.match(r"ascend_.*_install\.info", file)]
+        if install_files:
+            filepath = os.path.join(dirpath, install_files[0])
+            with open(filepath, "r") as f:
+                for line in f:
+                    if line.find("version") != -1:
+                        cann_version = line.strip().split("=")[-1]
+                        break
+    return cann_version
+
+
+def get_shm_size():
+    try:
+        shm_stats = os.statvfs('/dev/shm')
+    except (OSError, FileNotFoundError, ValueError):
+        return "UNKNOWN", None
+
+    shm_size = shm_stats.f_frsize * shm_stats.f_blocks
+    shm_hbytes = human_readable_size(shm_size)
+    warn = []
+    if shm_size < 512 * 1024**2:
+        warn.append(
+            f" {YELLOW} [WARNING] /dev/shm size might be too small, if running in docker increase to at least --shm-size='1gb' {END}"
+        )
+        if get_accelerator().communication_backend_name() == "nccl":
+            warn.append(
+                f" {YELLOW} [WARNING] see more details about NCCL requirements: https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/troubleshooting.html#sharing-data {END}"
+            )
+    return shm_hbytes, warn
+
+
+def human_readable_size(size):
+    units = ['B', 'KB', 'MB', 'GB', 'TB']
+    i = 0
+    while size >= 1024 and i < len(units) - 1:
+        size /= 1024
+        i += 1
+    return f'{size:.2f} {units[i]}'
+
+
 def debug_report():
     max_dots = 33
 
-    report = [
-        ("torch install path",
-         torch.__path__),
-        ("torch version",
-         torch.__version__),
-        ("deepspeed install path",
-         deepspeed.__path__),
-        ("deepspeed info",
-         f"{deepspeed.__version__}, {deepspeed.__git_hash__}, {deepspeed.__git_branch__}"
-         )
-    ]
+    report = [("torch install path", torch.__path__), ("torch version", torch.__version__),
+              ("deepspeed install path", deepspeed.__path__),
+              ("deepspeed info", f"{deepspeed.__version__}, {deepspeed.__git_hash__}, {deepspeed.__git_branch__}")]
     if get_accelerator().device_name() == 'cuda':
         hip_version = getattr(torch.version, "hip", None)
-        report.extend([("torch cuda version",
-                        torch.version.cuda),
-                       ("torch hip version",
-                        hip_version),
-                       ("nvcc version",
-                        (None if hip_version else nvcc_version())),
-                       ("deepspeed wheel compiled w.",
-                        f"torch {torch_info['version']}, " +
-                        (f"hip {torch_info['hip_version']}"
-                         if hip_version else f"cuda {torch_info['cuda_version']}"))])
+        report.extend([("torch cuda version", torch.version.cuda), ("torch hip version", hip_version),
+                       ("nvcc version", (None if hip_version else nvcc_version())),
+                       ("deepspeed wheel compiled w.", f"torch {torch_info['version']}, " +
+                        (f"hip {torch_info['hip_version']}" if hip_version else f"cuda {torch_info['cuda_version']}"))
+                       ])
+    elif get_accelerator().device_name() == 'npu':
+        import torch_npu
+        report.extend([("deepspeed wheel compiled w.", f"torch {torch_info['version']}"),
+                       ("torch_npu install path", torch_npu.__path__), ("torch_npu version", torch_npu.__version__),
+                       ("ascend_cann version", installed_cann_version())])
     else:
-        report.extend([("deepspeed wheel compiled w.",
-                        f"torch {torch_info['version']} ")])
+        report.extend([("deepspeed wheel compiled w.", f"torch {torch_info['version']} ")])
+
+    report.append(("shared memory (/dev/shm) size", get_shm_size()))
 
     print("DeepSpeed general environment info:")
     for name, value in report:
+        warns = []
+        if isinstance(value, tuple):
+            value, warns = value
         print(name, "." * (max_dots - len(name)), value)
+        if warns:
+            for warn in warns:
+                print(warn)
 
 
 def parse_arguments():
     parser = argparse.ArgumentParser()
-    parser.add_argument(
-        '--hide_operator_status',
-        action='store_true',
-        help=
-        'Suppress display of installation and compatibility statuses of DeepSpeed operators. '
-    )
-    parser.add_argument('--hide_errors_and_warnings',
+    parser.add_argument('--hide_operator_status',
                         action='store_true',
-                        help='Suppress warning and error messages.')
+                        help='Suppress display of installation and compatibility statuses of DeepSpeed operators. ')
+    parser.add_argument('--hide_errors_and_warnings', action='store_true', help='Suppress warning and error messages.')
     args = parser.parse_args()
     return args
 
@@ -137,8 +187,7 @@ def main(hide_operator_status=False, hide_errors_and_warnings=False):
 
 def cli_main():
     args = parse_arguments()
-    main(hide_operator_status=args.hide_operator_status,
-         hide_errors_and_warnings=args.hide_errors_and_warnings)
+    main(hide_operator_status=args.hide_operator_status, hide_errors_and_warnings=args.hide_errors_and_warnings)
 
 
 if __name__ == "__main__":
diff --git a/deepspeed/git_version_info.py b/deepspeed/git_version_info.py
index dfb8864fe687..635842c760ea 100644
--- a/deepspeed/git_version_info.py
+++ b/deepspeed/git_version_info.py
@@ -1,8 +1,11 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 try:
     #  This is populated by setup.py
-    from .git_version_info_installed import *  # noqa: F401
+    from .git_version_info_installed import *  # noqa: F401 # type: ignore
 except ModuleNotFoundError:
     import os
     if os.path.isfile('version.txt'):
diff --git a/deepspeed/inference/__init__.py b/deepspeed/inference/__init__.py
index 449e6651e5d4..cdd00fec935b 100644
--- a/deepspeed/inference/__init__.py
+++ b/deepspeed/inference/__init__.py
@@ -1,3 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
 
-from .engine import InferenceEngine
+# DeepSpeed Team
+from .v2 import RaggedInferenceEngineConfig, DeepSpeedTPConfig
+from .v2.engine_v2 import InferenceEngineV2
+from .v2 import build_hf_engine, build_engine_from_ds_checkpoint
diff --git a/deepspeed/inference/config.py b/deepspeed/inference/config.py
index 61298db3fbd4..1d5018aaa75b 100644
--- a/deepspeed/inference/config.py
+++ b/deepspeed/inference/config.py
@@ -1,10 +1,13 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
+import deepspeed
+from deepspeed.pydantic_v1 import Field, validator
 from deepspeed.runtime.config_utils import DeepSpeedConfigModel
 from deepspeed.runtime.zero.config import DeepSpeedZeroConfig
-from pydantic import Field
-from pydantic import validator
 from typing import Dict, Union
 from enum import Enum
 
@@ -12,8 +15,8 @@
 class DtypeEnum(Enum):
     # The torch dtype must always be the first value (so we return torch.dtype)
     fp16 = torch.float16, "torch.float16", "fp16", "float16", "half"
-    bf16 = torch.bfloat16, "torch.bfloat16", "bf16", "bfloat16"
     fp32 = torch.float32, "torch.float32", "fp32", "float32", "float"
+    bf16 = torch.bfloat16, "torch.bfloat16", "bf16", "bfloat16", "bfloat"
     int8 = torch.int8, "torch.int8", "int8"
 
     # Copied from https://stackoverflow.com/a/43210118
@@ -96,6 +99,8 @@ class BaseQuantConfig(DeepSpeedConfigModel):
 
 class WeightQuantConfig(BaseQuantConfig):
     enabled = True
+    quantized_initialization: Dict = {}
+    post_init_quant: Dict = {}
 
 
 class ActivationQuantConfig(BaseQuantConfig):
@@ -149,6 +154,18 @@ class DeepSpeedInferenceConfig(DeepSpeedConfigModel):
     can run faster using the graph replay method.
     """
 
+    use_triton: bool = False
+    """
+    Use this flag to use triton kernels for inference ops.
+    """
+
+    triton_autotune: bool = False
+    """
+    Use this flag to enable triton autotuning.
+    Turning it on is better for performance but increase the 1st runtime for
+    autotuning.
+    """
+
     zero: DeepSpeedZeroConfig = {}
     """
     ZeRO configuration to use with the Inference Engine. Expects a dictionary
@@ -181,17 +198,22 @@ class DeepSpeedInferenceConfig(DeepSpeedConfigModel):
     """
 
     #todo: refactor the following 3 into the new checkpoint_config
-    checkpoint: str = None
+    checkpoint: Union[str, Dict] = None
     """
     Path to deepspeed compatible checkpoint or path to JSON with load policy.
     """
 
-    base_dir: str = None
+    base_dir: str = ""
     """
     This shows the root directory under which all the checkpoint files exists.
     This can be passed through the json config too.
     """
 
+    set_empty_params: bool = False
+    """
+    specifying whether the inference-module is created with empty or real Tensor
+    """
+
     save_mp_checkpoint_path: str = None
     """
     The path for which we want to save the loaded model with a checkpoint. This
@@ -222,9 +244,7 @@ class DeepSpeedInferenceConfig(DeepSpeedConfigModel):
     replace_method: str = Field(
         "auto",
         deprecated=True,
-        deprecated_msg=
-        "This parameter is no longer needed, please remove from your call to DeepSpeed-inference"
-    )
+        deprecated_msg="This parameter is no longer needed, please remove from your call to DeepSpeed-inference")
 
     injection_policy: Dict = Field(None, alias="injection_dict")
     """
@@ -235,9 +255,7 @@ class DeepSpeedInferenceConfig(DeepSpeedConfigModel):
     injection_policy_tuple: tuple = None
     """ TODO: Add docs """
 
-    config: Dict = Field(
-        None,
-        alias="args")  # todo: really no need for this field if we can refactor
+    config: Dict = Field(None, alias="args")  # todo: really no need for this field if we can refactor
 
     max_out_tokens: int = Field(1024, alias="max_tokens")
     """
@@ -246,6 +264,16 @@ class DeepSpeedInferenceConfig(DeepSpeedConfigModel):
     to the required token-length required for your use-case.
     """
 
+    min_out_tokens: int = Field(1, alias="min_tokens")
+    """
+    This argument communicates to the runtime the minimum number of tokens you
+    expect you will need to generate. This will cause the runtime to error
+    if it unable to provide this and provide context on the memory pressure
+    rather than seg-faulting or providing corrupted output.
+    """
+
+    transposed_mode: bool = Field(False, alias="transposed_mode")
+
     mp_size: int = Field(1, deprecated=True, new_param="tensor_parallel.tp_size")
     """
     Desired model parallel size, default is 1 meaning no model parallelism.
@@ -254,18 +282,10 @@ class DeepSpeedInferenceConfig(DeepSpeedConfigModel):
     """
     mpu: object = Field(None, deprecated=True, new_param="tensor_parallel.mpu")
     ep_size: int = Field(1, deprecated=True, new_param="moe.ep_size")
-    ep_group: object = Field(None,
-                             alias="expert_group",
-                             deprecated=True,
-                             new_param="moe.ep_group")
-    ep_mp_group: object = Field(None,
-                                alias="expert_mp_group",
-                                deprecated=True,
-                                new_param="moe.ep_mp_group")
+    ep_group: object = Field(None, alias="expert_group", deprecated=True, new_param="moe.ep_group")
+    ep_mp_group: object = Field(None, alias="expert_mp_group", deprecated=True, new_param="moe.ep_mp_group")
     moe_experts: list = Field([1], deprecated=True, new_param="moe.moe_experts")
-    moe_type: MoETypeEnum = Field(MoETypeEnum.standard,
-                                  deprecated=True,
-                                  new_param="moe.type")
+    moe_type: MoETypeEnum = Field(MoETypeEnum.standard, deprecated=True, new_param="moe.type")
 
     @validator("moe")
     def moe_backward_compat(cls, field_value, values):
@@ -273,6 +293,12 @@ def moe_backward_compat(cls, field_value, values):
             return DeepSpeedMoEConfig(moe=field_value)
         return field_value
 
+    @validator("use_triton")
+    def has_triton(cls, field_value, values):
+        if field_value and not deepspeed.HAS_TRITON:
+            raise ValueError('Triton needs to be installed to use deepspeed with triton kernels')
+        return field_value
+
     class Config:
         # Get the str representation of the datatype for serialization
         json_encoders = {torch.dtype: lambda x: str(x)}
diff --git a/deepspeed/inference/engine.py b/deepspeed/inference/engine.py
index 123a028ce3a9..149d20cd9305 100755
--- a/deepspeed/inference/engine.py
+++ b/deepspeed/inference/engine.py
@@ -1,10 +1,11 @@
-'''
-Copyright 2021 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 import torch
 import time
 import os
-
 from deepspeed import comm as dist
 from deepspeed.utils.logging import log_dist
 
@@ -25,6 +26,9 @@
 from ..module_inject.auto_tp import AutoTP
 
 from ..module_inject.replace_policy import generic_policies
+from ..module_inject.auto_tp_model_utils import build_bloom_alibi_tensor, build_mpt_atten_bias_tensor, build_mpt_alibi_tensor
+from ..ops.transformer.inference.ds_attention import DeepSpeedSelfAttention
+from ..model_implementations.transformers.ds_transformer import DeepSpeedTransformerInference
 
 DS_INFERENCE_ENABLED = False
 from torch import nn
@@ -48,6 +52,13 @@ def __init__(self, model, config):
 
         super().__init__()
 
+        # Have to import here because inference_module is a global, but python
+        # globals only work at the module level and will not be updated unless
+        # we import it each time we init a new inference engine.
+        from ..model_implementations.transformers.ds_transformer import inference_module
+        if inference_module is not None:
+            self.destroy()
+
         self.module = model
         self._config = config
 
@@ -80,28 +91,35 @@ def __init__(self, model, config):
         self.checkpoint_engine = TorchCheckpointEngine()
         quantization_setting = None
         self._init_quantization_setting(
-            quantization_setting
-        )  # todo: update with the new quant config for weight quant
+            quantization_setting)  # todo: update with the new quant config for weight quant
         self.model_profile_enabled = False
         self._model_times = []
 
-        # This is a hack to remove the prepare_mask function on HF side for BLOOM architecture
-        self.remove_mask_prepare_for_bloom()
+        if not self.injection_dict and config.replace_with_kernel_inject:
+            # This is a hack to remove the prepare_mask function on HF side for BLOOM architecture
+            self.remove_mask_prepare_for_bloom()
+
+        if self.injection_dict or not config.replace_with_kernel_inject:
+            # This is a hack to redefine the alibi func due to TP
+            if config.tensor_parallel.tp_size > 1:
+                self.build_alibi_tensor()
+                self.build_attn_bias()
 
         if get_accelerator().device_name() == 'cuda' and config.enable_cuda_graph:
             assert pkg_version.parse(torch.__version__) >= pkg_version.parse("1.10"), \
                 "If you want to use cuda graph, please upgrade torch to at least v1.10"
 
-        if config.checkpoint and not config.replace_with_kernel_inject:
-            self._load_checkpoint(config.checkpoint)
+        # Check if model passed to engine is loaded w/ meta tensors, in which case
+        # kernel injection must be enabled.
+        # NOTE: This check assumes a Hugging Face hierarchy for the device type i.e. module.device.type
+        self.model_meta_device = self.module.device.type == 'meta' if hasattr(self.module, "device") else False
 
         # convert model to intended dtype
         if config.dtype:
             self._convert_to_dtype(config)
 
         if self.mpu:
-            config.tensor_parallel.tp_size = dist.get_world_size(
-                group=self.mpu.get_model_parallel_group())
+            config.tensor_parallel.tp_size = dist.get_world_size(group=self.mpu.get_model_parallel_group())
             self.mp_group = self.mpu.get_model_parallel_group()
         elif config.tensor_parallel.tp_size > 1:
             self._create_model_parallel_group(config)
@@ -115,26 +133,32 @@ def __init__(self, model, config):
         if moe and dist.get_world_size() > 1:
             self._create_ep_parallel_group(config.moe.moe_experts)
 
-        # retain this from the old conditional argument being passed to apply_injection_policy()
-        if not config.replace_with_kernel_inject:
-            config.checkpoint = None
-
-        # We only support three modes: 1) user specified policy for tensor-parallelism, 2) kernel injection (replace_with_kernel_inject), and 3) automatic tensor parallelism.
+        # We only support three modes: 1) user specified policy for tensor-parallelism, 2) kernel injection (replace_with_kernel_inject), and 3) automatic tensor parallelism if tp_size > 1.
         if self.injection_dict:
             # 1. User specified Tensor Parallelism
             assert not config.replace_with_kernel_inject, "Cannot use both user specified injection policy and kernel injection"
             for client_module, injection_policy in self.injection_dict.items():
+
+                assert issubclass(client_module,
+                                  torch.nn.Module), f"{client_module} is not a subclass of torch.nn.Module"
+
                 # construct the tuple and pass that instead of a string or dict.
                 if isinstance(injection_policy, str):
                     config.injection_policy_tuple = (injection_policy, )
                 else:
                     config.injection_policy_tuple = injection_policy
+
+                layer_names = [name for name, _ in self.module.named_modules()]
+                for policy in config.injection_policy_tuple:
+                    if not any(name.endswith(policy) for name in layer_names):
+                        raise ValueError(f"Injection policy layer'{policy}' not valid.")
+
                 self._apply_injection_policy(config, client_module)
         else:
             if config.replace_with_kernel_inject:
                 # 2. DeepSpeed Kernel Injection
                 self._apply_injection_policy(config)
-            else:
+            elif config.tensor_parallel.tp_size > 1:
                 # 3. Automatic Tensor Parallelism
                 parser_dict = AutoTP.tp_parser(model)
                 print("AutoTP: ", parser_dict)
@@ -149,8 +173,7 @@ def __init__(self, model, config):
         self.module.to(device)
 
         if config.tensor_parallel.tp_size > 1:
-            _rng_state = get_accelerator().get_rng_state().to(
-                get_accelerator().current_device_name())
+            _rng_state = get_accelerator().get_rng_state().to(get_accelerator().current_device_name())
             dist.broadcast(_rng_state, 0)
             get_accelerator().set_rng_state(_rng_state.cpu())
 
@@ -160,6 +183,17 @@ def __init__(self, model, config):
         # Check if local CUDA graphs can be created in replacement modules
         self.local_cuda_graph = self._local_cuda_graph_used(self.module)
 
+    def destroy(self):
+        # Have to import here because inference_module is a global, but python
+        # globals only work at the module level and will not be updated unless
+        # we import it each time we init a new inference engine.
+        from ..model_implementations.transformers.ds_transformer import inference_module
+        DeepSpeedTransformerInference.layer_id = 0
+        DeepSpeedSelfAttention.num_layers = 0
+        if inference_module is not None:
+            inference_module.release_workspace()
+            inference_module = None
+
     def profile_model_time(self, use_cuda_events=True):
         if not self.model_profile_enabled and not self._config.enable_cuda_graph:
             self.module.register_forward_pre_hook(self._pre_forward_hook)
@@ -172,15 +206,27 @@ def profile_model_time(self, use_cuda_events=True):
     # todo: remove this once all the config dicts are centralized from top level pydantic config
     def _get_model_config_generate(self, config):
         # this is being passed to replace_transformer_layer(config=self.user_model_config_dict)
-        self.config = getattr(self.module,
-                              'config',
-                              None) if config.config is None else config.config
+        self.config = getattr(self.module, 'config', None) if config.config is None else config.config
 
     def remove_mask_prepare_for_bloom(self):
         if hasattr(self.module, 'transformer'):
             if hasattr(self.module.transformer, '_prepare_attn_mask'):
                 self.module.transformer._prepare_attn_mask = lambda attention_mask, *args, **kwargs: attention_mask
 
+    def build_alibi_tensor(self):
+        if hasattr(self.module, 'transformer'):
+            if hasattr(self.module.transformer, 'build_alibi_tensor'):
+                self.module.transformer.build_alibi_tensor = build_bloom_alibi_tensor
+            if hasattr(self.module.transformer, 'build_mpt_alibi_tensor'):
+                self.module.transformer.build_mpt_alibi_tensor_orig = self.module.transformer.build_mpt_alibi_tensor
+                self.module.transformer.__class__.build_mpt_alibi_tensor = build_mpt_alibi_tensor
+
+    def build_attn_bias(self):
+        if hasattr(self.module, 'transformer'):
+            if hasattr(self.module.transformer, '_attn_bias'):
+                self.module.transformer._attn_bias_orig = self.module.transformer._attn_bias
+                self.module.transformer.__class__._attn_bias = build_mpt_atten_bias_tensor
+
     def _pre_forward_hook(self, module, *inputs, **kwargs):
         if self.use_cuda_events:
             self.timers(INFERENCE_MODEL_TIMER).start()
@@ -195,7 +241,7 @@ def _post_forward_hook(self, module, input, output):
         else:
             get_accelerator().synchronize()
             self._end = time.time()
-            elapsed_time = self._end - self._start
+            elapsed_time = (self._end - self._start) * 1e3  # convert seconds to ms
         self._model_times.append(elapsed_time)
 
     def _create_model_parallel_group(self, config):
@@ -223,8 +269,7 @@ def _create_ep_parallel_group(self, moe_experts):
             num_ep_groups = dist.get_world_size() // moe_ep_size
             for i in range(num_ep_groups):
                 ep_cnt = i * moe_ep_size
-                size = dist.get_world_size(
-                ) if moe_ep_size > dist.get_world_size() else moe_ep_size
+                size = dist.get_world_size() if moe_ep_size > dist.get_world_size() else moe_ep_size
                 ranks = list(range(ep_cnt, ep_cnt + size))
                 _ep_group = dist.new_group(ranks)
                 if dist.get_rank() in ranks:
@@ -234,9 +279,7 @@ def _create_ep_parallel_group(self, moe_experts):
                 num_expert_mp_groups = dist.get_world_size() // num_ep_groups
                 expert_mp_size = dist.get_world_size() // moe_ep_size
                 for i in range(num_expert_mp_groups):
-                    expert_mp_comm_ranks = [
-                        i + nr * moe_ep_size for nr in range(expert_mp_size)
-                    ]
+                    expert_mp_comm_ranks = [i + nr * moe_ep_size for nr in range(expert_mp_size)]
                     _expert_mp_group = dist.new_group(expert_mp_comm_ranks)
                     if dist.get_rank() in expert_mp_comm_ranks:
                         self.expert_mp_group.update({moe_ep_size: _expert_mp_group})
@@ -253,66 +296,71 @@ def _init_quantization_setting(self, quantization_setting):
         log_dist(
             f"quantize_bits = {self.quantize_bits} "
             f"mlp_extra_grouping = {self.mlp_extra_grouping}, "
-            f"quantize_groups = {self.quantize_groups}",
-            [0])
+            f"quantize_groups = {self.quantize_groups}", [0])
 
     # TODO: remove this function and add this functionality to pydantic config checking
     def _validate_args(self, mpu, replace_with_kernel_inject):
         # TODO: to support SD pipeline we need to avoid this check for now
         if replace_with_kernel_inject and not isinstance(self.module, Module):
             raise ValueError(f"model must be a torch.nn.Module, got {type(self.module)}")
-        if not isinstance(self._config.tensor_parallel.tp_size,
-                          int) or self._config.tensor_parallel.tp_size < 1:
-            raise ValueError(
-                f"mp_size must be an int >= 1, got {self._config.tensor_parallel.tp_size}"
-            )
+        if not isinstance(self._config.tensor_parallel.tp_size, int) or self._config.tensor_parallel.tp_size < 1:
+            raise ValueError(f"mp_size must be an int >= 1, got {self._config.tensor_parallel.tp_size}")
 
         if mpu:
             methods = ["get_model_parallel_group", "get_data_parallel_group"]
             for method in methods:
                 if not hasattr(mpu, method):
                     raise ValueError(f"mpu is missing {method}")
-        if self._config.checkpoint is not None and not isinstance(
-                self._config.checkpoint,
-            (str,
-             dict)):
-            raise ValueError(
-                f"checkpoint must be None, str or dict, got {type(self._config.checkpoint)}"
-            )
+        if self._config.checkpoint is not None and not isinstance(self._config.checkpoint, (str, dict)):
+            raise ValueError(f"checkpoint must be None, str or dict, got {type(self._config.checkpoint)}")
 
         supported_dtypes = [None, torch.half, torch.int8, torch.float]
         if self._config.dtype not in supported_dtypes:
-            raise ValueError(
-                f"{self._config.dtype} not supported, valid dtype: {supported_dtypes}")
+            raise ValueError(f"{self._config.dtype} not supported, valid dtype: {supported_dtypes}")
 
         if self.injection_dict is not None and not isinstance(self.injection_dict, dict):
-            raise ValueError(
-                f"injection_dict must be None or a dict, got: {self.injection_dict}")
+            raise ValueError(f"injection_dict must be None or a dict, got: {self.injection_dict}")
 
     def load_model_with_checkpoint(self, r_module):
         self.mp_replace = ReplaceWithTensorSlicing(
-            mp_group=self.mp_group,
-            mp_size=self._config.tensor_parallel.tp_size)  #, out_dim=0, in_dim=1)
+            mp_group=self.mp_group, mp_size=self._config.tensor_parallel.tp_size)  #, out_dim=0, in_dim=1)
         error_msgs = []
 
         def load(module, state_dict, prefix):
             args = (state_dict, prefix, {}, True, [], [], error_msgs)
             if hasattr(module, 'weight'):
+                if module.weight.data.is_meta:
+                    # meta tensor cannot be casted or copied to, so we need to replace it with a normal tensor here
+                    module.weight = torch.nn.parameter.Parameter(data=torch.empty_like(module.weight.data,
+                                                                                       device="cpu"),
+                                                                 requires_grad=module.weight.data.requires_grad)
                 if 'query_key_value' in prefix:
-                    module.weight = self.mp_replace.qkv_copy(
-                        module.weight.data,
-                        state_dict[prefix + 'weight'])
+                    module.weight = self.mp_replace.strided_copy(module.weight.data,
+                                                                 state_dict[prefix + 'weight'],
+                                                                 num_splits=3)
                 else:
-                    module.weight = self.mp_replace.copy(module.weight.data,
-                                                         state_dict[prefix + 'weight'])
+                    module.weight = self.mp_replace.copy(module.weight.data, state_dict[prefix + 'weight'])
             else:
-                module.norm.weight = self.mp_replace.copy(module.norm.weight.data,
-                                                          state_dict[prefix + 'weight'])
+                if module.norm.weight.data.is_meta:
+                    # meta tensor cannot be casted or copied to, so we need to replace it with a normal tensor here
+                    module.norm.weight = torch.nn.parameter.Parameter(
+                        data=torch.empty_like(module.norm.weight.data, device="cpu"),
+                        requires_grad=module.norm.weight.data.requires_grad)
+                module.norm.weight = self.mp_replace.copy(module.norm.weight.data, state_dict[prefix + 'weight'])
             if prefix + 'bias' in self.key_list:
                 if hasattr(module, 'norm'):
-                    module.norm.bias = self.mp_replace.copy(module.norm.bias,
-                                                            state_dict[prefix + 'bias'])
+                    if module.norm.bias.data.is_meta:
+                        # meta tensor cannot be casted or copied to, so we need to replace it with a normal tensor here
+                        module.norm.bias = torch.nn.parameter.Parameter(
+                            data=torch.empty_like(module.norm.bias.data, device="cpu"),
+                            requires_grad=module.norm.bias.data.requires_grad)
+                    module.norm.bias = self.mp_replace.copy(module.norm.bias, state_dict[prefix + 'bias'])
                 else:
+                    if module.bias.data.is_meta:
+                        # meta tensor cannot be casted or copied to, so we need to replace it with a normal tensor here
+                        module.bias = torch.nn.parameter.Parameter(data=torch.empty_like(module.bias.data,
+                                                                                         device="cpu"),
+                                                                   requires_grad=module.bias.data.requires_grad)
                     data = state_dict[prefix + 'bias']
                     data = data.to(get_accelerator().current_device_name())
                     module.bias = self.mp_replace.copy(module.bias, data)
@@ -331,45 +379,39 @@ def load_module_recursive(module, prefix='', level=0):
                     checking_key = prefix + name + '.'
                     if not any(checking_key in item for item in self.key_list):
                         continue
-                    if len(list(child.parameters())) > 0 and list(
-                            child.parameters())[0].numel() == 0:
+                    if len(list(child.parameters())) > 0 and list(child.parameters())[0].numel() == 0:
                         if len(child.weight.ds_shape) == 1:
-                            child = Normalize(dim=child.weight.ds_shape[-1],
-                                              dtype=child.weight.dtype,
-                                              eps=child.eps)
+                            child = Normalize(dim=child.weight.ds_shape[-1], dtype=child.weight.dtype, eps=child.eps)
                             setattr(module, name, child)
                     load(child, self.sd, prefix + name + '.')
                 else:
-                    load_module_recursive(child,
-                                          prefix if level == 0 else prefix + name + '.',
-                                          level + 1)
+                    load_module_recursive(child, prefix if level == 0 else prefix + name + '.', level + 1)
 
         load_module_recursive(r_module)
 
+        embedding_weight = None
+
+        for n, p in r_module.named_parameters():
+            if "word_embeddings." in n or "embed_tokens." in n or "wte." in n:
+                embedding_weight = p
+        if embedding_weight is not None and hasattr(r_module, "lm_head") and hasattr(
+                r_module.lm_head, "weight") and r_module.lm_head.weight.is_meta:
+            r_module.lm_head.weight = embedding_weight
+
     def _apply_injection_policy(self, config, client_module=None):
         # client_module is only passed when using the injection_dict method.
         checkpoint_dir = config.checkpoint
-        checkpoint = SDLoaderFactory.get_sd_loader_json(
-            checkpoint_dir,
-            self.checkpoint_engine) if checkpoint_dir is not None else None
+        checkpoint = SDLoaderFactory.get_sd_loader_json(checkpoint_dir,
+                                                        self.checkpoint_engine) if checkpoint_dir is not None else None
 
-        generic_injection(self.module,
-                          fp16=(config.dtype == torch.half)
-                          or (config.dtype == torch.int8),
-                          enable_cuda_graph=config.enable_cuda_graph)
+        generic_injection(self.module, dtype=config.dtype, enable_cuda_graph=config.enable_cuda_graph)
 
         if isinstance(self.module, torch.nn.Module):
             # config is our DeepSpeedInferenceConfig and self.config is the HF model config
-            replace_transformer_layer(client_module,
-                                      self.module,
-                                      checkpoint,
-                                      config,
-                                      self.config)
+            replace_transformer_layer(client_module, self.module, checkpoint, config, self.config)
 
     def _get_all_ckpt_names(self, checkpoints_path, tag):
-        ckpt_file_pattern = self._get_ckpt_name(checkpoints_path,
-                                                tag,
-                                                mp_placeholder="*")
+        ckpt_file_pattern = self._get_ckpt_name(checkpoints_path, tag, mp_placeholder="*")
         import glob
 
         ckpt_files = glob.glob(ckpt_file_pattern)
@@ -392,8 +434,7 @@ def _get_ckpt_name(self, checkpoints_path, tag, mp_placeholder=None):
     def _load_checkpoint(self, load_dir, load_module_strict=True, tag=None):
         is_pipe_parallel = isinstance(self.module, PipelineModule)
         if is_pipe_parallel:
-            raise RuntimeError(
-                'pipeline parallelism is currently not supported in inference.')
+            raise RuntimeError('pipeline parallelism is currently not supported in inference.')
         if not isinstance(load_dir, dict) and os.path.isdir(load_dir):
             if tag is None:
                 latest_path = os.path.join(load_dir, "latest")
@@ -404,31 +445,31 @@ def _load_checkpoint(self, load_dir, load_module_strict=True, tag=None):
             ckpt_list = self._get_all_ckpt_names(load_dir, tag)
             sd_loader = SDLoaderFactory.get_sd_loader(ckpt_list, self.checkpoint_engine)
         else:
-            sd_loader = SDLoaderFactory.get_sd_loader_json(load_dir,
-                                                           self.checkpoint_engine)
+            sd_loader = SDLoaderFactory.get_sd_loader_json(load_dir, self.checkpoint_engine)
+
+        checkpoint = sd_loader['checkpoints']
 
-        if type(sd_loader) is list:
-            self.sd = torch.load(sd_loader[0], map_location='cpu')
+        if type(checkpoint) is list:
+            self.sd = torch.load(checkpoint[0], map_location='cpu')
             self.key_list = list(self.sd.keys())
 
             self.load_model_with_checkpoint(self.module)
 
-            for i in range(1, len(sd_loader)):
+            for i in range(1, len(checkpoint)):
                 if not dist.is_initialized() or dist.get_rank() == 0:
                     print(f"loading checkpoint ({i})")
-                self.sd = torch.load(sd_loader[i],
-                                     map_location=get_accelerator().device_name())
+                self.sd = torch.load(checkpoint[i], map_location=get_accelerator().device_name())
                 self.key_list = list(self.sd.keys())
                 self.load_model_with_checkpoint(self.module)
         else:
             mp_rank = 0 if self.mpu is None else self.mpu.get_model_parallel_rank()
 
             load_path, checkpoint, quantize_config = sd_loader.load(self._config.tensor_parallel.tp_size,
-                                                    mp_rank,
-                                                    is_pipe_parallel=is_pipe_parallel,
-                                                    quantize=(self._config.dtype is torch.int8),
-                                                    quantize_groups=self.quantize_groups,
-                                                    mlp_extra_grouping=self.mlp_extra_grouping)
+                                                                    mp_rank,
+                                                                    is_pipe_parallel=is_pipe_parallel,
+                                                                    quantize=(self._config.dtype is torch.int8),
+                                                                    quantize_groups=self.quantize_groups,
+                                                                    mlp_extra_grouping=self.mlp_extra_grouping)
 
             self.quantization_scales, self.quantize_merge_count = quantize_config
 
@@ -438,21 +479,20 @@ def _load_checkpoint(self, load_dir, load_module_strict=True, tag=None):
                 old_moe_load = False
                 if not isinstance(checkpoint['num_experts'], list):
                     old_moe_load = True
-                DeepSpeedEngine.load_moe_state_dict(
-                    load_dir,
-                    tag,
-                    state_dict=checkpoint[self._choose_module_key(checkpoint)],
-                    old_moe_load=old_moe_load,
-                    model=self.module,
-                    mpu=self.mpu,
-                    checkpoint_engine=self.checkpoint_engine)
-
-            self.module.load_state_dict(
-                state_dict=checkpoint[self._choose_module_key(checkpoint)],
-                strict=load_module_strict)
+                DeepSpeedEngine.load_moe_state_dict(load_dir,
+                                                    tag,
+                                                    state_dict=checkpoint[self._choose_module_key(checkpoint)],
+                                                    old_moe_load=old_moe_load,
+                                                    model=self.module,
+                                                    mpu=self.mpu,
+                                                    checkpoint_engine=self.checkpoint_engine)
+
+            self.module.load_state_dict(state_dict=checkpoint[self._choose_module_key(checkpoint)],
+                                        strict=load_module_strict)
 
     def _choose_module_key(self, sd):
-        assert not ('module' in sd and 'model' in sd), "checkpoint has both 'model' and 'module' keys, not sure how to proceed"
+        assert not ('module' in sd
+                    and 'model' in sd), "checkpoint has both 'model' and 'module' keys, not sure how to proceed"
         assert 'module' in sd or 'model' in sd, "checkpoint contains neither 'model' or 'module' keys, not sure how to proceed"
         if 'module' in sd:
             return 'module'
@@ -465,10 +505,8 @@ def _convert_to_dtype(self, config):
 
         if False:  #config.dtype is torch.int8 and self.quantization_scales is None:
             quantizer = WeightQuantization(mlp_extra_grouping=self.mlp_extra_grouping)
-            model, self.quantization_scales = quantizer.model_quantize(self.module,
-                                                                        self.injection_dict,
-                                                                        self.quantize_bits,
-                                                                        self.quantize_groups)
+            model, self.quantization_scales = quantizer.model_quantize(self.module, self.injection_dict,
+                                                                       self.quantize_bits, self.quantize_groups)
         elif config.dtype == torch.half:
             self.module.half()
         elif config.dtype == torch.bfloat16:
@@ -509,11 +547,10 @@ def model_times(self):
         assert self.model_profile_enabled, "model profiling is not enabled"
         model_times = self._model_times
         if self._config.enable_cuda_graph and len(self._model_times) == 0:
-            raise ValueError(
-                "Model times are empty and cuda graph is enabled. If "
-                "this is a GPT-style model this combo is not supported. If this is a "
-                "BERT-style model this is a bug, please report it. "
-                f"Model type is: {type(self.module)}")
+            raise ValueError("Model times are empty and cuda graph is enabled. If "
+                             "this is a GPT-style model this combo is not supported. If this is a "
+                             "BERT-style model this is a bug, please report it. "
+                             f"Model type is: {type(self.module)}")
         self._model_times = []
         return model_times
 
@@ -532,8 +569,7 @@ def _local_cuda_graph_used(self, module):
             for name in module.__dict__.keys():
                 sub_module = getattr(module, name)
 
-                if self._module_match(sub_module) and hasattr(sub_module,
-                                                              "enable_cuda_graph"):
+                if self._module_match(sub_module) and hasattr(sub_module, "enable_cuda_graph"):
                     sub_module_cuda_graph = True
 
             return sub_module_cuda_graph
@@ -546,24 +582,23 @@ def forward(self, *inputs, **kwargs):
             **kwargs: variable length keyword arguments
         """
         start = None
-        if self.model_profile_enabled and get_accelerator().device_name(
-        ) == 'cuda' and self._config.enable_cuda_graph:
+        if self.model_profile_enabled and get_accelerator().device_name() == 'cuda' and self._config.enable_cuda_graph:
             get_accelerator().synchronize()
             start = time.time()
 
-        if get_accelerator().device_name(
-        ) == 'cuda' and self._config.enable_cuda_graph and not self.local_cuda_graph:
+        if get_accelerator().device_name() == 'cuda' and self._config.enable_cuda_graph and not self.local_cuda_graph:
             if self.cuda_graph_created:
                 outputs = self._graph_replay(*inputs, **kwargs)
             else:
                 self._create_cuda_graph(*inputs, **kwargs)
                 outputs = self._graph_replay(*inputs, **kwargs)
+
         else:
             outputs = self.module(*inputs, **kwargs)
 
         if self.model_profile_enabled and self._config.enable_cuda_graph:
             get_accelerator().synchronize()
-            duration = time.time() - start
+            duration = (time.time() - start) * 1e3  # convert seconds to ms
             self._model_times.append(duration)
 
         return outputs
@@ -580,9 +615,15 @@ def _generate(self, *inputs, **kwargs):
             num_beams = kwargs["num_beams"]
 
         if num_beams > 1:
-            raise NotImplementedError(
-                "DeepSpeed does not support `num_beams` > 1, if this is important to you please "
-                "add your request to: https://github.com/microsoft/DeepSpeed/issues/2506"
-            )
+            raise NotImplementedError("DeepSpeed does not support `num_beams` > 1, if this is important to you please "
+                                      "add your request to: https://github.com/microsoft/DeepSpeed/issues/2506")
+
+        if ("input_ids" in kwargs) and (kwargs["input_ids"].dim() == 2):
+            for input_tensor in kwargs["input_ids"]:
+                tensor_length = input_tensor.shape[-1]
+                if tensor_length > self._config.max_out_tokens:
+                    raise RuntimeError(
+                        f"Input with size {tensor_length} exceeds maximum length of {self._config.max_out_tokens}. Please increase `max_tokens` in the DeepSpeed Inference Config."
+                    )
 
         return self.module.generate(*inputs, **kwargs)
diff --git a/deepspeed/inference/quantization/__init__.py b/deepspeed/inference/quantization/__init__.py
new file mode 100644
index 000000000000..208299fb8c50
--- /dev/null
+++ b/deepspeed/inference/quantization/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
diff --git a/deepspeed/inference/quantization/layers.py b/deepspeed/inference/quantization/layers.py
new file mode 100644
index 000000000000..c90354aca90f
--- /dev/null
+++ b/deepspeed/inference/quantization/layers.py
@@ -0,0 +1,114 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+
+from torch import nn
+from torch import Tensor
+from torch.nn import functional as F
+from .utils import Quantizer, DeQuantizer, concat_to_compat_param
+from typing import Tuple, Callable, Dict
+from deepspeed.runtime.zero import register_external_parameter
+
+quantized_weight_registry = {}
+is_zero3_enabled = False
+
+
+# deal with weight sharing
+def get_quantized_weight_wrapper(model, pre_quant_weight: nn.Parameter, quantize_weight_fn: Callable) -> nn.Parameter:
+    if id(pre_quant_weight) in quantized_weight_registry:
+        compat_tensor = quantized_weight_registry[id(pre_quant_weight)]
+        if is_zero3_enabled:
+            register_external_parameter(model, compat_tensor)
+
+        return quantized_weight_registry[id(pre_quant_weight)]
+    else:
+        quantized_weights, quant_scale, quant_min = quantize_weight_fn()
+        quantized_weight_registry[id(pre_quant_weight)] = concat_to_compat_param(quantized_weights, quant_scale,
+                                                                                 quant_min)
+        return quantized_weight_registry[id(pre_quant_weight)]
+
+
+def get_quantize_weight_fn(quantizer: Quantizer, pre_quant_weight: nn.Parameter) -> Callable:
+
+    def func() -> Tuple[nn.Parameter, Tensor, Tensor]:
+        quantized_weights, quant_scale, quant_min = quantizer.quantize(pre_quant_weight.data)
+        # A temporary hack as zero Zero3 assume all model weights has the same type. in all_gather_coalesced.get_only_unique_item
+        quantized_weights = quantized_weights.view(pre_quant_weight.dtype)
+        quant_scale = quant_scale.type(pre_quant_weight.dtype)
+        quant_min = quant_min.type(pre_quant_weight.dtype)
+        return quantized_weights, quant_scale, quant_min
+
+    return func
+
+
+class QuantizedLinear(nn.Linear):
+
+    def __init__(self, config: Dict, pre_quant_layer: nn.Linear) -> None:
+        super(QuantizedLinear, self).__init__(in_features=pre_quant_layer.in_features,
+                                              out_features=pre_quant_layer.out_features,
+                                              bias=pre_quant_layer.bias is not None,
+                                              device=pre_quant_layer.weight.device,
+                                              dtype=pre_quant_layer.weight.dtype)
+        self.config = config
+
+        self.quantizer = Quantizer(config=config)
+        self.bias = pre_quant_layer.bias
+        self.weight = get_quantized_weight_wrapper(self, pre_quant_layer.weight,
+                                                   get_quantize_weight_fn(self.quantizer, pre_quant_layer.weight))
+
+        self.weight.dequantizer = DeQuantizer(config, pre_quant_layer.weight.dtype)
+
+    def forward(self, input: Tensor) -> Tensor:
+        quantized_weight, quant_scale, quant_min = self.weight.deconcat(self.weight)
+        temp_dequantized_weight = self.weight.dequantizer.dequantize(quantized_weight.view(torch.uint8), quant_scale,
+                                                                     quant_min)
+
+        # !!! Do not use torch.functional.linear(input, temp_dequantized_weight, self.bias) here as in zero3 torch.functional.linear is
+        # replaced by LinearFunctionForZeroStage3. Which assume weight is non-temporary.
+        # If weight is temp buffer there will be memory leak.
+        return torch._C._nn.linear(input, temp_dequantized_weight, self.bias)
+
+
+class QuantizedEmbedding(nn.Embedding):
+
+    def __init__(self, config: Dict, pre_quant_layer: nn.Embedding) -> None:
+        super(QuantizedEmbedding, self).__init__(num_embeddings=pre_quant_layer.num_embeddings,
+                                                 embedding_dim=pre_quant_layer.embedding_dim,
+                                                 padding_idx=pre_quant_layer.padding_idx,
+                                                 max_norm=pre_quant_layer.max_norm,
+                                                 norm_type=pre_quant_layer.norm_type,
+                                                 scale_grad_by_freq=pre_quant_layer.scale_grad_by_freq,
+                                                 sparse=pre_quant_layer.sparse,
+                                                 _weight=pre_quant_layer.weight,
+                                                 device=pre_quant_layer.weight.device,
+                                                 dtype=pre_quant_layer.weight.dtype)
+
+        assert pre_quant_layer.max_norm == None, 'Not supported'
+        assert pre_quant_layer.norm_type == 2, 'Not supported'
+        assert pre_quant_layer.scale_grad_by_freq == False, 'Not supported'
+        assert pre_quant_layer.sparse == False, 'Not supported'
+
+        self.config = config
+        quantizer = Quantizer(config=config)
+
+        self.weight = get_quantized_weight_wrapper(self, pre_quant_layer.weight,
+                                                   get_quantize_weight_fn(quantizer, pre_quant_layer.weight))
+
+        self.weight.dequantizer = DeQuantizer(config, pre_quant_layer.weight.dtype)
+
+    def forward(self, input: Tensor) -> Tensor:
+        quantized_weight, quant_scale, quant_min = self.weight.deconcat(self.weight)
+        temp_dequantized_weight = self.weight.dequantizer.dequantize(quantized_weight.view(torch.uint8), quant_scale,
+                                                                     quant_min)
+
+        return F.embedding(input, temp_dequantized_weight, self.padding_idx, self.max_norm, self.norm_type,
+                           self.scale_grad_by_freq, self.sparse)
+
+
+QUANTIZATION_LAYER_MAPPINGS = {
+    nn.Linear: QuantizedLinear,
+    nn.Embedding: QuantizedEmbedding,
+}
diff --git a/deepspeed/inference/quantization/quantization.py b/deepspeed/inference/quantization/quantization.py
new file mode 100644
index 000000000000..9ae39e8d5688
--- /dev/null
+++ b/deepspeed/inference/quantization/quantization.py
@@ -0,0 +1,111 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+from torch import nn
+from typing import Dict
+import gc
+from deepspeed.inference.quantization import layers
+from .layers import QUANTIZATION_LAYER_MAPPINGS
+from .utils import get_AsyncPartitionedParameterSwapper, recursive_setattr
+from deepspeed.utils.logging import logger
+from collections import deque
+from transformers.utils.generic import ContextManagers
+from .quantization_context import QuantizationContext
+import contextlib
+
+
+def _init_group_wise_weight_quantization(model: nn.Module, ds_config: Dict) -> nn.Module:
+    """[Experimental] Apply group-wise weight quantization to model. Replace layers module according to config_list
+
+    Args:
+        model (nn.Module): A nn.Module
+        ds_config (Dict, optional): The ds_config dictionary. use None for non-deepspeed managed model.
+
+    Returns:
+        nn.Module: Quantized nn.Module
+    """
+
+    # global quantized_weight_registry
+
+    matched_module_list_by_key = {}
+    matched_module_count = 0
+
+    assert 'weight_quantization' in ds_config, 'Please provide quantization config in ds_config'
+    quantization_config = ds_config['weight_quantization']['post_init_quant']
+
+    # Return nvme swapper if exists, else return None.
+    # For nvme offloading we must use the same swapper here as model initialized.
+    nvme_swapper = get_AsyncPartitionedParameterSwapper(model)
+    is_zero3_enabled = 'zero_optimization' in ds_config and \
+            'stage' in ds_config['zero_optimization'] and \
+            ds_config['zero_optimization']['stage'] == 3
+    is_offloading_enabled = 'zero_optimization' in ds_config and \
+                            'offload_param' in ds_config['zero_optimization']
+
+    layers.is_zero3_enabled = is_zero3_enabled
+
+    context_mgr = ContextManagers([QuantizationContext(config_dict_or_path=ds_config, param_swapper=nvme_swapper)]) \
+                    if is_zero3_enabled else contextlib.suppress()
+    with context_mgr:
+        module_list = list(
+            filter(lambda named_module: type(named_module[1]) in QUANTIZATION_LAYER_MAPPINGS, model.named_modules()))
+
+        # Quantize small weight first then large.
+        if not is_offloading_enabled:
+            module_list.sort(key=lambda named_module: named_module[1].weight.ds_tensor.numel()
+                             if is_zero3_enabled else named_module[1].weight.numel())
+        module_list = deque(module_list)
+
+        while len(module_list) > 0:
+            # Use popleft to timely release module's memory of replaced module after each loop iteration
+            module_name, module = module_list.popleft()
+
+            matched_key = None
+            matched_quantization_config = None
+
+            for key, config in quantization_config.items():
+                if key in module_name:
+                    assert matched_key is None, f'{module_name} matched multiple quantization key word {matched_key} and {key}'
+                    matched_key = key
+                    matched_quantization_config = config
+
+            if matched_key is None:
+                continue
+
+            if is_zero3_enabled:
+                module.weight.all_gather()
+
+            assert module.weight.dtype == torch.float16, 'Model weight is expected in half.'
+
+            new_module = QUANTIZATION_LAYER_MAPPINGS[type(module)](matched_quantization_config, module)
+
+            if is_zero3_enabled:
+                module.weight.partition()
+
+            recursive_setattr(model, module_name, new_module)
+
+            if matched_key not in matched_module_list_by_key:
+                matched_module_list_by_key[matched_key] = []
+            matched_module_list_by_key[matched_key].append(module_name)
+            matched_module_count += 1
+
+            # Timely recycle memory to prevent OOM on large models
+            gc.collect()
+
+    # Clear registry after model construction.
+    layers.quantized_weight_registry.clear()
+
+    logger.info(
+        f'Group-wise weight quantization summary: convert {matched_module_count} node(s) to quantized implementation')
+    summary_str = '\n'
+
+    for key, module_list in matched_module_list_by_key.items():
+        summary_str += f'Key: {key}, matched modules:\n'
+        for module_name in module_list:
+            summary_str += f'\t{module_name}\n'
+    logger.info(summary_str)
+
+    return model
diff --git a/deepspeed/inference/quantization/quantization_context.py b/deepspeed/inference/quantization/quantization_context.py
new file mode 100644
index 000000000000..d3333da05058
--- /dev/null
+++ b/deepspeed/inference/quantization/quantization_context.py
@@ -0,0 +1,13 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from deepspeed.runtime.zero import partition_parameters
+from deepspeed.runtime.swap_tensor.partitioned_param_swapper import AsyncPartitionedParameterSwapper
+
+
+class QuantizationContext(partition_parameters.Init):
+
+    def __init__(self, config_dict_or_path, param_swapper: AsyncPartitionedParameterSwapper = None) -> None:
+        super().__init__(config_dict_or_path=config_dict_or_path, param_swapper=param_swapper)
diff --git a/deepspeed/inference/quantization/utils.py b/deepspeed/inference/quantization/utils.py
new file mode 100644
index 000000000000..712abc384a44
--- /dev/null
+++ b/deepspeed/inference/quantization/utils.py
@@ -0,0 +1,288 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+import deepspeed
+from torch import Tensor
+from typing import Tuple
+import torch.nn as nn
+from typing import Dict, Callable, Union
+from deepspeed.accelerator import get_accelerator
+import functools
+
+device = get_accelerator().device_name() if get_accelerator().is_available() else 'cpu'
+
+quantizer_cuda_module = None
+
+
+def get_quantizer_cuda_module():
+    global quantizer_cuda_module
+    if quantizer_cuda_module is None:
+        quantizer_cuda_module = deepspeed.ops.op_builder.QuantizerBuilder().load()
+    return quantizer_cuda_module
+
+
+def tensor_clamp(tensor: Tensor, min, max) -> Tensor:
+    if tensor.device.type == 'cpu' and tensor.dtype == torch.float16:
+        # CPU does not support FP16 clamp
+        return tensor.to(dtype=torch.float32).clamp_(min, max).to(dtype=torch.float16)
+    else:
+        return tensor.clamp_(min, max)
+
+
+def tensor_round(tensor: Tensor) -> Tensor:
+    if tensor.device.type == 'cpu' and tensor.dtype == torch.float16:
+        # CPU does not support FP16 round
+        return tensor.to(dtype=torch.float32).round_().to(dtype=torch.float16)
+    else:
+        return tensor.round_()
+
+
+class Quantizer:
+
+    def __init__(self, config: Dict) -> None:
+        self.config = config
+        assert self.config['num_bits'] == 4 or self.config[
+            'num_bits'] == 8, 'Only INT4 and INT8 quantization is supported.'
+        assert self.config['symmetric'] == False, 'Only asymmetric quantization is supported at this moment.'
+
+    def quantize(self, tensor: Tensor) -> Tuple[Tensor, Tensor, Tensor]:
+        assert tensor.shape[self.config['group_dim']] % self.config['group_size'] == 0 \
+            , f'Tensor shape: {tensor.shape} quantization config {self.config}'
+
+        tensor = torch.clone(tensor)
+
+        shape = tensor.shape
+        num_groups = shape[self.config['group_dim']] // self.config['group_size']
+        new_shape = (shape[:self.config['group_dim']] + (num_groups, self.config['group_size']) +
+                     shape[self.config['group_dim'] + 1:])
+        tensor = tensor.view(new_shape)
+
+        quantized_tensor, scale, min_value = self._quantize_int8(tensor)
+        quantized_tensor = quantized_tensor.view(shape)
+
+        if self.config['num_bits'] == 4:
+            return self._compress_uint8_to_uint4(quantized_tensor), scale, min_value
+        if self.config['num_bits'] == 8:
+            return quantized_tensor, scale, min_value
+
+        assert False, 'Unsupported quantization bits {}'.format(self.config['num_bits'])
+
+    def _quantize_int8(self, tensor: Tensor) -> Tuple[Tensor, Tensor, Tensor]:
+        q_range = 2**self.config['num_bits'] - 1
+        min_value = tensor.amin(dim=self.config['group_dim'] + 1, keepdim=True)
+        max_value = tensor.amax(dim=self.config['group_dim'] + 1, keepdim=True)
+
+        scale = q_range / (max_value - min_value)
+
+        tensor = tensor.sub_(min_value).mul_(scale)
+        tensor = tensor_round(tensor_clamp(tensor, 0, q_range)).to(torch.uint8)
+        return tensor, scale, min_value
+
+    def _compress_uint8_to_uint4(self, tensor: Tensor) -> Tensor:
+        assert tensor.shape[-1] % 2 == 0
+
+        new_data_shape = list(tensor.shape)
+        new_data_shape[-1] = new_data_shape[-1] // 2
+
+        data = torch.empty(new_data_shape, dtype=torch.uint8, device=tensor.device)
+        data = torch.bitwise_or(tensor[..., 0::2].bitwise_left_shift(4), tensor[..., 1::2])
+
+        return data
+
+
+class DeQuantizer:
+
+    def __init__(self, config: Dict, dtype: torch.dtype) -> None:
+        self.config = config
+        self.dtype = dtype
+        assert self.config['num_bits'] == 4 or self.config[
+            'num_bits'] == 8, 'Only INT4 and INT8 quantization is supported.'
+        assert self.config['symmetric'] == False, 'Only asymmetric quantization is supported at this moment.'
+
+    def dequantize(self, tensor: Tensor, quant_scale: Tensor, quant_min: Tensor) -> Tensor:
+        # Use customized CUDA quantization kernel if possible.
+        if self.config['group_size'] % 8 == 0 and \
+                (self.config['num_bits'] == 4 or self.config['num_bits'] == 8) and \
+                self.config['group_dim'] == len(tensor.shape) - 1 and \
+                    self.dtype == torch.float16 and device == 'cuda':
+
+            last_dimension_size = self.config['group_size']
+            if self.config['num_bits'] == 4:
+                last_dimension_size = last_dimension_size // 2
+                quantized_tensor = get_quantizer_cuda_module().dequantize_int4_to_half_experimental(
+                    tensor.reshape(-1, last_dimension_size), quant_scale, quant_min,
+                    tensor.numel() // last_dimension_size, self.config['group_size'])
+                shape = list(tensor.shape)
+                shape[-1] = shape[-1] * 2
+            elif self.config['num_bits'] == 8:
+                # last_dimension_size = last_dimension_size // 2
+                quantized_tensor = get_quantizer_cuda_module().dequantize_int8_to_half_experimental(
+                    tensor.reshape(-1, last_dimension_size), quant_scale, quant_min,
+                    tensor.numel() // last_dimension_size, self.config['group_size'])
+                shape = list(tensor.shape)
+
+            return quantized_tensor.reshape(shape)
+
+        if self.config['num_bits'] == 4:
+            tensor = self._decompress_uint4_to_uint8(tensor)
+        elif self.config['num_bits'] != 8:
+            assert False, 'Unsupported quantization bits {}'.format(self.config['num_bits'])
+
+        shape = tensor.shape
+        num_groups = shape[self.config['group_dim']] // self.config['group_size']
+        new_shape = (shape[:self.config['group_dim']] + (num_groups, self.config['group_size']) +
+                     shape[self.config['group_dim'] + 1:])
+        tensor = tensor.view(new_shape)
+
+        dequantized_tensor = self._dequantize_int8(tensor, quant_scale, quant_min).view(shape)
+        return dequantized_tensor
+
+    def _dequantize_int8(self, tensor: Tensor, quant_scale: Tensor, quant_min: Tensor) -> Tensor:
+        assert tensor.dtype == torch.uint8
+        data = torch.zeros_like(tensor, dtype=self.dtype, device=tensor.device)
+        data = data.copy_(tensor)
+        data = data.div_(quant_scale).add_(quant_min)
+
+        return data
+
+    def _decompress_uint4_to_uint8(self, tensor: Tensor) -> Tensor:
+        new_data_shape = list(tensor.shape)
+        new_data_shape[-1] = new_data_shape[-1] * 2
+        data = torch.empty(new_data_shape, dtype=torch.uint8, device=tensor.device)
+        data[..., 0::2] = tensor.bitwise_right_shift(4)
+        data[..., 1::2] = tensor.bitwise_and(0xF)
+
+        return data
+
+
+def get_AsyncPartitionedParameterSwapper(model: nn.Module):
+    for param_name, param in model.named_parameters():
+        if hasattr(param, 'nvme_swapper') and param.nvme_swapper is not None:
+            return param.nvme_swapper
+    return None
+
+
+def recursive_setattr(model, module_name, module):
+    """
+    Recursively set the attribute of a module.
+    Args:
+        model (`torch.nn.Module`)
+            The model to set the attribute in.
+        module_name (`str`)
+            The name of the module to set the attribute in.
+        module (`torch.nn.Module`)
+            The module to set the attribute to.
+    """
+    split_list = module_name.split('.')
+    output = model
+    for name in split_list[:-1]:
+        output = getattr(output, name)
+    output.__setattr__(split_list[-1], module)
+
+
+def concat_to_compat_param(quantized_weight: Tensor,
+                           quant_scale: Tensor,
+                           quant_min: Tensor,
+                           return_param: bool = True) -> Union[nn.Parameter, Tensor]:
+    shape_wieght = quantized_weight.shape
+    shape_scale = quant_scale.shape
+    shape_min = quant_min.shape
+
+    quantized_weight = torch.flatten(quantized_weight)
+    quant_scale = torch.flatten(quant_scale)
+    quant_min = torch.flatten(quant_min)
+
+    def deconcat_individual_tensors(shape_wieght: torch.Size, shape_scale: torch.Size,
+                                    shape_min: torch.Size) -> Callable:
+
+        def fn(compat_tensor: nn.Parameter) -> Tuple[Tensor, Tensor, Tensor]:
+            weight = torch.narrow(compat_tensor, 0, 0, shape_wieght.numel()).view(shape_wieght)
+            scale = torch.narrow(compat_tensor, 0, shape_wieght.numel(), shape_scale.numel()).view(shape_scale)
+            min_val = torch.narrow(compat_tensor, 0,
+                                   shape_wieght.numel() + shape_scale.numel(), shape_min.numel()).view(shape_min)
+
+            return weight, scale, min_val
+
+        return fn
+
+    compat_tensor = torch.concat([quantized_weight, quant_scale, quant_min])
+    if return_param:
+        compat_tensor = nn.Parameter(compat_tensor, requires_grad=False)
+    compat_tensor.deconcat = deconcat_individual_tensors(shape_wieght, shape_scale, shape_min)
+
+    return compat_tensor
+
+
+def _quantize_param(param: nn.Parameter, quant_config: Dict):
+    assert not hasattr(param, 'weight_quantized'), 'Parameter has already been quantized.'
+    quantizer = Quantizer(quant_config)
+    dequantizer = DeQuantizer(quant_config, param.dtype)
+
+    quantized_weight, quant_scale, quant_min = quantizer.quantize(param.data)
+
+    quantized_weight = quantized_weight.view(param.dtype)
+    quant_scale = quant_scale.view(param.dtype)
+    quant_min = quant_min.view(param.dtype)
+
+    quantized_compat_tensor = concat_to_compat_param(quantized_weight, quant_scale, quant_min)
+    param.data = quantized_compat_tensor
+    param.deconcat = quantized_compat_tensor.deconcat
+
+    param.quantizer = quantizer
+    param.dequantizer = dequantizer
+    setattr(param, 'weight_quantized', True)
+
+
+def wrap_quantized_functional(f):
+
+    @functools.wraps(f)
+    def wrapper(input: Tensor, weight: nn.Parameter, *args, **kwargs) -> Tensor:
+        if hasattr(weight, 'weight_quantized') and getattr(weight, 'weight_quantized'):
+            quantized_weight, quant_scale, quant_min = weight.deconcat(weight)
+            temp_dequantized_weight = weight.dequantizer.dequantize(quantized_weight.view(torch.uint8), quant_scale,
+                                                                    quant_min)
+            return f(input, temp_dequantized_weight, *args, **kwargs)
+        else:
+            return f(input, weight, *args, **kwargs)
+
+    return wrapper
+
+
+def wrap_load_from_state_dict(f):
+
+    @functools.wraps(f)
+    def wrapper(model, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs):
+        replaced_old_value = None
+        key = None
+        # We may have nested wrappers if we launch multiple initialization context.
+        # Use state_dict_quantized flag to quantize state_dict only once
+        if hasattr(model.weight, 'weight_quantized') and getattr(
+                model.weight, 'weight_quantized') and not hasattr(model.weight, 'state_dict_quantized'):
+            setattr(model.weight, 'state_dict_quantized', True)
+            key = prefix + 'weight'
+            if key in state_dict:
+                quantized_weight, quant_scale, quant_min = model.weight.quantizer.quantize(state_dict[key])
+                quantized_weight = quantized_weight.view(model.weight.dtype)
+                quant_scale = quant_scale.view(model.weight.dtype)
+                quant_min = quant_min.view(model.weight.dtype)
+
+                replaced_old_value = state_dict[key]
+
+                state_dict[key] = concat_to_compat_param(quantized_weight, quant_scale, quant_min)
+
+        f(model, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs)
+
+        if replaced_old_value is not None:
+            state_dict[key] = replaced_old_value
+            delattr(model.weight, 'state_dict_quantized')
+
+    return wrapper
+
+
+WEIGHT_QUANTIZATION_LAYERS = (
+    nn.Linear,
+    nn.Embedding,
+)
diff --git a/deepspeed/inference/v2/__init__.py b/deepspeed/inference/v2/__init__.py
new file mode 100644
index 000000000000..ac8a42da8ab3
--- /dev/null
+++ b/deepspeed/inference/v2/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+from .config_v2 import RaggedInferenceEngineConfig, DeepSpeedTPConfig
+from .engine_v2 import InferenceEngineV2
+from .engine_factory import build_hf_engine, build_engine_from_ds_checkpoint
diff --git a/deepspeed/inference/v2/allocator.py b/deepspeed/inference/v2/allocator.py
new file mode 100644
index 000000000000..bebdcf83aee3
--- /dev/null
+++ b/deepspeed/inference/v2/allocator.py
@@ -0,0 +1,32 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from functools import reduce
+from typing import Iterable
+
+import torch
+
+from deepspeed.accelerator import get_accelerator
+
+
+def empty_from(tensor: torch.Tensor, shape: Iterable[int]) -> torch.Tensor:
+    shape_size = reduce(lambda x, y: x * y, shape)
+    if shape_size == 0:
+        raise ValueError("Cannot create empty tensor with size 0")
+    return tensor.flatten()[:shape_size].view(shape)
+
+
+def on_device(method) -> torch.Tensor:
+    """
+    Wraps a method to ensure the returned tensor is on the current device.
+    """
+
+    def wrapped(self, *args, **kwargs):
+        tensor = method(self, *args, **kwargs)
+        if isinstance(tensor, torch.Tensor):
+            return tensor.to(get_accelerator().current_device())
+        return tensor
+
+    return wrapped
diff --git a/deepspeed/inference/v2/checkpoint/__init__.py b/deepspeed/inference/v2/checkpoint/__init__.py
new file mode 100644
index 000000000000..45e523ab62b9
--- /dev/null
+++ b/deepspeed/inference/v2/checkpoint/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .base_engine import CheckpointEngineBase
+from .in_memory_engine import InMemoryModelEngine
+from .huggingface_engine import HuggingFaceCheckpointEngine
diff --git a/deepspeed/inference/v2/checkpoint/base_engine.py b/deepspeed/inference/v2/checkpoint/base_engine.py
new file mode 100644
index 000000000000..26fc467d4d86
--- /dev/null
+++ b/deepspeed/inference/v2/checkpoint/base_engine.py
@@ -0,0 +1,41 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from abc import ABC, abstractmethod
+from typing import Iterable, Tuple
+
+import torch
+
+#from .huggingface_engine import HuggingFaceCheckpointEngine
+
+MEGATRON = 'megatron'
+HUGGINGFACE = 'huggingface'
+
+
+class CheckpointEngineBase(ABC):
+    """
+    Abstract interface for checkpoint engines to implement.
+
+    There is no ``__init__`` method here by design, since the creation of the checkpoint
+    engine will happen outside the policy/engine code. The tradeoff being made here is
+    that we will write different frontends for different checkpoint engines, but these
+    frontends can be tailored to the specific checkpoint engine/model source needs.
+    """
+
+    @abstractmethod
+    def parameters(self) -> Iterable[Tuple[str, torch.Tensor]]:
+        """
+        This method should create a generator of tuples of the form (name, parameter) for
+        all parameters in the model. The name should be the fully qualified name of the
+        parameter, and the parameter should be a torch.Tensor.
+
+        The expected use of a checkpoint engine is the following:
+        ```python
+        for name, parameter in checkpoint_engine.parameters():
+            container_map.map_param(name, parameter)
+        ```
+        For a concrete use example, see ``InferenceV2Policy``.
+        """
+        ...
diff --git a/deepspeed/inference/v2/checkpoint/huggingface_engine.py b/deepspeed/inference/v2/checkpoint/huggingface_engine.py
new file mode 100644
index 000000000000..029e3f7774c0
--- /dev/null
+++ b/deepspeed/inference/v2/checkpoint/huggingface_engine.py
@@ -0,0 +1,101 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import os
+import json
+import torch
+from .base_engine import CheckpointEngineBase
+from typing import Iterable, Tuple
+
+from ..logging import inference_logger
+
+
+class HuggingFaceCheckpointEngine(CheckpointEngineBase):
+
+    def __init__(self, model_name_or_path: str, auth_token: str = None) -> None:
+        super().__init__()
+        from transformers import AutoConfig, GenerationConfig
+
+        self.model_name_or_path = model_name_or_path
+        self.auth_token = auth_token
+        self.model_config = AutoConfig.from_pretrained(self.model_name_or_path)
+        self.generation_config = GenerationConfig.from_pretrained(self.model_name_or_path)
+        # Define this property here so we can use it in the model implementation
+        if not hasattr(self.model_config, "max_seq_length"):
+            self.model_config.max_seq_length = self.model_config.max_position_embeddings
+        else:
+            self.model_config.max_seq_length = self.generation_config.max_length
+
+        self._all_ckpt_paths = self._fetch_checkpoint_files()
+
+    def _fetch_checkpoint_files(self):
+        """
+        Fetch the checkpoint files from the HuggingFace Hub.
+        """
+        # TODO(jeff): for models like llama-2 the user will have to provide an auth `token`,
+        # currently coming from the ckpt engine init but maybe a catch all kwargs for other
+        # snapshot download parameters would be more flexible.
+
+        # NOTE(jeff): allow_patterns here are explicitly not using safetensors or other
+        # checkpoint files that may be present. Example of all files in the llama-2-7b
+        # repo here: https://huggingface.co/meta-llama/Llama-2-7b-hf/tree/main
+        from huggingface_hub import snapshot_download
+
+        if os.path.isdir(self.model_name_or_path):
+            self._local_checkpoint_dir = self.model_name_or_path
+        else:
+            self._local_checkpoint_dir = snapshot_download(self.model_name_or_path,
+                                                           allow_patterns=[
+                                                               "*.bin",
+                                                               "*.json",
+                                                               "*.pt",
+                                                           ],
+                                                           revision=None,
+                                                           token=self.auth_token)
+
+        assert os.path.isdir(
+            self._local_checkpoint_dir
+        ), f"Checkpoint dir {self._local_checkpoint_dir} is not a directory, cannot load checkpoint."
+
+        model_param_json = os.path.join(self._local_checkpoint_dir, "pytorch_model.bin.index.json")
+
+        if not os.path.isfile(model_param_json):
+            # We don't need any json as all such HF models will have pytorch_model.bin
+            all_checkpoint_files = [os.path.join(self._local_checkpoint_dir, 'pytorch_model.bin')]
+        else:
+            param_map = json.load(open(model_param_json, "r"))
+
+            # weight_map -> { "lm_head.weight": "pytorch_model-00002-of-00002.bin", ... }
+            weight_map = param_map["weight_map"]
+
+            # unique set of all checkpoint files
+            all_checkpoint_files = set(weight_map.values())
+
+            # get absolute path of all unique checkpoint files
+            all_checkpoint_files = [os.path.join(self._local_checkpoint_dir, f) for f in all_checkpoint_files]
+
+        return all_checkpoint_files
+
+    def parameters(self) -> Iterable[Tuple[str, torch.Tensor]]:
+        """
+        Generator of model parameters (satisfies the CheckpointEngineBase interface).
+        """
+        for checkpoint in self._all_ckpt_paths:
+            inference_logger().info(f"Loading checkpoint: {checkpoint}")
+            checkpoint_sd = torch.load(checkpoint, map_location='cpu')
+            param_keys = list(checkpoint_sd.keys())
+            for param_name in param_keys:
+                param = checkpoint_sd[param_name]
+                yield param_name, param
+
+            del checkpoint_sd
+
+
+if __name__ == "__main__":
+    # To test, add your auth_token here and run `python huggingface_engine.py`
+    engine = HuggingFaceCheckpointEngine(model_name_or_path="meta-llama/Llama-2-7b-hf",
+                                         auth_token="hf_xxxxxxxxxxxxxxxxx")
+    for name, param in engine.parameters():
+        print(name, param.shape)
diff --git a/deepspeed/inference/v2/checkpoint/in_memory_engine.py b/deepspeed/inference/v2/checkpoint/in_memory_engine.py
new file mode 100644
index 000000000000..13ec7b288f5f
--- /dev/null
+++ b/deepspeed/inference/v2/checkpoint/in_memory_engine.py
@@ -0,0 +1,40 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import Iterable, Tuple
+import torch
+
+from .base_engine import CheckpointEngineBase
+
+
+class InMemoryModelEngine(CheckpointEngineBase):
+    """
+    This "checkpoint" engine uses the existing interface to enable loading parameters into an
+    inference model from a model already instantiated in memory. In general, this is not the
+    recommended way to use the inference engine, and should only be used when absolutely necessary.
+
+    The primary limitation of this approach is that the model must be fully instantiated in memory.
+    In a tensor parallel scenario, this means that the model is either replicated many times in host
+    memory. Currently, it is also recommended to only use this approach for models held in host memory.
+
+    In order to free the memory held by this copy of the model, we delete the model in the first call
+    to `parameters`, so it is not safe to make this call twice.
+    """
+
+    def __init__(self, model: torch.nn.Module) -> None:
+        """
+        Create virtual checkpoint engine for the provided module.
+
+        Args:
+            model (torch.nn.Module): Model to load parameters from.
+        """
+        super().__init__()
+        self.model = model
+
+    def parameters(self) -> Iterable[Tuple[str, torch.Tensor]]:
+        for name, parameter in self.model.named_parameters():
+            yield name, parameter
+
+        del self.model
diff --git a/deepspeed/inference/v2/config_v2.py b/deepspeed/inference/v2/config_v2.py
new file mode 100644
index 000000000000..64e7e29b1844
--- /dev/null
+++ b/deepspeed/inference/v2/config_v2.py
@@ -0,0 +1,31 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from deepspeed.pydantic_v1 import Field
+
+from deepspeed.runtime.config_utils import DeepSpeedConfigModel
+from .ragged import DSStateManagerConfig
+
+
+class DeepSpeedTPConfig(DeepSpeedConfigModel):
+    """ Configure tensor parallelism settings """
+
+    tp_size: int = 1
+    """ Number of devices to split the model across using tensor parallelism. """
+
+
+class RaggedInferenceEngineConfig(DeepSpeedConfigModel):
+    """ Sets parameters for DeepSpeed Inference Engine. """
+
+    tensor_parallel: DeepSpeedTPConfig = Field({}, alias="tp")
+    """
+    Configuration for tensor parallelism used to split the model across several
+    GPUs. Expects a dictionary containing values for :any:`DeepSpeedTPConfig`.
+    """
+
+    state_manager: DSStateManagerConfig = Field({}, alias="manager")
+    """
+    Configuration for managing persistent state
+    """
diff --git a/deepspeed/inference/v2/engine_factory.py b/deepspeed/inference/v2/engine_factory.py
new file mode 100644
index 000000000000..ecca9f3c1b34
--- /dev/null
+++ b/deepspeed/inference/v2/engine_factory.py
@@ -0,0 +1,110 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import json
+import logging
+import os
+import pickle
+from packaging import version
+
+from .engine_v2 import InferenceEngineV2
+from .config_v2 import RaggedInferenceEngineConfig
+from .checkpoint import HuggingFaceCheckpointEngine
+from .logging import inference_logger
+from .model_implementations import (
+    OPTPolicy,
+    Llama2Policy,
+    MistralPolicy,
+)
+from .model_implementations.inference_policy_base import POLICIES, InferenceV2Policy
+from .model_implementations.flat_model_helpers import make_metadata_filename, ModelMetadata
+
+
+def build_engine_from_ds_checkpoint(path: str,
+                                    engine_config: RaggedInferenceEngineConfig,
+                                    debug_level: int = logging.INFO) -> InferenceEngineV2:
+    """
+    Creates an engine from a checkpoint saved by ``InferenceEngineV2``.
+
+    Arguments:
+        path: Path to the checkpoint. This does not need to point to any files in particular,
+            just the directory containing the checkpoint.
+        engine_config: Engine configuration. See ``RaggedInferenceEngineConfig`` for details.
+        debug_level: Logging level to use. Unless you are actively seeing issues, the recommended
+            value is ``logging.INFO``.
+
+    Returns:
+        Fully initialized inference engine ready to serve queries.
+    """
+
+    inference_logger(level=debug_level)
+    # Load metadata, for grabbing the policy name we'll have all ranks just check for
+    # rank 0.
+    metadata_filename = make_metadata_filename(path, 0, engine_config.tensor_parallel.tp_size)
+    metadata = json.load(open(metadata_filename, "r"))
+    metadata = ModelMetadata.parse_raw(metadata)
+
+    # Get the policy
+    try:
+        policy_cls: InferenceV2Policy = POLICIES[metadata.policy]
+    except KeyError:
+        raise ValueError(f"Unknown policy {metadata.policy} for model {path}")
+
+    # Load the model config
+    model_config = pickle.load(open(os.path.join(path, "ds_model_config.pkl"), "rb"))
+    policy = policy_cls(model_config, inf_checkpoint_path=path)
+
+    return InferenceEngineV2(policy, engine_config)
+
+
+def build_hf_engine(path: str,
+                    engine_config: RaggedInferenceEngineConfig,
+                    debug_level: int = logging.INFO) -> InferenceEngineV2:
+    """
+    Build an InferenceV2 engine for HuggingFace models. This can accept both a HuggingFace
+    model name or a path to an Inference-V2 checkpoint.
+
+    Arguments:
+        path: Path to the checkpoint. This does not need to point to any files in particular,
+            just the directory containing the checkpoint.
+        engine_config: Engine configuration. See ``RaggedInferenceEngineConfig`` for details.
+        debug_level: Logging level to use. Unless you are actively seeing issues, the recommended
+            value is ``logging.INFO``.
+
+    Returns:
+        Fully initialized inference engine ready to serve queries.
+    """
+
+    if os.path.exists(os.path.join(path, "ds_model_config.pkl")):
+        return build_engine_from_ds_checkpoint(path, engine_config, debug_level=debug_level)
+    else:
+        # Set up logging
+        inference_logger(level=debug_level)
+        # get HF checkpoint engine
+        checkpoint_engine = HuggingFaceCheckpointEngine(path)
+
+        # get model config from HF AutoConfig
+        model_config = checkpoint_engine.model_config
+
+        # get the policy
+        # TODO: generalize this to other models
+        if model_config.model_type == "opt":
+            if not model_config.do_layer_norm_before:
+                raise ValueError(
+                    "Detected OPT-350m model. This model is not currently supported. If this is not the 350m model, please open an issue: https://github.com/microsoft/DeepSpeed-MII/issues"
+                )
+            policy = OPTPolicy(model_config, checkpoint_engine=checkpoint_engine)
+        elif model_config.model_type == "llama":
+            policy = Llama2Policy(model_config, checkpoint_engine=checkpoint_engine)
+        elif model_config.model_type == "mistral":
+            # Ensure we're using the correct version of transformers for mistral
+            import transformers
+            assert version.parse(transformers.__version__) >= version.parse("4.34.0"), \
+                f"Mistral requires transformers >= 4.34.0, you have version {transformers.__version__}"
+            policy = MistralPolicy(model_config, checkpoint_engine=checkpoint_engine)
+        else:
+            raise ValueError(f"Unsupported model type {model_config.model_type}")
+
+        return InferenceEngineV2(policy, engine_config)
diff --git a/deepspeed/inference/v2/engine_v2.py b/deepspeed/inference/v2/engine_v2.py
new file mode 100644
index 000000000000..ff73f7ea9680
--- /dev/null
+++ b/deepspeed/inference/v2/engine_v2.py
@@ -0,0 +1,254 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import os
+import json
+import pickle
+from typing import Iterable, Tuple
+
+import torch
+
+import deepspeed.comm as dist
+
+from deepspeed.accelerator import get_accelerator
+from deepspeed.comm.comm import init_distributed
+
+from .model_implementations import InferenceV2Policy
+from .logging import inference_logger
+from .ragged import DSStateManager, RaggedBatchWrapper, PlaceholderSequenceDescriptor
+from .scheduling_utils import SchedulingError, SchedulingResult
+from .model_implementations.flat_model_helpers import make_param_filename, make_metadata_filename
+from .model_implementations.inference_model_base import DSInferenceModelBase
+
+from .config_v2 import RaggedInferenceEngineConfig
+
+INFERENCE_MODEL_TIMER = "model-forward-inference"
+
+
+class InferenceEngineV2:
+
+    _config: RaggedInferenceEngineConfig
+    """
+    Configuration of the inference engine.
+    """
+
+    _model: DSInferenceModelBase
+    """
+    Inference model supporting ragged inference.
+    """
+
+    _state_manager: DSStateManager
+    """
+    Persistent state manager for sequences and KV-cache.
+    """
+
+    @property
+    def free_blocks(self) -> torch.Tensor:
+        """
+        Number of free KV blocks. This is a tensor of shape [n_kv_cache_groups] where each
+        element is the number of free blocks in the corresponding KV cache group.
+        """
+        return self._state_manager.free_blocks
+
+    @property
+    def n_kv_cache_groups(self) -> int:
+        """
+        Number of KV cache groups.
+        """
+        return self._state_manager.n_kv_cache_groups
+
+    def model(self) -> DSInferenceModelBase:
+        """
+        The model implementation.
+        """
+        return self._model
+
+    def __init__(self, policy: InferenceV2Policy, engine_config: RaggedInferenceEngineConfig) -> None:
+        """
+        Create the Inference V2 engine.
+
+        Arguments:
+            policy (InferenceV2Policy): Policy for the model implementation. This policy object
+                will be used to build the model and load the checkpoint associated with it.
+            engine_config (RaggedInferenceEngineConfig): Configuration for the inference engine.
+        """
+        self._config = engine_config
+        self._policy = policy
+        self._base_mp_group = self._initialize_tp_group()
+
+        # Build model from policy
+        inference_logger().info("Building model...")
+        self._model = self._policy.build_model(self._config, self._base_mp_group)
+        inference_logger().info("Model built.")
+
+        # Create state manager
+        self._batch = RaggedBatchWrapper(self._config.state_manager)
+        self._state_manager = DSStateManager(self._config.state_manager,
+                                             self._model.kv_cache_config(),
+                                             base_mp_group=self._base_mp_group)
+        self._model.set_state_manager(self._state_manager)
+
+    def _initialize_tp_group(self):
+        """
+        Implementation of our TP group initialization.
+        """
+        init_distributed()
+        local_rank = int(os.getenv("LOCAL_RANK", 0))
+        get_accelerator().set_device(local_rank)
+
+        if local_rank >= self._config.tensor_parallel.tp_size:
+            raise RuntimeError("Local rank is greater than TP size, ensure that the TP config is correct.")
+
+        ranks = list(range(self._config.tensor_parallel.tp_size))
+        return dist.new_group(ranks=ranks)
+
+    def put(self, batch_uids: Iterable[int], batch_tokens: Iterable[torch.Tensor]) -> torch.Tensor:
+        """
+        Put a ragged batch onto the inference engine. This will perform one forward and return
+        a Tensor of the shape [len(batch_uids), *output_shape]. Logits for the non-final tokens
+        are not calculated.
+
+        Arguments:
+            batch_uids: Iterable of uids for the batch on the host
+            batch_tokens: Iterable of token tensors for the batch on the host
+        """
+
+        token_lens = [len(tokens) for tokens in batch_tokens]
+        schedule_check = self.can_schedule(batch_uids, token_lens)
+        if schedule_check != SchedulingResult.Success:
+            raise SchedulingError(schedule_check)
+
+        self._batch.clear()
+        for uid, tokens in zip(batch_uids, batch_tokens):
+
+            host_seq_desc = self._state_manager.get_or_create_sequence(uid)
+            self._model.maybe_allocate_kv(host_seq_desc, tokens.numel())
+            host_seq_desc.pre_forward(tokens.numel())
+
+            # We can disable checks since we already validated schedulability.
+            self._batch.insert_sequence(host_seq_desc, tokens, do_checks=False)
+
+        # Send all metadata to the device
+        self._batch.finalize()
+
+        # Prep all data structures for the actual forward (in anticipation of CG in the future)
+        # and also to amortize some of the costs in a more straightforward way.
+        self._model.prepare_batch(self._batch)
+
+        # Model implementation will pick up in the forward.
+        logits = self._model.forward(self._batch)
+
+        # We return one set of logits per sequence in the batch (saves cost on unembedding)
+        assert logits.shape[0] == self._batch.current_sequences
+
+        for uid in batch_uids:
+            host_seq_desc = self._state_manager.get_sequence(uid)
+            host_seq_desc.post_forward()  # Updates sequence metadata.
+            self._model.maybe_free_kv(host_seq_desc)
+
+        return logits
+
+    def query(self, uid: int, max_request_tokens: int, max_request_blocks) -> Tuple[int, torch.Tensor]:
+        """
+        Determine the number of tokens and KV blocks to reserve for a given request. Given a UID
+        (this UID may not be recognized by the model yet), this will return the number of tokens
+        and blocks to reserve for the request.
+
+        Arguments:
+            uid (int): The UID of the sequence (as tracked by the scheduling entity). If
+                this is a new sequence (with a UID unknown to the inference engine), then
+                an empty placeholder is created to pass to the occupancy logic.
+            n_tokens (int): The number of tokens to hypothetically send.
+
+        Returns:
+            Tuple[int, Optional[int]]: Tuple of free kv blocks and the number of blocks
+                required to schedule the sequence.
+        """
+        seq_desc = self._state_manager.get_sequence(uid)
+        if seq_desc is None:
+            if (self._state_manager.n_tracked_sequences == self._config.state_manager.max_tracked_sequences):
+                return (0, 0)
+            seq_desc = PlaceholderSequenceDescriptor()
+
+        req_tokens, req_blocks = self._model.get_kv_requirements(seq_desc, max_request_tokens, max_request_blocks)
+
+        return (req_tokens, req_blocks)
+
+    def can_schedule(self, uids: Iterable[int], lengths: Iterable[int]) -> SchedulingResult:
+        """
+        Dry run a batch to determine if it can be scheduled. Placeholder sequences will be
+        created for any UIDs that are unknown to the inference engine.
+
+        Arguments:
+            uids (Iterable[int]): Iterable of UIDs for the batch
+            lengths (Iterable[int]): Iterable of lengths for each sequence of the batch. This lengths
+                corresponds to the number of tokens to send in the hypothetical forward; history
+                tokens will be determined via UID lookup and future tokens are disregarded.
+
+        Returns:
+            bool: True if the batch can be scheduled, False otherwise.
+        """
+
+        cur_seqs = self._state_manager.n_tracked_sequences
+        free_blocks = self._state_manager.free_blocks
+        req_blocks = 0
+        batch_len = 0
+
+        if len(uids) > self._config.state_manager.max_ragged_sequence_count:
+            # Can only compose a batch from a limited number of sequences
+            return SchedulingResult.BatchSequenceLimitExceeded
+
+        for uid, length in zip(uids, lengths):
+            seq_desc = self._state_manager.get_sequence(uid)
+            if seq_desc is None:
+                cur_seqs += 1
+                seq_desc = PlaceholderSequenceDescriptor()
+
+            sched_len, sched_blocks = self._model.get_kv_requirements(seq_desc, length, free_blocks)
+
+            if sched_len != length:
+                # We ran out of KV cache
+                return SchedulingResult.KVCacheLimitExceeded
+
+            batch_len += length
+            free_blocks -= sched_blocks
+
+        if cur_seqs > self._config.state_manager.max_tracked_sequences:
+            # Would run out of tracking metadata
+            return SchedulingResult.EngineSequenceLimitExceeded
+
+        if batch_len > self._config.state_manager.max_ragged_batch_size:
+            # Would exceed the maximum batch size
+            return SchedulingResult.BatchTokenLimitExceeded
+
+        return SchedulingResult.Success
+
+    def flush(self, uid: int) -> None:
+        """
+        Remove all state associated with a sequence from the inference engine.
+
+        Arguments:
+            uid (int): The UID of the sequence to flush.
+        """
+        self._state_manager.flush_sequence(uid)
+
+    def serialize(self, save_path: str) -> None:
+        """
+        Serialize the model to a file.
+
+        Arguments:
+            path (str): Path to the file to serialize to.
+        """
+        param_file_name = make_param_filename(save_path, self._model.tp_rank, self._model.tp_size)
+        metadata_file_name = make_metadata_filename(save_path, self._model.tp_rank, self._model.tp_size)
+
+        # Save the flattened parameters
+
+        torch.save(self._model.flattened_params, param_file_name)
+
+        json.dump(self._model.flattened_param_metadata.json(), open(metadata_file_name, "w"))
+
+        if self._model.tp_rank == 0:
+            pickle.dump(self._model._config, open(os.path.join(save_path, "ds_model_config.pkl"), "wb"))
diff --git a/deepspeed/inference/v2/inference_parameter.py b/deepspeed/inference/v2/inference_parameter.py
new file mode 100644
index 000000000000..4dcff16a4515
--- /dev/null
+++ b/deepspeed/inference/v2/inference_parameter.py
@@ -0,0 +1,89 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import Dict
+
+import torch
+
+CORE_PARAM = "_ds_core_param_key"
+
+STR_TO_DTYPE = {
+    "torch.float32": torch.float32,
+    "torch.float64": torch.float64,
+    "torch.float16": torch.float16,
+    "torch.bfloat16": torch.bfloat16,
+    "torch.int64": torch.int64,
+    "torch.int32": torch.int32,
+    "torch.int16": torch.int16,
+    "torch.int8": torch.int8,
+    "torch.uint8": torch.uint8,
+    "torch.bool": torch.bool,
+}
+
+
+class InferenceParameter(torch.Tensor):
+    """
+    An extension of the torch.Tensor class to support our inference focused features. One important
+    thing to note here is that an InferenceParam can be used a torch.Tensor, but outputs of
+    torch.Tensor operations will not be InferenceParams.
+    """
+
+    @staticmethod
+    def __new__(cls, tensor, *args, **kwargs):
+        new_tensor = super().__new__(cls, tensor, *args, **kwargs)
+        if hasattr(tensor, "_aux_attrs"):
+            setattr(new_tensor, "_aux_attrs", tensor.aux_attrs)
+        return new_tensor
+
+    def to(self, *args, **kwargs):
+        new_tensor = super().to(*args, **kwargs)
+        if hasattr(self, "_aux_attrs"):
+            setattr(new_tensor, "_aux_attrs", self.aux_attrs)
+        try:
+            _ = torch.device(args[0])
+            for name, attr in new_tensor.aux_attrs.items():
+                new_attr = attr.to(*args, **kwargs)
+                setattr(new_tensor, name, new_attr)
+                new_tensor.aux_attrs[name] = new_attr
+        except:
+            pass
+
+        return new_tensor
+
+    @classmethod
+    def initialize(cls, core_param: torch.Tensor, **kwargs) -> 'InferenceParameter':
+        """
+        Create the inference parameter.
+        """
+        param = InferenceParameter(core_param)
+        setattr(param, "_aux_attrs", kwargs)
+
+        for attr_name, attr in kwargs.items():
+            if hasattr(param, attr_name):
+                raise ValueError(f"Attribute {attr_name} already exists on param.")
+
+            if not isinstance(attr, torch.Tensor):
+                raise ValueError(f"Attribute {attr_name} must be a tensor.")
+
+            setattr(param, attr_name, attr)
+
+        return param
+
+    @classmethod
+    def initialize_raw(self, **kwargs) -> 'InferenceParameter':
+        """
+        All kwargs must be torch.Tensors and must include the core parameter.
+        """
+        if CORE_PARAM not in kwargs:
+            raise ValueError(f"Must provide core parameter, with key {CORE_PARAM}.")
+
+        return InferenceParameter.initialize(kwargs[CORE_PARAM], **kwargs)
+
+    @property
+    def aux_attrs(self) -> Dict[str, torch.Tensor]:
+        """
+        Dictionary of auxiliary attributes.
+        """
+        return self._aux_attrs
diff --git a/deepspeed/inference/v2/inference_utils.py b/deepspeed/inference/v2/inference_utils.py
new file mode 100644
index 000000000000..7b2dd4237353
--- /dev/null
+++ b/deepspeed/inference/v2/inference_utils.py
@@ -0,0 +1,105 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import Dict
+
+import torch
+
+from enum import Enum, IntEnum
+
+
+class NormTypeEnum(Enum):
+    LayerNorm: str = "layer_norm"
+    RMSNorm: str = "rms_norm"
+
+
+class DtypeEnum(Enum):
+    # The torch dtype must always be the first value (so we return torch.dtype)
+    fp16 = torch.float16, "torch.float16", "fp16", "float16", "half"
+    fp32 = torch.float32, "torch.float32", "fp32", "float32", "float"
+    bf16 = torch.bfloat16, "torch.bfloat16", "bf16", "bfloat16", "bfloat"
+    int8 = torch.int8, "torch.int8", "int8"
+
+    # Copied from https://stackoverflow.com/a/43210118
+    # Allows us to use multiple values for each Enum index and returns first
+    # listed value when Enum is called
+    def __new__(cls, *values):
+        obj = object.__new__(cls)
+        # first value is canonical value
+        obj._value_ = values[0]
+        for other_value in values[1:]:
+            cls._value2member_map_[other_value] = obj
+        obj._all_values = values
+        return obj
+
+    def __repr__(self):
+        return "<%s.%s: %s>" % (
+            self.__class__.__name__,
+            self._name_,
+            ", ".join([repr(v) for v in self._all_values]),
+        )
+
+
+ELEM_SIZES: Dict[torch.dtype, int] = {
+    torch.float16: 2,
+    torch.bfloat16: 2,
+    torch.float32: 4,
+    torch.float64: 8,
+    torch.int8: 1,
+    torch.uint8: 1,
+    torch.int16: 2,
+    torch.int32: 4,
+    torch.int64: 8,
+    torch.bool: 1,
+}
+
+
+class ActivationType(IntEnum):
+    """
+    Types of activations supported by DS-Inference
+    """
+
+    GELU = 0
+
+    RELU = 1
+
+    SILU = 2
+
+    GEGLU = 3
+
+    ReGLU = 4
+
+    SiGLU = 5
+
+    IDENTITY = 6
+
+    InvalidType = -1
+
+
+def is_gated(act_fn: ActivationType) -> bool:
+    """
+    Return True if the given activation function is gated.
+    """
+    if not isinstance(act_fn, ActivationType):
+        act_fn = ActivationType(act_fn)
+
+    return act_fn in [ActivationType.GEGLU, ActivationType.ReGLU, ActivationType.SiGLU]
+
+
+def elem_size(dtype: torch.dtype) -> int:
+    """
+    Return size in bytes of the given dtype.
+    """
+    try:
+        return ELEM_SIZES[dtype]
+    except KeyError:
+        raise ValueError("Unknown dtype size for {}".format(dtype))
+
+
+def ceil_div(a: int, b: int) -> int:
+    """
+    Return ceil(a / b).
+    """
+    return -(-a // b)
diff --git a/deepspeed/inference/v2/kernels/__init__.py b/deepspeed/inference/v2/kernels/__init__.py
new file mode 100644
index 000000000000..01b7b0580073
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .ds_kernel import DSKernelBase
diff --git a/deepspeed/inference/v2/kernels/core_ops/__init__.py b/deepspeed/inference/v2/kernels/core_ops/__init__.py
new file mode 100644
index 000000000000..bbb53e5b58a2
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/core_ops/__init__.py
@@ -0,0 +1,10 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .bias_activations import *
+from .blas_kernels import *
+from .cuda_layer_norm import *
+from .cuda_rms_norm import *
+from .gated_activations import *
diff --git a/deepspeed/inference/v2/kernels/core_ops/bias_activations/__init__.py b/deepspeed/inference/v2/kernels/core_ops/bias_activations/__init__.py
new file mode 100644
index 000000000000..ea7f8a7d1996
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/core_ops/bias_activations/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .bias_activation import *
diff --git a/deepspeed/inference/v2/kernels/core_ops/bias_activations/bias_activation.cpp b/deepspeed/inference/v2/kernels/core_ops/bias_activations/bias_activation.cpp
new file mode 100644
index 000000000000..4f0cc9cbd77c
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/core_ops/bias_activations/bias_activation.cpp
@@ -0,0 +1,68 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include "bias_activation.h"
+#include <c10/cuda/CUDAStream.h>
+#include "ds_kernel_utils.h"
+
+#ifdef BF16_AVAILABLE
+#define DTYPE_SWITCH(DTYPE, ...)                                        \
+    [&] {                                                               \
+        if (DTYPE == torch::kFloat16) {                                 \
+            using scalar_t = __half;                                    \
+            return __VA_ARGS__();                                       \
+        } else if (DTYPE == torch::kBFloat16) {                         \
+            using scalar_t = __nv_bfloat16;                             \
+            return __VA_ARGS__();                                       \
+        } else {                                                        \
+            TORCH_CHECK(false, "Unsupported dtype for BiasActivation"); \
+        }                                                               \
+    }()
+#else
+#define DTYPE_SWITCH(DTYPE, ...)                                        \
+    [&] {                                                               \
+        if (DTYPE == torch::kFloat16) {                                 \
+            using scalar_t = __half;                                    \
+            return __VA_ARGS__();                                       \
+        } else {                                                        \
+            TORCH_CHECK(false, "Unsupported dtype for BiasActivation"); \
+        }                                                               \
+    }()
+#endif
+
+/*
+In-place bias and activation fusion kernel.
+*/
+void bias_activation(torch::Tensor& activation,
+                     c10::optional<torch::Tensor>& bias,
+                     const int32_t act_type)
+{
+    const ActivationType atype = static_cast<ActivationType>(act_type);
+    const int32_t rows = activation.size(0);
+    const int32_t cols = activation.size(1);
+
+    TORCH_CHECK(atype == ActivationType::GELU || atype == ActivationType::RELU ||
+                    atype == ActivationType::SILU || atype == ActivationType::IDENTITY,
+                "Unsupported activation type for BiasActivation");
+    TORCH_CHECK(activation.dim() == 2, "BiasActivation only supports 2D activation tensors");
+
+    DTYPE_SWITCH(activation.scalar_type(), [&] {
+        scalar_t* activation_ptr = reinterpret_cast<scalar_t*>(activation.data_ptr());
+
+        const scalar_t* bias_ptr;
+        if (bias.has_value()) {
+            TORCH_CHECK(activation.scalar_type() == bias.value().scalar_type(),
+                        "BiasActivation activation and bias must have same dtype");
+            bias_ptr = reinterpret_cast<const scalar_t*>(bias.value().data_ptr());
+        } else {
+            bias_ptr = nullptr;
+        }
+
+        if (atype == ActivationType::IDENTITY && bias_ptr == nullptr) { return; }
+
+        launch_bias_activation<scalar_t>(
+            activation_ptr, bias_ptr, rows, cols, atype, c10::cuda::getCurrentCUDAStream());
+    });
+}
diff --git a/deepspeed/inference/v2/kernels/core_ops/bias_activations/bias_activation.cu b/deepspeed/inference/v2/kernels/core_ops/bias_activations/bias_activation.cu
new file mode 100644
index 000000000000..66bca0c175c3
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/core_ops/bias_activations/bias_activation.cu
@@ -0,0 +1,140 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include <cassert>
+#include "activation_type.h"
+#include "conversion_utils.h"
+#include "ds_kernel_utils.h"
+#include "memory_access_utils.h"
+
+// Default activation function will error out
+template <ActivationType ActType>
+DS_D_INLINE float act_fn(float val);
+
+template <>
+DS_D_INLINE float act_fn<ActivationType::IDENTITY>(float val)
+{
+    return val;
+}
+
+template <>
+DS_D_INLINE float act_fn<ActivationType::RELU>(float val)
+{
+    return val > 0.0f ? val : 0.0f;
+}
+
+template <>
+DS_D_INLINE float act_fn<ActivationType::GELU>(float val)
+{
+    constexpr float sqrt_param = 0.79788456080286535587989211986876f;
+    constexpr float mul_param = 0.044715f;
+    return val * 0.5f * (1.0f + tanhf(sqrt_param * (val + mul_param * val * val * val)));
+}
+
+template <>
+DS_D_INLINE float act_fn<ActivationType::SILU>(float val)
+{
+    return val / (1.0f + expf(-val));
+}
+
+namespace bias_act {
+
+constexpr int access_size = 16;
+constexpr int threads = 512;
+constexpr int unroll = 4;
+
+}  // namespace bias_act
+
+template <typename T, ActivationType ActType>
+__global__ void bias_activation_kernel(T* activation,
+                                       const T* bias,
+                                       const int32_t rows,
+                                       const int32_t cols)
+{
+    constexpr int vector_T = bias_act::access_size / sizeof(T);
+
+    const int32_t thread_offset = threadIdx.x * vector_T;
+    const int32_t block_offset = blockIdx.x * vector_T * bias_act::unroll * bias_act::threads;
+    const int32_t base_offset = block_offset + thread_offset;
+
+    const int32_t thread_stride = bias_act::threads * vector_T;
+
+#pragma unroll
+    for (int i = 0; i < bias_act::unroll; i++) {
+        const int32_t iter_offset = base_offset + i * thread_stride;
+
+        const int32_t row = iter_offset / cols;
+
+        T buffer[vector_T];
+        T bias_buffer[vector_T];
+
+        if (row < rows) {
+            const int32_t col = iter_offset % cols;
+
+            mem_access::load_global<bias_act::access_size>(buffer, activation + iter_offset);
+            mem_access::load_global<bias_act::access_size>(
+                bias_buffer, bias + col, bias != nullptr);
+
+#pragma unroll
+            for (int j = 0; j < vector_T; j++) {
+                float val =
+                    conversion::to<float>(buffer[j]) + conversion::to<float>(bias_buffer[j]);
+                buffer[j] = conversion::to<T>(act_fn<ActType>(val));
+            }
+
+            mem_access::store_global<bias_act::access_size>(activation + iter_offset, buffer);
+        }
+    }
+}
+
+#define ACT_TYPE_SWITCH(ACT_TYPE, ...)                                \
+    if (ACT_TYPE == ActivationType::IDENTITY) {                       \
+        constexpr ActivationType act_fn_t = ActivationType::IDENTITY; \
+        return __VA_ARGS__();                                         \
+    } else if (ACT_TYPE == ActivationType::RELU) {                    \
+        constexpr ActivationType act_fn_t = ActivationType::RELU;     \
+        return __VA_ARGS__();                                         \
+    } else if (ACT_TYPE == ActivationType::GELU) {                    \
+        constexpr ActivationType act_fn_t = ActivationType::GELU;     \
+        return __VA_ARGS__();                                         \
+    } else if (ACT_TYPE == ActivationType::SILU) {                    \
+        constexpr ActivationType act_fn_t = ActivationType::SILU;     \
+        return __VA_ARGS__();                                         \
+    } else {                                                          \
+        assert(false);                                                \
+    }
+
+template <typename T>
+void launch_bias_activation(T* activation,
+                            const T* bias,
+                            const int32_t n_rows,
+                            const int32_t n_cols,
+                            const ActivationType activation_type,
+                            cudaStream_t stream)
+{
+    constexpr int32_t elems_per_block =
+        bias_act::threads * bias_act::unroll * bias_act::access_size / sizeof(T);
+    const int32_t total_elems = n_rows * n_cols;
+
+    const int32_t blocks = (total_elems + elems_per_block - 1) / elems_per_block;
+
+    const dim3 grid(blocks);
+    const dim3 block(bias_act::threads);
+
+    ACT_TYPE_SWITCH(activation_type, [&] {
+        bias_activation_kernel<T, act_fn_t>
+            <<<grid, block, 0, stream>>>(activation, bias, n_rows, n_cols);
+    });
+}
+
+#define INSTANTIATE_FOR_T(T)                 \
+    template void launch_bias_activation<T>( \
+        T*, const T*, const int32_t, const int32_t, const ActivationType, cudaStream_t);
+
+INSTANTIATE_FOR_T(__half);
+
+#ifdef BF16_AVAILABLE
+INSTANTIATE_FOR_T(__nv_bfloat16);
+#endif
diff --git a/deepspeed/inference/v2/kernels/core_ops/bias_activations/bias_activation.h b/deepspeed/inference/v2/kernels/core_ops/bias_activations/bias_activation.h
new file mode 100644
index 000000000000..db6174633a09
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/core_ops/bias_activations/bias_activation.h
@@ -0,0 +1,22 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#pragma once
+
+#include <c10/cuda/CUDAStream.h>
+#include <torch/extension.h>
+#include "activation_type.h"
+
+template <typename T>
+void launch_bias_activation(T* activation,
+                            const T* bias,
+                            const int32_t n_rows,
+                            const int32_t n_cols,
+                            const ActivationType activation_type,
+                            cudaStream_t stream);
+
+void bias_activation(torch::Tensor& activation,
+                     c10::optional<torch::Tensor>& bias,
+                     const int32_t activation_type);
diff --git a/deepspeed/inference/v2/kernels/core_ops/bias_activations/bias_activation.py b/deepspeed/inference/v2/kernels/core_ops/bias_activations/bias_activation.py
new file mode 100644
index 000000000000..436d7f8805d5
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/core_ops/bias_activations/bias_activation.py
@@ -0,0 +1,62 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import Optional
+
+import torch
+
+from ....inference_utils import ActivationType, DtypeEnum
+from deepspeed.ops.op_builder import InferenceCoreBuilder
+from ... import DSKernelBase
+
+
+class CUDABiasActivation(DSKernelBase):
+    """
+    CUDA implementation of bias activation kernel. This kernel should be deprecated once
+    we are fusing the bias activation into the linear kernel in all scenarios.
+    """
+
+    supported_dtypes = [DtypeEnum.fp16, DtypeEnum.bf16]
+    supported_act_fns = [ActivationType.IDENTITY, ActivationType.GELU, ActivationType.RELU, ActivationType.SILU]
+
+    def __init__(self, channels: int, dtype: DtypeEnum, act_fn: ActivationType) -> None:
+        """
+        Compile and validate for the fused bias-activation kernel.
+
+        Parameters:
+            channels (int): Number of channels to expect in the activation.
+            dtype (torch.dtype): Data type for the input/output. Supported values
+                are DtypeEnum.fp16 and DtypeEnum.bf16.
+            act_fn (ActivationType): Activation function to use. Only IDENTITY, GELU, RELU, and SILU are supported.
+        """
+
+        if channels % 8 != 0:
+            raise ValueError("channels must be divisible by 8")
+
+        if DtypeEnum(dtype) not in CUDABiasActivation.supported_dtypes:
+            raise ValueError("Unsupported data type: {}, supported_dtypes are {}".format(
+                dtype, CUDABiasActivation.supported_dtypes))
+
+        act_fn = ActivationType(act_fn)
+        if act_fn not in CUDABiasActivation.supported_act_fns:
+            raise ValueError("Unsupported activation function: {}, supported_act_fns are {}".format(
+                act_fn, CUDABiasActivation.supported_act_fns))
+
+        inf_module = InferenceCoreBuilder().load()
+        self.kernel = inf_module.bias_activation
+        self.act_fn = act_fn
+
+    def __call__(self, activation: torch.Tensor, bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        """
+        Add an optional bias and perform the non-linear activation function.
+
+        Parameters:
+            activation (torch.Tensor): Input tensor of shape [tokens, channels]
+            bias (torch.Tensor): Optional bias tensor of shape [channels]
+
+        Returns:
+            activation that has been updated in-place
+        """
+        self.kernel(activation, bias, self.act_fn.value)
diff --git a/deepspeed/inference/v2/kernels/core_ops/blas_kernels/__init__.py b/deepspeed/inference/v2/kernels/core_ops/blas_kernels/__init__.py
new file mode 100644
index 000000000000..4af5a579ca1b
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/core_ops/blas_kernels/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .blas_linear import *
diff --git a/deepspeed/inference/v2/kernels/core_ops/blas_kernels/blas.h b/deepspeed/inference/v2/kernels/core_ops/blas_kernels/blas.h
new file mode 100644
index 000000000000..1854e40a227d
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/core_ops/blas_kernels/blas.h
@@ -0,0 +1,138 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#pragma once
+
+#include <c10/cuda/CUDAStream.h>
+#include <torch/extension.h>
+#include <cstdio>
+#include "blas_utils.h"
+
+#define DISPATCH_BLAS_MATMUL(T_TYPE, C_TYPE)                \
+    if (output.options().dtype() == torch::T_TYPE) {        \
+        blas_gemm_ex(output.data_ptr(),                     \
+                     (const void*)weights.data_ptr(),       \
+                     (const void*)hidden_states.data_ptr(), \
+                     m,                                     \
+                     n,                                     \
+                     k,                                     \
+                     lda,                                   \
+                     ldb,                                   \
+                     ldc,                                   \
+                     trans_a,                               \
+                     trans_b,                               \
+                     &alpha,                                \
+                     &beta,                                 \
+                     C_TYPE);                               \
+    }
+
+void blas_linear(at::Tensor& output, at::Tensor& hidden_states, at::Tensor& weights)
+{
+    /*
+    Expected shape: output([total_tokens_across_dims], out_neurons)
+                    hidden_states([total_tokens_across_dims], in_neurons)
+                    weights(out_neurons, in_neurons)
+
+    We are going to assume contiguous for the above shapes.
+
+    The shapes are going to get messed with a little internally to handle column-major
+    GEMMs.
+    */
+
+    // Number of tokens is N (since the GEMM output is column-major but our Tensor
+    // is row-major, we need to transpose the shapes)
+    const int n = output.numel() / output.size(-1);
+    const int k = weights.size(1);
+    const int m = weights.size(0);
+
+    // A strides
+    const bool trans_a = weights.stride(1) == 1;
+    const int lda = (trans_a) ? weights.stride(0) : weights.stride(1);
+
+    // B strides
+    const bool trans_b = hidden_states.stride(-1) != 1;
+    const int ldb = (trans_b) ? hidden_states.stride(-1) : hidden_states.stride(-2);
+
+    // C strides
+    const int ldc = output.stride(-2);
+
+    const float alpha = 1.0f;
+    const float beta = 0.0f;
+
+    TORCH_CHECK(output.scalar_type() == hidden_states.scalar_type(),
+                "Output and hidden states must have the same scalar type");
+    TORCH_CHECK(output.scalar_type() == weights.scalar_type(),
+                "Output and weights must have the same scalar type");
+
+    // Dispatch the datatypes
+    DISPATCH_BLAS_MATMUL(kFloat, BlasType::FP32);
+    DISPATCH_BLAS_MATMUL(kHalf, BlasType::FP16);
+#ifdef BF16_AVAILABLE
+    DISPATCH_BLAS_MATMUL(kBFloat16, BlasType::BF16);
+#endif
+}
+
+#define DISPATCH_4D_BLAS(T_TYPE, C_TYPE)                     \
+    if (C.options().dtype() == torch::T_TYPE) {              \
+        blas_strided_batched_gemm(C.data_ptr(),              \
+                                  (const void*)A.data_ptr(), \
+                                  (const void*)B.data_ptr(), \
+                                  m,                         \
+                                  n,                         \
+                                  k,                         \
+                                  lda,                       \
+                                  ldb,                       \
+                                  ldc,                       \
+                                  trans_a,                   \
+                                  trans_b,                   \
+                                  &alpha,                    \
+                                  &beta,                     \
+                                  stride_a,                  \
+                                  stride_b,                  \
+                                  stride_c,                  \
+                                  batch,                     \
+                                  C_TYPE);                   \
+    }
+
+void blas_4d_matmul(at::Tensor& C, at::Tensor& B, at::Tensor& A)
+{
+    /*
+    C shape: (batch_size, N, M)
+    A shape: (batch_size, N, K)
+    B shape: (batch_size, K, M)
+    */
+
+    const int n = C.size(-2);
+    const int k = C.size(-1);
+    const int m = B.size(-1);
+
+    // A strides
+    const bool trans_a = A.stride(-1) == 1;
+    const int lda = (trans_a) ? A.stride(-2) : A.stride(-1);
+    const int stride_a = A.stride(-3);
+
+    // B strides
+    const bool trans_b = B.stride(-1) != 1;
+    const int ldb = (trans_b) ? B.stride(-1) : B.stride(-2);
+    const int stride_b = B.stride(-3);
+
+    // C strides
+    const int ldc = C.stride(-2);
+    const int stride_c = C.stride(-3);
+
+    const float alpha = 1.0f;
+    const float beta = 0.0f;
+
+    const int batch = C.numel() / (n * m);
+
+    // Dispatch the datatypes
+    DISPATCH_4D_BLAS(kFloat, BlasType::FP32);
+    DISPATCH_4D_BLAS(kHalf, BlasType::FP16);
+#ifdef BF16_AVAILABLE
+    DISPATCH_4D_BLAS(kBFloat16, BlasType::BF16);
+#endif
+}
+
+void create_handle() { BlasContext::getInstance().get_handle(); }
diff --git a/deepspeed/inference/v2/kernels/core_ops/blas_kernels/blas_linear.py b/deepspeed/inference/v2/kernels/core_ops/blas_kernels/blas_linear.py
new file mode 100644
index 000000000000..9a151ce36dc4
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/core_ops/blas_kernels/blas_linear.py
@@ -0,0 +1,55 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+
+from ....inference_utils import DtypeEnum
+from deepspeed.ops.op_builder import InferenceCoreBuilder
+from ... import DSKernelBase
+
+
+class BlasLibLinear(DSKernelBase):
+    """
+    Wrapper around the BLAS matmul kernel for FP16/BF16/FP32 for CUDA/RoCM.
+
+    Performs z = x @ y
+    """
+
+    supported_dtypes = [DtypeEnum.fp16, DtypeEnum.bf16, DtypeEnum.fp32]
+
+    def __init__(self, fp_dtype: DtypeEnum):
+        """
+        Parameters:
+            fp_dtype (torch.dtype): Data type for the input/output. Supported values
+                are torch.float16, torch.bfloat16, and torch.float32.
+        """
+        fp_dtype = DtypeEnum(fp_dtype)
+        if fp_dtype not in BlasLibLinear.supported_dtypes:
+            raise ValueError("Unsupported data type: {}, supported_dtypes are {}".format(
+                fp_dtype, BlasLibLinear.supported_dtypes))
+
+        self.inf_module = InferenceCoreBuilder().load()
+        self.inf_module.create_handle()
+        self.kernel = self.inf_module.blas_linear
+
+    def __call__(self, output: torch.Tensor, hidden_states: torch.Tensor, weights: torch.Tensor) -> torch.Tensor:
+        """
+        Matmul kernel as implemented by platform BLAS library. The input must be 2D or larger. If
+        n-dimensional, the leading dimensions are folded into each other:
+            2D: m = x.size(0)
+            3D: m = x.size(0) * x.size(1)
+            4D: m = x.size(0) * x.size(1) * x.size(2) (etc...)
+        All inputs should be contiguous.
+
+        Parameters:
+            output (torch.Tensor): Output tensor. Shape is of [*, out_features]
+            hidden_states (torch.Tensor): Input tensor. Shape is of [*, in_features]
+            weights (torch.Tensor): Input tensor. Shape is of [out_features, in_features]
+
+        Returns:
+            z (torch.Tensor): Output tensor. Shape is of [m, n]
+        """
+        self.kernel(output, hidden_states, weights)
+        return output
diff --git a/deepspeed/inference/v2/kernels/core_ops/blas_kernels/blas_utils.h b/deepspeed/inference/v2/kernels/core_ops/blas_kernels/blas_utils.h
new file mode 100644
index 000000000000..450991b3c387
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/core_ops/blas_kernels/blas_utils.h
@@ -0,0 +1,275 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#pragma once
+
+#include <assert.h>
+#include <cublas_v2.h>
+#include <cuda.h>
+#ifdef BF16_AVAILABLE
+#include <cuda_bf16.h>
+#endif
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#ifndef __HIP_PLATFORM_HCC__
+#include <mma.h>
+#endif
+#include <stdio.h>
+#include <iostream>
+#include <stdexcept>
+
+class BlasContext {
+    /*
+    Slim wrapper for managing the lifetime of the platform's BLAS handle. This should
+    be hipified for ROCm.
+    */
+public:
+    BlasContext()
+    {
+        if (cublasCreate(&_handle) != CUBLAS_STATUS_SUCCESS) {
+            auto message = std::string("Fail to create cublas handle.");
+            std::cerr << message << std::endl;
+            throw std::runtime_error(message);
+        }
+#ifndef __HIP_PLATFORM_HCC__
+        cublasSetMathMode(_handle, CUBLAS_TENSOR_OP_MATH);
+#endif
+    }
+
+    virtual ~BlasContext() { cublasDestroy(_handle); }
+
+    static BlasContext& getInstance()
+    {
+        // Should always access the singleton through this function.
+        static BlasContext _instance;
+        return _instance;
+    }
+
+    cublasHandle_t get_handle() const { return _handle; }
+
+private:
+    cublasHandle_t _handle;
+};
+
+enum class BlasType { FP32, FP16, BF16 };
+
+#ifdef __HIP_PLATFORM_HCC__
+rocblas_operation get_trans_op(bool do_trans)
+{
+    return (do_trans) ? rocblas_operation_transpose : rocblas_operation_none;
+}
+
+rocblas_datatype get_datatype(BlasType type)
+{
+    switch (type) {
+        case BlasType::FP32: return rocblas_datatype_f32_r;
+        case BlasType::FP16: return rocblas_datatype_f16_r;
+        case BlasType::BF16: return rocblas_datatype_bf16_r;
+        default: throw std::runtime_error("Unsupported BlasType");
+    }
+}
+#else
+cublasOperation_t get_trans_op(bool do_trans) { return (do_trans) ? CUBLAS_OP_T : CUBLAS_OP_N; }
+
+cublasDataType_t get_datatype(BlasType type)
+{
+    switch (type) {
+        case BlasType::FP32: return CUDA_R_32F;
+        case BlasType::FP16: return CUDA_R_16F;
+        case BlasType::BF16: return CUDA_R_16BF;
+        default: throw std::runtime_error("Unsupported BlasType");
+    }
+}
+#endif
+
+int blas_gemm_ex(void* C,
+                 const void* A,
+                 const void* B,
+                 int m,
+                 int n,
+                 int k,
+                 int lda,
+                 int ldb,
+                 int ldc,
+                 bool transa,
+                 bool transb,
+                 const float* alpha,
+                 const float* beta,
+                 BlasType type)
+{
+#ifdef __HIP_PLATFORM_HCC__
+    rocblas_operation_t transa_op = get_trans_op(transa);
+    rocblas_operation_t transb_op = get_trans_op(transb);
+
+    rocblas_datatype_t abc_type = get_datatype(type);
+
+    rocblas_status status = rocblas_gemm_ex(BlasContext::getInstance().get_handle(),
+                                            transa_op,
+                                            transb_op,
+                                            m,
+                                            n,
+                                            k,
+                                            (const void*)alpha,
+                                            A,
+                                            abc_type,
+                                            lda,
+                                            B,
+                                            abc_type,
+                                            ldb,
+                                            (const void*)beta,
+                                            C,
+                                            abc_type,
+                                            ldc,
+                                            C,
+                                            abc_type,
+                                            ldc,
+                                            rocblas_datatype_f32_r,
+                                            rocblas_gemm_algo_standard,
+                                            0,
+                                            0);
+#else
+    cublasOperation_t transa_op = get_trans_op(transa);
+    cublasOperation_t transb_op = get_trans_op(transb);
+
+    cublasDataType_t abc_type = get_datatype(type);
+    cublasStatus_t status = cublasGemmEx(BlasContext::getInstance().get_handle(),
+                                         transa_op,
+                                         transb_op,
+                                         m,
+                                         n,
+                                         k,
+                                         (const void*)alpha,
+                                         A,
+                                         abc_type,
+                                         lda,
+                                         B,
+                                         abc_type,
+                                         ldb,
+                                         (const void*)beta,
+                                         C,
+                                         abc_type,
+                                         ldc,
+                                         CUDA_R_32F,
+                                         CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+#endif
+
+#ifdef __HIP_PLATFORM_HCC__
+    if (status != rocblas_status_success) {
+#else
+    if (status != CUBLAS_STATUS_SUCCESS) {
+#endif
+        fprintf(stderr,
+                "!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d) \n",
+                m,
+                n,
+                k,
+                (int)status);
+        return EXIT_FAILURE;
+    }
+    return 0;
+}
+
+int blas_strided_batched_gemm(void* C,
+                              const void* A,
+                              const void* B,
+                              int m,
+                              int n,
+                              int k,
+                              int lda,
+                              int ldb,
+                              int ldc,
+                              bool transa,
+                              bool transb,
+                              const float* alpha,
+                              const float* beta,
+                              int stride_A,
+                              int stride_B,
+                              int stride_C,
+                              int batch,
+                              BlasType type)
+{
+#ifdef __HIP_PLATFORM_HCC__
+    rocblas_operation_t transa_op = get_trans_op(transa);
+    rocblas_operation_t transb_op = get_trans_op(transb);
+
+    rocblas_datatype_t abc_type = get_datatype(type);
+
+    rocblas_status status =
+        rocblas_gemm_strided_batched_ex(BlasContext::getInstance()::get_handle(),
+                                        transa_op,
+                                        transb_op,
+                                        m,
+                                        n,
+                                        k,
+                                        (const void*)alpha,
+                                        A,
+                                        abc_type,
+                                        lda,
+                                        stride_A,
+                                        B,
+                                        abc_type,
+                                        ldb,
+                                        stride_B,
+                                        (const void*)beta,
+                                        C,
+                                        abc_type,
+                                        ldc,
+                                        stride_C,
+                                        C,
+                                        abc_type,
+                                        ldc,
+                                        stride_C,
+                                        batch,
+                                        rocblas_datatype_f32_r,
+                                        rocblas_gemm_algo_standard,
+                                        0,
+                                        0);
+#else
+    cublasOperation_t transa_op = get_trans_op(transa);
+    cublasOperation_t transb_op = get_trans_op(transb);
+
+    cublasDataType_t abc_type = get_datatype(type);
+
+    cublasStatus_t status = cublasGemmStridedBatchedEx(BlasContext::getInstance().get_handle(),
+                                                       transa_op,
+                                                       transb_op,
+                                                       m,
+                                                       n,
+                                                       k,
+                                                       (const void*)alpha,
+                                                       A,
+                                                       abc_type,
+                                                       lda,
+                                                       stride_A,
+                                                       B,
+                                                       abc_type,
+                                                       ldb,
+                                                       stride_B,
+                                                       (const void*)beta,
+                                                       C,
+                                                       abc_type,
+                                                       ldc,
+                                                       stride_C,
+                                                       batch,
+                                                       CUDA_R_32F,
+                                                       CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+#endif
+
+#ifdef __HIP_PLATFORM_HCC__
+    if (status != rocblas_status_success) {
+#else
+    if (status != CUBLAS_STATUS_SUCCESS) {
+#endif
+        fprintf(stderr,
+                "!!!! kernel execution error. (batch: %d, m: %d, n: %d, k: %d, error: %d) \n",
+                batch,
+                m,
+                n,
+                k,
+                (int)status);
+        return EXIT_FAILURE;
+    }
+    return 0;
+}
diff --git a/deepspeed/inference/v2/kernels/core_ops/core_ops.cpp b/deepspeed/inference/v2/kernels/core_ops/core_ops.cpp
new file mode 100644
index 000000000000..58df88e56136
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/core_ops/core_ops.cpp
@@ -0,0 +1,36 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include <c10/cuda/CUDAStream.h>
+#include <torch/extension.h>
+
+#include "bias_activation.h"
+#include "blas.h"
+#include "gated_activation_kernels.h"
+#include "layer_norm.h"
+#include "rms_norm.h"
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+    // bias_activation.h
+    m.def("bias_activation", &bias_activation, "DeepSpeed bias activation in CUDA");
+
+    // layer_norm.h
+    m.def("layer_norm", &ds_layer_norm, "DeepSpeed layer norm in CUDA");
+    m.def("pre_layer_norm", &ds_pre_layer_norm, "DeepSpeed pre layer norm in CUDA");
+    m.def("post_layer_norm", &ds_post_layer_norm, "DeepSpeed pre layer norm in CUDA");
+
+    // blas.h
+    m.def("blas_linear", &blas_linear, "Linear implemented by vendor BLAS");
+    m.def("blas_4d_matmul", &blas_4d_matmul, "4D matmul implemented by vendor BLAS");
+    m.def("create_handle", &create_handle, "Create a handle for vendor BLAS");
+
+    // gated_activation_kernels.h
+    m.def("gated_activation", &ds_gated_activation, "DeepSpeed gated activation in CUDA");
+
+    // rms_norm.h
+    m.def("rms_norm", &rms_norm, "DeepSpeed rms norm in CUDA");
+    m.def("rms_pre_norm", &rms_pre_norm, "DeepSpeed rms pre norm in CUDA");
+}
diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_layer_norm/__init__.py b/deepspeed/inference/v2/kernels/core_ops/cuda_layer_norm/__init__.py
new file mode 100644
index 000000000000..bed7688b15d2
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/core_ops/cuda_layer_norm/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .cuda_ln import *
+from .cuda_post_ln import *
+from .cuda_pre_ln import *
diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_layer_norm/cuda_fp_ln_base.py b/deepspeed/inference/v2/kernels/core_ops/cuda_layer_norm/cuda_fp_ln_base.py
new file mode 100644
index 000000000000..3c2aa5cb5eb4
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/core_ops/cuda_layer_norm/cuda_fp_ln_base.py
@@ -0,0 +1,37 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+
+from ... import DSKernelBase
+from ....inference_utils import elem_size
+from deepspeed.ops.op_builder import InferenceCoreBuilder
+
+
+class CUDAFPLNBase(DSKernelBase):
+    """
+    Base class for CUDA LN kernels. They all same the same validation logic,
+    so we can share it here.
+    """
+
+    supported_dtypes = [torch.float16, torch.bfloat16, torch.float32]
+
+    def __init__(self, channels: int, fp_dtype: torch.dtype, epsilon: float = 1e-5):
+        """
+        Parameters:
+            channels (int): Number of channels in the input tensor. Must be divisible to align
+                to 16 bytes.
+            fp_dtype (torch.dtype): Data type for the input/output/gamma. Supported values
+                are torch.float16, torch.bfloat16, and torch.float32.
+        """
+        if fp_dtype not in CUDAFPLNBase.supported_dtypes:
+            raise ValueError("Unsupported data type: {}, supported_dtypes are {}".format(
+                fp_dtype, CUDAFPLNBase.supported_dtypes))
+
+        if elem_size(fp_dtype) * channels % 16 != 0:
+            raise ValueError("channels must be divisible by 16 bytes")
+
+        self.inf_module = InferenceCoreBuilder().load()
+        self.epsilon = epsilon
diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_layer_norm/cuda_ln.py b/deepspeed/inference/v2/kernels/core_ops/cuda_layer_norm/cuda_ln.py
new file mode 100644
index 000000000000..583736fb8bbc
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/core_ops/cuda_layer_norm/cuda_ln.py
@@ -0,0 +1,30 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+
+from .cuda_fp_ln_base import CUDAFPLNBase
+
+
+class CUDAFPLN(CUDAFPLNBase):
+    """
+    Floating point layer norm kernel for CUDA/RoCM.
+
+    Performs: z = ln(x)
+    """
+
+    def __call__(self, output_z: torch.Tensor, input_x: torch.Tensor, gamma: torch.Tensor,
+                 beta: torch.Tensor) -> torch.Tensor:
+        """
+        output_z may alias input_x directly. All Tensors should have the same shape.
+
+        Parameters:
+            output_z (torch.Tensor): Output tensor.
+            input_x (torch.Tensor): Input tensor.
+            gamma (torch.Tensor): Gamma tensor.
+            beta (torch.Tensor): Beta tensor.
+        """
+        self.inf_module.layer_norm(output_z, input_x, gamma, beta, self.epsilon)
+        return output_z
diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_layer_norm/cuda_post_ln.py b/deepspeed/inference/v2/kernels/core_ops/cuda_layer_norm/cuda_post_ln.py
new file mode 100644
index 000000000000..0ced1ecf207e
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/core_ops/cuda_layer_norm/cuda_post_ln.py
@@ -0,0 +1,34 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+
+from .cuda_fp_ln_base import CUDAFPLNBase
+
+
+class CUDAFPPostLN(CUDAFPLNBase):
+    """
+    Floating point post-LayerNorm kernel for CUDA/RoCM.
+
+    Performs: z = ln(x + y)
+    """
+
+    def __call__(self, output_z: torch.Tensor, input_x: torch.Tensor, input_y: torch.Tensor, gamma: torch.Tensor,
+                 beta: torch.Tensor) -> torch.Tensor:
+        """
+        Either input_x or input_y can alias output_z.
+
+        Parameters:
+            output_z (torch.Tensor): Output tensor.
+            input_x (torch.Tensor): Input tensor.
+            input_y (torch.Tensor): Input tensor.
+            gamma (torch.Tensor): Gamma tensor.
+            beta (torch.Tensor): Beta tensor.
+
+        Returns:
+            output (torch.Tensor): Output tensor.
+        """
+        self.inf_module.post_layer_norm(output_z, input_x, input_y, gamma, beta, self.epsilon)
+        return output_z
diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_layer_norm/cuda_pre_ln.py b/deepspeed/inference/v2/kernels/core_ops/cuda_layer_norm/cuda_pre_ln.py
new file mode 100644
index 000000000000..74b2d9cf5880
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/core_ops/cuda_layer_norm/cuda_pre_ln.py
@@ -0,0 +1,39 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import Tuple
+
+import torch
+
+from .cuda_fp_ln_base import CUDAFPLNBase
+
+
+class CUDAFPPreLN(CUDAFPLNBase):
+    """
+    Floating point pre-LayerNorm kernel for CUDA/RoCM.
+
+    Performs: z_res = x_res + y_hid
+              z_hid = ln(z_hid)
+    """
+
+    def __call__(self, z_res: torch.Tensor, z_hid: torch.Tensor, x_res: torch.Tensor, y_hid: torch.Tensor,
+                 gamma: torch.Tensor, beta: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        z_res can alias x_res. All non-parameter input/output tensors
+        must have the same shape. z_hid can alias y_hid.
+
+        Parameters:
+            z_res (torch.Tensor): Output residual.
+            z_hid (torch.Tensor): Output hidden states.
+            x_res (torch.Tensor): Input residual.
+            y_hid (torch.Tensor): Input hidden states.
+            gamma (torch.Tensor): Gamma tensor.
+            beta (torch.Tensor): Beta tensor.
+
+        Returns:
+            output (torch.Tensor): Output tensor.
+        """
+        self.inf_module.pre_layer_norm(z_res, z_hid, x_res, y_hid, gamma, beta, self.epsilon)
+        return z_res, z_hid
diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_layer_norm/layer_norm.cpp b/deepspeed/inference/v2/kernels/core_ops/cuda_layer_norm/layer_norm.cpp
new file mode 100644
index 000000000000..b2c95d410a1f
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/core_ops/cuda_layer_norm/layer_norm.cpp
@@ -0,0 +1,102 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include "layer_norm.h"
+
+#define DISPATCH_LAYER_NORM(T_TYPE, C_TYPE)                \
+    if (input.options().dtype() == torch::T_TYPE) {        \
+        launch_fused_ln((C_TYPE*)output.data_ptr(),        \
+                        (const C_TYPE*)input.data_ptr(),   \
+                        (const C_TYPE*)gamma.data_ptr(),   \
+                        (const C_TYPE*)beta.data_ptr(),    \
+                        epsilon,                           \
+                        rows,                              \
+                        elems_per_row,                     \
+                        at::cuda::getCurrentCUDAStream()); \
+    }
+
+void ds_layer_norm(at::Tensor& output,
+                   at::Tensor& input,
+                   at::Tensor& gamma,
+                   at::Tensor& beta,
+                   float epsilon)
+{
+    bool ragged_input = input.dim() == 2;
+
+    const int rows = ragged_input ? input.size(0) : input.size(0) * input.size(1);
+    const int elems_per_row = ragged_input ? input.size(1) : input.size(2);
+
+    DISPATCH_LAYER_NORM(kFloat, float);
+    DISPATCH_LAYER_NORM(kHalf, __half);
+#ifdef BF16_AVAILABLE
+    DISPATCH_LAYER_NORM(kBFloat16, __nv_bfloat16);
+#endif
+}
+
+#define DISPATCH_LAYER_NORM_RESIDUAL(T_TYPE, C_TYPE)             \
+    if (input.options().dtype() == torch::T_TYPE) {              \
+        launch_fused_post_ln((C_TYPE*)output.data_ptr(),         \
+                             (const C_TYPE*)input.data_ptr(),    \
+                             (const C_TYPE*)residual.data_ptr(), \
+                             (const C_TYPE*)gamma.data_ptr(),    \
+                             (const C_TYPE*)beta.data_ptr(),     \
+                             epsilon,                            \
+                             rows,                               \
+                             elems_per_row,                      \
+                             at::cuda::getCurrentCUDAStream());  \
+    }
+
+void ds_post_layer_norm(at::Tensor& output,
+                        at::Tensor& input,
+                        at::Tensor& residual,
+                        at::Tensor& gamma,
+                        at::Tensor& beta,
+                        float epsilon)
+{
+    bool ragged_input = input.dim() == 2;
+
+    const int rows = ragged_input ? input.size(0) : input.size(0) * input.size(1);
+    const int elems_per_row = ragged_input ? input.size(1) : input.size(2);
+
+    DISPATCH_LAYER_NORM_RESIDUAL(kFloat, float);
+    DISPATCH_LAYER_NORM_RESIDUAL(kHalf, __half);
+#ifdef BF16_AVAILABLE
+    DISPATCH_LAYER_NORM_RESIDUAL(kBFloat16, __nv_bfloat16);
+#endif
+}
+
+#define DISPATCH_PRE_LAYER_NORM_RESIDUAL(T_TYPE, C_TYPE)        \
+    if (input.options().dtype() == torch::T_TYPE) {             \
+        launch_fused_pre_ln((C_TYPE*)norm_output.data_ptr(),    \
+                            (C_TYPE*)res_output.data_ptr(),     \
+                            (const C_TYPE*)input.data_ptr(),    \
+                            (const C_TYPE*)residual.data_ptr(), \
+                            (const C_TYPE*)gamma.data_ptr(),    \
+                            (const C_TYPE*)beta.data_ptr(),     \
+                            epsilon,                            \
+                            rows,                               \
+                            elems_per_row,                      \
+                            at::cuda::getCurrentCUDAStream());  \
+    }
+
+void ds_pre_layer_norm(at::Tensor& res_output,
+                       at::Tensor& norm_output,
+                       at::Tensor& input,
+                       at::Tensor& residual,
+                       at::Tensor& gamma,
+                       at::Tensor& beta,
+                       float epsilon)
+{
+    bool ragged_input = input.dim() == 2;
+
+    const int rows = ragged_input ? input.size(0) : input.size(0) * input.size(1);
+    const int elems_per_row = ragged_input ? input.size(1) : input.size(2);
+
+    DISPATCH_PRE_LAYER_NORM_RESIDUAL(kFloat, float);
+    DISPATCH_PRE_LAYER_NORM_RESIDUAL(kHalf, __half);
+#ifdef BF16_AVAILABLE
+    DISPATCH_PRE_LAYER_NORM_RESIDUAL(kBFloat16, __nv_bfloat16);
+#endif
+}
diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_layer_norm/layer_norm.cu b/deepspeed/inference/v2/kernels/core_ops/cuda_layer_norm/layer_norm.cu
new file mode 100644
index 000000000000..15f52c46622b
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/core_ops/cuda_layer_norm/layer_norm.cu
@@ -0,0 +1,490 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include "conversion_utils.h"
+#include "ds_kernel_utils.h"
+#include "memory_access_utils.h"
+#include "reduction_utils.h"
+
+namespace cg = cooperative_groups;
+using rop = reduce::ROpType;
+
+namespace ln {
+constexpr int granularity = 16;
+}  // namespace ln
+
+/*
+Regular layer norm implementation. Assumes elems_per_row % 8
+is equal to 0.
+
+Args:
+    output: buffer for output data
+    vals: buffer for input data
+    gamma: gain for normalization
+    beta: bias for normalization
+    epsilon: numeric stability
+    elems_per_row: number of elements each block will normalize
+*/
+template <typename T, int unRoll, int threadsPerGroup, int maxThreads>
+__global__ void fused_ln(T* output,
+                         const T* vals,
+                         const T* gamma,
+                         const T* beta,
+                         float epsilon,
+                         int elems_per_row)
+{
+    constexpr int T_per_load = ln::granularity / sizeof(T);
+
+    cg::thread_block tb = cg::this_thread_block();
+    cg::thread_block_tile<hw_warp_size> warp = cg::tiled_partition<hw_warp_size>(tb);
+
+    // X-dimension of the block
+    const int block_offset = (tb.group_index().x * (maxThreads / threadsPerGroup) * elems_per_row) +
+                             (tb.thread_index().y * elems_per_row);
+    const int thread_offset = tb.thread_index().x * T_per_load;
+    const int base_offset = block_offset + thread_offset;
+    const int stride = blockDim.x * T_per_load;
+
+    float sum = reduce::init<rop::Add, float>();
+
+    const T* input_base = vals + base_offset;
+
+    T local_buffer[unRoll * T_per_load];
+
+#pragma unRoll
+    for (int i = 0; i < unRoll; i++) {
+        T* iteration_buffer = local_buffer + i * T_per_load;
+
+        mem_access::load_global<ln::granularity>(
+            iteration_buffer, input_base + i * stride, thread_offset + i * stride < elems_per_row);
+
+#pragma unRoll
+        for (int j = 0; j < T_per_load; j++) {
+            float vals_up_cast = conversion::to<float>(iteration_buffer[j]);
+            sum = reduce::element<rop::Add>(sum, vals_up_cast);
+        }
+    }
+
+    reduce::partitioned_block<rop::Add, threadsPerGroup>(tb, warp, sum);
+    const float mean = sum / elems_per_row;
+
+    float mean_diff = reduce::init<rop::Add, float>();
+
+#pragma unRoll
+    for (int i = 0; i < unRoll; i++) {
+#pragma unRoll
+        for (int j = 0; j < T_per_load; j++) {
+            // Using a 0 value here skews the variance, have to if-guard
+            if (thread_offset + i * stride < elems_per_row) {
+                float diff = (conversion::to<float>(local_buffer[i * T_per_load + j]) - mean);
+                mean_diff = reduce::element<rop::Add>(mean_diff, diff * diff);
+            }
+        }
+    }
+
+    reduce::partitioned_block<rop::Add, threadsPerGroup>(tb, warp, mean_diff);
+    const float variance = mean_diff / elems_per_row;
+    const float denom = __frsqrt_rn(variance + epsilon);
+
+    T* block_output = output + block_offset;
+
+#pragma unRoll
+    for (int i = 0; i < unRoll; i++) {
+        T* iteration_buffer = local_buffer + i * T_per_load;
+        const int iter_idx = i * stride + thread_offset;
+        const bool do_loads = iter_idx < elems_per_row;
+
+        T gamma_local[T_per_load], beta_local[T_per_load];
+
+        mem_access::load_global<ln::granularity>(gamma_local, gamma + iter_idx, do_loads);
+        mem_access::load_global<ln::granularity>(beta_local, beta + iter_idx, do_loads);
+
+#pragma unRoll
+        for (int j = 0; j < T_per_load; j++) {
+            float val = conversion::to<float>(iteration_buffer[j]);
+            val = (val - mean) * denom;
+            val =
+                val * conversion::to<float>(gamma_local[j]) + conversion::to<float>(beta_local[j]);
+            iteration_buffer[j] = conversion::to<T>(val);
+        }
+
+        if (do_loads) {
+            mem_access::store_global<ln::granularity>(block_output + iter_idx, iteration_buffer);
+        }
+    }
+}
+
+#define LAUNCH_FUSED_LN(unRollFactor, threadsPerGroup, maxThreads) \
+    fused_ln<T, unRollFactor, threadsPerGroup, maxThreads>         \
+        <<<grid, block, 0, stream>>>(output, vals, gamma, beta, epsilon, elems_per_row);
+
+template <typename T>
+void launch_fused_ln(T* output,
+                     const T* vals,
+                     const T* gamma,
+                     const T* beta,
+                     float epsilon,
+                     int rows,
+                     int elems_per_row,
+                     cudaStream_t stream)
+{
+    // 8 for __half, 4 for float
+    constexpr int T_per_load = ln::granularity / sizeof(T);
+
+    constexpr int maxThreads = 256;
+
+    // For Flaoat, unRoll 4, for __half, unRoll 2
+    constexpr int internal_unRoll = sizeof(T) == 4 ? 4 : 2;
+
+    const bool is_subblock_schedule = (elems_per_row <= 128) ? true : false;
+    const int h_per_step = is_subblock_schedule ? T_per_load : T_per_load * internal_unRoll;
+
+    // Scheduling concern: may be slightly faster for some inputs to assign multiple stages of
+    // warp-sized blocks rather than stepping up to 64/96 threads
+    const int one_step_threads = next_pow2((elems_per_row + h_per_step - 1) / h_per_step);
+    const int threadsPerGroup = (one_step_threads < maxThreads) ? one_step_threads : maxThreads;
+
+    const int groups_per_block_max =
+        is_subblock_schedule ? (maxThreads + threadsPerGroup - 1) / threadsPerGroup : 1;
+    const int groups_per_block = (rows < groups_per_block_max) ? rows : groups_per_block_max;
+    const int groups_launch = (groups_per_block + rows - 1) / groups_per_block;
+
+    dim3 block(threadsPerGroup, groups_per_block);
+    dim3 grid(groups_launch);
+
+    const int elems_per_step = threadsPerGroup * h_per_step;
+    const int external_unRoll = (elems_per_row + elems_per_step - 1) / elems_per_step;
+
+    if (is_subblock_schedule) {
+        // <=128
+        if (threadsPerGroup == 1) {
+            LAUNCH_FUSED_LN(1, 1, maxThreads);
+        } else if (threadsPerGroup == 2) {
+            LAUNCH_FUSED_LN(1, 2, maxThreads);
+        } else if (threadsPerGroup == 4) {
+            LAUNCH_FUSED_LN(1, 4, maxThreads);
+        } else if (threadsPerGroup == 8) {
+            LAUNCH_FUSED_LN(1, 8, maxThreads);
+        } else if (threadsPerGroup == 16) {
+            LAUNCH_FUSED_LN(1, 16, maxThreads);
+        }
+    } else if (external_unRoll == 1) {
+        // 129 - 4096 elems
+        // (this can launch with 1-7 warps as well)
+        LAUNCH_FUSED_LN(1 * internal_unRoll, maxThreads, maxThreads);
+    } else if (external_unRoll == 2) {
+        // 4097 - 8192 elems
+        LAUNCH_FUSED_LN(2 * internal_unRoll, maxThreads, maxThreads);
+    } else if (external_unRoll == 3) {
+        // 8193 - 12288 elems
+        LAUNCH_FUSED_LN(3 * internal_unRoll, maxThreads, maxThreads);
+    } else if (external_unRoll == 4) {
+        // 12289 - 16384 elems
+        LAUNCH_FUSED_LN(4 * internal_unRoll, maxThreads, maxThreads);
+    }
+}
+
+#define INSTANTIATE_FUSED_LN(T) \
+    template void launch_fused_ln(T*, const T*, const T*, const T*, float, int, int, cudaStream_t);
+
+INSTANTIATE_FUSED_LN(__half);
+#ifdef BF16_AVAILABLE
+INSTANTIATE_FUSED_LN(__nv_bfloat16);
+#endif
+INSTANTIATE_FUSED_LN(float);
+
+/*
+Fused resiual + bias + layer norm implementation. Assumes elems_per_row % 8
+is equal to 0.
+
+TODO(cmikeh2): Goal is to deprecate this implementation. The bias + residual
+need to be fused into compute-bound producer operations.
+
+Args:
+    output: buffer for output data
+    res_output: output of residual addition
+    vals: buffer for input data
+    residual: residual data
+    bias: bias of of input data
+    gamma: gain for normalization
+    beta: bias for normalization
+    epsilon: numeric stability
+    elems_per_row: number of elements each block will normalize
+Template arg:
+    StoreResidual: controls whether the residual calculation is stored
+        or not. When set to false, the input `res_output` is unused.
+*/
+template <typename T, int unRoll, int threadsPerGroup, int maxThreads, bool preLnResidual>
+__global__ void fused_residual_ln(T* output,
+                                  T* res_output,
+                                  const T* vals,
+                                  const T* residual,
+                                  const T* gamma,
+                                  const T* beta,
+                                  float epsilon,
+                                  int elems_per_row)
+{
+    constexpr int T_per_load = ln::granularity / sizeof(T);
+
+    cg::thread_block tb = cg::this_thread_block();
+    cg::thread_block_tile<hw_warp_size> warp = cg::tiled_partition<hw_warp_size>(tb);
+
+    // X-dimension of the block
+    const int block_offset = (tb.group_index().x * (maxThreads / threadsPerGroup) * elems_per_row) +
+                             (tb.thread_index().y * elems_per_row);
+    const int thread_offset = tb.thread_index().x * T_per_load;
+    const int base_offset = block_offset + thread_offset;
+    const int stride = tb.size() * T_per_load;
+
+    float sum = reduce::init<rop::Add, float>();
+
+    const T* input_base = vals + base_offset;
+    const T* residual_base = residual + base_offset;
+
+    T local_buffer[unRoll * T_per_load];
+
+    // Unlike a vanilla layernorm, since we're fusing the two adds as well
+    // an inner unRoll seems to be less valuable. If anything, a double unRoll
+    // makes the most sense if we find we are having performance issues.
+#pragma unRoll
+    for (int i = 0; i < unRoll; i++) {
+        T* iteration_buffer = local_buffer + i * T_per_load;
+        T residual_buffer[T_per_load];
+        T bias_buffer[T_per_load];
+
+        mem_access::load_global<ln::granularity>(
+            iteration_buffer, input_base + i * stride, thread_offset + i * stride < elems_per_row);
+        mem_access::load_global<ln::granularity>(residual_buffer,
+                                                 residual_base + i * stride,
+                                                 thread_offset + i * stride < elems_per_row);
+
+#pragma unRoll
+        for (int j = 0; j < T_per_load; j++) {
+            float vals_up_cast = conversion::to<float>(iteration_buffer[j]);
+            float res_up_cast = conversion::to<float>(residual_buffer[j]);
+            vals_up_cast += res_up_cast;
+            sum = reduce::element<rop::Add>(sum, vals_up_cast);
+            iteration_buffer[j] = conversion::to<T>(vals_up_cast);
+        }
+
+        if (preLnResidual && (thread_offset + i * stride < elems_per_row)) {
+            mem_access::store_global<ln::granularity>(res_output + base_offset + i * stride,
+                                                      iteration_buffer);
+        }
+    }
+
+    reduce::partitioned_block<rop::Add, threadsPerGroup>(tb, warp, sum);
+    const float mean = sum / elems_per_row;
+
+    float mean_diff = reduce::init<rop::Add, float>();
+#pragma unRoll
+    for (int i = 0; i < unRoll; i++) {
+#pragma unRoll
+        for (int j = 0; j < T_per_load; j++) {
+            // Using a 0 value here skews the variance, have to if-guard
+            if (thread_offset + i * stride < elems_per_row) {
+                float diff = (conversion::to<float>(local_buffer[i * T_per_load + j]) - mean);
+                mean_diff = reduce::element<rop::Add>(mean_diff, diff * diff);
+            }
+        }
+    }
+
+    reduce::partitioned_block<rop::Add, threadsPerGroup>(tb, warp, mean_diff);
+    const float variance = mean_diff / elems_per_row;
+    const float denom = __frsqrt_rn(variance + epsilon);
+
+    T* block_output = output + block_offset;
+
+#pragma unRoll
+    for (int i = 0; i < unRoll; i++) {
+        T* iteration_buffer = local_buffer + i * T_per_load;
+        const int iter_idx = i * stride + thread_offset;
+        const bool do_loads = iter_idx < elems_per_row;
+
+        T gamma_local[T_per_load], beta_local[T_per_load];
+
+        mem_access::load_global<ln::granularity>(gamma_local, gamma + iter_idx, do_loads);
+        mem_access::load_global<ln::granularity>(beta_local, beta + iter_idx, do_loads);
+
+#pragma unRoll
+        for (int j = 0; j < T_per_load; j++) {
+            float val = conversion::to<float>(iteration_buffer[j]);
+            val = (val - mean) * denom;
+            val =
+                val * conversion::to<float>(gamma_local[j]) + conversion::to<float>(beta_local[j]);
+            iteration_buffer[j] = conversion::to<T>(val);
+        }
+
+        if (do_loads) {
+            mem_access::store_global<ln::granularity>(block_output + iter_idx, iteration_buffer);
+        }
+    }
+}
+
+// TODO(cmikeh2): There's a bunch of redundancy here that needs to be removed/simplified.
+#define LAUNCH_FUSED_RES_LN(unRollFactor, threadsPerGroup, maxThreads)     \
+    fused_residual_ln<T, unRollFactor, threadsPerGroup, maxThreads, false> \
+        <<<grid, block, 0, stream>>>(                                      \
+            output, nullptr, vals, residual, gamma, beta, epsilon, elems_per_row);
+
+template <typename T>
+void launch_fused_post_ln(T* output,
+                          const T* vals,
+                          const T* residual,
+                          const T* gamma,
+                          const T* beta,
+                          float epsilon,
+                          int rows,
+                          int elems_per_row,
+                          cudaStream_t stream)
+{
+    // 8 for __half, 4 for float
+    constexpr int T_per_load = ln::granularity / sizeof(T);
+
+    constexpr int maxThreads = 256;
+
+    // For Flaoat, unRoll 4, for __half, unRoll 2
+    constexpr int internal_unRoll = sizeof(T) == 4 ? 4 : 2;
+
+    const bool is_subblock_schedule = (elems_per_row <= 128) ? true : false;
+    const int h_per_step = is_subblock_schedule ? T_per_load : T_per_load * internal_unRoll;
+
+    // Scheduling concern: may be slightly faster for some inputs to assign multiple stages of
+    // warp-sized blocks rather than stepping up to 64/96 threads
+    const int one_step_threads = next_pow2((elems_per_row + h_per_step - 1) / h_per_step);
+    const int threadsPerGroup = (one_step_threads < maxThreads) ? one_step_threads : maxThreads;
+
+    const int groups_per_block_max =
+        is_subblock_schedule ? (maxThreads + threadsPerGroup - 1) / threadsPerGroup : 1;
+    const int groups_per_block = (rows < groups_per_block_max) ? rows : groups_per_block_max;
+    const int groups_launch = (groups_per_block + rows - 1) / groups_per_block;
+
+    dim3 block(threadsPerGroup, groups_per_block);
+    dim3 grid(groups_launch);
+
+    const int elems_per_step = threadsPerGroup * h_per_step;
+    const int external_unRoll = (elems_per_row + elems_per_step - 1) / elems_per_step;
+
+    if (is_subblock_schedule) {
+        // <=128
+        if (threadsPerGroup == 1) {
+            LAUNCH_FUSED_RES_LN(1, 1, maxThreads);
+        } else if (threadsPerGroup == 2) {
+            LAUNCH_FUSED_RES_LN(1, 2, maxThreads);
+        } else if (threadsPerGroup == 4) {
+            LAUNCH_FUSED_RES_LN(1, 4, maxThreads);
+        } else if (threadsPerGroup == 8) {
+            LAUNCH_FUSED_RES_LN(1, 8, maxThreads);
+        } else if (threadsPerGroup == 16) {
+            LAUNCH_FUSED_RES_LN(1, 16, maxThreads);
+        }
+    } else if (external_unRoll == 1) {
+        // 129 - 4096 elems
+        // (this can launch with 1-7 warps as well)
+        LAUNCH_FUSED_RES_LN(1 * internal_unRoll, maxThreads, maxThreads);
+    } else if (external_unRoll == 2) {
+        // 4097 - 8192 elems
+        LAUNCH_FUSED_RES_LN(2 * internal_unRoll, maxThreads, maxThreads);
+    } else if (external_unRoll == 3) {
+        // 8193 - 12288 elems
+        LAUNCH_FUSED_RES_LN(3 * internal_unRoll, maxThreads, maxThreads);
+    } else if (external_unRoll == 4) {
+        // 12289 - 16384 elems
+        LAUNCH_FUSED_RES_LN(4 * internal_unRoll, maxThreads, maxThreads);
+    }
+}
+
+#define LAUNCH_FUSED_RES_LN_STORE_PRE_LN_RES(unRollFactor, threadsPerGroup, maxThreads) \
+    fused_residual_ln<T, unRollFactor, threadsPerGroup, maxThreads, true>               \
+        <<<grid, block, 0, stream>>>(                                                   \
+            norm_output, res_output, vals, residual, gamma, beta, epsilon, elems_per_row);
+
+template <typename T>
+void launch_fused_pre_ln(T* norm_output,
+                         T* res_output,
+                         const T* vals,
+                         const T* residual,
+                         const T* gamma,
+                         const T* beta,
+                         float epsilon,
+                         int rows,
+                         int elems_per_row,
+                         cudaStream_t stream)
+{
+    // 8 for __half, 4 for float
+    constexpr int T_per_load = ln::granularity / sizeof(T);
+
+    constexpr int maxThreads = 256;
+
+    // For Flaoat, unRoll 4, for __half, unRoll 2
+    constexpr int internal_unRoll = sizeof(T) == 4 ? 4 : 2;
+
+    const bool is_subblock_schedule = (elems_per_row <= 128) ? true : false;
+    const int h_per_step = is_subblock_schedule ? T_per_load : T_per_load * internal_unRoll;
+
+    // Scheduling concern: may be slightly faster for some inputs to assign multiple stages of
+    // warp-sized blocks rather than stepping up to 64/96 threads
+    const int one_step_threads = next_pow2((elems_per_row + h_per_step - 1) / h_per_step);
+    const int threadsPerGroup = (one_step_threads < maxThreads) ? one_step_threads : maxThreads;
+
+    const int groups_per_block_max =
+        is_subblock_schedule ? (maxThreads + threadsPerGroup - 1) / threadsPerGroup : 1;
+    const int groups_per_block = (rows < groups_per_block_max) ? rows : groups_per_block_max;
+    const int groups_launch = (groups_per_block + rows - 1) / groups_per_block;
+
+    dim3 block(threadsPerGroup, groups_per_block);
+    dim3 grid(groups_launch);
+
+    const int elems_per_step = threadsPerGroup * h_per_step;
+    const int external_unRoll = (elems_per_row + elems_per_step - 1) / elems_per_step;
+
+    if (is_subblock_schedule) {
+        // <=128
+        if (threadsPerGroup == 1) {
+            LAUNCH_FUSED_RES_LN_STORE_PRE_LN_RES(1, 1, maxThreads);
+        } else if (threadsPerGroup == 2) {
+            LAUNCH_FUSED_RES_LN_STORE_PRE_LN_RES(1, 2, maxThreads);
+        } else if (threadsPerGroup == 4) {
+            LAUNCH_FUSED_RES_LN_STORE_PRE_LN_RES(1, 4, maxThreads);
+        } else if (threadsPerGroup == 8) {
+            LAUNCH_FUSED_RES_LN_STORE_PRE_LN_RES(1, 8, maxThreads);
+        } else if (threadsPerGroup == 16) {
+            LAUNCH_FUSED_RES_LN_STORE_PRE_LN_RES(1, 16, maxThreads);
+        }
+    } else if (external_unRoll == 1) {
+        // 129 - 4096 elems
+        // (this can launch with 1-7 warps as well)
+        LAUNCH_FUSED_RES_LN_STORE_PRE_LN_RES(1 * internal_unRoll, maxThreads, maxThreads);
+    } else if (external_unRoll == 2) {
+        // 4097 - 8192 elems
+        LAUNCH_FUSED_RES_LN_STORE_PRE_LN_RES(2 * internal_unRoll, maxThreads, maxThreads);
+    } else if (external_unRoll == 3) {
+        // 8193 - 12288 elems
+        LAUNCH_FUSED_RES_LN_STORE_PRE_LN_RES(3 * internal_unRoll, maxThreads, maxThreads);
+    } else if (external_unRoll == 4) {
+        // 12289 - 16384 elems
+        LAUNCH_FUSED_RES_LN_STORE_PRE_LN_RES(4 * internal_unRoll, maxThreads, maxThreads);
+    }
+}
+
+#define INSTANTIATE_RES_LN(T)              \
+    template void launch_fused_post_ln<T>( \
+        T*, const T*, const T*, const T*, const T*, float, int, int, cudaStream_t);
+
+#define INSTANTIATE_PRE_LN_RES(T)         \
+    template void launch_fused_pre_ln<T>( \
+        T*, T*, const T*, const T*, const T*, const T*, float, int, int, cudaStream_t);
+
+INSTANTIATE_RES_LN(__half);
+INSTANTIATE_RES_LN(float);
+#ifdef BF16_AVAILABLE
+INSTANTIATE_RES_LN(__nv_bfloat16);
+#endif
+
+INSTANTIATE_PRE_LN_RES(__half);
+INSTANTIATE_PRE_LN_RES(float);
+#ifdef BF16_AVAILABLE
+INSTANTIATE_PRE_LN_RES(__nv_bfloat16);
+#endif
diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_layer_norm/layer_norm.h b/deepspeed/inference/v2/kernels/core_ops/cuda_layer_norm/layer_norm.h
new file mode 100644
index 000000000000..9ea3a8c42524
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/core_ops/cuda_layer_norm/layer_norm.h
@@ -0,0 +1,67 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#pragma once
+
+#include <c10/cuda/CUDAStream.h>
+#include <torch/extension.h>
+#include "ds_kernel_utils.h"
+
+/*
+Kernel launch methods for layer norm variants.
+*/
+
+template <typename T>
+void launch_fused_ln(T* output,
+                     const T* vals,
+                     const T* gamma,
+                     const T* beta,
+                     float epsilon,
+                     int rows,
+                     int elems_per_row,
+                     cudaStream_t stream);
+
+template <typename T>
+void launch_fused_post_ln(T* output,
+                          const T* vals,
+                          const T* residual,
+                          const T* gamma,
+                          const T* beta,
+                          float epsilon,
+                          int rows,
+                          int elems_per_row,
+                          cudaStream_t stream);
+template <typename T>
+void launch_fused_pre_ln(T* norm_output,
+                         T* res_output,
+                         const T* vals,
+                         const T* residual,
+                         const T* gamma,
+                         const T* beta,
+                         float epsilon,
+                         int rows,
+                         int elems_per_row,
+                         cudaStream_t stream);
+
+void ds_layer_norm(at::Tensor& output,
+                   at::Tensor& input,
+                   at::Tensor& gamma,
+                   at::Tensor& beta,
+                   float epsilon);
+
+void ds_post_layer_norm(at::Tensor& output,
+                        at::Tensor& input,
+                        at::Tensor& residual,
+                        at::Tensor& gamma,
+                        at::Tensor& beta,
+                        float epsilon);
+
+void ds_pre_layer_norm(at::Tensor& res_output,
+                       at::Tensor& norm_output,
+                       at::Tensor& input,
+                       at::Tensor& residual,
+                       at::Tensor& gamma,
+                       at::Tensor& beta,
+                       float epsilon);
diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_rms_norm/__init__.py b/deepspeed/inference/v2/kernels/core_ops/cuda_rms_norm/__init__.py
new file mode 100644
index 000000000000..640a72307650
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/core_ops/cuda_rms_norm/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .rms_norm import CUDARMSNorm
+from .rms_pre_norm import CUDARMSPreNorm
diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_rms_norm/rms_norm.cpp b/deepspeed/inference/v2/kernels/core_ops/cuda_rms_norm/rms_norm.cpp
new file mode 100644
index 000000000000..c67712df438a
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/core_ops/cuda_rms_norm/rms_norm.cpp
@@ -0,0 +1,123 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include "rms_norm.h"
+
+#ifdef BF16_AVAILABLE
+#define DISPATCH_FOR_FLOAT(DTYPE, ...)                                  \
+    [&] {                                                               \
+        if (DTYPE == torch::kFloat32) {                                 \
+            using scalar_t = float;                                     \
+            return __VA_ARGS__();                                       \
+        } else if (DTYPE == torch::kFloat16) {                          \
+            using scalar_t = __half;                                    \
+            return __VA_ARGS__();                                       \
+        } else if (DTYPE == torch::kBFloat16) {                         \
+            using scalar_t = __nv_bfloat16;                             \
+            return __VA_ARGS__();                                       \
+        } else {                                                        \
+            TORCH_CHECK(false, "Unsupported dtype for BiasActivation"); \
+        }                                                               \
+    }()
+#else
+#define DISPATCH_FOR_FLOAT(DTYPE, ...)                                  \
+    [&] {                                                               \
+        if (DTYPE == torch::kFloat32) {                                 \
+            using scalar_t = float;                                     \
+            return __VA_ARGS__();                                       \
+        } else if (DTYPE == torch::kFloat16) {                          \
+            using scalar_t = __half;                                    \
+            return __VA_ARGS__();                                       \
+        } else {                                                        \
+            TORCH_CHECK(false, "Unsupported dtype for BiasActivation"); \
+        }                                                               \
+    }()
+#endif
+
+void rms_norm(torch::Tensor& norm_output,
+              torch::Tensor& norm_input,
+              torch::Tensor& gamma,
+              float epsilon)
+{
+    TORCH_CHECK(norm_output.scalar_type() == norm_input.scalar_type(),
+                "norm_output and norm_input should have the same data type");
+    TORCH_CHECK(norm_output.scalar_type() == gamma.scalar_type(),
+                "norm_output and gamma should have the same data type");
+
+    const int32_t rows = norm_input.size(0);
+    const int32_t cols = norm_input.size(1);
+
+    TORCH_CHECK(norm_output.size(0) == rows,
+                "norm_output and norm_input should have the same first dimension");
+    TORCH_CHECK(norm_output.size(1) == cols,
+                "norm_output and norm_input should have the same second dimension");
+
+    DISPATCH_FOR_FLOAT(norm_output.scalar_type(), [&] {
+        scalar_t* norm_output_ptr = reinterpret_cast<scalar_t*>(norm_output.data_ptr());
+        scalar_t* norm_input_ptr = reinterpret_cast<scalar_t*>(norm_input.data_ptr());
+        scalar_t* gamma_ptr = reinterpret_cast<scalar_t*>(gamma.data_ptr());
+        scalar_t* null_t = nullptr;
+
+        launch_rms_norm(norm_output_ptr,
+                        null_t,
+                        norm_input_ptr,
+                        null_t,
+                        gamma_ptr,
+                        epsilon,
+                        rows,
+                        cols,
+                        at::cuda::getCurrentCUDAStream());
+    });
+}
+
+void rms_pre_norm(torch::Tensor& norm_output,
+                  torch::Tensor& residual_output,
+                  torch::Tensor& norm_input,
+                  torch::Tensor& residual_input,
+                  torch::Tensor& gamma,
+                  float epsilon)
+{
+    TORCH_CHECK(norm_output.scalar_type() == norm_input.scalar_type(),
+                "norm_output and norm_input should have the same data type");
+    TORCH_CHECK(norm_output.scalar_type() == gamma.scalar_type(),
+                "norm_output and gamma should have the same data type");
+
+    const int32_t rows = norm_input.size(0);
+    const int32_t cols = norm_input.size(1);
+
+    TORCH_CHECK(norm_output.size(0) == rows,
+                "norm_output and norm_input should have the same first dimension");
+    TORCH_CHECK(norm_output.size(1) == cols,
+                "norm_output and norm_input should have the same second dimension");
+
+    TORCH_CHECK(residual_output.size(0) == rows,
+                "residual_output and norm_input should have the same first dimension");
+    TORCH_CHECK(residual_output.size(1) == cols,
+                "residual_output and norm_input should have the same second dimension");
+
+    TORCH_CHECK(residual_input.size(0) == rows,
+                "residual_input and norm_input should have the same first dimension");
+    TORCH_CHECK(residual_input.size(1) == cols,
+                "residual_input and norm_input should have the same second dimension");
+
+    DISPATCH_FOR_FLOAT(norm_output.scalar_type(), [&] {
+        scalar_t* norm_output_ptr = reinterpret_cast<scalar_t*>(norm_output.data_ptr());
+        scalar_t* residual_output_ptr = reinterpret_cast<scalar_t*>(residual_output.data_ptr());
+        const scalar_t* norm_input_ptr = reinterpret_cast<const scalar_t*>(norm_input.data_ptr());
+        const scalar_t* residual_input_ptr =
+            reinterpret_cast<const scalar_t*>(residual_input.data_ptr());
+        const scalar_t* gamma_ptr = reinterpret_cast<const scalar_t*>(gamma.data_ptr());
+
+        launch_rms_norm(norm_output_ptr,
+                        residual_output_ptr,
+                        norm_input_ptr,
+                        residual_input_ptr,
+                        gamma_ptr,
+                        epsilon,
+                        rows,
+                        cols,
+                        at::cuda::getCurrentCUDAStream());
+    });
+}
diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_rms_norm/rms_norm.cu b/deepspeed/inference/v2/kernels/core_ops/cuda_rms_norm/rms_norm.cu
new file mode 100644
index 000000000000..e69d3c36cc00
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/core_ops/cuda_rms_norm/rms_norm.cu
@@ -0,0 +1,262 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include "conversion_utils.h"
+#include "ds_kernel_utils.h"
+#include "memory_access_utils.h"
+#include "reduction_utils.h"
+
+namespace cg = cooperative_groups;
+using rop = reduce::ROpType;
+
+namespace rms {
+constexpr int granularity = 16;
+}  // namespace rms
+
+template <typename T, int UNROLL, int threadsPerGroup, int maxThreads>
+__global__ void rms_norm(T* output, const T* vals, const T* gamma, float epsilon, int elems_per_row)
+{
+    constexpr int T_per_load = rms::granularity / sizeof(T);
+
+    cg::thread_block tb = cg::this_thread_block();
+    cg::thread_block_tile<hw_warp_size> warp = cg::tiled_partition<hw_warp_size>(tb);
+
+    // X-dimension of the block
+    const int block_offset = (tb.group_index().x * (maxThreads / threadsPerGroup) * elems_per_row) +
+                             (tb.thread_index().y * elems_per_row);
+    const int thread_offset = tb.thread_index().x * T_per_load;
+    const int base_offset = block_offset + thread_offset;
+    const int stride = blockDim.x * T_per_load;
+
+    float var_sum = reduce::init<rop::Add, float>();
+
+    const T* input_base = vals + base_offset;
+
+    T local_buffer[UNROLL * T_per_load];
+
+#pragma unroll
+    for (int i = 0; i < UNROLL; i++) {
+        T* iteration_buffer = local_buffer + (i * T_per_load);
+
+        mem_access::load_global<rms::granularity>(iteration_buffer,
+                                                  input_base + (i * stride),
+                                                  thread_offset + (i * stride) < elems_per_row);
+
+#pragma unroll
+        for (int j = 0; j < T_per_load; j++) {
+            float up_cast = conversion::to<float>(iteration_buffer[j]);
+            float sq_val = up_cast * up_cast;
+            var_sum = reduce::element<rop::Add, float>(var_sum, sq_val);
+        }
+    }
+
+    reduce::partitioned_block<rop::Add, threadsPerGroup>(tb, warp, var_sum);
+    const float var = var_sum / elems_per_row;
+    const T denom = conversion::to<T>(__frsqrt_rn(var + epsilon));
+
+    T* block_output = output + block_offset;
+
+#pragma unroll
+    for (int i = 0; i < UNROLL; i++) {
+        T* iteration_buffer = local_buffer + (i * T_per_load);
+        const int iter_idx = i * stride + thread_offset;
+        const bool do_loads = (iter_idx < elems_per_row);
+
+        T gamma_local[T_per_load];
+
+        mem_access::load_global<rms::granularity>(gamma_local, gamma + iter_idx, do_loads);
+
+#pragma unroll
+        for (int j = 0; j < T_per_load; j++) {
+            iteration_buffer[j] *= denom;
+            iteration_buffer[j] *= gamma_local[j];
+        }
+
+        if (do_loads) {
+            mem_access::store_global<rms::granularity>(block_output + iter_idx, iteration_buffer);
+        }
+    }
+}
+
+template <typename T, int UNROLL, int threadsPerGroup, int maxThreads>
+__global__ void pre_rms_norm(T* output,
+                             T* res_out,
+                             const T* vals,
+                             const T* residual,
+                             const T* gamma,
+                             float epsilon,
+                             int elems_per_row)
+{
+    constexpr int T_per_load = rms::granularity / sizeof(T);
+
+    cg::thread_block tb = cg::this_thread_block();
+    cg::thread_block_tile<hw_warp_size> warp = cg::tiled_partition<hw_warp_size>(tb);
+
+    // X-dimension of the block
+    const int block_offset = (tb.group_index().x * (maxThreads / threadsPerGroup) * elems_per_row) +
+                             (tb.thread_index().y * elems_per_row);
+    const int thread_offset = tb.thread_index().x * T_per_load;
+    const int base_offset = block_offset + thread_offset;
+    const int stride = blockDim.x * T_per_load;
+
+    float var_sum = reduce::init<rop::Add, float>();
+
+    const T* input_base = vals + base_offset;
+    const T* residual_base = residual + base_offset;
+    T* res_output = res_out + base_offset;
+
+    T local_buffer[UNROLL * T_per_load];
+
+#pragma unroll
+    for (int i = 0; i < UNROLL; i++) {
+        T* iteration_buffer = local_buffer + (i * T_per_load);
+        T residual_buffer[T_per_load];
+
+        const int iter_offset = i * stride + thread_offset;
+        const bool do_loads = (iter_offset < elems_per_row);
+
+        mem_access::load_global<rms::granularity>(
+            iteration_buffer, input_base + (i * stride), do_loads);
+        mem_access::load_global<rms::granularity>(
+            residual_buffer, residual_base + (i * stride), do_loads);
+
+#pragma unroll
+        for (int j = 0; j < T_per_load; j++) {
+            iteration_buffer[j] += residual_buffer[j];
+            float vals_up_cast = conversion::to<float>(iteration_buffer[j]);
+
+            var_sum = reduce::element<rop::Add, float>(var_sum, vals_up_cast * vals_up_cast);
+        }
+
+        if (do_loads) {
+            mem_access::store_global<rms::granularity>(res_output + i * stride, iteration_buffer);
+        }
+    }
+
+    reduce::partitioned_block<rop::Add, threadsPerGroup>(tb, warp, var_sum);
+    const float var = var_sum / elems_per_row;
+    const T denom = conversion::to<T>(__frsqrt_rn(var + epsilon));
+
+    T* block_output = output + block_offset;
+
+#pragma unroll
+    for (int i = 0; i < UNROLL; i++) {
+        T* iteration_buffer = local_buffer + (i * T_per_load);
+        const int iter_idx = i * stride + thread_offset;
+        const bool do_loads = (iter_idx < elems_per_row);
+
+        T gamma_local[T_per_load];
+
+        mem_access::load_global<rms::granularity>(gamma_local, gamma + iter_idx, do_loads);
+
+#pragma unroll
+        for (int j = 0; j < T_per_load; j++) {
+            iteration_buffer[j] *= denom;
+            iteration_buffer[j] *= gamma_local[j];
+        }
+
+        if (do_loads) {
+            mem_access::store_global<rms::granularity>(block_output + iter_idx, iteration_buffer);
+        }
+    }
+}
+
+#define LAUNCH_RMS_NORM(UNROLL, threadsPerGroup, maxThreads) \
+    rms_norm<T, UNROLL, threadsPerGroup, maxThreads>         \
+        <<<grid, block, 0, stream>>>(norm_output, vals, gamma, epsilon, elems_per_row);
+
+#define LAUNCH_PRE_RMS_NORM(UNROLL, threadsPerGroup, maxThreads)                      \
+    pre_rms_norm<T, UNROLL, threadsPerGroup, maxThreads><<<grid, block, 0, stream>>>( \
+        norm_output, res_output, vals, residual, gamma, epsilon, elems_per_row);
+
+#define LAUNCH_ALL_RMS_NORM(UNROLL, threadsPerGroup, maxThreads) \
+    if (pre_norm) {                                              \
+        LAUNCH_PRE_RMS_NORM(UNROLL, threadsPerGroup, maxThreads) \
+    } else {                                                     \
+        LAUNCH_RMS_NORM(UNROLL, threadsPerGroup, maxThreads)     \
+    }
+
+template <typename T>
+void launch_rms_norm(T* norm_output,
+                     T* res_output,
+                     const T* vals,
+                     const T* residual,
+                     const T* gamma,
+                     float epsilon,
+                     int rows,
+                     int elems_per_row,
+                     cudaStream_t stream)
+{
+    // 8 for __half, 4 for float
+    constexpr int T_per_load = rms::granularity / sizeof(T);
+    constexpr int maxThreads = 256;
+    constexpr int internalUnroll = sizeof(T) == 4 ? 4 : 2;
+
+    const bool is_subblock_schedule = (elems_per_row <= 128) ? true : false;
+    const int h_per_step = is_subblock_schedule ? T_per_load : T_per_load * internalUnroll;
+
+    // Scheduling concern: may be slightly faster for some inputs to assign multiple stages of
+    // warp-sized blocks rather than stepping up to 64/96 threads
+    const int one_step_threads = next_pow2((elems_per_row + h_per_step - 1) / h_per_step);
+    const int threads_per_group = (one_step_threads < maxThreads) ? one_step_threads : maxThreads;
+
+    const int groups_per_block_max =
+        is_subblock_schedule ? (maxThreads + threads_per_group - 1) / threads_per_group : 1;
+    const int groups_per_block = (rows < groups_per_block_max) ? rows : groups_per_block_max;
+    const int groups_launch = (groups_per_block + rows - 1) / groups_per_block;
+
+    dim3 block(threads_per_group, groups_per_block);
+    dim3 grid(groups_launch);
+
+    const int elems_per_step = threads_per_group * h_per_step;
+    const int external_unRoll = (elems_per_row + elems_per_step - 1) / elems_per_step;
+
+    bool pre_norm = (residual == nullptr) ? false : true;
+
+    if (is_subblock_schedule) {
+        // <=128
+        if (threads_per_group == 1) {
+            LAUNCH_ALL_RMS_NORM(1, 1, maxThreads);
+        } else if (threads_per_group == 2) {
+            LAUNCH_ALL_RMS_NORM(1, 2, maxThreads);
+        } else if (threads_per_group == 4) {
+            LAUNCH_ALL_RMS_NORM(1, 4, maxThreads);
+        } else if (threads_per_group == 8) {
+            LAUNCH_ALL_RMS_NORM(1, 8, maxThreads);
+        } else if (threads_per_group == 16) {
+            LAUNCH_ALL_RMS_NORM(1, 16, maxThreads);
+        }
+    } else if (external_unRoll == 1) {
+        // 129 - 4096 elems
+        // (this can launch with 1-7 warps as well)
+        LAUNCH_ALL_RMS_NORM(1 * internalUnroll, maxThreads, maxThreads);
+    } else if (external_unRoll == 2) {
+        // 4097 - 8192 elems
+        LAUNCH_ALL_RMS_NORM(2 * internalUnroll, maxThreads, maxThreads);
+    } else if (external_unRoll == 3) {
+        // 8193 - 12288 elems
+        LAUNCH_ALL_RMS_NORM(3 * internalUnroll, maxThreads, maxThreads);
+    } else if (external_unRoll == 4) {
+        // 12289 - 16384 elems
+        LAUNCH_ALL_RMS_NORM(4 * internalUnroll, maxThreads, maxThreads);
+    }
+}
+
+#define INSTANTIATE_LAUNCH_RMS_NORM(T)                  \
+    template void launch_rms_norm<T>(T * norm_output,   \
+                                     T * res_output,    \
+                                     const T* vals,     \
+                                     const T* residual, \
+                                     const T* gamma,    \
+                                     float epsilon,     \
+                                     int rows,          \
+                                     int elems_per_row, \
+                                     cudaStream_t stream);
+
+INSTANTIATE_LAUNCH_RMS_NORM(float)
+INSTANTIATE_LAUNCH_RMS_NORM(__half)
+#ifdef BF16_AVAILABLE
+INSTANTIATE_LAUNCH_RMS_NORM(__nv_bfloat16)
+#endif
diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_rms_norm/rms_norm.h b/deepspeed/inference/v2/kernels/core_ops/cuda_rms_norm/rms_norm.h
new file mode 100644
index 000000000000..7867fb65964f
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/core_ops/cuda_rms_norm/rms_norm.h
@@ -0,0 +1,33 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#pragma once
+
+#include <c10/cuda/CUDAStream.h>
+#include <torch/extension.h>
+#include "ds_kernel_utils.h"
+
+template <typename T>
+void launch_rms_norm(T* norm_output,
+                     T* res_output,
+                     const T* vals,
+                     const T* residual,
+                     const T* gamma,
+                     float epsilon,
+                     int rows,
+                     int elems_per_row,
+                     cudaStream_t stream);
+
+void rms_norm(torch::Tensor& norm_output,
+              torch::Tensor& norm_input,
+              torch::Tensor& gamma,
+              float epsilon);
+
+void rms_pre_norm(torch::Tensor& norm_output,
+                  torch::Tensor& residual_output,
+                  torch::Tensor& norm_input,
+                  torch::Tensor& residual_input,
+                  torch::Tensor& gamma,
+                  float epsilon);
diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_rms_norm/rms_norm.py b/deepspeed/inference/v2/kernels/core_ops/cuda_rms_norm/rms_norm.py
new file mode 100644
index 000000000000..deb5d33111a9
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/core_ops/cuda_rms_norm/rms_norm.py
@@ -0,0 +1,28 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+
+from .rms_norm_base import CUDARMSNormBase
+
+
+class CUDARMSNorm(CUDARMSNormBase):
+    """
+    Floating point layer norm kernel for CUDA/RoCM.
+
+    Performs: z = ln(x)
+    """
+
+    def __call__(self, output_z: torch.Tensor, input_x: torch.Tensor, gamma: torch.Tensor) -> torch.Tensor:
+        """
+        output_z may alias input_x directly. All Tensors should have the same shape.
+
+        Parameters:
+            output_z (torch.Tensor): Output tensor.
+            input_x (torch.Tensor): Input tensor.
+            gamma (torch.Tensor): Gamma tensor.
+        """
+        self.inf_module.rms_norm(output_z, input_x, gamma, self.epsilon)
+        return output_z
diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_rms_norm/rms_norm_base.py b/deepspeed/inference/v2/kernels/core_ops/cuda_rms_norm/rms_norm_base.py
new file mode 100644
index 000000000000..62bc9d056ade
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/core_ops/cuda_rms_norm/rms_norm_base.py
@@ -0,0 +1,37 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+
+from ... import DSKernelBase
+from ....inference_utils import elem_size
+from deepspeed.ops.op_builder import InferenceCoreBuilder
+
+
+class CUDARMSNormBase(DSKernelBase):
+    """
+    Base class for CUDA LN kernels. They all same the same validation logic,
+    so we can share it here.
+    """
+
+    supported_dtypes = [torch.float16, torch.bfloat16, torch.float32]
+
+    def __init__(self, channels: int, fp_dtype: torch.dtype, epsilon: float = 1e-5):
+        """
+        Parameters:
+            channels (int): Number of channels in the input tensor. Must be divisible to align
+                to 16 bytes.
+            fp_dtype (torch.dtype): Data type for the input/output/gamma. Supported values
+                are torch.float16, torch.bfloat16, and torch.float32.
+        """
+        if fp_dtype not in CUDARMSNormBase.supported_dtypes:
+            raise ValueError("Unsupported data type: {}, supported_dtypes are {}".format(
+                fp_dtype, CUDARMSNormBase.supported_dtypes))
+
+        if elem_size(fp_dtype) * channels % 16 != 0:
+            raise ValueError("channels must be divisible by 16 bytes")
+
+        self.inf_module = InferenceCoreBuilder().load()
+        self.epsilon = epsilon
diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_rms_norm/rms_pre_norm.py b/deepspeed/inference/v2/kernels/core_ops/cuda_rms_norm/rms_pre_norm.py
new file mode 100644
index 000000000000..3b040d88b50f
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/core_ops/cuda_rms_norm/rms_pre_norm.py
@@ -0,0 +1,39 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import Tuple
+
+import torch
+
+from .rms_norm_base import CUDARMSNormBase
+
+
+class CUDARMSPreNorm(CUDARMSNormBase):
+    """
+    Floating point pre-LayerNorm kernel for CUDA/RoCM.
+
+    Performs: z_res = x_res + y_hid
+              z_hid = ln(z_hid)
+    """
+
+    def __call__(self, z_res: torch.Tensor, z_hid: torch.Tensor, x_res: torch.Tensor, y_hid: torch.Tensor,
+                 gamma: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        z_res can alias x_res. All non-parameter input/output tensors
+        must have the same shape. z_hid can alias y_hid.
+
+        Parameters:
+            z_res (torch.Tensor): Output residual.
+            z_hid (torch.Tensor): Output hidden states.
+            x_res (torch.Tensor): Input residual.
+            y_hid (torch.Tensor): Input hidden states.
+            gamma (torch.Tensor): Gamma tensor.
+            beta (torch.Tensor): Beta tensor.
+
+        Returns:
+            output (torch.Tensor): Output tensor.
+        """
+        self.inf_module.rms_pre_norm(z_hid, z_res, y_hid, x_res, gamma, self.epsilon)
+        return z_res, z_hid
diff --git a/deepspeed/inference/v2/kernels/core_ops/gated_activations/__init__.py b/deepspeed/inference/v2/kernels/core_ops/gated_activations/__init__.py
new file mode 100644
index 000000000000..05479d86c906
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/core_ops/gated_activations/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .gated_activation import *
diff --git a/deepspeed/inference/v2/kernels/core_ops/gated_activations/gated_activation.py b/deepspeed/inference/v2/kernels/core_ops/gated_activations/gated_activation.py
new file mode 100644
index 000000000000..ca1b62ba5c36
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/core_ops/gated_activations/gated_activation.py
@@ -0,0 +1,65 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import Optional
+
+import torch
+
+from ... import DSKernelBase
+from ....inference_utils import ActivationType, elem_size
+from deepspeed.ops.op_builder import InferenceCoreBuilder
+
+
+class CUDAGatedActivation(DSKernelBase):
+    """
+    CUDA implementation of gated activation kernel. This kernel assumes that the input
+    tensor has gate and activation values in adjacent channels. The output tensor should
+    have half the dimensionality of the input tensor.
+    """
+
+    supported_dtypes = [torch.float16, torch.bfloat16, torch.float32]
+    supported_act_fns = [ActivationType.GEGLU, ActivationType.ReGLU, ActivationType.SiGLU]
+
+    def __init__(self, channels: int, fp_dtype: torch.dtype, act_fn: ActivationType) -> None:
+        """
+        Compile and validate for the gated activation function.
+
+        Args:
+            channels (int): Number of columns in the output tensor. Must be divisible to align
+                to 8 bytes.
+            fp_dtype (torch.dtype): Data type for the input/output/gamma. Supported values
+                are torch.float16, torch.bfloat16, and torch.float32.
+            act_fn (ActivationType): Activation function to use. Only GEGLU is supported.
+        """
+        if fp_dtype not in CUDAGatedActivation.supported_dtypes:
+            raise ValueError("Unsupported data type: {}, supported_dtypes are {}".format(
+                fp_dtype, CUDAGatedActivation.supported_dtypes))
+
+        act_fn = ActivationType(act_fn)
+        if act_fn not in CUDAGatedActivation.supported_act_fns:
+            raise ValueError("Unsupported activation function: {}, supported_act_fns are {}".format(
+                act_fn, CUDAGatedActivation.supported_act_fns))
+
+        if elem_size(fp_dtype) * channels % 8 != 0:
+            raise ValueError("Channels must be divisible by 16 bytes")
+
+        if elem_size(fp_dtype) * channels > 98304:
+            raise ValueError(
+                "Kernel only compiled to support 98304 bytes per row, please file an issue if your model requires more."
+            )
+
+        self.inf_module = InferenceCoreBuilder().load()
+        self.act_fn = act_fn
+        self.kernel = self.inf_module.gated_activation
+
+    def __call__(self, output: torch.Tensor, input: torch.Tensor, bias: Optional[torch.Tensor] = None) -> None:
+        """
+        Performs gated activation on the input tensor, writing the result to the output tensor.
+
+        Args:
+            output (torch.Tensor): Output tensor. Can be of [T, C // 2] or [B, S, C // 2]
+            input (torch.Tensor): Input tensor. Can be of [T, C] or [B, S, C]
+        """
+        self.kernel(output, input, bias, self.act_fn.value)
diff --git a/deepspeed/inference/v2/kernels/core_ops/gated_activations/gated_activation_kernels.cpp b/deepspeed/inference/v2/kernels/core_ops/gated_activations/gated_activation_kernels.cpp
new file mode 100644
index 000000000000..05463c75138c
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/core_ops/gated_activations/gated_activation_kernels.cpp
@@ -0,0 +1,72 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include "gated_activation_kernels.h"
+
+#ifdef BF16_AVAILABLE
+#define DISPATCH_FOR_FLOAT(DTYPE, ...)                                  \
+    [&] {                                                               \
+        if (DTYPE == torch::kFloat32) {                                 \
+            using scalar_t = float;                                     \
+            return __VA_ARGS__();                                       \
+        } else if (DTYPE == torch::kFloat16) {                          \
+            using scalar_t = __half;                                    \
+            return __VA_ARGS__();                                       \
+        } else if (DTYPE == torch::kBFloat16) {                         \
+            using scalar_t = __nv_bfloat16;                             \
+            return __VA_ARGS__();                                       \
+        } else {                                                        \
+            TORCH_CHECK(false, "Unsupported dtype for BiasActivation"); \
+        }                                                               \
+    }()
+#else
+#define DISPATCH_FOR_FLOAT(DTYPE, ...)                                  \
+    [&] {                                                               \
+        if (DTYPE == torch::kFloat32) {                                 \
+            using scalar_t = float;                                     \
+            return __VA_ARGS__();                                       \
+        } else if (DTYPE == torch::kFloat16) {                          \
+            using scalar_t = __half;                                    \
+            return __VA_ARGS__();                                       \
+        } else {                                                        \
+            TORCH_CHECK(false, "Unsupported dtype for BiasActivation"); \
+        }                                                               \
+    }()
+#endif
+
+void ds_gated_activation(at::Tensor& output,
+                         at::Tensor& input,
+                         c10::optional<torch::Tensor>& bias,
+                         int activation_type_raw)
+{
+    bool ragged_input = input.dim() == 2;
+
+    const ActivationType activation_type = static_cast<ActivationType>(activation_type_raw);
+
+    const int rows = ragged_input ? input.size(0) : input.size(0) * input.size(1);
+    const int cols = ragged_input ? input.size(1) : input.size(2);
+
+    DISPATCH_FOR_FLOAT(input.scalar_type(), [&] {
+        scalar_t* bias_ptr = nullptr;
+        if (bias.has_value()) {
+            TORCH_CHECK(bias.value().scalar_type() == input.scalar_type(),
+                        "Bias type must match input type");
+            TORCH_CHECK(bias.value().numel() == cols,
+                        "Bias must have the same number of elements as the input channels");
+            bias_ptr = reinterpret_cast<scalar_t*>(bias.value().data_ptr());
+        }
+
+        scalar_t* output_ptr = reinterpret_cast<scalar_t*>(output.data_ptr());
+        const scalar_t* input_ptr = reinterpret_cast<const scalar_t*>(input.data_ptr());
+
+        launch_gated_activation(output_ptr,
+                                input_ptr,
+                                bias_ptr,
+                                rows,
+                                cols,
+                                activation_type,
+                                c10::cuda::getCurrentCUDAStream());
+    });
+}
diff --git a/deepspeed/inference/v2/kernels/core_ops/gated_activations/gated_activation_kernels.cu b/deepspeed/inference/v2/kernels/core_ops/gated_activations/gated_activation_kernels.cu
new file mode 100644
index 000000000000..84a9906cf037
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/core_ops/gated_activations/gated_activation_kernels.cu
@@ -0,0 +1,169 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include <stdexcept>
+#include "activation_type.h"
+#include "conversion_utils.h"
+#include "ds_kernel_utils.h"
+#include "memory_access_utils.h"
+
+namespace cg = cooperative_groups;
+
+namespace gated_act {
+
+constexpr int access_size = 16;
+constexpr int threads = 1024;
+
+template <ActivationType ActType>
+float gated_act_fn(float x, float y);
+
+template <>
+DS_D_INLINE float gated_act_fn<ActivationType::GEGLU>(float x, float y)
+{
+    constexpr float sqrt_param = 0.79788456080286535587989211986876f;
+    constexpr float mul_param = 0.044715;
+    return y * x * 0.5f * (1.0f + tanhf(sqrt_param * (x + mul_param * x * x * x)));
+}
+
+template <>
+DS_D_INLINE float gated_act_fn<ActivationType::ReGLU>(float x, float y)
+{
+    return y * (x > 0.0f ? x : 0.0f);
+}
+
+template <>
+DS_D_INLINE float gated_act_fn<ActivationType::SiGLU>(float x, float y)
+{
+    return y * (x / (1.0f + expf(-x)));
+}
+
+}  // namespace gated_act
+
+template <typename T, ActivationType ActType, int loopUnroll>
+__global__ void gated_activation_kernel(T* output,
+                                        const T* input,
+                                        const T* bias,
+                                        int rows,
+                                        int cols)
+{
+    constexpr int read_vector = gated_act::access_size / sizeof(T);
+    constexpr int write_vector = read_vector / 2;
+
+    const int row = blockIdx.x;
+    const int col = threadIdx.x * read_vector;
+
+    const T* input_row = input + row * cols;
+    T* output_row = output + row * cols / 2;
+
+#pragma unroll
+    for (int i = 0; i < loopUnroll; i++) {
+        T read[read_vector];
+        T bias_read[read_vector];
+        T store[write_vector];
+
+        const int read_offset = col + gated_act::threads * read_vector * i;
+        const int write_offset = col / 2 + gated_act::threads * write_vector * i;
+
+        if (i != loopUnroll - 1 || read_offset < cols) {
+            mem_access::load_global<gated_act::access_size>(read, input_row + read_offset);
+            mem_access::load_global<gated_act::access_size>(
+                bias_read, bias + read_offset, bias != nullptr);
+
+            for (int j = 0; j < write_vector; j++) {
+                float g_val =
+                    conversion::to<float>(read[j * 2]) + conversion::to<float>(bias_read[j * 2]);
+                float a_val = conversion::to<float>(read[j * 2 + 1]) +
+                              conversion::to<float>(bias_read[j * 2 + 1]);
+
+                float act_val = gated_act::gated_act_fn<ActType>(g_val, a_val);
+                store[j] = conversion::to<T>(act_val);
+            }
+
+            mem_access::store_global<gated_act::access_size / 2>(output_row + write_offset, store);
+        }
+    }
+}
+
+#define DISPATCH_UNROLL(unroll_val)                 \
+    gated_activation_kernel<T, ActType, unroll_val> \
+        <<<grid, block, 0, stream>>>(output, input, bias, rows, cols);
+
+template <typename T, ActivationType ActType>
+void launch_gated_activation_impl(T* output,
+                                  const T* input,
+                                  const T* bias,
+                                  int rows,
+                                  int cols,
+                                  cudaStream_t stream)
+{
+    constexpr int read_vector = gated_act::access_size / sizeof(T);
+    constexpr int cols_per_unroll = gated_act::threads * read_vector;
+    const int req_threads = (cols + read_vector - 1) / read_vector;
+    const int threads = std::min(req_threads, gated_act::threads);
+
+    const dim3 grid(rows);
+    const dim3 block(threads);
+    const int unroll = (cols + cols_per_unroll - 1) / cols_per_unroll;
+
+    if (unroll == 1) {
+        DISPATCH_UNROLL(1);
+    } else if (unroll == 2) {
+        DISPATCH_UNROLL(2);
+    } else if (unroll == 3) {
+        DISPATCH_UNROLL(3);
+    } else if (unroll == 4) {
+        DISPATCH_UNROLL(4);
+    } else if (unroll == 5) {
+        DISPATCH_UNROLL(5);
+    } else if (unroll == 6) {
+        DISPATCH_UNROLL(6);
+    } else {
+        throw std::runtime_error(
+            "Called with more columns than supported, please report this bug and this limit will "
+            "be increased.");
+    }
+}
+
+template <typename T>
+void launch_gated_activation(T* output,
+                             const T* input,
+                             const T* bias,
+                             int rows,
+                             int cols,
+                             ActivationType act_type,
+                             cudaStream_t stream)
+{
+    switch (act_type) {
+        case ActivationType::GEGLU:
+            launch_gated_activation_impl<T, ActivationType::GEGLU>(
+                output, input, bias, rows, cols, stream);
+            break;
+        case ActivationType::ReGLU:
+            launch_gated_activation_impl<T, ActivationType::ReGLU>(
+                output, input, bias, rows, cols, stream);
+            break;
+        case ActivationType::SiGLU:
+            launch_gated_activation_impl<T, ActivationType::SiGLU>(
+                output, input, bias, rows, cols, stream);
+            break;
+        default: throw std::runtime_error("Unsupported activation type");
+    }
+}
+
+#define INSTANTIATE_FOR_TYPE(T)                                       \
+    template void launch_gated_activation<T>(T * output,              \
+                                             const T* input,          \
+                                             const T* bias,           \
+                                             int rows,                \
+                                             int cols,                \
+                                             ActivationType act_type, \
+                                             cudaStream_t stream);
+
+INSTANTIATE_FOR_TYPE(float)
+INSTANTIATE_FOR_TYPE(__half)
+
+#ifdef BF16_AVAILABLE
+INSTANTIATE_FOR_TYPE(__nv_bfloat16)
+#endif
diff --git a/deepspeed/inference/v2/kernels/core_ops/gated_activations/gated_activation_kernels.h b/deepspeed/inference/v2/kernels/core_ops/gated_activations/gated_activation_kernels.h
new file mode 100644
index 000000000000..6ae01e99679a
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/core_ops/gated_activations/gated_activation_kernels.h
@@ -0,0 +1,25 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#pragma once
+
+#include <c10/cuda/CUDAStream.h>
+#include <torch/extension.h>
+#include "activation_type.h"
+#include "ds_kernel_utils.h"
+
+template <typename T>
+void launch_gated_activation(T* output,
+                             const T* vals,
+                             const T* bias,
+                             int rows,
+                             int cols,
+                             ActivationType activation_type,
+                             cudaStream_t stream);
+
+void ds_gated_activation(at::Tensor& output,
+                         at::Tensor& input,
+                         c10::optional<torch::Tensor>& bias,
+                         int activation_type_raw);
diff --git a/deepspeed/inference/v2/kernels/cutlass_ops/LICENSE b/deepspeed/inference/v2/kernels/cutlass_ops/LICENSE
new file mode 100644
index 000000000000..d64569567334
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/cutlass_ops/LICENSE
@@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/deepspeed/inference/v2/kernels/cutlass_ops/__init__.py b/deepspeed/inference/v2/kernels/cutlass_ops/__init__.py
new file mode 100644
index 000000000000..44b9adbae794
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/cutlass_ops/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .mixed_gemm import *
+from .moe_gemm import *
diff --git a/deepspeed/inference/v2/kernels/cutlass_ops/cutlass_ops.cpp b/deepspeed/inference/v2/kernels/cutlass_ops/cutlass_ops.cpp
new file mode 100644
index 000000000000..18e834f3e60a
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/cutlass_ops/cutlass_ops.cpp
@@ -0,0 +1,19 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include <torch/extension.h>
+
+#include "mixed_gemm.h"
+#include "moe_gemm.h"
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+    // mixed_gemm.h
+    m.def("mixed_gemm", &mixed_gemm, "Mixed-precision GEMM");
+
+    // moe_gemm.h
+    m.def("moe_gemm", &moe_gemm, "MultiGEMM for MoE (16-bit weights)");
+    m.def("mixed_moe_gemm", &mixed_moe_gemm, "MultiGEMM for MoE (4-bit/8-bit weights)");
+}
diff --git a/deepspeed/inference/v2/kernels/cutlass_ops/mixed_gemm/__init__.py b/deepspeed/inference/v2/kernels/cutlass_ops/mixed_gemm/__init__.py
new file mode 100644
index 000000000000..14ccf2ce5354
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/cutlass_ops/mixed_gemm/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .mixed_gemm import *
diff --git a/deepspeed/inference/v2/kernels/cutlass_ops/mixed_gemm/mixed_gemm.cu b/deepspeed/inference/v2/kernels/cutlass_ops/mixed_gemm/mixed_gemm.cu
new file mode 100644
index 000000000000..7c522203bb48
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/cutlass_ops/mixed_gemm/mixed_gemm.cu
@@ -0,0 +1,93 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include <c10/cuda/CUDAStream.h>
+#include "mixed_gemm.h"
+#include "mixed_gemm_api.h"
+#include "weight_variant.h"
+
+// Switch helpers inspired by
+// https://github.com/NVIDIA/DALI/blob/main/include/dali/core/static_switch.h
+// and https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/Dispatch.h
+
+#define ACT_DTYPE_SWITCH(COND, ...)                \
+    [&] {                                          \
+        if (COND) {                                \
+            using ActivationDtype = __half;        \
+            return __VA_ARGS__();                  \
+        } else {                                   \
+            using ActivationDtype = __nv_bfloat16; \
+            return __VA_ARGS__();                  \
+        }                                          \
+    }()
+
+#define WEIGHT_VARIANT_SWITCH(COND, ...)                            \
+    [&] {                                                           \
+        if (COND) {                                                 \
+            constexpr WeightVariant WVariant = WeightVariant::kFP8; \
+            return __VA_ARGS__();                                   \
+        } else {                                                    \
+            constexpr WeightVariant WVariant = WeightVariant::kFP4; \
+            return __VA_ARGS__();                                   \
+        }                                                           \
+    }()
+
+void mixed_gemm(at::Tensor& output,
+                at::Tensor& hidden_states,
+                at::Tensor& weight,
+                at::Tensor& scales,
+                c10::optional<at::Tensor>& bias,
+                int num_bits,
+                int activation_raw)
+{
+    TORCH_CHECK(output.dtype() == hidden_states.dtype(),
+                "Output and hidden states must have the same dtype");
+    TORCH_CHECK(num_bits == 4 || num_bits == 8, "Data width must be 4 or 8");
+    TORCH_CHECK(output.size(0) == hidden_states.size(0), "Token dimension mismatch");
+
+    int32_t m = output.size(0);
+    int32_t k = hidden_states.size(1);
+    int32_t n = weight.size(1);
+
+    TORCH_CHECK(weight.size(0) == k, "Weight dimension mismatch");
+
+    ACT_DTYPE_SWITCH(hidden_states.dtype() == torch::kFloat16, [&] {
+        WEIGHT_VARIANT_SWITCH(num_bits == 8, [&] {
+            fastertransformer::CutlassFpAIntBGemmRunner<ActivationDtype, WVariant> runner =
+                *MixedGemmContext<ActivationDtype, WVariant>::Instance().GeMM_Runner();
+
+            ActivationType activation_type = (ActivationType)activation_raw;
+            if (!bias.has_value() && activation_type == ActivationType::IDENTITY) {
+                runner.gemm((ActivationDtype*)hidden_states.data_ptr(),
+                            (const char*)weight.data_ptr(),
+                            (ActivationDtype*)scales.data_ptr(),
+                            (ActivationDtype*)output.data_ptr(),
+                            m,
+                            n,
+                            k,
+                            nullptr,
+                            0,
+                            at::cuda::getCurrentCUDAStream());
+                return;
+            } else {
+                ActivationDtype* bias_ptr = nullptr;
+                if (bias.has_value()) { bias_ptr = (ActivationDtype*)bias.value().data_ptr(); }
+                runner.gemm_bias_act((ActivationDtype*)hidden_states.data_ptr(),
+                                     (char*)weight.data_ptr(),
+                                     (ActivationDtype*)scales.data_ptr(),
+                                     bias_ptr,
+                                     (ActivationDtype*)output.data_ptr(),
+                                     m,
+                                     n,
+                                     k,
+                                     activation_type,
+                                     nullptr,
+                                     0,
+                                     at::cuda::getCurrentCUDAStream());
+                return;
+            }
+        });
+    });
+}
diff --git a/deepspeed/inference/v2/kernels/cutlass_ops/mixed_gemm/mixed_gemm.h b/deepspeed/inference/v2/kernels/cutlass_ops/mixed_gemm/mixed_gemm.h
new file mode 100644
index 000000000000..1fc3831e9084
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/cutlass_ops/mixed_gemm/mixed_gemm.h
@@ -0,0 +1,16 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#pragma once
+
+#include <torch/extension.h>
+
+void mixed_gemm(at::Tensor& output,
+                at::Tensor& hidden_states,
+                at::Tensor& weight,
+                at::Tensor& scales,
+                c10::optional<at::Tensor>& bias,
+                int num_bits,
+                int activation_raw);
diff --git a/deepspeed/inference/v2/kernels/cutlass_ops/mixed_gemm/mixed_gemm.py b/deepspeed/inference/v2/kernels/cutlass_ops/mixed_gemm/mixed_gemm.py
new file mode 100644
index 000000000000..dddb555e267a
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/cutlass_ops/mixed_gemm/mixed_gemm.py
@@ -0,0 +1,64 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+
+from ... import DSKernelBase
+from ....inference_utils import ActivationType, DtypeEnum
+from deepspeed.ops.op_builder import InferenceCutlassBuilder
+
+from typing import Optional
+
+
+class MixedGEMM(DSKernelBase):
+    """
+    CUTLASS implementation of MoE GEMM.
+    """
+
+    supported_dtypes = [DtypeEnum.fp16, DtypeEnum.bf16]
+    supported_act_fns = [ActivationType.GELU, ActivationType.SILU, ActivationType.RELU, ActivationType.IDENTITY]
+
+    def __init__(self, fp_dtype: DtypeEnum, act_fn: ActivationType, num_bits: int) -> None:
+
+        if not isinstance(fp_dtype, DtypeEnum):
+            fp_dtype = DtypeEnum(fp_dtype)
+
+        if fp_dtype not in MixedGEMM.supported_dtypes:
+            raise ValueError("Unsupported data type: {}, supported_dtypes are {}".format(
+                fp_dtype, MixedGEMM.supported_dtypes))
+
+        if act_fn not in MixedGEMM.supported_act_fns:
+            raise ValueError("Unsupported activation function: {}, supported_act_fns are {}".format(
+                act_fn, MixedGEMM.supported_act_fns))
+
+        if num_bits != 4 and num_bits != 8:
+            raise ValueError("Unsupported num_bits: {}, supported num_bits are 4 and 8".format(num_bits))
+
+        inf_module = InferenceCutlassBuilder().load()
+        self.num_bits = num_bits
+        self.kernel = inf_module.moe_gemm
+        self.act_fn = act_fn
+
+    def __call__(self,
+                 output: torch.Tensor,
+                 hidden_states: torch.Tensor,
+                 weights: torch.Tensor,
+                 scales: torch.Tensor,
+                 biases: Optional[torch.Tensor] = None) -> None:
+        """
+            Performs a MoE GEMM. Note that the stride between token inputs must be even (the distance between byte 1 of token 0 and token 1 must be the same as the distance between byte 1 of token 1 and token 2).
+
+            Arguments:
+                output (torch.Tensor): The output of the MoE GEMM of shape [n_tokens, out_neurons].
+                hidden_states (torch.Tensor): The direct input for the MoE GEMM of shape [n_tokens, in_neurons].
+                weights (torch.Tensor): The weights of shape [in_neurons, out_neurons]. These weights must be contiguous.
+                scales (torch.Tensor): The scales of shape [out_neurons]. These scales must be contiguous.
+                biases (torch.Tensor): The biases of shape [out_neurons]. These biases must be contiguous.
+
+            Returns:
+                output
+            """
+        self.kernel(output, hidden_states, weights, biases, self.num_bits, self.act_fn)
+        return output
diff --git a/deepspeed/inference/v2/kernels/cutlass_ops/mixed_gemm/mixed_gemm_api.h b/deepspeed/inference/v2/kernels/cutlass_ops/mixed_gemm/mixed_gemm_api.h
new file mode 100644
index 000000000000..74fc07ffc4a2
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/cutlass_ops/mixed_gemm/mixed_gemm_api.h
@@ -0,0 +1,57 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include "activation_type.h"
+#include "weight_variant.h"
+
+namespace fastertransformer {
+
+template <typename T, WeightVariant V>
+class CutlassFpAIntBGemmRunner {
+public:
+    void gemm(const T* A,
+              const char* B,
+              const T* weight_scales,
+              T* C,
+              int m,
+              int n,
+              int k,
+              char* workspace_ptr,
+              const size_t workspace_bytes,
+              cudaStream_t stream);
+
+    void gemm_bias_act(const T* A,
+                       const char* B,
+                       const T* weight_scales,
+                       const T* biases,
+                       T* C,
+                       int m,
+                       int n,
+                       int k,
+                       ActivationType activation_type,
+                       char* workspace_ptr,
+                       const size_t workspace_bytes,
+                       cudaStream_t stream);
+};
+
+}  // namespace fastertransformer
+
+template <typename T, WeightVariant V>
+class MixedGemmContext {
+public:
+    MixedGemmContext() { _runner = new fastertransformer::CutlassFpAIntBGemmRunner<T, V>(); }
+
+    virtual ~MixedGemmContext() { delete _runner; }
+
+    static MixedGemmContext& Instance()
+    {
+        static MixedGemmContext _ctx;
+        return _ctx;
+    }
+
+    fastertransformer::CutlassFpAIntBGemmRunner<T, V>* GeMM_Runner() const { return _runner; }
+
+    fastertransformer::CutlassFpAIntBGemmRunner<T, V>* _runner;
+};
diff --git a/deepspeed/inference/v2/kernels/cutlass_ops/moe_gemm/__init__.py b/deepspeed/inference/v2/kernels/cutlass_ops/moe_gemm/__init__.py
new file mode 100644
index 000000000000..aff4e77bba98
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/cutlass_ops/moe_gemm/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .mixed_moe_gemm import *
+from .moe_gemm import *
diff --git a/deepspeed/inference/v2/kernels/cutlass_ops/moe_gemm/mixed_moe_gemm.py b/deepspeed/inference/v2/kernels/cutlass_ops/moe_gemm/mixed_moe_gemm.py
new file mode 100644
index 000000000000..9c55ce341532
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/cutlass_ops/moe_gemm/mixed_moe_gemm.py
@@ -0,0 +1,67 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+
+from ... import DSKernelBase
+from ....inference_utils import ActivationType, DtypeEnum
+from deepspeed.ops.op_builder import InferenceCutlassBuilder
+
+from typing import Optional
+
+
+class MixedMoEGEMM(DSKernelBase):
+    """
+    CUTLASS implementation of MoE GEMM.
+    """
+
+    supported_dtypes = [DtypeEnum.fp16, DtypeEnum.bf16]
+    supported_act_fns = [ActivationType.GELU, ActivationType.SILU, ActivationType.RELU, ActivationType.IDENTITY]
+
+    def __init__(self, fp_dtype: DtypeEnum, act_fn: ActivationType, num_bits: int) -> None:
+
+        if not isinstance(fp_dtype, DtypeEnum):
+            fp_dtype = DtypeEnum(fp_dtype)
+
+        if fp_dtype not in MixedMoEGEMM.supported_dtypes:
+            raise ValueError("Unsupported data type: {}, supported_dtypes are {}".format(
+                fp_dtype, MixedMoEGEMM.supported_dtypes))
+
+        if act_fn not in MixedMoEGEMM.supported_act_fns:
+            raise ValueError("Unsupported activation function: {}, supported_act_fns are {}".format(
+                act_fn, MixedMoEGEMM.supported_act_fns))
+
+        if num_bits != 4 and num_bits != 8:
+            raise ValueError("Unsupported num_bits: {}, supported num_bits are 4 and 8".format(num_bits))
+
+        inf_module = InferenceCutlassBuilder().load()
+        self.num_bits = num_bits
+        self.kernel = inf_module.moe_gemm
+        self.act_fn = act_fn
+
+    def __call__(self,
+                 ordered_output: torch.Tensor,
+                 ordered_input: torch.Tensor,
+                 weights: torch.Tensor,
+                 scales: torch.Tensor,
+                 total_rows_before_expert: torch.Tensor,
+                 biases: Optional[torch.Tensor] = None) -> None:
+        """
+            Performs a MoE GEMM. Note that the stride between token inputs must be even (the distance between byte 1 of token 0 and token 1 must be the same as the distance between byte 1 of token 1 and token 2).
+
+            Arguments:
+                ordered_output (torch.Tensor): The output of the MoE GEMM of shape [n_tokens, out_neurons].
+                ordered_input (torch.Tensor): The direct input for the MoE GEMM of shape [n_tokens, in_neurons].
+                weights (torch.Tensor): The weights of shape [n_experts, in_neurons, out_neurons]. These weights must be contiguous.
+                scales (torch.Tensor): The scales of shape [n_experts, out_neurons]. These scales must be contiguous.
+                total_rows_before_expert (torch.Tensor): The total number of rows before each expert of shape [n_experts].
+                biases (torch.Tensor): The biases of shape [n_experts, out_neurons]. These biases must be contiguous.
+
+            Returns:
+                ordered_output
+            """
+        self.kernel(ordered_output, ordered_input, weights, scales, biases, total_rows_before_expert, self.num_bits,
+                    self.act_fn)
+        return ordered_output
diff --git a/deepspeed/inference/v2/kernels/cutlass_ops/moe_gemm/moe_gemm.cu b/deepspeed/inference/v2/kernels/cutlass_ops/moe_gemm/moe_gemm.cu
new file mode 100644
index 000000000000..d1cafc9fff4c
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/cutlass_ops/moe_gemm/moe_gemm.cu
@@ -0,0 +1,175 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include <c10/cuda/CUDAStream.h>
+#include "moe_gemm.h"
+#include "moe_gemm_api.h"
+#include "weight_variant.h"
+
+// Switch helpers inspired by
+// https://github.com/NVIDIA/DALI/blob/main/include/dali/core/static_switch.h
+// and https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/Dispatch.h
+
+#define HIDDEN_DTYPE_SWITCH(COND, ...)                               \
+    [&] {                                                            \
+        if (COND) {                                                  \
+            using ActivationDtype = __half;                          \
+            constexpr WeightVariant WVariant = WeightVariant::kFP16; \
+            return __VA_ARGS__();                                    \
+        } else {                                                     \
+            using ActivationDtype = __nv_bfloat16;                   \
+            constexpr WeightVariant WVariant = WeightVariant::kBF16; \
+            return __VA_ARGS__();                                    \
+        }                                                            \
+    }()
+
+void moe_gemm(at::Tensor& output,
+              at::Tensor& hidden_states,
+              at::Tensor& weight,
+              c10::optional<at::Tensor>& bias,
+              at::Tensor& total_rows_before_expert,
+              int activation_raw)
+{
+    TORCH_CHECK(output.dtype() == hidden_states.dtype(),
+                "Output and hidden states must have the same dtype");
+    TORCH_CHECK(output.dtype() == weight.dtype(), "Output and weight must have the same dtype");
+
+    int64_t total_rows = hidden_states.size(0);
+    int64_t gemm_k = hidden_states.size(1);
+    int64_t gemm_n = weight.size(2);
+    int num_experts = weight.size(0);
+
+    TORCH_CHECK(total_rows == output.size(0), "Total rows dimension mismatch");
+    TORCH_CHECK(gemm_k == weight.size(1), "GEMM K dimension mismatch");
+    TORCH_CHECK(gemm_n == output.size(1), "GEMM N dimension mismatch");
+    TORCH_CHECK(num_experts == total_rows_before_expert.size(0), "Number of experts mismatch");
+
+    HIDDEN_DTYPE_SWITCH(hidden_states.dtype() == torch::kFloat16, [&] {
+        fastertransformer::MoeGemmRunner<ActivationDtype, WVariant> runner =
+            *MoeGemmContext<ActivationDtype, WVariant>::Instance().GeMM_Runner();
+
+        ActivationType activation_type = (ActivationType)activation_raw;
+        if (!bias.has_value() && activation_type == ActivationType::IDENTITY) {
+            runner.moe_gemm((ActivationDtype*)hidden_states.data_ptr(),
+                            (char*)weight.data_ptr(),
+                            nullptr,
+                            (ActivationDtype*)output.data_ptr(),
+                            (int64_t*)total_rows_before_expert.data_ptr(),
+                            total_rows,
+                            gemm_n,
+                            gemm_k,
+                            num_experts,
+                            at::cuda::getCurrentCUDAStream());
+            return;
+        } else {
+            ActivationDtype* bias_ptr = nullptr;
+            if (bias.has_value()) {
+                bias_ptr = (ActivationDtype*)bias.value().data_ptr();
+                TORCH_CHECK(num_experts == bias.value().size(0), "Number of experts mismatch");
+                TORCH_CHECK(gemm_n == bias.value().size(1), "GEMM N dimension mismatch");
+            }
+            runner.moe_gemm_bias_act((ActivationDtype*)hidden_states.data_ptr(),
+                                     (char*)weight.data_ptr(),
+                                     nullptr,
+                                     bias_ptr,
+                                     (ActivationDtype*)output.data_ptr(),
+                                     (int64_t*)total_rows_before_expert.data_ptr(),
+                                     total_rows,
+                                     gemm_n,
+                                     gemm_k,
+                                     num_experts,
+                                     activation_type,
+                                     at::cuda::getCurrentCUDAStream());
+            return;
+        }
+    });
+}
+
+#define ACT_DTYPE_SWITCH(COND, ...)                \
+    [&] {                                          \
+        if (COND) {                                \
+            using ActivationDtype = __half;        \
+            return __VA_ARGS__();                  \
+        } else {                                   \
+            using ActivationDtype = __nv_bfloat16; \
+            return __VA_ARGS__();                  \
+        }                                          \
+    }()
+
+#define WEIGHT_VARIANT_SWITCH(COND, ...)                            \
+    [&] {                                                           \
+        if (COND) {                                                 \
+            constexpr WeightVariant WVariant = WeightVariant::kFP8; \
+            return __VA_ARGS__();                                   \
+        } else {                                                    \
+            constexpr WeightVariant WVariant = WeightVariant::kFP4; \
+            return __VA_ARGS__();                                   \
+        }                                                           \
+    }()
+
+void mixed_moe_gemm(at::Tensor& output,
+                    at::Tensor& hidden_states,
+                    at::Tensor& weight,
+                    at::Tensor& scales,
+                    c10::optional<at::Tensor>& bias,
+                    at::Tensor& total_rows_before_expert,
+                    int num_bits,
+                    int activation_raw)
+{
+    TORCH_CHECK(output.dtype() == hidden_states.dtype(),
+                "Output and hidden states must have the same dtype");
+
+    int64_t total_rows = hidden_states.size(0);
+    int64_t gemm_k = hidden_states.size(1);
+    int64_t gemm_n = weight.size(2);
+    int num_experts = weight.size(0);
+
+    TORCH_CHECK(total_rows == output.size(0), "Total rows dimension mismatch");
+    TORCH_CHECK(gemm_k == weight.size(1), "GEMM K dimension mismatch");
+    TORCH_CHECK(gemm_n == output.size(1), "GEMM N dimension mismatch");
+    TORCH_CHECK(num_experts == total_rows_before_expert.size(0), "Number of experts mismatch");
+
+    ACT_DTYPE_SWITCH(hidden_states.dtype() == torch::kFloat16, [&] {
+        WEIGHT_VARIANT_SWITCH(num_bits == 8, [&] {
+            fastertransformer::MoeGemmRunner<ActivationDtype, WVariant> runner =
+                *MoeGemmContext<ActivationDtype, WVariant>::Instance().GeMM_Runner();
+
+            ActivationType activation_type = (ActivationType)activation_raw;
+            if (!bias.has_value() && activation_type == ActivationType::IDENTITY) {
+                runner.moe_gemm((ActivationDtype*)hidden_states.data_ptr(),
+                                (char*)weight.data_ptr(),
+                                (ActivationDtype*)scales.data_ptr(),
+                                (ActivationDtype*)output.data_ptr(),
+                                (int64_t*)total_rows_before_expert.data_ptr(),
+                                total_rows,
+                                gemm_n,
+                                gemm_k,
+                                num_experts,
+                                at::cuda::getCurrentCUDAStream());
+                return;
+            } else {
+                ActivationDtype* bias_ptr = nullptr;
+                if (bias.has_value()) {
+                    bias_ptr = (ActivationDtype*)bias.value().data_ptr();
+                    TORCH_CHECK(num_experts == bias.value().size(0), "Number of experts mismatch");
+                    TORCH_CHECK(gemm_n == bias.value().size(1), "GEMM N dimension mismatch");
+                }
+                runner.moe_gemm_bias_act((ActivationDtype*)hidden_states.data_ptr(),
+                                         (char*)weight.data_ptr(),
+                                         (ActivationDtype*)scales.data_ptr(),
+                                         bias_ptr,
+                                         (ActivationDtype*)output.data_ptr(),
+                                         (int64_t*)total_rows_before_expert.data_ptr(),
+                                         total_rows,
+                                         gemm_n,
+                                         gemm_k,
+                                         num_experts,
+                                         activation_type,
+                                         at::cuda::getCurrentCUDAStream());
+                return;
+            }
+        });
+    });
+}
diff --git a/deepspeed/inference/v2/kernels/cutlass_ops/moe_gemm/moe_gemm.h b/deepspeed/inference/v2/kernels/cutlass_ops/moe_gemm/moe_gemm.h
new file mode 100644
index 000000000000..dfd3d4561567
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/cutlass_ops/moe_gemm/moe_gemm.h
@@ -0,0 +1,24 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#pragma once
+
+#include <torch/extension.h>
+
+void moe_gemm(at::Tensor& output,
+              at::Tensor& hidden_states,
+              at::Tensor& weight,
+              c10::optional<at::Tensor>& bias,
+              at::Tensor& total_rows_before_expert,
+              int activation_raw);
+
+void mixed_moe_gemm(at::Tensor& output,
+                    at::Tensor& hidden_states,
+                    at::Tensor& weight,
+                    at::Tensor& scales,
+                    c10::optional<at::Tensor>& bias,
+                    at::Tensor& total_rows_before_expert,
+                    int num_bits,
+                    int activation_raw);
diff --git a/deepspeed/inference/v2/kernels/cutlass_ops/moe_gemm/moe_gemm.py b/deepspeed/inference/v2/kernels/cutlass_ops/moe_gemm/moe_gemm.py
new file mode 100644
index 000000000000..0cc233e8d87a
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/cutlass_ops/moe_gemm/moe_gemm.py
@@ -0,0 +1,60 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+
+from ... import DSKernelBase
+from ....inference_utils import ActivationType, DtypeEnum
+from deepspeed.ops.op_builder import InferenceCutlassBuilder
+
+from typing import Optional
+
+
+class MoEGEMM(DSKernelBase):
+    """
+    CUTLASS implementation of MoE GEMM.
+    """
+
+    supported_dtypes = [DtypeEnum.fp16, DtypeEnum.bf16]
+    supported_act_fns = [ActivationType.GELU, ActivationType.SILU, ActivationType.RELU, ActivationType.IDENTITY]
+
+    def __init__(self, fp_dtype: DtypeEnum, act_fn: ActivationType) -> None:
+
+        if not isinstance(fp_dtype, DtypeEnum):
+            fp_dtype = DtypeEnum(fp_dtype)
+
+        if fp_dtype not in MoEGEMM.supported_dtypes:
+            raise ValueError("Unsupported data type: {}, supported_dtypes are {}".format(
+                fp_dtype, MoEGEMM.supported_dtypes))
+
+        if act_fn not in MoEGEMM.supported_act_fns:
+            raise ValueError("Unsupported activation function: {}, supported_act_fns are {}".format(
+                act_fn, MoEGEMM.supported_act_fns))
+
+        inf_module = InferenceCutlassBuilder().load()
+        self.kernel = inf_module.moe_gemm
+        self.act_fn = act_fn
+
+    def __call__(self,
+                 ordered_output: torch.Tensor,
+                 ordered_input: torch.Tensor,
+                 weights: torch.Tensor,
+                 total_rows_before_expert: torch.Tensor,
+                 biases: Optional[torch.Tensor] = None) -> None:
+        """
+            Performs a MoE GEMM. Note that the stride between token inputs must be even (the distance between byte 1 of token 0 and token 1 must be the same as the distance between byte 1 of token 1 and token 2).
+
+            Arguments:
+                ordered_output (torch.Tensor): The output of the MoE GEMM of shape [n_tokens, out_neurons].
+                ordered_input (torch.Tensor): The direct input for the MoE GEMM of shape [n_tokens, in_neurons].
+                weights (torch.Tensor): The weights of shape [n_experts, in_neurons, out_neurons]. These weights must be contiguous.
+                total_rows_before_expert (torch.Tensor): The total number of rows before each expert of shape [n_experts].
+                biases (torch.Tensor): The biases of shape [n_experts, out_neurons]. These biases must be contiguous.
+
+            Returns:
+                ordered_output
+            """
+        self.kernel(ordered_output, ordered_input, weights, biases, total_rows_before_expert, self.act_fn)
+        return ordered_output
diff --git a/deepspeed/inference/v2/kernels/cutlass_ops/moe_gemm/moe_gemm_api.h b/deepspeed/inference/v2/kernels/cutlass_ops/moe_gemm/moe_gemm_api.h
new file mode 100644
index 000000000000..7ad92070b35f
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/cutlass_ops/moe_gemm/moe_gemm_api.h
@@ -0,0 +1,64 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include "activation_type.h"
+#include "weight_variant.h"
+
+namespace fastertransformer {
+
+template <typename T, /*The type used for activations/scales/compute*/
+          WeightVariant V /* The type for the MoE weights */>
+class MoeGemmRunner {
+public:
+    MoeGemmRunner();
+
+    void moe_gemm_bias_act(const T* A,
+                           const char* B,
+                           const T* weight_scales,
+                           const T* biases,
+                           T* C,
+                           int64_t* total_rows_before_expert,
+                           int64_t total_rows,
+                           int64_t gemm_n,
+                           int64_t gemm_k,
+                           int num_experts,
+                           ActivationType activation_type,
+                           cudaStream_t stream);
+
+    void moe_gemm(const T* A,
+                  const char* B,
+                  const T* weight_scales,
+                  T* C,
+                  int64_t* total_rows_before_expert,
+                  int64_t total_rows,
+                  int64_t gemm_n,
+                  int64_t gemm_k,
+                  int num_experts,
+                  cudaStream_t stream);
+
+private:
+    int sm_;
+    int multi_processor_count_;
+};
+
+}  // namespace fastertransformer
+
+template <typename T, WeightVariant V>
+class MoeGemmContext {
+public:
+    MoeGemmContext() { _runner = new fastertransformer::MoeGemmRunner<T, V>(); }
+
+    virtual ~MoeGemmContext() { delete _runner; }
+
+    static MoeGemmContext& Instance()
+    {
+        static MoeGemmContext _ctx;
+        return _ctx;
+    }
+
+    fastertransformer::MoeGemmRunner<T, V>* GeMM_Runner() const { return _runner; }
+
+    fastertransformer::MoeGemmRunner<T, V>* _runner;
+};
diff --git a/deepspeed/inference/v2/kernels/cutlass_ops/shared_resources/weight_variant.h b/deepspeed/inference/v2/kernels/cutlass_ops/shared_resources/weight_variant.h
new file mode 100644
index 000000000000..4d17c799f726
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/cutlass_ops/shared_resources/weight_variant.h
@@ -0,0 +1,11 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+// Data structure that allows us to abstract internal CUTLASS datatypes/mappings
+// to the DeepSpeed-Kernels repo.
+
+#pragma once
+
+enum WeightVariant { kFP16, kBF16, kFP8, kFP4 };
diff --git a/deepspeed/inference/v2/kernels/ds_kernel.py b/deepspeed/inference/v2/kernels/ds_kernel.py
new file mode 100644
index 000000000000..8dbfa1de86a6
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/ds_kernel.py
@@ -0,0 +1,32 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from abc import ABC, abstractmethod
+
+
+class DSKernelBase(ABC):
+
+    @abstractmethod
+    def __init__(self, *args, **kwargs):
+        """
+        If necessary trigger compilation and warmup
+        Autotuning of the kernel would happen at this stage to
+        eliminate any potential hangs that might occur mid-deployment
+        Validate that the desired run configuration is compatible.
+
+        It is not necessary to call super on this method.
+        """
+        raise NotImplementedError()
+
+    @abstractmethod
+    def __call__(self, *args, **kwargs):
+        """
+        However the kernel needs to be called, it can be called here. Auto-tuning
+        should never be performed here.
+
+        All inputs/outputs should be passed as arguments to this function. No allocations
+        should be performed here.
+        """
+        raise NotImplementedError()
diff --git a/deepspeed/inference/v2/kernels/includes/activation_type.h b/deepspeed/inference/v2/kernels/includes/activation_type.h
new file mode 100644
index 000000000000..a44921d5d650
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/includes/activation_type.h
@@ -0,0 +1,17 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#pragma once
+
+enum ActivationType {
+    GELU = 0,
+    RELU = 1,
+    SILU = 2,
+    GEGLU = 3,
+    ReGLU = 4,
+    SiGLU = 5,
+    IDENTITY = 6,
+    InvalidType = -1
+};
diff --git a/deepspeed/inference/v2/kernels/includes/conversion_utils.h b/deepspeed/inference/v2/kernels/includes/conversion_utils.h
new file mode 100644
index 000000000000..3a90a3e91ddf
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/includes/conversion_utils.h
@@ -0,0 +1,640 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#pragma once
+
+#include "ds_kernel_utils.h"
+
+#include <stdint.h>
+
+#ifdef BF16_AVAILABLE
+#include <cuda_bf16.h>
+#endif
+
+namespace conversion {
+
+// Basic primitive for constructing conversions
+template <typename TO, typename FROM>
+DS_D_INLINE TO to(FROM val)
+{
+    return to(val);
+}
+
+// Specializations
+
+/********************* Identity Conversions *********************/
+/*
+Identity conversions are useful in templated functions where we might have
+a fixed destination type. For example, I might have a kernel that accepts
+__half, __nv_bfloat16, and float but always want to do the core computation
+at floating point:
+
+T mem_value = input[idx];
+float compute_value = conversion::to<float, T>(mem_value);
+
+In practice, we should be able to elide the second template parameter:
+float compute_val = conversion::to<float>(mem_value);
+
+In this case, we need an implementation to handle the T = float case
+
+NOTE: The type inferencing system appears to be unable to handle inferring the first
+template parameter, even in the trivial case.
+*/
+
+// Floating point types
+template <>
+DS_D_INLINE double to(double val)
+{
+    return val;
+}
+template <>
+DS_D_INLINE float to(float val)
+{
+    return val;
+}
+template <>
+DS_D_INLINE __half to(__half val)
+{
+    return val;
+}
+#ifdef BF16_AVAILABLE
+template <>
+DS_D_INLINE __nv_bfloat16 to(__nv_bfloat16 val)
+{
+    return val;
+}
+#endif
+
+// Integer types
+template <>
+DS_D_INLINE int8_t to(int8_t val)
+{
+    return val;
+}
+template <>
+DS_D_INLINE uint8_t to(uint8_t val)
+{
+    return val;
+}
+template <>
+DS_D_INLINE int16_t to(int16_t val)
+{
+    return val;
+}
+template <>
+DS_D_INLINE uint16_t to(uint16_t val)
+{
+    return val;
+}
+template <>
+DS_D_INLINE int32_t to(int32_t val)
+{
+    return val;
+}
+template <>
+DS_D_INLINE uint32_t to(uint32_t val)
+{
+    return val;
+}
+template <>
+DS_D_INLINE int64_t to(int64_t val)
+{
+    return val;
+}
+template <>
+DS_D_INLINE uint64_t to(uint64_t val)
+{
+    return val;
+}
+
+// TODO: evaluate if we want bools
+
+/*********************  To Double Conversions *********************/
+
+// * to double variants
+
+// Would normally like to not use C cast, but this is an important enough conversion
+// to keep
+template <>
+DS_D_INLINE double to(float val)
+{
+#ifdef PTX_AVAILABLE
+    double ret_val;
+    asm("ctv.rn.f64.f32 %0, %1;\n" : "=d"(ret_val) : "f"(val));
+    return ret_val;
+#else
+    return double(val);
+#endif
+}
+// Note: there is a CVT instruction for __half -> double, but there's no inline interface
+// for passing a single half value
+template <>
+DS_D_INLINE double to(__half val)
+{
+    return to<double>(__half2float(val));
+}
+template <>
+DS_D_INLINE double to(int64_t val)
+{
+    return __ll2double_rn(val);
+}
+template <>
+DS_D_INLINE double to(int32_t val)
+{
+    return __int2double_rn(val);
+}
+template <>
+DS_D_INLINE double to(int16_t val)
+{
+    return __int2double_rn(val);
+}
+template <>
+DS_D_INLINE double to(int8_t val)
+{
+    return __int2double_rn(val);
+}
+template <>
+DS_D_INLINE double to(uint64_t val)
+{
+    return __ull2double_rn(val);
+}
+template <>
+DS_D_INLINE double to(uint32_t val)
+{
+    return __uint2double_rn(val);
+}
+template <>
+DS_D_INLINE double to(uint16_t val)
+{
+    return __uint2double_rn(val);
+}
+template <>
+DS_D_INLINE double to(uint8_t val)
+{
+    return __uint2double_rn(val);
+}
+
+// Same applies here
+#ifdef BF16_AVAILABLE
+template <>
+DS_D_INLINE double to(__nv_bfloat16 val)
+{
+    return to<double>(__bfloat162float(val));
+}
+#endif
+
+/*********************  To Float Conversions *********************/
+
+template <>
+DS_D_INLINE float to(double val)
+{
+    return __double2float_rn(val);
+}
+template <>
+DS_D_INLINE float to(__half val)
+{
+    return __half2float(val);
+}
+template <>
+DS_D_INLINE float to(int64_t val)
+{
+    return __ll2float_rn(val);
+}
+template <>
+DS_D_INLINE float to(int32_t val)
+{
+    return __int2float_rn(val);
+}
+template <>
+DS_D_INLINE float to(int16_t val)
+{
+    return __int2float_rn(val);
+}
+template <>
+DS_D_INLINE float to(int8_t val)
+{
+    return __int2float_rn(val);
+}
+template <>
+DS_D_INLINE float to(uint64_t val)
+{
+    return __ull2float_rn(val);
+}
+template <>
+DS_D_INLINE float to(uint32_t val)
+{
+    return __uint2float_rn(val);
+}
+template <>
+DS_D_INLINE float to(uint16_t val)
+{
+    return __uint2float_rn(val);
+}
+template <>
+DS_D_INLINE float to(uint8_t val)
+{
+    return __uint2float_rn(val);
+}
+
+#ifdef BF16_AVAILABLE
+template <>
+DS_D_INLINE float to(__nv_bfloat16 val)
+{
+    return __bfloat162float(val);
+}
+#endif
+
+/*********************  To Float2 Conversions *********************/
+template <>
+DS_D_INLINE float2 to(__half2 val)
+{
+    return __half22float2(val);
+}
+
+#ifdef BF16_AVAILABLE
+template <>
+DS_D_INLINE float2 to(__nv_bfloat162 val)
+{
+    return __bfloat1622float2(val);
+}
+#endif
+
+/*********************  To Half Conversions *********************/
+template <>
+DS_D_INLINE __half to(double val)
+{
+#ifdef __HIP_PLATFORM_AMD__
+    float val_f = __double2float_rn(val);
+    return __float2half(val_f);
+#else
+    return __double2half(val);
+#endif
+}
+template <>
+DS_D_INLINE __half to(float val)
+{
+    return __float2half(val);
+}
+template <>
+DS_D_INLINE __half to(int64_t val)
+{
+    return __ll2half_rn(val);
+}
+template <>
+DS_D_INLINE __half to(int32_t val)
+{
+    return __int2half_rn(val);
+}
+template <>
+DS_D_INLINE __half to(int16_t val)
+{
+    return __short2half_rn(val);
+}
+template <>
+DS_D_INLINE __half to(int8_t val)
+{
+    return __int2half_rn(val);
+}
+template <>
+DS_D_INLINE __half to(uint64_t val)
+{
+    return __ull2half_rn(val);
+}
+template <>
+DS_D_INLINE __half to(uint32_t val)
+{
+    return __uint2half_rn(val);
+}
+template <>
+DS_D_INLINE __half to(uint16_t val)
+{
+    return __ushort2half_rn(val);
+}
+template <>
+DS_D_INLINE __half to(uint8_t val)
+{
+    return __uint2half_rn(val);
+}
+
+#ifdef BF16_AVAILABLE
+// No direct conversion
+template <>
+DS_D_INLINE __half to(__nv_bfloat16 val)
+{
+    return to<__half>(to<float>(val));
+}
+#endif
+
+/*********************  To Half2 Conversions *********************/
+template <>
+DS_D_INLINE __half2 to(float2 val)
+{
+    return __float22half2_rn(val);
+}
+template <>
+DS_D_INLINE __half2 to(float val)
+{
+    return __float2half2_rn(val);
+}
+
+#ifdef BF16_AVAILABLE
+// No direct conversion
+template <>
+DS_D_INLINE __half2 to(__nv_bfloat162 val)
+{
+    return to<__half2>(to<float2>(val));
+}
+#endif
+
+/*********************  To BF16 Conversions *********************/
+#ifdef BF16_AVAILABLE
+template <>
+DS_D_INLINE __nv_bfloat16 to(double val)
+{
+    return __double2bfloat16(val);
+}
+template <>
+DS_D_INLINE __nv_bfloat16 to(float val)
+{
+    return __float2bfloat16(val);
+}
+template <>
+DS_D_INLINE __nv_bfloat16 to(int64_t val)
+{
+    return __ll2bfloat16_rn(val);
+}
+template <>
+DS_D_INLINE __nv_bfloat16 to(int32_t val)
+{
+    return __int2bfloat16_rn(val);
+}
+template <>
+DS_D_INLINE __nv_bfloat16 to(int16_t val)
+{
+    return __short2bfloat16_rn(val);
+}
+template <>
+DS_D_INLINE __nv_bfloat16 to(int8_t val)
+{
+    return __int2bfloat16_rn(val);
+}
+template <>
+DS_D_INLINE __nv_bfloat16 to(uint64_t val)
+{
+    return __ull2bfloat16_rn(val);
+}
+template <>
+DS_D_INLINE __nv_bfloat16 to(uint32_t val)
+{
+    return __uint2bfloat16_rn(val);
+}
+template <>
+DS_D_INLINE __nv_bfloat16 to(uint16_t val)
+{
+    return __ushort2bfloat16_rn(val);
+}
+template <>
+DS_D_INLINE __nv_bfloat16 to(uint8_t val)
+{
+    return __uint2bfloat16_rn(val);
+}
+#endif
+
+/*********************  To BF162 Conversions *********************/
+#ifdef BF16_AVAILABLE
+template <>
+DS_D_INLINE __nv_bfloat162 to(float2 val)
+{
+    return __float22bfloat162_rn(val);
+}
+template <>
+DS_D_INLINE __nv_bfloat162 to(float val)
+{
+    return __float2bfloat162_rn(val);
+}
+template <>
+DS_D_INLINE __nv_bfloat162 to(__half2 val)
+{
+    return to<__nv_bfloat162>(to<float2>(val));
+}
+#endif
+
+/*********************  To INT64_T Conversions *********************/
+template <>
+DS_D_INLINE int64_t to(double val)
+{
+    return __double2ll_rn(val);
+}
+template <>
+DS_D_INLINE int64_t to(float val)
+{
+    return __float2ll_rn(val);
+}
+template <>
+DS_D_INLINE int64_t to(__half val)
+{
+    return __half2ll_rn(val);
+}
+// No direct support for integer casts at the C++ level and I don't feel they're so important
+// to demand an PTX at this time
+
+#ifdef BF16_AVAILABLE
+template <>
+DS_D_INLINE int64_t to(__nv_bfloat16 val)
+{
+    return __bfloat162ll_rn(val);
+}
+#endif
+
+/*********************  To INT32_T Conversions *********************/
+template <>
+DS_D_INLINE int32_t to(double val)
+{
+    return __double2int_rn(val);
+}
+template <>
+DS_D_INLINE int32_t to(float val)
+{
+    return __float2int_rn(val);
+}
+template <>
+DS_D_INLINE int32_t to(__half val)
+{
+    return __half2int_rn(val);
+}
+// No direct support for integer casts at the C++ level and I don't feel they're so important
+// to demand an PTX at this time
+
+#ifdef BF16_AVAILABLE
+template <>
+DS_D_INLINE int32_t to(__nv_bfloat16 val)
+{
+    return __bfloat162int_rn(val);
+}
+#endif
+
+/*********************  To INT16_T Conversions *********************/
+template <>
+DS_D_INLINE int16_t to(double val)
+{
+    return __double2int_rn(val);
+}
+template <>
+DS_D_INLINE int16_t to(float val)
+{
+    return __float2int_rn(val);
+}
+template <>
+DS_D_INLINE int16_t to(__half val)
+{
+    return __half2int_rn(val);
+}
+// No direct support for integer casts at the C++ level and I don't feel they're so important
+// to demand an PTX at this time
+
+#ifdef BF16_AVAILABLE
+template <>
+DS_D_INLINE int16_t to(__nv_bfloat16 val)
+{
+    return __bfloat162int_rn(val);
+}
+#endif
+
+/*********************  To INT8_T Conversions *********************/
+template <>
+DS_D_INLINE int8_t to(double val)
+{
+    return __double2int_rn(val);
+}
+template <>
+DS_D_INLINE int8_t to(float val)
+{
+    return __float2int_rn(val);
+}
+template <>
+DS_D_INLINE int8_t to(__half val)
+{
+    return __half2int_rn(val);
+}
+// No direct support for integer casts at the C++ level and I don't feel they're so important
+// to demand an PTX at this time
+
+#ifdef BF16_AVAILABLE
+template <>
+DS_D_INLINE int8_t to(__nv_bfloat16 val)
+{
+    return __bfloat162int_rn(val);
+}
+#endif
+
+/*********************  To UINT64_T Conversions *********************/
+template <>
+DS_D_INLINE uint64_t to(double val)
+{
+    return __double2ull_rn(val);
+}
+template <>
+DS_D_INLINE uint64_t to(float val)
+{
+    return __float2ull_rn(val);
+}
+template <>
+DS_D_INLINE uint64_t to(__half val)
+{
+    return __half2ull_rn(val);
+}
+// No direct support for integer casts at the C++ level and I don't feel they're so important
+// to demand an PTX at this time
+
+#ifdef BF16_AVAILABLE
+template <>
+DS_D_INLINE uint64_t to(__nv_bfloat16 val)
+{
+    return __bfloat162ull_rn(val);
+}
+#endif
+
+/*********************  To UINT32_T Conversions *********************/
+template <>
+DS_D_INLINE uint32_t to(double val)
+{
+    return __double2uint_rn(val);
+}
+template <>
+DS_D_INLINE uint32_t to(float val)
+{
+    return __float2uint_rn(val);
+}
+template <>
+DS_D_INLINE uint32_t to(__half val)
+{
+    return __half2uint_rn(val);
+}
+// No direct support for integer casts at the C++ level and I don't feel they're so important
+// to demand an PTX at this time
+
+#ifdef BF16_AVAILABLE
+template <>
+DS_D_INLINE uint32_t to(__nv_bfloat16 val)
+{
+    return __bfloat162uint_rn(val);
+}
+#endif
+
+/*********************  To UINT16_T Conversions *********************/
+template <>
+DS_D_INLINE uint16_t to(double val)
+{
+    return __double2uint_rn(val);
+}
+template <>
+DS_D_INLINE uint16_t to(float val)
+{
+    return __float2uint_rn(val);
+}
+template <>
+DS_D_INLINE uint16_t to(__half val)
+{
+    return __half2uint_rn(val);
+}
+// No direct support for integer casts at the C++ level and I don't feel they're so important
+// to demand an PTX at this time
+
+#ifdef BF16_AVAILABLE
+template <>
+DS_D_INLINE uint16_t to(__nv_bfloat16 val)
+{
+    return __bfloat162uint_rn(val);
+}
+#endif
+
+/*********************  To UINT8_T Conversions *********************/
+template <>
+DS_D_INLINE uint8_t to(double val)
+{
+    return __double2uint_rn(val);
+}
+template <>
+DS_D_INLINE uint8_t to(float val)
+{
+    return __float2uint_rn(val);
+}
+template <>
+DS_D_INLINE uint8_t to(__half val)
+{
+    return __half2uint_rn(val);
+}
+// No direct support for integer casts at the C++ level and I don't feel they're so important
+// to demand an PTX at this time
+
+#ifdef BF16_AVAILABLE
+template <>
+DS_D_INLINE uint8_t to(__nv_bfloat16 val)
+{
+    return __bfloat162uint_rn(val);
+}
+#endif
+
+}  // namespace conversion
diff --git a/deepspeed/inference/v2/kernels/includes/ds_kernel_utils.h b/deepspeed/inference/v2/kernels/includes/ds_kernel_utils.h
new file mode 100644
index 000000000000..8e4888109fcd
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/includes/ds_kernel_utils.h
@@ -0,0 +1,58 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+/*
+Centralized header file for preprocessor macros and constants
+used throughout the codebase.
+*/
+
+#pragma once
+
+#include <cuda.h>
+#include <cuda_fp16.h>
+
+#ifdef BF16_AVAILABLE
+#include <cuda_bf16.h>
+#endif
+
+#define DS_HD_INLINE __host__ __device__ __forceinline__
+#define DS_D_INLINE __device__ __forceinline__
+
+#ifdef __HIP_PLATFORM_AMD__
+
+// constexpr variant of warpSize for templating
+constexpr int hw_warp_size = 64;
+#define HALF_PRECISION_AVAILABLE = 1
+#include <hip/hip_cooperative_groups.h>
+#include <hip/hip_fp16.h>
+
+#else  // !__HIP_PLATFORM_AMD__
+
+// constexpr variant of warpSize for templating
+constexpr int hw_warp_size = 32;
+
+#if __CUDA_ARCH__ >= 530
+#define HALF_PRECISION_AVAILABLE = 1
+#define PTX_AVAILABLE
+#endif  // __CUDA_ARCH__ >= 530
+
+#if __CUDA_ARCH__ >= 800
+#define ASYNC_COPY_AVAILABLE
+#endif  // __CUDA_ARCH__ >= 800
+
+#include <cooperative_groups.h>
+#include <cuda_fp16.h>
+
+#endif  //__HIP_PLATFORM_AMD__
+
+inline int next_pow2(const int val)
+{
+    int rounded_val = val - 1;
+    rounded_val |= rounded_val >> 1;
+    rounded_val |= rounded_val >> 2;
+    rounded_val |= rounded_val >> 4;
+    rounded_val |= rounded_val >> 8;
+    return rounded_val + 1;
+}
diff --git a/deepspeed/inference/v2/kernels/includes/memory_access_utils.h b/deepspeed/inference/v2/kernels/includes/memory_access_utils.h
new file mode 100644
index 000000000000..6789714d27c7
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/includes/memory_access_utils.h
@@ -0,0 +1,1115 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#pragma once
+
+#include <cuda.h>
+#include "ds_kernel_utils.h"
+
+/////////////////////////////// Memory Access Utils ///////////////////////////////
+namespace mem_access {
+
+enum class LoadPolicy {
+    CacheAll,       // Cache at all levels
+    CacheGlobal,    // Cache at L2 only
+    CacheStreaming  // Cache with evict first policy
+};
+
+enum class StorePolicy {
+    Writeback,      // Cache in L1, write-back on eviction
+    CacheGlobal,    // Bypass L1, write-back on eviction
+    CacheStreaming  // Allocate cache line with evict first policy
+};
+
+template <int AccessSize, LoadPolicy policy = LoadPolicy::CacheAll>
+__device__ __forceinline__ void load_global(void* dst, const void* src);
+
+template <int AccessSize, LoadPolicy policy = LoadPolicy::CacheAll>
+__device__ __forceinline__ void load_global(void* dst, const void* src, bool do_access);
+
+// Shared accesses have no cache policy
+template <int AccessSize>
+__device__ __forceinline__ void load_shared(void* dst, const void* src);
+
+template <int AccessSize>
+__device__ __forceinline__ void load_shared(void* dst, const void* src, bool do_access);
+
+template <int AccessSize, StorePolicy policy = StorePolicy::Writeback>
+__device__ __forceinline__ void store_global(void* dst, const void* src);
+
+// Shared accesses have no cache policy
+template <int AccessSize>
+__device__ __forceinline__ void store_shared(void* dst, const void* src);
+
+#ifdef ASYNC_COPY_AVAILABLE
+template <int AccessSize>
+__device__ __forceinline__ void memcpy_async(void* shr, const void* gbl);
+
+template <int AccessSize>
+__device__ __forceinline__ void memcpy_async_nop(void* shr, const void* gbl, bool predicate);
+
+template <int AccessSize>
+__device__ __forceinline__ void memcpy_async_zero(void* shr, const void* gbl, bool predicate);
+
+__device__ __forceinline__ void memcpy_async_fence();
+
+template <int stages>
+__device__ __forceinline__ void memcpy_async_wait();
+
+template <int stages>
+__device__ __forceinline__ void tail_complete_wait(int remaining_stages);
+#endif
+
+// Util for tracking pipeline buffers
+// TODO: Evaluate whether this should also be guarded by ASYNC_COPY_AVAILABLE
+template <int max>
+class BufferTracker {
+public:
+    int current_state;
+
+    __device__ __forceinline__ BufferTracker() : current_state(0) {}
+
+    __device__ __forceinline__ int get()
+    {
+        int return_val = current_state++;
+        current_state = (current_state == max ? 0 : current_state);
+        return return_val;
+    }
+};
+
+__device__ __forceinline__ uint32_t lane_id()
+{
+#ifdef PTX_AVAILABLE
+    unsigned int lane_id;
+    asm volatile("mov.u32 %0, %%laneid;" : "=r"(lane_id));
+    return lane_id;
+#else
+    return threadIdx.x & (warpSize - 1);  // Portable
+#endif
+}
+
+/////////// Load Global ///////////
+template <>
+__device__ __forceinline__ void load_global<16>(void* dst, const void* src)
+{
+    uint4* data = reinterpret_cast<uint4*>(dst);
+#ifdef PTX_AVAILABLE
+    asm volatile("ld.global.ca.v4.u32 {%0, %1, %2, %3}, [%4];\n"
+                 : "=r"(data[0].x), "=r"(data[0].y), "=r"(data[0].z), "=r"(data[0].w)
+                 : "l"(src));
+#else
+    const uint4* src_cast = reinterpret_cast<const uint4*>(src);
+    data[0] = src_cast[0];
+#endif
+}
+
+template <>
+__device__ __forceinline__ void load_global<16>(void* dst, const void* src, bool do_access)
+{
+    uint4* data = reinterpret_cast<uint4*>(dst);
+#ifdef PTX_AVAILABLE
+    asm volatile(
+        "{\n"
+        "\t.reg .pred p;\n"
+        "\tsetp.ne.b32 p, %5, 0;\n"
+        "\tmov.b32 %0, 0;\n"
+        "\tmov.b32 %1, 0;\n"
+        "\tmov.b32 %2, 0;\n"
+        "\tmov.b32 %3, 0;\n"
+        "\t@p ld.global.v4.u32 {%0, %1, %2, %3}, [%4];\n"
+        "}\n"
+        : "=r"(data[0].x), "=r"(data[0].y), "=r"(data[0].z), "=r"(data[0].w)
+        : "l"(src), "r"((int)do_access));
+#else
+    const uint4* src_cast = reinterpret_cast<const uint4*>(src);
+    if (do_access) {
+        data[0] = src_cast[0];
+    } else {
+        data[0].x = 0;
+        data[0].y = 0;
+        data[0].z = 0;
+        data[0].w = 0;
+    }
+#endif
+}
+
+template <>
+__device__ __forceinline__ void load_global<16, LoadPolicy::CacheGlobal>(void* dst, const void* src)
+{
+    uint4* data = reinterpret_cast<uint4*>(dst);
+#ifdef PTX_AVAILABLE
+    asm volatile("ld.global.cg.v4.u32 {%0, %1, %2, %3}, [%4];\n"
+                 : "=r"(data[0].x), "=r"(data[0].y), "=r"(data[0].z), "=r"(data[0].w)
+                 : "l"(src));
+#else
+    const uint4* src_cast = reinterpret_cast<const uint4*>(src);
+    data[0] = src_cast[0];
+#endif
+}
+
+template <>
+__device__ __forceinline__ void load_global<16, LoadPolicy::CacheGlobal>(void* dst,
+                                                                         const void* src,
+                                                                         bool do_access)
+{
+    uint4* data = reinterpret_cast<uint4*>(dst);
+#ifdef PTX_AVAILABLE
+    asm volatile(
+        "{\n"
+        "\t.reg .pred p;\n"
+        "\tsetp.ne.b32 p, %5, 0;\n"
+        "\tmov.b32 %0, 0;\n"
+        "\tmov.b32 %1, 0;\n"
+        "\tmov.b32 %2, 0;\n"
+        "\tmov.b32 %3, 0;\n"
+        "\t@p ld.global.cg.v4.u32 {%0, %1, %2, %3}, [%4];\n"
+        "}\n"
+        : "=r"(data[0].x), "=r"(data[0].y), "=r"(data[0].z), "=r"(data[0].w)
+        : "l"(src), "r"((int)do_access));
+#else
+    const uint4* src_cast = reinterpret_cast<const uint4*>(src);
+    if (do_access) {
+        data[0] = src_cast[0];
+    } else {
+        data[0].x = 0;
+        data[0].y = 0;
+        data[0].z = 0;
+        data[0].w = 0;
+    }
+#endif
+}
+
+template <>
+__device__ __forceinline__ void load_global<16, LoadPolicy::CacheStreaming>(void* dst,
+                                                                            const void* src)
+{
+    uint4* data = reinterpret_cast<uint4*>(dst);
+#ifdef PTX_AVAILABLE
+    asm volatile("ld.global.cs.v4.u32 {%0, %1, %2, %3}, [%4];\n"
+                 : "=r"(data[0].x), "=r"(data[0].y), "=r"(data[0].z), "=r"(data[0].w)
+                 : "l"(src));
+#else
+    const uint4* src_cast = reinterpret_cast<const uint4*>(src);
+    data[0] = src_cast[0];
+#endif
+}
+
+template <>
+__device__ __forceinline__ void load_global<16, LoadPolicy::CacheStreaming>(void* dst,
+                                                                            const void* src,
+                                                                            bool do_access)
+{
+    uint4* data = reinterpret_cast<uint4*>(dst);
+#ifdef PTX_AVAILABLE
+    asm volatile(
+        "{\n"
+        "\t.reg .pred p;\n"
+        "\tsetp.ne.b32 p, %5, 0;\n"
+        "\tmov.b32 %0, 0;\n"
+        "\tmov.b32 %1, 0;\n"
+        "\tmov.b32 %2, 0;\n"
+        "\tmov.b32 %3, 0;\n"
+        "\t@p ld.global.cg.v4.u32 {%0, %1, %2, %3}, [%4];\n"
+        "}\n"
+        : "=r"(data[0].x), "=r"(data[0].y), "=r"(data[0].z), "=r"(data[0].w)
+        : "l"(src), "r"((int)do_access));
+#else
+    const uint4* src_cast = reinterpret_cast<const uint4*>(src);
+    if (do_access) {
+        data[0] = src_cast[0];
+    } else {
+        data[0].x = 0;
+        data[0].y = 0;
+        data[0].z = 0;
+        data[0].w = 0;
+    }
+#endif
+}
+
+template <>
+__device__ __forceinline__ void load_global<8>(void* dst, const void* src)
+{
+    uint2* data = reinterpret_cast<uint2*>(dst);
+#ifdef PTX_AVAILABLE
+    asm volatile("ld.global.ca.v2.u32 {%0, %1}, [%2];\n"
+                 : "=r"(data[0].x), "=r"(data[0].y)
+                 : "l"(src));
+#else
+    const uint2* src_cast = reinterpret_cast<const uint2*>(src);
+    data[0] = src_cast[0];
+#endif
+}
+
+template <>
+__device__ __forceinline__ void load_global<8>(void* dst, const void* src, bool do_access)
+{
+    uint2* data = reinterpret_cast<uint2*>(dst);
+#ifdef PTX_AVAILABLE
+    asm volatile(
+        "{\n"
+        "\t.reg .pred p;\n"
+        "\tsetp.ne.b32 p, %3, 0;\n"
+        "\tmov.b32 %0, 0;\n"
+        "\tmov.b32 %1, 0;\n"
+        "\t@p ld.global.v2.u32 {%0, %1}, [%2];\n"
+        "}\n"
+        : "=r"(data[0].x), "=r"(data[0].y)
+        : "l"(src), "r"((int)do_access));
+#else
+    const uint2* src_cast = reinterpret_cast<const uint2*>(src);
+    if (do_access) {
+        data[0] = src_cast[0];
+    } else {
+        data[0].x = 0;
+        data[0].y = 0;
+    }
+#endif
+}
+
+template <>
+__device__ __forceinline__ void load_global<8, LoadPolicy::CacheGlobal>(void* dst, const void* src)
+{
+    uint2* data = reinterpret_cast<uint2*>(dst);
+#ifdef PTX_AVAILABLE
+    asm volatile("ld.global.cg.v2.u32 {%0, %1}, [%2];\n"
+                 : "=r"(data[0].x), "=r"(data[0].y)
+                 : "l"(src));
+#else
+    const uint2* src_cast = reinterpret_cast<const uint2*>(src);
+    data[0] = src_cast[0];
+#endif
+}
+
+template <>
+__device__ __forceinline__ void load_global<8, LoadPolicy::CacheGlobal>(void* dst,
+                                                                        const void* src,
+                                                                        bool do_access)
+{
+    uint2* data = reinterpret_cast<uint2*>(dst);
+#ifdef PTX_AVAILABLE
+    asm volatile(
+        "{\n"
+        "\t.reg .pred p;\n"
+        "\tsetp.ne.b32 p, %3, 0;\n"
+        "\tmov.b32 %0, 0;\n"
+        "\tmov.b32 %1, 0;\n"
+        "\t@p ld.global.cg.v2.u32 {%0, %1}, [%2];\n"
+        "}\n"
+        : "=r"(data[0].x), "=r"(data[0].y)
+        : "l"(src), "r"((int)do_access));
+#else
+    const uint2* src_cast = reinterpret_cast<const uint2*>(src);
+    if (do_access) {
+        data[0] = src_cast[0];
+    } else {
+        data[0].x = 0;
+        data[0].y = 0;
+    }
+#endif
+}
+
+template <>
+__device__ __forceinline__ void load_global<8, LoadPolicy::CacheStreaming>(void* dst,
+                                                                           const void* src)
+{
+    uint2* data = reinterpret_cast<uint2*>(dst);
+#ifdef PTX_AVAILABLE
+    asm volatile("ld.global.cs.v2.u32 {%0, %1}, [%2];\n"
+                 : "=r"(data[0].x), "=r"(data[0].y)
+                 : "l"(src));
+#else
+    const uint2* src_cast = reinterpret_cast<const uint2*>(src);
+    data[0] = src_cast[0];
+#endif
+}
+
+template <>
+__device__ __forceinline__ void load_global<8, LoadPolicy::CacheStreaming>(void* dst,
+                                                                           const void* src,
+                                                                           bool do_access)
+{
+    uint2* data = reinterpret_cast<uint2*>(dst);
+#ifdef PTX_AVAILABLE
+    asm volatile(
+        "{\n"
+        "\t.reg .pred p;\n"
+        "\tsetp.ne.b32 p, %3, 0;\n"
+        "\tmov.b32 %0, 0;\n"
+        "\tmov.b32 %1, 0;\n"
+        "\t@p ld.global.cs.v2.u32 {%0, %1}, [%2];\n"
+        "}\n"
+        : "=r"(data[0].x), "=r"(data[0].y)
+        : "l"(src), "r"((int)do_access));
+#else
+    const uint2* src_cast = reinterpret_cast<const uint2*>(src);
+    if (do_access) {
+        data[0] = src_cast[0];
+    } else {
+        data[0].x = 0;
+        data[0].y = 0;
+    }
+#endif
+}
+
+template <>
+__device__ __forceinline__ void load_global<4>(void* dst, const void* src)
+{
+    int32_t* data = reinterpret_cast<int32_t*>(dst);
+#ifdef PTX_AVAILABLE
+    asm volatile("ld.global.ca.u32 {%0}, [%1];\n" : "=r"(*data) : "l"(src));
+#else
+    const int32_t* src_cast = reinterpret_cast<const int32_t*>(src);
+    data[0] = src_cast[0];
+#endif
+}
+
+template <>
+__device__ __forceinline__ void load_global<4>(void* dst, const void* src, bool do_access)
+{
+    int32_t* data = reinterpret_cast<int32_t*>(dst);
+#ifdef PTX_AVAILABLE
+    asm volatile(
+        "{\n"
+        "\t.reg .pred p;\n"
+        "\tsetp.ne.b32 p, %2, 0;\n"
+        "\tmov.b32 %0, 0;\n"
+        "\t@p ld.global.u32 {%0}, [%1];\n"
+        "}\n"
+        : "=r"(data[0])
+        : "l"(src), "r"((int)do_access));
+#else
+    const int32_t* src_cast = reinterpret_cast<const int32_t*>(src);
+    if (do_access) {
+        data[0] = src_cast[0];
+    } else {
+        data[0] = 0;
+    }
+#endif
+}
+
+template <>
+__device__ __forceinline__ void load_global<4, LoadPolicy::CacheGlobal>(void* dst, const void* src)
+{
+    int32_t* data = reinterpret_cast<int32_t*>(dst);
+#ifdef PTX_AVAILABLE
+    asm volatile("ld.global.cg.u32 {%0}, [%1];\n" : "=r"(*data) : "l"(src));
+#else
+    const int32_t* src_cast = reinterpret_cast<const int32_t*>(src);
+    data[0] = src_cast[0];
+#endif
+}
+
+template <>
+__device__ __forceinline__ void load_global<4, LoadPolicy::CacheGlobal>(void* dst,
+                                                                        const void* src,
+                                                                        bool do_access)
+{
+    int32_t* data = reinterpret_cast<int32_t*>(dst);
+#ifdef PTX_AVAILABLE
+    asm volatile(
+        "{\n"
+        "\t.reg .pred p;\n"
+        "\tsetp.ne.b32 p, %2, 0;\n"
+        "\tmov.b32 %0, 0;\n"
+        "\t@p ld.global.cg.u32 {%0}, [%1];\n"
+        "}\n"
+        : "=r"(data[0])
+        : "l"(src), "r"((int)do_access));
+#else
+    const int32_t* src_cast = reinterpret_cast<const int32_t*>(src);
+    if (do_access) {
+        data[0] = src_cast[0];
+    } else {
+        data[0] = 0;
+    }
+#endif
+}
+
+template <>
+__device__ __forceinline__ void load_global<4, LoadPolicy::CacheStreaming>(void* dst,
+                                                                           const void* src)
+{
+    int32_t* data = reinterpret_cast<int32_t*>(dst);
+#ifdef PTX_AVAILABLE
+    asm volatile("ld.global.cs.u32 {%0}, [%1];\n" : "=r"(*data) : "l"(src));
+#else
+    const int32_t* src_cast = reinterpret_cast<const int32_t*>(src);
+    data[0] = src_cast[0];
+#endif
+}
+
+template <>
+__device__ __forceinline__ void load_global<4, LoadPolicy::CacheStreaming>(void* dst,
+                                                                           const void* src,
+                                                                           bool do_access)
+{
+    int32_t* data = reinterpret_cast<int32_t*>(dst);
+#ifdef PTX_AVAILABLE
+    asm volatile(
+        "{\n"
+        "\t.reg .pred p;\n"
+        "\tsetp.ne.b32 p, %2, 0;\n"
+        "\tmov.b32 %0, 0;\n"
+        "\t@p ld.global.cs.u32 {%0}, [%1];\n"
+        "}\n"
+        : "=r"(data[0])
+        : "l"(src), "r"((int)do_access));
+#else
+    const int32_t* src_cast = reinterpret_cast<const int32_t*>(src);
+    if (do_access) {
+        data[0] = src_cast[0];
+    } else {
+        data[0] = 0;
+    }
+#endif
+}
+
+template <>
+__device__ __forceinline__ void load_global<2>(void* dst, const void* src)
+{
+    int16_t* data = reinterpret_cast<int16_t*>(dst);
+#ifdef PTX_AVAILABLE
+    asm volatile("ld.global.ca.u16 {%0}, [%1];\n" : "=h"(*data) : "l"(src));
+#else
+    const int16_t* src_cast = reinterpret_cast<const int16_t*>(src);
+    data[0] = src_cast[0];
+#endif
+}
+
+template <>
+__device__ __forceinline__ void load_global<2>(void* dst, const void* src, bool do_access)
+{
+    int16_t* data = reinterpret_cast<int16_t*>(dst);
+#ifdef PTX_AVAILABLE
+    asm volatile(
+        "{\n"
+        "\t.reg .pred p;\n"
+        "\tsetp.ne.b32 p, %2, 0;\n"
+        "\tmov.u16 %0, 0;\n"
+        "\t@p ld.global.u16 {%0}, [%1];\n"
+        "}\n"
+        : "=h"(*data)
+        : "l"(src), "r"((int)do_access));
+#else
+    const int16_t* src_cast = reinterpret_cast<const int16_t*>(src);
+    if (do_access) {
+        data[0] = src_cast[0];
+    } else {
+        data[0] = 0;
+    }
+#endif
+}
+
+template <>
+__device__ __forceinline__ void load_global<2, LoadPolicy::CacheGlobal>(void* dst, const void* src)
+{
+    int16_t* data = reinterpret_cast<int16_t*>(dst);
+#ifdef PTX_AVAILABLE
+    asm volatile("ld.global.cg.u16 {%0}, [%1];\n" : "=h"(*data) : "l"(src));
+#else
+    const int16_t* src_cast = reinterpret_cast<const int16_t*>(src);
+    data[0] = src_cast[0];
+#endif
+}
+
+template <>
+__device__ __forceinline__ void load_global<2, LoadPolicy::CacheGlobal>(void* dst,
+                                                                        const void* src,
+                                                                        bool do_access)
+{
+    int16_t* data = reinterpret_cast<int16_t*>(dst);
+#ifdef PTX_AVAILABLE
+    asm volatile(
+        "{\n"
+        "\t.reg .pred p;\n"
+        "\tsetp.ne.b32 p, %2, 0;\n"
+        "\tmov.u16 %0, 0;\n"
+        "\t@p ld.global.cg.u16 {%0}, [%1];\n"
+        "}\n"
+        : "=h"(*data)
+        : "l"(src), "r"((int)do_access));
+#else
+    const int16_t* src_cast = reinterpret_cast<const int16_t*>(src);
+    if (do_access) {
+        data[0] = src_cast[0];
+    } else {
+        data[0] = 0;
+    }
+#endif
+}
+
+template <>
+__device__ __forceinline__ void load_global<2, LoadPolicy::CacheStreaming>(void* dst,
+                                                                           const void* src)
+{
+    int16_t* data = reinterpret_cast<int16_t*>(dst);
+#ifdef PTX_AVAILABLE
+    asm volatile("ld.global.cs.u16 {%0}, [%1];\n" : "=h"(*data) : "l"(src));
+#else
+    const int16_t* src_cast = reinterpret_cast<const int16_t*>(src);
+    data[0] = src_cast[0];
+#endif
+}
+
+template <>
+__device__ __forceinline__ void load_global<2, LoadPolicy::CacheStreaming>(void* dst,
+                                                                           const void* src,
+                                                                           bool do_access)
+{
+    int16_t* data = reinterpret_cast<int16_t*>(dst);
+#ifdef PTX_AVAILABLE
+    asm volatile(
+        "{\n"
+        "\t.reg .pred p;\n"
+        "\tsetp.ne.b32 p, %2, 0;\n"
+        "\tmov.u16 %0, 0;\n"
+        "\t@p ld.global.cs.u16 {%0}, [%1];\n"
+        "}\n"
+        : "=h"(*data)
+        : "l"(src), "r"((int)do_access));
+#else
+    const int16_t* src_cast = reinterpret_cast<const int16_t*>(src);
+    if (do_access) {
+        data[0] = src_cast[0];
+    } else {
+        data[0] = 0;
+    }
+#endif
+}
+
+/////////// Load Shared ///////////
+namespace internal {
+
+#ifdef PTX_AVAILABLE
+__device__ __forceinline__ unsigned convert_to_shared(const void* ptr)
+{
+#if __CUDACC_VER_MAJOR__ >= 11
+    // In CUDA 11 we have a builtin intrinsic
+    return __cvta_generic_to_shared(ptr);
+#else
+    unsigned ret_val;
+    asm volatile(
+        "{\n"
+        "\t.reg .u64 p1;\n"
+        "\tcvta.to.shared.u64 p1, %1\n"
+        "\tcvt.u32.u64 %0, p1;\n"
+        "}\n"
+        : "=r"(ret_val)
+        : "l"(ptr));
+    return ret_val;
+#endif
+}
+#endif
+
+}  // namespace internal
+
+template <>
+__device__ __forceinline__ void load_shared<16>(void* dst, const void* src)
+{
+    uint4* data = reinterpret_cast<uint4*>(dst);
+#ifdef PTX_AVAILABLE
+    unsigned src_shr = internal::convert_to_shared(src);
+
+    asm volatile("ld.shared.v4.u32 {%0, %1, %2, %3}, [%4];\n"
+                 : "=r"(data[0].x), "=r"(data[0].y), "=r"(data[0].z), "=r"(data[0].w)
+                 : "r"(src_shr));
+#else
+    const uint4* src_cast = reinterpret_cast<const uint4*>(src);
+    data[0] = src_cast[0];
+#endif
+}
+
+template <>
+__device__ __forceinline__ void load_shared<16>(void* dst, const void* src, bool do_access)
+{
+    uint4* data = reinterpret_cast<uint4*>(dst);
+#ifdef PTX_AVAILABLE
+    unsigned src_shr = internal::convert_to_shared(src);
+
+    asm volatile(
+        "{\n"
+        "\t.reg .pred p;\n"
+        "\tsetp.ne.b32 p, %5, 0;\n"
+        "\tmov.b32 %0, 0;\n"
+        "\tmov.b32 %1, 0;\n"
+        "\tmov.b32 %2, 0;\n"
+        "\tmov.b32 %3, 0;\n"
+        "\t@p ld.shared.v4.u32 {%0, %1, %2, %3}, [%4];\n"
+        "}\n"
+        : "=r"(data[0].x), "=r"(data[0].y), "=r"(data[0].z), "=r"(data[0].w)
+        : "r"(src_shr), "r"((int)do_access));
+#else
+    const uint4* src_cast = reinterpret_cast<const uint4*>(src);
+    if (do_access) {
+        data[0] = src_cast[0];
+    } else {
+        data[0].x = 0;
+        data[0].y = 0;
+        data[0].z = 0;
+        data[0].w = 0;
+    }
+#endif
+}
+
+template <>
+__device__ __forceinline__ void load_shared<8>(void* dst, const void* src)
+{
+    uint2* data = reinterpret_cast<uint2*>(dst);
+#ifdef PTX_AVAILABLE
+    unsigned src_shr = internal::convert_to_shared(src);
+
+    asm volatile("ld.shared.v2.u32 {%0, %1}, [%2];\n"
+                 : "=r"(data[0].x), "=r"(data[0].y)
+                 : "r"(src_shr));
+#else
+    const uint2* src_cast = reinterpret_cast<const uint2*>(src);
+    data[0] = src_cast[0];
+#endif
+}
+
+template <>
+__device__ __forceinline__ void load_shared<8>(void* dst, const void* src, bool do_access)
+{
+    uint2* data = reinterpret_cast<uint2*>(dst);
+#ifdef PTX_AVAILABLE
+    unsigned src_shr = internal::convert_to_shared(src);
+
+    asm volatile(
+        "{\n"
+        "\t.reg .pred p;\n"
+        "\tsetp.ne.b32 p, %3, 0;\n"
+        "\tmov.b32 %0, 0;\n"
+        "\tmov.b32 %1, 0;\n"
+        "\t@p ld.shared.v2.u32 {%0, %1}, [%2];\n"
+        "}\n"
+        : "=r"(data[0].x), "=r"(data[0].y)
+        : "r"(src_shr), "r"((int)do_access));
+#else
+    const uint2* src_cast = reinterpret_cast<const uint2*>(src);
+    if (do_access) {
+        data[0] = src_cast[0];
+    } else {
+        data[0].x = 0;
+        data[0].y = 0;
+    }
+#endif
+}
+
+template <>
+__device__ __forceinline__ void load_shared<4>(void* dst, const void* src)
+{
+    int32_t* data = reinterpret_cast<int32_t*>(dst);
+#ifdef PTX_AVAILABLE
+    unsigned src_shr = internal::convert_to_shared(src);
+
+    asm volatile("ld.shared.u32 {%0}, [%1];\n" : "=r"(*data) : "r"(src_shr));
+#else
+    const int32_t* src_cast = reinterpret_cast<const int32_t*>(src);
+    data[0] = src_cast[0];
+#endif
+}
+
+template <>
+__device__ __forceinline__ void load_shared<4>(void* dst, const void* src, bool do_access)
+{
+    int32_t* data = reinterpret_cast<int32_t*>(dst);
+#ifdef PTX_AVAILABLE
+    unsigned src_shr = internal::convert_to_shared(src);
+
+    asm volatile(
+        "{\n"
+        "\t.reg .pred p;\n"
+        "\tsetp.ne.b32 p, %2, 0;\n"
+        "\tmov.b32 %0, 0;\n"
+        "\t@p ld.shared.u32 %0, [%1];\n"
+        "}\n"
+        : "=r"(data[0])
+        : "r"(src_shr), "r"((int)do_access));
+#else
+    const int32_t* src_cast = reinterpret_cast<const int32_t*>(src);
+    if (do_access) {
+        data[0] = src_cast[0];
+    } else {
+        data[0] = 0;
+    }
+#endif
+}
+
+/////////// Store Global ///////////
+
+template <>
+__device__ __forceinline__ void store_global<16>(void* dst, const void* src)
+{
+    const uint4* data = reinterpret_cast<const uint4*>(src);
+#ifdef PTX_AVAILABLE
+    asm volatile("st.global.wb.v4.u32 [%0], {%1, %2, %3, %4};\n"
+                 :
+                 : "l"(dst), "r"(data[0].x), "r"(data[0].y), "r"(data[0].z), "r"(data[0].w)
+                 : "memory");
+#else
+    uint4* dst_cast = reinterpret_cast<uint4*>(dst);
+    dst_cast[0] = data[0];
+#endif
+}
+
+template <>
+__device__ __forceinline__ void store_global<16, StorePolicy::CacheGlobal>(void* dst,
+                                                                           const void* src)
+{
+    const uint4* data = reinterpret_cast<const uint4*>(src);
+#ifdef PTX_AVAILABLE
+    asm volatile("st.global.cg.v4.u32 [%0], {%1, %2, %3, %4};\n"
+                 :
+                 : "l"(dst), "r"(data[0].x), "r"(data[0].y), "r"(data[0].z), "r"(data[0].w)
+                 : "memory");
+#else
+    uint4* dst_cast = reinterpret_cast<uint4*>(dst);
+    dst_cast[0] = data[0];
+#endif
+}
+
+template <>
+__device__ __forceinline__ void store_global<16, StorePolicy::CacheStreaming>(void* dst,
+                                                                              const void* src)
+{
+    const uint4* data = reinterpret_cast<const uint4*>(src);
+#ifdef PTX_AVAILABLE
+    asm volatile("st.global.cs.v4.u32 [%0], {%1, %2, %3, %4};\n"
+                 :
+                 : "l"(dst), "r"(data[0].x), "r"(data[0].y), "r"(data[0].z), "r"(data[0].w)
+                 : "memory");
+#else
+    uint4* dst_cast = reinterpret_cast<uint4*>(dst);
+    dst_cast[0] = data[0];
+#endif
+}
+
+template <>
+__device__ __forceinline__ void store_global<8>(void* dst, const void* src)
+{
+    const uint2* data = reinterpret_cast<const uint2*>(src);
+#ifdef PTX_AVAILABLE
+    asm volatile("st.global.wb.v2.u32 [%0], {%1, %2};\n"
+                 :
+                 : "l"(dst), "r"(data[0].x), "r"(data[0].y));
+#else
+    uint2* dst_cast = reinterpret_cast<uint2*>(dst);
+    dst_cast[0] = data[0];
+#endif
+}
+
+template <>
+__device__ __forceinline__ void store_global<8, StorePolicy::CacheGlobal>(void* dst,
+                                                                          const void* src)
+{
+    const uint2* data = reinterpret_cast<const uint2*>(src);
+#ifdef PTX_AVAILABLE
+    asm volatile("st.global.cg.v2.u32 [%0], {%1, %2};\n"
+                 :
+                 : "l"(dst), "r"(data[0].x), "r"(data[0].y));
+#else
+    uint2* dst_cast = reinterpret_cast<uint2*>(dst);
+    dst_cast[0] = data[0];
+#endif
+}
+
+template <>
+__device__ __forceinline__ void store_global<8, StorePolicy::CacheStreaming>(void* dst,
+                                                                             const void* src)
+{
+    const uint2* data = reinterpret_cast<const uint2*>(src);
+#ifdef PTX_AVAILABLE
+    asm volatile("st.global.cs.v2.u32 [%0], {%1, %2};\n"
+                 :
+                 : "l"(dst), "r"(data[0].x), "r"(data[0].y));
+#else
+    uint2* dst_cast = reinterpret_cast<uint2*>(dst);
+    dst_cast[0] = data[0];
+#endif
+}
+
+template <>
+__device__ __forceinline__ void store_global<4>(void* dst, const void* src)
+{
+    const int32_t* data = reinterpret_cast<const int32_t*>(src);
+#ifdef PTX_AVAILABLE
+    asm volatile("st.global.wb.u32 [%0], %1;\n" : : "l"(dst), "r"(*data));
+#else
+    int32_t* dst_cast = reinterpret_cast<int32_t*>(dst);
+    dst_cast[0] = data[0];
+#endif
+}
+
+template <>
+__device__ __forceinline__ void store_global<4, StorePolicy::CacheGlobal>(void* dst,
+                                                                          const void* src)
+{
+    const int32_t* data = reinterpret_cast<const int32_t*>(src);
+#ifdef PTX_AVAILABLE
+    asm volatile("st.global.cg.u32 [%0], %1;\n" : : "l"(dst), "r"(*data));
+#else
+    int32_t* dst_cast = reinterpret_cast<int32_t*>(dst);
+    dst_cast[0] = data[0];
+#endif
+}
+
+template <>
+__device__ __forceinline__ void store_global<4, StorePolicy::CacheStreaming>(void* dst,
+                                                                             const void* src)
+{
+    const int32_t* data = reinterpret_cast<const int32_t*>(src);
+#ifdef PTX_AVAILABLE
+    asm volatile("st.global.cs.u32 [%0], %1;\n" : : "l"(dst), "r"(*data));
+#else
+    int32_t* dst_cast = reinterpret_cast<int32_t*>(dst);
+    dst_cast[0] = data[0];
+#endif
+}
+
+/////////// Store Shared ///////////
+
+template <>
+__device__ __forceinline__ void store_shared<16>(void* dst, const void* src)
+{
+    const uint4* data = reinterpret_cast<const uint4*>(src);
+#ifdef PTX_AVAILABLE
+    unsigned dst_int = internal::convert_to_shared(dst);
+
+    asm volatile("st.shared.v4.u32 [%0], {%1, %2, %3, %4};\n"
+                 :
+                 : "r"(dst_int), "r"(data[0].x), "r"(data[0].y), "r"(data[0].z), "r"(data[0].w));
+#else
+    uint4* dst_cast = reinterpret_cast<uint4*>(dst);
+    dst_cast[0] = data[0];
+#endif
+}
+
+template <>
+__device__ __forceinline__ void store_shared<8>(void* dst, const void* src)
+{
+    const uint2* data = reinterpret_cast<const uint2*>(src);
+#ifdef PTX_AVAILABLE
+    unsigned dst_int = internal::convert_to_shared(dst);
+
+    asm volatile("st.shared.v2.u32 [%0], {%1, %2};\n"
+                 :
+                 : "r"(dst_int), "r"(data[0].x), "r"(data[0].y));
+#else
+    uint2* dst_cast = reinterpret_cast<uint2*>(dst);
+    dst_cast[0] = data[0];
+#endif
+}
+
+template <>
+__device__ __forceinline__ void store_shared<4>(void* dst, const void* src)
+{
+    const int32_t* data = reinterpret_cast<const int32_t*>(src);
+#ifdef PTX_AVAILABLE
+    unsigned dst_int = internal::convert_to_shared(dst);
+
+    asm volatile("st.shared.u32 [%0], %1;\n" : : "r"(dst_int), "r"(*data));
+#else
+    int32_t* dst_cast = reinterpret_cast<int32_t*>(dst);
+    dst_cast[0] = data[0];
+#endif
+}
+
+/////////// Asynchronous Memory Copy ///////////
+
+#ifdef ASYNC_COPY_AVAILABLE
+template <int AccessSize>
+__device__ __forceinline__ void memcpy_async(void* shr, const void* gbl)
+{
+    static_assert((AccessSize == 4 || AccessSize == 8 || AccessSize == 16));
+    unsigned shr_int = internal::convert_to_shared(shr);
+
+    asm volatile("cp.async.ca.shared.global [%0], [%1], %2;\n"
+                 :
+                 : "r"(shr_int), "l"(gbl), "n"(AccessSize));
+}
+
+template <int AccessSize>
+__device__ __forceinline__ void memcpy_async_nop(void* shr, const void* gbl, bool predicate)
+{
+    static_assert((AccessSize == 4 || AccessSize == 8 || AccessSize == 16));
+    unsigned shr_int = internal::convert_to_shared(shr);
+
+    asm volatile(
+        "{\n"
+        "   .reg .pred p;\n"
+        "   setp.ne.b32 p, %0, 0;\n"
+        "   @p cp.async.ca.shared.global [%1], [%2], %3;\n"
+        "}\n"
+        :
+        : "r"((int)predicate), "r"(shr_int), "l"(gbl), "n"(AccessSize));
+}
+
+template <int AccessSize>
+__device__ __forceinline__ void memcpy_async_zero(void* shr, const void* gbl, bool predicate)
+{
+    static_assert((AccessSize == 4 || AccessSize == 8 || AccessSize == 16));
+    unsigned shr_int = internal::convert_to_shared(shr);
+    int bytes_to_copy = (predicate ? AccessSize : 0);
+
+    asm volatile("cp.async.ca.shared.global [%0], [%1], %2, %3;\n"
+                 :
+                 : "r"(shr_int), "l"(gbl), "n"(AccessSize), "r"(bytes_to_copy));
+}
+
+template <int AccessSize>
+__device__ __forceinline__ void memcpy_async_zero_nop(void* shr,
+                                                      const void* gbl,
+                                                      bool zero_predicate,
+                                                      bool nop_predicate)
+{
+    static_assert((AccessSize == 4 || AccessSize == 8 || AccessSize == 16));
+    unsigned shr_int = internal::convert_to_shared(shr);
+    int bytes_to_copy = (zero_predicate ? AccessSize : 0);
+
+    asm volatile(
+        "{\n"
+        "   .reg .pred p;\n"
+        "   setp.ne.b32 p, %0, 0;\n"
+        "   @p cp.async.ca.shared.global [%1], [%2], %3, %4;\n"
+        "}\n"
+        :
+        : "r"((int)nop_predicate), "r"(shr_int), "l"(gbl), "n"(AccessSize), "r"(bytes_to_copy));
+}
+
+// Cache global variants. Separate interface to require deliberate use of them.
+__device__ __forceinline__ void memcpy_async_cg(void* shr, const void* gbl)
+{
+    unsigned shr_int = internal::convert_to_shared(shr);
+
+    asm volatile("cp.async.cg.shared.global [%0], [%1], 16;\n" : : "r"(shr_int), "l"(gbl));
+}
+
+__device__ __forceinline__ void memcpy_async_nop_cg(void* shr, const void* gbl, bool predicate)
+{
+    unsigned shr_int = internal::convert_to_shared(shr);
+
+    asm volatile(
+        "{\n"
+        "   .reg .pred p;\n"
+        "   setp.ne.b32 p, %0, 0;\n"
+        "   @p cp.async.cg.shared.global [%1], [%2], 16;\n"
+        "}\n"
+        :
+        : "r"((int)predicate), "r"(shr_int), "l"(gbl));
+}
+
+__device__ __forceinline__ void memcpy_async_zero_cg(void* shr, const void* gbl, bool predicate)
+{
+    unsigned shr_int = internal::convert_to_shared(shr);
+    int bytes_to_copy = (predicate ? 16 : 0);
+
+    asm volatile("cp.async.cg.shared.global [%0], [%1], 16, %2;\n"
+                 :
+                 : "r"(shr_int), "l"(gbl), "r"(bytes_to_copy));
+}
+
+__device__ __forceinline__ void memcpy_async_zero_nop_cg(void* shr,
+                                                         const void* gbl,
+                                                         bool zero_predicate,
+                                                         bool nop_predicate)
+{
+    unsigned shr_int = internal::convert_to_shared(shr);
+    int bytes_to_copy = (zero_predicate ? 16 : 0);
+
+    asm volatile(
+        "{\n"
+        "   .reg .pred p;\n"
+        "   setp.ne.b32 p, %0, 0;\n"
+        "   @p cp.async.cg.shared.global [%1], [%2], 16, %3;\n"
+        "}\n"
+        :
+        : "r"((int)nop_predicate), "r"(shr_int), "l"(gbl), "r"(bytes_to_copy));
+}
+
+__device__ __forceinline__ void memcpy_async_fence() { asm volatile("cp.async.commit_group;\n"); }
+
+template <int stages>
+__device__ __forceinline__ void memcpy_async_wait()
+{
+    static_assert(stages <= 8);
+
+    asm volatile("cp.async.wait_group %0;\n" : : "n"(stages));
+}
+
+// TODO: The tail complete should be a known compile time artifact, should try and induce this
+// without all of the branches from the call-site. This is a hacky solution.
+template <>
+__device__ __forceinline__ void tail_complete_wait<1>(int remaining_stages)
+{
+    if (remaining_stages == 0) memcpy_async_wait<0>();
+}
+
+template <>
+__device__ __forceinline__ void tail_complete_wait<2>(int remaining_stages)
+{
+    if (remaining_stages == 1)
+        memcpy_async_wait<1>();
+    else if (remaining_stages == 0)
+        memcpy_async_wait<0>();
+}
+
+template <>
+__device__ __forceinline__ void tail_complete_wait<3>(int remaining_stages)
+{
+    if (remaining_stages == 2)
+        memcpy_async_wait<2>();
+    else if (remaining_stages == 1)
+        memcpy_async_wait<1>();
+    else if (remaining_stages == 0)
+        memcpy_async_wait<0>();
+}
+
+template <>
+__device__ __forceinline__ void tail_complete_wait<4>(int remaining_stages)
+{
+    if (remaining_stages == 3)
+        memcpy_async_wait<3>();
+    else if (remaining_stages == 2)
+        memcpy_async_wait<2>();
+    else if (remaining_stages == 1)
+        memcpy_async_wait<1>();
+    else if (remaining_stages == 0)
+        memcpy_async_wait<0>();
+}
+
+template <>
+__device__ __forceinline__ void tail_complete_wait<5>(int remaining_stages)
+{
+    if (remaining_stages == 4)
+        memcpy_async_wait<4>();
+    else if (remaining_stages == 3)
+        memcpy_async_wait<3>();
+    else if (remaining_stages == 2)
+        memcpy_async_wait<2>();
+    else if (remaining_stages == 1)
+        memcpy_async_wait<1>();
+    else if (remaining_stages == 0)
+        memcpy_async_wait<0>();
+}
+
+template <>
+__device__ __forceinline__ void tail_complete_wait<6>(int remaining_stages)
+{
+    if (remaining_stages == 5)
+        memcpy_async_wait<5>();
+    else if (remaining_stages == 4)
+        memcpy_async_wait<4>();
+    else if (remaining_stages == 3)
+        memcpy_async_wait<3>();
+    else if (remaining_stages == 2)
+        memcpy_async_wait<2>();
+    else if (remaining_stages == 1)
+        memcpy_async_wait<1>();
+    else if (remaining_stages == 0)
+        memcpy_async_wait<0>();
+}
+#endif
+
+}  // namespace mem_access
diff --git a/deepspeed/inference/v2/kernels/includes/reduction_utils.h b/deepspeed/inference/v2/kernels/includes/reduction_utils.h
new file mode 100644
index 000000000000..eb8efab77ac1
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/includes/reduction_utils.h
@@ -0,0 +1,778 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#pragma once
+
+#include "conversion_utils.h"
+#include "ds_kernel_utils.h"
+#include "memory_access_utils.h"
+
+namespace cg = cooperative_groups;
+
+namespace reduce {
+
+enum class ROpType {
+    // Addition
+    Add,
+
+    // Maximum reduction
+    Max,
+
+    // Minimum reduction
+    Min,
+};
+
+constexpr int max_threads = 1024;
+constexpr int max_warps = max_threads / hw_warp_size;
+
+/*
+High level API. The API takes in a set of operations and variables
+and performs that reduction operation on that variable. The reductions
+of each of the arguments are completely independent of each other (
+i.e., the val1-op1 combination has no impact on val2-op2).
+
+Example usage:
+``` cpp
+float max_val;
+float min_val;
+reduce::block<rop::Max, rop::Min>(tb, warp, max_val, min_val);
+```
+
+TODO(cmikeh2): In theory, we might be able to do this sequentially with
+device functions and rely on the assembler correctly behaving. My initial
+instinct is this won't work, but if it does it would reduce implementation
+cost significantly.
+
+TODO(cmikeh2): We need to support sub-block reductions. The warp intrinsic
+currently supports this (more incidentally than anything else). It is not
+uncommon in something like softmax or a fused attention kernel to map multiple
+reductions to a thread block, but each reduction itself is only scoped
+to part of the threads (i.e block size = 512, 128 threads per reduction).
+*/
+template <ROpType Op, int warp_bound = max_warps>
+DS_D_INLINE void block(cg::thread_block& tb, cg::thread_block_tile<hw_warp_size>& warp, float& val);
+
+template <ROpType Op1, ROpType Op2, int warp_bound = max_warps>
+DS_D_INLINE void block(cg::thread_block& tb,
+                       cg::thread_block_tile<hw_warp_size>& warp,
+                       float& val1,
+                       float& val2);
+
+template <ROpType Op1, ROpType Op2, ROpType Op3, int warp_bound = max_warps>
+DS_D_INLINE void block(cg::thread_block& tb,
+                       cg::thread_block_tile<hw_warp_size>& warp,
+                       float& val1,
+                       float& val2,
+                       float& val3);
+
+template <ROpType Op1, ROpType Op2, ROpType Op3, ROpType Op4, int warp_bound = max_warps>
+DS_D_INLINE void block(cg::thread_block& tb,
+                       cg::thread_block_tile<hw_warp_size>& warp,
+                       float& val1,
+                       float& val2,
+                       float& val3,
+                       float& val4);
+
+/*
+The partitioned block is a special case of the above where in the warps of a threadblock are
+partitioned into separate independent reductions. For example, I might have an 8 warp thread block
+in which each pair of warps is processing an independent piece of data. I would then reduce that
+data with the something like the following:
+``` cpp
+float max_val;
+reduce::partitioned_block<rop::Max, 2>(tb, warp, max_val);
+```
+After which, each pair of warps would have coherent data with each other. Note, this API will not
+provide correct results if the number of warps per partition is not a power of 2.
+*/
+template <ROpType Op, int num_threads>
+DS_D_INLINE void partitioned_block(cg::thread_block& tb,
+                                   cg::thread_block_tile<hw_warp_size>& warp,
+                                   float& val);
+
+template <ROpType Op1, ROpType Op2, int num_threads>
+DS_D_INLINE void partitioned_block(cg::thread_block& tb,
+                                   cg::thread_block_tile<hw_warp_size>& warp,
+                                   float& val1,
+                                   float& val2);
+
+template <ROpType Op1, ROpType Op2, ROpType Op3, int num_threads>
+DS_D_INLINE void partitioned_block(cg::thread_block& tb,
+                                   cg::thread_block_tile<hw_warp_size>& warp,
+                                   float& val1,
+                                   float& val2,
+                                   float& val3);
+
+template <ROpType Op1, ROpType Op2, ROpType Op3, ROpType Op4, int num_threads>
+DS_D_INLINE void partitioned_block(cg::thread_block& tb,
+                                   cg::thread_block_tile<hw_warp_size>& warp,
+                                   float& val1,
+                                   float& val2,
+                                   float& val3,
+                                   float& val4);
+
+/*
+Single element reduction primitives. Used inside serial collection
+loops.
+
+Example usage:
+using rop = reduce::OpType;
+float min = init<rop::Min>();
+for (int i = 0; i < 4; i++) {
+    min = reduce::element<rop::Min>(min, data[i]);
+}
+*/
+
+template <ROpType Op, typename T>
+DS_D_INLINE T element(const T lhs, const T rhs);
+
+template <ROpType OType, typename T = float>
+DS_D_INLINE T init();
+
+/********************** Internal reduction APIs **********************/
+
+/*
+Single element "reductions". TODO(cmikeh2): this sort of "op" concept
+should be refactored into its own implementation at some point. This interface
+may be easily expanded for new types/operations, but the typical reductions
+we need are covered with min/max/add on float.
+
+NOTE: there is no mean reduction because that relies on knowledge of how
+many values were already reduced into each scalar. Implementing this on top
+of reduce should be straightforward (can just wrap the sum reduction) and
+would be a good extension of the header.
+*/
+
+DS_D_INLINE int _warp_rank()
+{
+    const int thread_rank =
+        threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y;
+    return thread_rank / hw_warp_size;
+}
+
+/* Float element reduce implementations */
+template <>
+DS_D_INLINE float element<ROpType::Add>(const float lhs, const float rhs)
+{
+    return lhs + rhs;
+}
+
+template <>
+DS_D_INLINE float element<ROpType::Max>(const float lhs, const float rhs)
+{
+    return fmaxf(lhs, rhs);
+}
+
+template <>
+DS_D_INLINE float element<ROpType::Min>(const float lhs, const float rhs)
+{
+    return fminf(lhs, rhs);
+}
+
+/* __half element reduce implementation */
+template <>
+DS_D_INLINE __half element<ROpType::Add>(const __half lhs, const __half rhs)
+{
+    return lhs + rhs;
+}
+
+template <>
+DS_D_INLINE __half element<ROpType::Max>(const __half lhs, const __half rhs)
+{
+#if __CUDA_ARCH__ >= 800
+    // Intrinsic limited to Ampere + newer
+    return __hmax(lhs, rhs);
+#else
+    return (lhs > rhs) ? lhs : rhs;
+#endif
+}
+
+template <>
+DS_D_INLINE __half element<ROpType::Min>(const __half lhs, const __half rhs)
+{
+#if __CUDA_ARCH__ >= 800
+    // Intrinsic limited to Ampere + newer
+    return __hmin(lhs, rhs);
+#else
+    return (lhs < rhs) ? lhs : rhs;
+#endif
+}
+
+/* __half2 element reduce implementation */
+template <>
+DS_D_INLINE __half2 element<ROpType::Add>(const __half2 lhs, const __half2 rhs)
+{
+    return lhs + rhs;
+}
+
+template <>
+DS_D_INLINE __half2 element<ROpType::Max>(const __half2 lhs, const __half2 rhs)
+{
+#if __CUDA_ARCH__ >= 800
+    return __hmax2(lhs, rhs);
+#else
+    __half2 ret_val;
+    ret_val.x = (lhs.x > rhs.x) ? lhs.x : rhs.x;
+    ret_val.y = (lhs.y > rhs.y) ? lhs.y : rhs.y;
+    return ret_val;
+#endif
+}
+
+template <>
+DS_D_INLINE __half2 element<ROpType::Min>(const __half2 lhs, const __half2 rhs)
+{
+#if __CUDA_ARCH__ >= 800
+    return __hmin2(lhs, rhs);
+#else
+    __half2 ret_val;
+    ret_val.x = (lhs.x < rhs.x) ? lhs.x : rhs.x;
+    ret_val.y = (lhs.y < rhs.y) ? lhs.y : rhs.y;
+    return ret_val;
+#endif
+}
+
+template <>
+DS_D_INLINE int32_t element<ROpType::Add>(const int32_t lhs, const int32_t rhs)
+{
+    return lhs + rhs;
+}
+
+template <>
+DS_D_INLINE int32_t element<ROpType::Max>(const int32_t lhs, const int32_t rhs)
+{
+    return (lhs > rhs) ? lhs : rhs;
+}
+
+template <>
+DS_D_INLINE int32_t element<ROpType::Min>(const int32_t lhs, const int32_t rhs)
+{
+    return (lhs < rhs) ? lhs : rhs;
+}
+
+template <>
+DS_D_INLINE uint32_t element<ROpType::Add>(const uint32_t lhs, const uint32_t rhs)
+{
+    return lhs + rhs;
+}
+
+template <>
+DS_D_INLINE uint32_t element<ROpType::Max>(const uint32_t lhs, const uint32_t rhs)
+{
+    return (lhs > rhs) ? lhs : rhs;
+}
+
+template <>
+DS_D_INLINE uint32_t element<ROpType::Min>(const uint32_t lhs, const uint32_t rhs)
+{
+    return (lhs < rhs) ? lhs : rhs;
+}
+
+template <>
+DS_D_INLINE int64_t element<ROpType::Add>(const int64_t lhs, const int64_t rhs)
+{
+    return lhs + rhs;
+}
+
+template <>
+DS_D_INLINE int64_t element<ROpType::Max>(const int64_t lhs, const int64_t rhs)
+{
+    return (lhs > rhs) ? lhs : rhs;
+}
+
+template <>
+DS_D_INLINE int64_t element<ROpType::Min>(const int64_t lhs, const int64_t rhs)
+{
+    return (lhs < rhs) ? lhs : rhs;
+}
+
+/*
+Reduction initialization primitives
+*/
+template <>
+DS_D_INLINE float init<ROpType::Add>()
+{
+    return 0.0f;
+}
+
+template <>
+DS_D_INLINE float init<ROpType::Min>()
+{
+    // Positive infinity
+    return INFINITY;
+}
+
+template <>
+DS_D_INLINE float init<ROpType::Max>()
+{
+    // Negative infinity
+    return -INFINITY;
+}
+
+template <>
+DS_D_INLINE __half init<ROpType::Add>()
+{
+    constexpr __half_raw zero = {0x0000};
+    return __half(zero);
+}
+
+template <>
+DS_D_INLINE __half init<ROpType::Min>()
+{
+    constexpr __half_raw inf = {0x7C00};
+    return __half(inf);
+}
+
+template <>
+DS_D_INLINE __half init<ROpType::Max>()
+{
+    constexpr __half_raw neg_inf = {0xFC00};
+    return __half(neg_inf);
+}
+
+template <>
+DS_D_INLINE __half2 init<ROpType::Add>()
+{
+#ifdef __HIP_PLATFORM_AMD__
+    return __half2{_Float16_2{0x0000, 0x0000}};
+#else
+    constexpr __half2_raw zero = {0x0000, 0x0000};
+    return __half2(zero);
+#endif
+}
+
+template <>
+DS_D_INLINE __half2 init<ROpType::Min>()
+{
+#ifdef __HIP_PLATFORM_AMD__
+    return __half2{_Float16_2{0x7C00, 0x7C00}};
+#else
+    constexpr __half2_raw inf = {0x7C00, 0x7C00};
+    return __half2(inf);
+#endif
+}
+
+template <>
+DS_D_INLINE __half2 init<ROpType::Max>()
+{
+#ifdef __HIP_PLATFORM_AMD__
+    return __half2{_Float16_2{0xFC00, 0xFC00}};
+#else
+    constexpr __half2_raw neg_inf = {0xFC00, 0xFC00};
+    return __half2(neg_inf);
+#endif
+}
+
+template <>
+DS_D_INLINE int32_t init<ROpType::Add>()
+{
+    return 0;
+}
+
+template <>
+DS_D_INLINE int32_t init<ROpType::Min>()
+{
+    return 0x7FFFFFFF;
+}
+
+template <>
+DS_D_INLINE int32_t init<ROpType::Max>()
+{
+    return 0x80000000;
+}
+
+template <>
+DS_D_INLINE uint32_t init<ROpType::Add>()
+{
+    return 0;
+}
+
+template <>
+DS_D_INLINE uint32_t init<ROpType::Min>()
+{
+    return 0xFFFFFFFF;
+}
+
+template <>
+DS_D_INLINE uint32_t init<ROpType::Max>()
+{
+    return 0;
+}
+
+template <>
+DS_D_INLINE int64_t init<ROpType::Add>()
+{
+    return 0;
+}
+
+template <>
+DS_D_INLINE int64_t init<ROpType::Min>()
+{
+    return 0x7FFFFFFFFFFFFFFF;
+}
+
+template <>
+DS_D_INLINE int64_t init<ROpType::Max>()
+{
+    return 0x8000000000000000;
+}
+
+template <>
+DS_D_INLINE uint64_t init<ROpType::Add>()
+{
+    return 0;
+}
+
+template <>
+DS_D_INLINE uint64_t init<ROpType::Min>()
+{
+    return 0xFFFFFFFFFFFFFFFF;
+}
+
+template <>
+DS_D_INLINE uint64_t init<ROpType::Max>()
+{
+    return 0;
+}
+
+template <ROpType Op, typename T>
+DS_D_INLINE void init(T* data)
+{
+    data[0] = init<Op, T>();
+}
+
+template <ROpType Op1, ROpType Op2, typename T>
+DS_D_INLINE void init(T* data)
+{
+    data[0] = init<Op1, T>();
+    data[1] = init<Op2, T>();
+}
+
+template <ROpType Op1, ROpType Op2, ROpType Op3, typename T>
+DS_D_INLINE void init(T* data)
+{
+    data[0] = init<Op1, T>();
+    data[1] = init<Op2, T>();
+    data[2] = init<Op3, T>();
+}
+
+template <ROpType Op1, ROpType Op2, ROpType Op3, ROpType Op4, typename T>
+DS_D_INLINE void init(T* data)
+{
+    data[0] = init<Op1, T>();
+    data[1] = init<Op2, T>();
+    data[2] = init<Op3, T>();
+    data[3] = init<Op4, T>();
+}
+
+/*
+Warp reduction primitives
+
+`reduction_width` is an unsafe template parameter, that is that
+when using `reduction_width` < hw_warp_size the warp is partitioned
+into `hw_warp_size` / `reduction_width` groups of partial sums.
+
+If someone can figure out how to use variadic templates in a reasonable way
+here (fold is C++17 only and I don't think helps and recursion feels like
+huge overkill that harms readability) that would be wonderful.
+*/
+
+template <typename T, ROpType Op, int reduce_width = hw_warp_size>
+DS_D_INLINE void _warp(cg::thread_block_tile<hw_warp_size>& warp, T* data)
+{
+#pragma unroll
+    for (int i = 1; i < reduce_width; i *= 2) {
+        data[0] = element<Op>(data[0], warp.shfl_xor(data[0], i));
+    }
+}
+
+template <typename T, ROpType Op1, ROpType Op2, int reduce_width = hw_warp_size>
+DS_D_INLINE void _warp(cg::thread_block_tile<hw_warp_size>& warp, T* data)
+{
+#pragma unroll
+    for (int i = 1; i < reduce_width; i *= 2) {
+        data[0] = element<Op1>(data[0], warp.shfl_xor(data[0], i));
+        data[1] = element<Op2>(data[1], warp.shfl_xor(data[1], i));
+    }
+}
+
+template <typename T, ROpType Op1, ROpType Op2, ROpType Op3, int reduce_width = hw_warp_size>
+DS_D_INLINE void _warp(cg::thread_block_tile<hw_warp_size>& warp, T* data)
+{
+#pragma unroll
+    for (int i = 1; i < reduce_width; i *= 2) {
+        data[0] = element<Op1>(data[0], warp.shfl_xor(data[0], i));
+        data[1] = element<Op2>(data[1], warp.shfl_xor(data[1], i));
+        data[2] = element<Op3>(data[2], warp.shfl_xor(data[2], i));
+    }
+}
+
+template <typename T,
+          ROpType Op1,
+          ROpType Op2,
+          ROpType Op3,
+          ROpType Op4,
+          int reduce_width = hw_warp_size>
+DS_D_INLINE void _warp(cg::thread_block_tile<hw_warp_size>& warp, T* data)
+{
+#pragma unroll
+    for (int i = 1; i < reduce_width; i *= 2) {
+        data[0] = element<Op1>(data[0], warp.shfl_xor(data[0], i));
+        data[1] = element<Op2>(data[1], warp.shfl_xor(data[1], i));
+        data[2] = element<Op3>(data[2], warp.shfl_xor(data[2], i));
+        data[3] = element<Op4>(data[3], warp.shfl_xor(data[3], i));
+    }
+}
+
+/*
+Implementation for primary block reduction that serves both `block` and
+`partitioned_block`.
+
+Total warps refers to the reduction width of the reduction, not
+the number of warps in the block (which may exceed that
+if the block is partitioned or if we do a conservative bound at
+compile time).
+*/
+template <typename T, int total_warps, ROpType... Ops>
+DS_D_INLINE void _block(cg::thread_block& tb,
+                        cg::thread_block_tile<hw_warp_size>& warp_arg,
+                        T* data)
+{
+    constexpr int elems = sizeof...(Ops);
+    constexpr int bytes = sizeof(T);
+    // Unused when `partition_size == 1` or total_warps == 1
+    __shared__ T reduce_buffer[max_warps * elems];
+
+#ifdef __HIP_PLATFORM_AMD__
+    const int total_threads = blockDim.x * blockDim.y * blockDim.z;
+    const int running_warps = total_threads / hw_warp_size;
+#else
+    const int running_warps = warp_arg.meta_group_size();
+#endif
+
+    // Always perform warp-scope reduction
+    _warp<T, Ops...>(warp_arg, data);
+
+    // If max_warps == 1 let's skip the runtime check
+    if (total_warps != 1) {
+        if (warp_arg.thread_rank() == 0) {
+#pragma unroll
+            for (int i = 0; i < elems; i++) {
+                mem_access::store_shared<bytes>(reduce_buffer + elems * _warp_rank() + i, data + i);
+            }
+        }
+
+        // Synchronization inside block-uniform conditional is safe
+        tb.sync();
+
+        if (_warp_rank() == 0) {
+            if (warp_arg.thread_rank() < running_warps) {
+#pragma unroll
+                for (int i = 0; i < elems; i++) {
+                    mem_access::load_shared<bytes>(
+                        data + i, reduce_buffer + elems * warp_arg.thread_rank() + i);
+                }
+            } else {
+                init<Ops...>(data);
+            }
+
+            _warp<T, Ops..., total_warps>(warp_arg, data);
+
+#pragma unroll
+            for (int i = 0; i < elems; i++) {
+                mem_access::store_shared<bytes>(reduce_buffer + elems * warp_arg.thread_rank() + i,
+                                                data + i);
+            }
+        }
+
+        // Synchronization inside block-uniform conditional is safe
+        tb.sync();
+
+#pragma unroll
+        for (int i = 0; i < elems; i++) {
+            mem_access::load_shared<bytes>(data + i, reduce_buffer + _warp_rank() * elems + i);
+        }
+    }
+}
+
+/*
+Main API implementations. For the most part, they just convert the individual
+variables into arrays, which makes working with them easier with a single
+implementation. In theory, we could use the `_block` implementation as another
+option, but the nature of using a pointer is a little less safe and this allows
+us to obfuscate the details of the partitioned implementation.
+*/
+template <ROpType Op, int warp_bound>
+DS_D_INLINE void block(cg::thread_block& tb, cg::thread_block_tile<hw_warp_size>& warp, float& val)
+{
+    _block<float, warp_bound, Op>(tb, warp, &val);
+}
+
+template <ROpType Op1, ROpType Op2, int warp_bound>
+DS_D_INLINE void block(cg::thread_block& tb,
+                       cg::thread_block_tile<hw_warp_size>& warp,
+                       float& val1,
+                       float& val2)
+{
+    float data[2] = {val1, val2};
+    _block<float, warp_bound, Op1, Op2>(tb, warp, data);
+    val1 = data[0];
+    val2 = data[1];
+}
+
+template <ROpType Op1, ROpType Op2, ROpType Op3, int warp_bound>
+DS_D_INLINE void block(cg::thread_block& tb,
+                       cg::thread_block_tile<hw_warp_size>& warp,
+                       float& val1,
+                       float& val2,
+                       float& val3)
+{
+    float data[3] = {val1, val2, val3};
+    _block<float, warp_bound, Op1, Op2, Op3>(tb, warp, data);
+    val1 = data[0];
+    val2 = data[1];
+    val3 = data[2];
+}
+
+template <ROpType Op1, ROpType Op2, ROpType Op3, ROpType Op4, int warp_bound>
+DS_D_INLINE void block(cg::thread_block& tb,
+                       cg::thread_block_tile<hw_warp_size>& warp,
+                       float& val1,
+                       float& val2,
+                       float& val3,
+                       float& val4)
+{
+    float data[4] = {val1, val2, val3, val4};
+    _block<float, warp_bound, Op1, Op2, Op3, Op4>(tb, warp, data);
+    val1 = data[0];
+    val2 = data[1];
+    val3 = data[2];
+    val4 = data[3];
+}
+
+/*
+Note: for the partitioned blocks, the implementation does not support non-power of 2 blocks in order
+to shorten block scale reduction length.
+*/
+template <ROpType Op, int num_threads>
+DS_D_INLINE void partitioned_block(cg::thread_block& tb,
+                                   cg::thread_block_tile<hw_warp_size>& warp,
+                                   float& val)
+{
+    if (num_threads <= hw_warp_size) {
+        _warp<float, Op, num_threads>(warp, &val);
+    } else {
+        constexpr int num_warps = num_threads / hw_warp_size;
+        _block<float, num_warps, Op>(tb, warp, &val);
+    }
+}
+
+template <ROpType Op1, ROpType Op2, int num_threads>
+DS_D_INLINE void partitioned_block(cg::thread_block& tb,
+                                   cg::thread_block_tile<hw_warp_size>& warp,
+                                   float& val1,
+                                   float& val2)
+{
+    float data[2] = {val1, val2};
+
+    if (num_threads <= hw_warp_size) {
+        _warp<float, Op1, Op2, num_threads>(warp, data);
+    } else {
+        constexpr int num_warps = num_threads / hw_warp_size;
+        _block<float, num_warps, Op1, Op2>(tb, warp, data);
+    }
+
+    val1 = data[0];
+    val2 = data[1];
+}
+
+template <ROpType Op1, ROpType Op2, ROpType Op3, int num_threads>
+DS_D_INLINE void partitioned_block(cg::thread_block& tb,
+                                   cg::thread_block_tile<hw_warp_size>& warp,
+                                   float& val1,
+                                   float& val2,
+                                   float& val3)
+{
+    float data[3] = {val1, val2, val3};
+
+    if (num_threads <= hw_warp_size) {
+        _warp<float, Op1, Op2, Op3, num_threads>(warp, data);
+    } else {
+        constexpr int num_warps = num_threads / hw_warp_size;
+        _block<float, num_warps, Op1, Op2, Op3>(tb, warp, data);
+    }
+
+    val1 = data[0];
+    val2 = data[1];
+    val3 = data[2];
+}
+
+template <ROpType Op1, ROpType Op2, ROpType Op3, ROpType Op4, int num_threads>
+DS_D_INLINE void partitioned_block(cg::thread_block& tb,
+                                   cg::thread_block_tile<hw_warp_size>& warp,
+                                   float& val1,
+                                   float& val2,
+                                   float& val3,
+                                   float& val4)
+{
+    float data[4] = {val1, val2, val3, val4};
+
+    if (num_threads <= hw_warp_size) {
+        _warp<float, Op1, Op2, Op3, Op4, num_threads>(warp, data);
+    } else {
+        constexpr int num_warps = num_threads / hw_warp_size;
+        _block<float, num_warps, Op1, Op2, Op3, Op4>(tb, warp, data);
+    }
+
+    val1 = data[0];
+    val2 = data[1];
+    val3 = data[2];
+    val4 = data[3];
+}
+
+/*
+Arg-reduce is a specialization of the above. We only support this with a single reduction
+parameter. This only works for max/min reductions.
+*/
+
+__align__(8) struct IdxReduceResult {
+    /*
+    NOTE: ORDERING MATTERS HERE! The idx is the least significant set of bits
+    and the val is the most significant. Changing the order of this declaration
+    will break the code.
+    */
+    int idx;
+    float val;
+};
+
+template <ROpType Op, int warpBound>
+DS_D_INLINE IdxReduceResult
+idx_reduce(cg::thread_block& tb, cg::thread_block_tile<hw_warp_size>& warp, float val, int idx)
+{
+    IdxReduceResult res = {idx, val};
+
+    // Clear out the nan. This shouldn't be an issue for our initial applications
+    if (isnan(val)) res.val = init<Op>();
+
+    // Can do float compares as integers. By packing the index into the lower bits
+    // we can just do a single int64 rather than a branch, compare, and select.
+    // One side benefit of this is that it is by nature a stable algorithm and
+    // will always bias ties to the higher index.
+    int64_t* res_as_int = reinterpret_cast<int64_t*>(&res);
+
+    // The way floating point compare works is normally to perform a sign comparison
+    // and if they match, then do a comparison of the rest of the bits as unsigned
+    // integers. Since we are bundling these, that means for negative values we need
+    // to reverse the sort order, which we can do with an XOR.
+    if (val < 0) { *res_as_int ^= 0x7fffffff00000000; }
+
+    _block<int64_t, warpBound, Op>(tb, warp, res_as_int);
+
+    // Sign bit is preserved, so we can check if we need to invert the mantissa back
+    if (res.val < 0) { *res_as_int ^= 0x7fffffff00000000; }
+
+    return res;
+}
+
+}  // namespace reduce
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/__init__.py b/deepspeed/inference/v2/kernels/ragged_ops/__init__.py
new file mode 100644
index 000000000000..988152b2e7c0
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/ragged_ops/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .atom_builder import *
+from .blocked_flash import *
+from .embed import *
+from .linear_blocked_kv_rotary import *
+from .logits_gather import *
+from .moe_gather import *
+from .moe_scatter import *
+from .top_1_gating import *
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/atom_builder/__init__.py b/deepspeed/inference/v2/kernels/ragged_ops/atom_builder/__init__.py
new file mode 100644
index 000000000000..c79201cdf165
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/ragged_ops/atom_builder/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .atom_builder import *
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/atom_builder/atom_builder.cpp b/deepspeed/inference/v2/kernels/ragged_ops/atom_builder/atom_builder.cpp
new file mode 100644
index 000000000000..7ad4dc5faa20
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/ragged_ops/atom_builder/atom_builder.cpp
@@ -0,0 +1,53 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include "atom_builder.h"
+#include "attention_atom.h"
+#include "ragged_dtypes.h"
+
+int32_t build_atoms(torch::Tensor& atoms_ten,
+                    torch::Tensor& batch_metadata,
+                    torch::Tensor& seq_metadata,
+                    torch::Tensor& kv_ptrs,
+                    const int32_t q_block_size,
+                    const int32_t kv_block_size)
+{
+    const RaggedBatchDescriptor* batch_desc =
+        reinterpret_cast<const RaggedBatchDescriptor*>(batch_metadata.data_ptr());
+
+    const InflightSeqDescriptor* seq_desc =
+        reinterpret_cast<const InflightSeqDescriptor*>(seq_metadata.data_ptr());
+
+    int32_t** kv_ptr_list = reinterpret_cast<int32_t**>(kv_ptrs.data_ptr());
+
+    AttentionAtom* atoms = reinterpret_cast<AttentionAtom*>(atoms_ten.data_ptr());
+
+    int32_t n_atoms = 0;
+    for (int i = 0; i < batch_desc->n_sequences; i++) {
+        const int seq_atoms = (seq_desc[i].n_tokens + q_block_size - 1) / q_block_size;
+        int32_t cur_start_idx = seq_desc[i].start_idx;
+        int32_t global_start_idx = seq_desc[i].seen_tokens;
+        int32_t remaining_toks = seq_desc[i].n_tokens;
+
+        for (int j = 0; j < seq_atoms; j++) {
+            atoms[n_atoms].block_idx_list = kv_ptr_list[i];
+            atoms[n_atoms].q_start_idx = cur_start_idx;
+            atoms[n_atoms].q_len = std::min(remaining_toks, q_block_size);
+            atoms[n_atoms].global_q_idx = global_start_idx;
+
+            const int32_t end_toks = global_start_idx + atoms[n_atoms].q_len;
+            // TODO(cmikeh2): This logic needs to be changed for sparse implementations
+            atoms[n_atoms].kv_blocks = (end_toks + kv_block_size - 1) / kv_block_size;
+            atoms[n_atoms].total_extent = end_toks;
+
+            cur_start_idx += atoms[n_atoms].q_len;
+            global_start_idx += atoms[n_atoms].q_len;
+            remaining_toks -= atoms[n_atoms].q_len;
+            n_atoms++;
+        }
+    }
+
+    return n_atoms;
+}
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/atom_builder/atom_builder.h b/deepspeed/inference/v2/kernels/ragged_ops/atom_builder/atom_builder.h
new file mode 100644
index 000000000000..a3342d0e6695
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/ragged_ops/atom_builder/atom_builder.h
@@ -0,0 +1,21 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#pragma once
+
+#include <torch/extension.h>
+
+/*
+Construct the attention atoms given the ragged metadata for the current batch.
+This could largely be done at the Python level, but since we pack the KV ptr
+alongside the int32_t metadata, it gets very ugly to handle the mixed-width
+data structures (since we're packing them in a single tensor).
+*/
+int32_t build_atoms(torch::Tensor& atoms_ten,
+                    torch::Tensor& batch_metadata,
+                    torch::Tensor& seq_metadata,
+                    torch::Tensor& kv_ptrs,
+                    const int32_t q_block_size,
+                    const int32_t kv_block_size);
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/atom_builder/atom_builder.py b/deepspeed/inference/v2/kernels/ragged_ops/atom_builder/atom_builder.py
new file mode 100644
index 000000000000..3355ca76c6a4
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/ragged_ops/atom_builder/atom_builder.py
@@ -0,0 +1,50 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import Tuple
+
+import torch
+
+from ... import DSKernelBase
+from deepspeed.ops.op_builder import RaggedOpsBuilder
+from ....ragged import RaggedBatchWrapper
+
+
+class AtomBuilder(DSKernelBase):
+    """
+    C++ implementation to populate the attention atoms for the blocked attention
+    kernel.
+    """
+
+    def __init__(self) -> None:
+        """
+        Triggers compilation of the C++ implementation.
+        """
+        inf_module = RaggedOpsBuilder().load()
+        self.kernel = inf_module.build_atoms
+
+    def __call__(self, atoms: torch.Tensor, ragged_batch: RaggedBatchWrapper, q_block_size: int,
+                 kv_block_size: int) -> Tuple[torch.Tensor, int]:
+        """
+        Populates the attention atoms for the blocked attention kernel.
+
+        Args:
+            atoms (torch.Tensor): Pre-allocated int32 tensor of shape [max_atoms, 8]
+            ragged_batch (torch.Tensor): Wrapper for the ragged batch.
+            q_block_size (int): The block size for the queries (as determined by the
+                attention implementation)
+            kv_block_size (int): The block size for the keys/values (as determined by the
+                attention implementation)
+
+        Returns:
+
+        """
+        if atoms.device != torch.device("cpu"):
+            raise RuntimeError("AtomBuilder must be called on tensors")
+
+        n_atoms = self.kernel(atoms, ragged_batch.batch_metadata_buffer(on_device=False),
+                              ragged_batch.inflight_seq_descriptors(on_device=False),
+                              ragged_batch.kv_ptrs(on_device=False), q_block_size, kv_block_size)
+        return atoms, n_atoms
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/blocked_flash/__init__.py b/deepspeed/inference/v2/kernels/ragged_ops/blocked_flash/__init__.py
new file mode 100644
index 000000000000..87b2b3d68777
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/ragged_ops/blocked_flash/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .blocked_flash import *
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/blocked_flash/attention_atom.h b/deepspeed/inference/v2/kernels/ragged_ops/blocked_flash/attention_atom.h
new file mode 100644
index 000000000000..ed8eb9e19b3d
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/ragged_ops/blocked_flash/attention_atom.h
@@ -0,0 +1,39 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#pragma once
+
+#include <cstdint>
+#include "cuda.h"
+
+struct AttentionAtom {
+    /*
+    The attention atom describes the workload of a particular query. The attention
+    kernel will execute each ``AttentionAtom`` for each head of the model.
+    */
+
+    // Pointer to a list of KV block indices.
+    int32_t* block_idx_list;
+
+    // Index of first token in the ragged batch associated with this atom.
+    int32_t q_start_idx;
+
+    // Number of tokens in the ragged batch associated with this atom.
+    int32_t q_len;
+
+    // Number of key/value blocks associated with this atom. All but the last are
+    // assumed to be fully dense.
+    int32_t kv_blocks;
+
+    // Number of tokens in the last key/value block.
+    int32_t total_extent;
+
+    // Global index of the first token in the atom. For example, in a prompt continuation
+    // in which we have already processed 768 tokens, this would be 768.
+    int32_t global_q_idx;
+
+    // Unused
+    int32_t unused;
+};
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/blocked_flash/blocked_flash.cpp b/deepspeed/inference/v2/kernels/ragged_ops/blocked_flash/blocked_flash.cpp
new file mode 100644
index 000000000000..6cb60ad00f9b
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/ragged_ops/blocked_flash/blocked_flash.cpp
@@ -0,0 +1,101 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+/******************************************************************************
+ * Copyright (c) 2023, Tri Dao.
+ ******************************************************************************/
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/extension.h>
+
+#include "blocked_flash.h"
+#include "flash.h"
+
+#define CHECK_SHAPE(x, ...)                                     \
+    TORCH_CHECK(x.sizes() == torch::IntArrayRef({__VA_ARGS__}), \
+                #x " must have shape (" #__VA_ARGS__ ")")
+
+void flash_attn_by_atoms(at::Tensor& out,
+                         at::Tensor& q,
+                         at::Tensor& k,
+                         at::Tensor& v,
+                         at::Tensor& attention_atoms,
+                         const float softmax_scale,
+                         const bool is_causal)
+{
+    auto dprops = at::cuda::getCurrentDeviceProperties();
+
+    bool is_sm8x = dprops->major == 8 && dprops->minor >= 0;
+    bool is_sm90 = dprops->major == 9 && dprops->minor == 0;
+    TORCH_CHECK(is_sm90 || is_sm8x, "FlashAttention only supports Ampere GPUs or newer.");
+
+    auto q_dtype = q.dtype();
+    TORCH_CHECK(q_dtype == torch::kFloat16 || q_dtype == torch::kBFloat16,
+                "FlashAttention only support fp16 and bf16 data type");
+    if (q_dtype == torch::kBFloat16) {
+        TORCH_CHECK(is_sm90 || is_sm8x, "bfloat16 is only supported on Ampere GPUs or newer");
+    }
+    TORCH_CHECK(k.dtype() == q_dtype, "query and key must have the same dtype");
+    TORCH_CHECK(v.dtype() == q_dtype, "query and value must have the same dtype");
+
+    TORCH_CHECK(q.is_cuda(), "Input tensor must be on CUDA device");
+    TORCH_CHECK(k.is_cuda(), "Input tensor must be on CUDA device");
+    TORCH_CHECK(v.is_cuda(), "Input tensor must be on CUDA device");
+
+    TORCH_CHECK(q.stride(-1) == 1, "Input tensor must have contiguous last dimension");
+    TORCH_CHECK(k.stride(-1) == 1, "Input tensor must have contiguous last dimension");
+    TORCH_CHECK(v.stride(-1) == 1, "Input tensor must have contiguous last dimension");
+
+    const int total_q = q.size(0);
+    const int head_size = k.size(-1);
+    const int num_heads_kv = k.size(-2);
+    const int num_heads_q = q.size(-1) / head_size;
+
+    TORCH_CHECK(head_size <= 256, "head_size must be <= 256");
+    TORCH_CHECK(head_size % 8 == 0, "head_size must be divisible by 8");
+    TORCH_CHECK(num_heads_q % num_heads_kv == 0, "num_heads_q must be divisible by num_heads_kv");
+
+    Flash_fwd_params params;
+
+    params.is_bf16 = q.dtype() == torch::kBFloat16;
+
+    // Set the pointers and strides.
+    params.q_ptr = q.data_ptr();
+    params.k_ptr = k.data_ptr();
+    params.v_ptr = v.data_ptr();
+    params.o_ptr = out.data_ptr();
+    params.atoms = reinterpret_cast<AttentionAtom*>(attention_atoms.data_ptr());
+
+    // All stride are in elements, not bytes.
+    params.q_row_stride = q.stride(0);
+    params.k_row_stride = k.stride(1);
+    params.v_row_stride = v.stride(1);
+    params.o_row_stride = out.stride(0);
+
+    // Assume heads are contiguous.
+    params.q_head_stride = head_size;
+    params.k_head_stride = head_size;
+    params.v_head_stride = head_size;
+    params.o_head_stride = head_size;
+
+    // Head params
+    params.h = num_heads_q;
+    params.h_k = num_heads_kv;
+    params.h_h_k_ratio = num_heads_q / num_heads_kv;
+    params.d = head_size;
+    auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; };
+    params.d_rounded = round_multiple(head_size, 32);
+    params.num_atoms = attention_atoms.size(0);
+
+    // Set the different scale values.
+    params.scale_softmax = softmax_scale;
+    params.scale_softmax_log2 = softmax_scale * M_LOG2E;
+
+    params.is_causal = is_causal;
+
+    auto stream = at::cuda::getCurrentCUDAStream().stream();
+    run_mha_fwd(params, stream);
+}
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/blocked_flash/blocked_flash.h b/deepspeed/inference/v2/kernels/ragged_ops/blocked_flash/blocked_flash.h
new file mode 100644
index 000000000000..68037b425113
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/ragged_ops/blocked_flash/blocked_flash.h
@@ -0,0 +1,16 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#pragma once
+
+#include <torch/extension.h>
+
+void flash_attn_by_atoms(at::Tensor& out,
+                         at::Tensor& q,
+                         at::Tensor& k,
+                         at::Tensor& v,
+                         at::Tensor& attention_atoms,
+                         const float softmax_scale,
+                         const bool is_causal);
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/blocked_flash/blocked_flash.py b/deepspeed/inference/v2/kernels/ragged_ops/blocked_flash/blocked_flash.py
new file mode 100644
index 000000000000..54d465698b4e
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/ragged_ops/blocked_flash/blocked_flash.py
@@ -0,0 +1,107 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+
+from deepspeed.accelerator import get_accelerator
+from ....inference_utils import DtypeEnum
+from deepspeed.ops.op_builder import RaggedOpsBuilder
+
+from ... import DSKernelBase
+
+
+def get_q_block_size(head_size: int) -> int:
+    """
+    Returns the query block size required by the kernel given a head size.
+    """
+    cc_major, cc_minor = torch.cuda.get_device_capability(get_accelerator().current_device())  #ignore-cuda
+
+    if cc_major < 8:
+        raise RuntimeError("Blocked attention requires CUDA compute capability >= 8.0")
+
+    if head_size <= 64:
+        return 128
+    elif head_size <= 160:
+        if cc_minor != 0:
+            return 64
+        else:
+            return 128
+    elif head_size == 192:
+        return 128
+    elif head_size == 224:
+        if cc_minor != 0:
+            return 64
+        else:
+            return 128
+    else:
+        if cc_major == 8 and cc_minor == 0:
+            return 128
+        else:
+            return 64
+
+
+def get_kv_block_size(head_size: int) -> int:
+    """
+    Return preferred granulatity for blocked KV-cache implementation.
+    """
+    cc_major, cc_minor = torch.cuda.get_device_capability(get_accelerator().current_device())  #ignore-cuda
+
+    if cc_major < 8:
+        raise RuntimeError("Blocked attention requires CUDA compute capability >= 8.0")
+
+    if head_size <= 64:
+        return 128
+    elif head_size != 160 or cc_minor != 0:
+        return 64
+    else:
+        return 32
+
+
+class BlockedFlashAttn(DSKernelBase):
+    """
+    Modified implementation of flash-attn-2 tuned for inference on blocked KV-cache and wider
+    range of input sequence lengths.
+    """
+
+    supported_dtypes = [DtypeEnum.fp16, DtypeEnum.bf16]
+
+    def __init__(self, head_size: int, dtype: DtypeEnum) -> None:
+        """
+        Triggers any compilation of the kernels.
+        """
+        if not isinstance(dtype, DtypeEnum):
+            dtype = DtypeEnum(dtype)
+
+        if dtype not in BlockedFlashAttn.supported_dtypes:
+            raise ValueError("Unsupported data type: {}, supported data types are {}".format(
+                dtype, BlockedFlashAttn.supported_dtypes))
+
+        # For testing, need to revert to 32
+        if head_size % 16 != 0:
+            raise ValueError("Head size must be divisible by 32 (configured with {})".format(head_size))
+
+        inf_module = RaggedOpsBuilder().load()
+        self.kernel = inf_module.flash_attn_by_atoms
+
+    def __call__(self, out: torch.Tensor, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, atoms: torch.Tensor,
+                 softmax_scale: float) -> torch.Tensor:
+        """
+        Flash attention implementation atop a blocked KV-cache. Atoms should be pre-populated.
+        See attention_atom.h for further details on the structure of the information.
+
+        Arguments:
+            out (torch.Tensor): Output tensor of shape [tokens, hidden_size]
+            q (torch.Tensor): Query tensor of shape [tokens, hidden_size]
+            k (torch.Tensor): Key cache tensor of shape [n_blocks, block_size, n_heads_kv, head_size]. This Tensor only needs to be contiguous on the final dimension.
+            v (torch.Tensor): Value cache tensor of shape [n_blocks, block_size, n_heads_kv, head_size]. This Tensor only needs to be contiguous on the final dimension.
+            atoms (torch.Tensor): Atom information tensor of shape [num_atoms, 8] and type int32.
+                Not all data is readable in this format. See attention_atom.h for further details.
+            softmax_scale (float): Softmax scale factor.
+
+        Returns:
+            out (torch.Tensor): Output tensor of shape [tokens, hidden_size]
+        """
+        self.kernel(out, q, k, v, atoms, softmax_scale, True)
+        return out
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/blocked_flash/flash.h b/deepspeed/inference/v2/kernels/ragged_ops/blocked_flash/flash.h
new file mode 100644
index 000000000000..b4a53e6d7f52
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/ragged_ops/blocked_flash/flash.h
@@ -0,0 +1,74 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+/******************************************************************************
+Copyright (c) 2023, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+#include <cuda.h>
+#include <vector>
+
+#include "attention_atom.h"
+
+constexpr int TOTAL_DIM = 0;
+constexpr int H_DIM = 1;
+constexpr int D_DIM = 2;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct Qkv_params {
+    using index_t = uint32_t;
+    // The QKV matrices.
+    void* __restrict__ q_ptr;
+    void* __restrict__ k_ptr;
+    void* __restrict__ v_ptr;
+
+    // The stride between rows of the Q, K and V matrices.
+    index_t q_row_stride;
+    index_t k_row_stride;
+    index_t v_row_stride;
+    index_t q_head_stride;
+    index_t k_head_stride;
+    index_t v_head_stride;
+
+    // The number of heads.
+    int h, h_k;
+    // In the case of multi-query and grouped-query attention (MQA/GQA), nheads_k could be
+    // different from nheads (query).
+    int h_h_k_ratio;  // precompute h / h_k,
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct Flash_fwd_params : public Qkv_params {
+    // The O matrix (output).
+    void* __restrict__ o_ptr;
+
+    // The attention metadata
+    AttentionAtom* __restrict__ atoms;
+
+    // Total attention atoms
+    int num_atoms;
+
+    // The stride between rows of O.
+    index_t o_row_stride;
+    index_t o_head_stride;
+
+    // The dimensions
+    int d, d_rounded;
+
+    // The scaling factors for the kernel.
+    float scale_softmax;
+    float scale_softmax_log2;
+
+    bool is_bf16;
+    bool is_causal;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+void run_mha_fwd(Flash_fwd_params& params, cudaStream_t stream);
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/embed/__init__.py b/deepspeed/inference/v2/kernels/ragged_ops/embed/__init__.py
new file mode 100644
index 000000000000..d6b8e6047d74
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/ragged_ops/embed/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .embed import RaggedEmbeddingKernel
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/embed/embed.cpp b/deepspeed/inference/v2/kernels/ragged_ops/embed/embed.cpp
new file mode 100644
index 000000000000..04b72bf948db
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/ragged_ops/embed/embed.cpp
@@ -0,0 +1,101 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include "embed.h"
+#include "ragged_kernel_helpers.h"
+
+#ifdef BF16_AVAILABLE
+#define DISPATCH_FOR_FLOAT(DTYPE, ...)                       \
+    [&] {                                                    \
+        if (DTYPE == torch::kFloat32) {                      \
+            using float_t = float;                           \
+            return __VA_ARGS__();                            \
+        } else if (DTYPE == torch::kFloat16) {               \
+            using float_t = __half;                          \
+            return __VA_ARGS__();                            \
+        } else if (DTYPE == torch::kBFloat16) {              \
+            using float_t = __nv_bfloat16;                   \
+            return __VA_ARGS__();                            \
+        } else {                                             \
+            TORCH_CHECK(false, "Unsupported dispatch type"); \
+        }                                                    \
+    }()
+#else
+#define DISPATCH_FOR_FLOAT(DTYPE, ...)                       \
+    [&] {                                                    \
+        if (DTYPE == torch::kFloat32) {                      \
+            using float_t = float;                           \
+            return __VA_ARGS__();                            \
+        } else if (DTYPE == torch::kFloat16) {               \
+            using float_t = __half;                          \
+            return __VA_ARGS__();                            \
+        } else {                                             \
+            TORCH_CHECK(false, "Unsupported dispatch type"); \
+        }                                                    \
+    }()
+#endif
+
+#define DISPATCH_FOR_INT(DTYPE, ...)                         \
+    [&] {                                                    \
+        if (DTYPE == torch::kInt32) {                        \
+            using int_t = int32_t;                           \
+            return __VA_ARGS__();                            \
+        } else if (DTYPE == torch::kInt64) {                 \
+            using int_t = int64_t;                           \
+            return __VA_ARGS__();                            \
+        } else {                                             \
+            TORCH_CHECK(false, "Unsupported dispatch type"); \
+        }                                                    \
+    }()
+
+/*
+Embeddings kernel aware of ragged batch structure.
+*/
+void ragged_embed(torch::Tensor& embedded_tokens,
+                  torch::Tensor& input_ids,
+                  torch::Tensor& embedding_weight,
+                  c10::optional<torch::Tensor>& position_embedding_weight,
+                  int32_t pos_embed_offset,
+                  torch::Tensor& batch_metadata,
+                  torch::Tensor& seq_metadata,
+                  torch::Tensor& tokens_to_seq,
+                  torch::Tensor& kv_ptrs)
+{
+    // We don't care about KV cache here, so just hardcoding 0s for block_size/num_blocks
+    BatchWrapperCPP batch_wrapper =
+        make_cpp_batch_wrapper(batch_metadata, seq_metadata, tokens_to_seq, kv_ptrs, 0, 0);
+
+    const int32_t n_tokens = input_ids.numel();
+    const int32_t embed_dim = embedding_weight.size(1);
+    const int32_t vocab_size = embedding_weight.size(0);
+
+    DISPATCH_FOR_INT(input_ids.scalar_type(), [&] {
+        DISPATCH_FOR_FLOAT(embedding_weight.scalar_type(), [&] {
+            float_t* pos_embed_ptr = nullptr;
+            int32_t max_position_embed_idx = 0;
+            if (position_embedding_weight.has_value()) {
+                TORCH_CHECK(
+                    position_embedding_weight.value().options().dtype() ==
+                        embedding_weight.options().dtype(),
+                    "position_embedding_weight and embedding_weight must have the same dtype");
+                pos_embed_ptr =
+                    reinterpret_cast<float_t*>(position_embedding_weight.value().data_ptr());
+                max_position_embed_idx = position_embedding_weight.value().size(0) - 1;
+            }
+
+            launch_ragged_embed_kernel((float_t*)embedded_tokens.data_ptr(),
+                                       (const int_t*)input_ids.data_ptr(),
+                                       (const float_t*)embedding_weight.data_ptr(),
+                                       pos_embed_ptr,
+                                       batch_wrapper,
+                                       n_tokens,
+                                       embed_dim,
+                                       vocab_size,
+                                       max_position_embed_idx,
+                                       pos_embed_offset,
+                                       at::cuda::getCurrentCUDAStream());
+        });
+    });
+}
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/embed/embed.cu b/deepspeed/inference/v2/kernels/ragged_ops/embed/embed.cu
new file mode 100644
index 000000000000..81d6d534ddf5
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/ragged_ops/embed/embed.cu
@@ -0,0 +1,137 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include "ds_kernel_utils.h"
+#include "embed.cuh"
+#include "memory_access_utils.h"
+#include "ragged_dtypes.h"
+
+namespace embed {
+
+constexpr int granularity = 16;
+constexpr int threads = 512;
+
+}  // namespace embed
+
+template <typename TokenType, typename EmbedType>
+__global__ void ragged_embed_kernel(EmbedType* embedded_tokens,
+                                    const TokenType* input_ids,
+                                    const EmbedType* embedding_weight,
+                                    const EmbedType* position_weight,
+                                    const BatchWrapperCPP batch_desc,
+                                    const int32_t embed_dim,
+                                    const int32_t vocab_size,
+                                    const int32_t max_position_embed_idx,
+                                    const int32_t position_embed_offset)
+{
+    constexpr int T_vector = embed::granularity / sizeof(EmbedType);
+
+    const int32_t token_idx = blockIdx.y;
+
+    // It's possible our batch is padded (under CG conditions typically)
+    if (token_idx >= batch_desc.batch_metadata->n_tokens) return;
+
+    TokenType token_value = input_ids[token_idx];
+
+    if (token_value >= vocab_size || token_value < 0) {
+        // TODO(cmikeh2): This is invalid, but not sure how we want to handle it being invalid
+        // yet.
+        return;
+    }
+
+    const EmbedType* embedding_row = embedding_weight + token_value * embed_dim;
+    EmbedType* dest_row = embedded_tokens + token_idx * embed_dim;
+
+    const int channel_offset = (threadIdx.x + embed::threads * blockIdx.x) * T_vector;
+
+    if (channel_offset < embed_dim) {
+        EmbedType reg_buf[T_vector];
+
+        mem_access::load_global<embed::granularity>(reg_buf, embedding_row + channel_offset);
+
+        if (position_weight != nullptr) {
+            // Map the token to its global idx (indirect memory accesses aren't great but whatever)
+            const int32_t seq_idx = batch_desc.tokens_to_seq[token_idx];
+            const InflightSeqDescriptor seq_desc = batch_desc.seq_metadata[seq_idx];
+            int32_t pos_emb_idx = seq_desc.seen_tokens + (token_idx - seq_desc.start_idx);
+
+            // Position embed offset is an OPT-specific feature I think?
+            pos_emb_idx = pos_emb_idx + position_embed_offset;
+
+            // This clamping is technically
+            pos_emb_idx = (pos_emb_idx < 0) ? 0 : pos_emb_idx;
+            pos_emb_idx = (pos_emb_idx >= max_position_embed_idx) ? max_position_embed_idx
+                                                                  : pos_emb_idx;
+
+            const EmbedType* position_embedding_row = position_weight + pos_emb_idx * embed_dim;
+
+            EmbedType pos_buf[T_vector];
+            mem_access::load_global<embed::granularity>(pos_buf,
+                                                        position_embedding_row + channel_offset);
+
+#pragma unroll
+            for (int i = 0; i < T_vector; i++) { reg_buf[i] += pos_buf[i]; }
+        }
+
+        mem_access::store_global<embed::granularity>(dest_row + channel_offset, reg_buf);
+    }
+}
+
+template <typename TokenType, typename EmbedType>
+void launch_ragged_embed_kernel(EmbedType* embedded_tokens,
+                                const TokenType* input_ids,
+                                const EmbedType* embedding_weight,
+                                const EmbedType* position_weight,
+                                const BatchWrapperCPP batch_desc,
+                                const int32_t n_tokens,
+                                const int32_t embed_dim,
+                                const int32_t vocab_size,
+                                const int32_t max_position_embed_idx,
+                                const int32_t position_embed_offset,
+                                cudaStream_t stream)
+{
+    constexpr int T_vector = embed::granularity / sizeof(EmbedType);
+    constexpr int elems_per_block = embed::threads * T_vector;
+    const int parallel_blocks = (embed_dim + elems_per_block - 1) / elems_per_block;
+
+    const dim3 grid_dim(parallel_blocks, n_tokens, 1);
+    const dim3 block_dim(embed::threads, 1, 1);
+
+    ragged_embed_kernel<TokenType, EmbedType>
+        <<<grid_dim, block_dim, 0, stream>>>(embedded_tokens,
+                                             input_ids,
+                                             embedding_weight,
+                                             position_weight,
+                                             batch_desc,
+                                             embed_dim,
+                                             vocab_size,
+                                             max_position_embed_idx,
+                                             position_embed_offset);
+}
+
+#define INSTANTIATE_EMBED_FOR_TYPES(TOKEN_TYPE, EMBED_TYPE)           \
+    template void launch_ragged_embed_kernel<TOKEN_TYPE, EMBED_TYPE>( \
+        EMBED_TYPE * embedded_tokens,                                 \
+        const TOKEN_TYPE* input_ids,                                  \
+        const EMBED_TYPE* embedding_weight,                           \
+        const EMBED_TYPE* position_weight,                            \
+        const BatchWrapperCPP batch_descriptor,                       \
+        const int32_t n_tokens,                                       \
+        const int32_t embed_dim,                                      \
+        const int32_t vocab_size,                                     \
+        const int32_t max_position_embed_idx,                         \
+        const int32_t position_embed_offset,                          \
+        cudaStream_t stream);
+
+INSTANTIATE_EMBED_FOR_TYPES(int32_t, float)
+INSTANTIATE_EMBED_FOR_TYPES(int64_t, float)
+
+INSTANTIATE_EMBED_FOR_TYPES(int32_t, __half)
+INSTANTIATE_EMBED_FOR_TYPES(int64_t, __half)
+
+#ifdef BF16_AVAILABLE
+INSTANTIATE_EMBED_FOR_TYPES(int32_t, __nv_bfloat16)
+INSTANTIATE_EMBED_FOR_TYPES(int64_t, __nv_bfloat16)
+#endif
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/embed/embed.cuh b/deepspeed/inference/v2/kernels/ragged_ops/embed/embed.cuh
new file mode 100644
index 000000000000..94c397439b80
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/ragged_ops/embed/embed.cuh
@@ -0,0 +1,26 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#pragma once
+
+#include "ds_kernel_utils.h"
+#include "ragged_dtypes.h"
+
+#ifdef BF16_AVAILABLE
+#include <cuda_bf16.h>
+#endif
+
+template <typename TokenType, typename EmbedType>
+void launch_ragged_embed_kernel(EmbedType* embedded_tokens,
+                                const TokenType* input_ids,
+                                const EmbedType* embedding_weight,
+                                const EmbedType* position_weight,
+                                const BatchWrapperCPP batch_desc,
+                                const int32_t n_tokens,
+                                const int32_t embed_dim,
+                                const int32_t vocab_size,
+                                const int32_t max_position_embed_idx,
+                                const int32_t position_embed_offset,
+                                cudaStream_t stream);
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/embed/embed.h b/deepspeed/inference/v2/kernels/ragged_ops/embed/embed.h
new file mode 100644
index 000000000000..7897c1362669
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/ragged_ops/embed/embed.h
@@ -0,0 +1,23 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#pragma once
+
+#include <c10/cuda/CUDAStream.h>
+#include <torch/extension.h>
+#include "embed.cuh"
+
+/*
+Embeddings kernel aware of ragged batch structure.
+*/
+void ragged_embed(torch::Tensor& embedded_tokens,
+                  torch::Tensor& input_ids,
+                  torch::Tensor& embedding_weight,
+                  c10::optional<torch::Tensor>& position_weight,
+                  int32_t position_embed_offset,
+                  torch::Tensor& batch_metadata,
+                  torch::Tensor& seq_metadata,
+                  torch::Tensor& tokens_to_seq,
+                  torch::Tensor& kv_ptrs);
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/embed/embed.py b/deepspeed/inference/v2/kernels/ragged_ops/embed/embed.py
new file mode 100644
index 000000000000..0443ce3fdd8e
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/ragged_ops/embed/embed.py
@@ -0,0 +1,67 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import Optional
+
+import torch
+
+from ... import DSKernelBase
+from deepspeed.ops.op_builder import RaggedOpsBuilder
+from ....inference_utils import elem_size
+from ....ragged import RaggedBatchWrapper
+
+
+class RaggedEmbeddingKernel(DSKernelBase):
+    """
+    Ragged-aware CUDA kernel implementation for an embedding lookup. This will only lookup
+    the necessary tokens for a padded batch (i.e. if we are CGed and running with a slightly
+    larger batch size than the actual tokens).
+    """
+
+    supported_dtypes = [torch.float16, torch.bfloat16, torch.float32]
+    supported_token_dtypes = [torch.int32, torch.int64]
+
+    def __init__(self, embed_dtype: torch.dtype, token_dtype: torch.dtype, embed_dim: int) -> None:
+        """
+        Args:
+            fp_dtype (torch.dtype): Data type of the embedding table and output dtype.
+                Supported values are torch.float16, torch.bfloat16, and torch.float32.
+            token_dtype (torch.dtype): Data type of the token ids. Supported values are
+                torch.int32 and torch.int64.
+            embed_dim (int): Embedding dimension. Must be aligned to 16 bytes.
+        """
+        if embed_dtype not in RaggedEmbeddingKernel.supported_dtypes:
+            raise ValueError("Unsupported embedding data type: {}, supported_dtypes are {}".format(
+                embed_dtype, RaggedEmbeddingKernel.supported_dtypes))
+
+        if token_dtype not in RaggedEmbeddingKernel.supported_token_dtypes:
+            raise ValueError("Unsupported token data type: {}, supported_dtypes are {}".format(
+                token_dtype, RaggedEmbeddingKernel.supported_token_dtypes))
+
+        if elem_size(embed_dtype) * embed_dim % 16 != 0:
+            raise ValueError("Embedding dimension must be aligned to 16 bytes, got {}".format(embed_dim))
+
+        inf_module = RaggedOpsBuilder().load()
+        self.kernel = inf_module.ragged_embed
+
+    def __call__(self,
+                 embedded_tokens: torch.Tensor,
+                 ragged_wrapper: RaggedBatchWrapper,
+                 embedding_weight: torch.Tensor,
+                 position_embed_weight: Optional[torch.Tensor] = None,
+                 position_embed_offset: int = 0) -> torch.Tensor:
+        """
+        Ragged aware embedding lookup.
+
+        Args:
+            embedded_tokens (torch.Tensor): Output tensor of shape [num_tokens, embed_dim]
+            ragged_wrapper (RaggedBatchWrapper): Wrapper for the ragged batch.
+            embedding_weight (torch.Tensor): Embedding table of shape [vocab_size, embed_dim]
+        """
+        self.kernel(embedded_tokens, ragged_wrapper.input_ids(),
+                    embedding_weight, position_embed_weight, position_embed_offset,
+                    ragged_wrapper.batch_metadata_buffer(), ragged_wrapper.inflight_seq_descriptors(),
+                    ragged_wrapper.tokens_to_seq(), ragged_wrapper.kv_ptrs())
+        return embedded_tokens
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/__init__.py b/deepspeed/inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/__init__.py
new file mode 100644
index 000000000000..0e239dd6b4c7
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .blocked_kv_rotary import *
+from .blocked_trained_kv_rotary import *
+from .linear_blocked_kv_copy import *
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_kv_rotary.cpp b/deepspeed/inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_kv_rotary.cpp
new file mode 100644
index 000000000000..8493bbf4b9af
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_kv_rotary.cpp
@@ -0,0 +1,188 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include "blocked_kv_rotary.h"
+#include "ragged_kernel_helpers.h"
+
+#define DISPATCH_KV_ROTARY(T_TYPE, C_TYPE)                                 \
+    if (q.options().dtype() == torch::T_TYPE) {                            \
+        launch_kv_rotary_kernel<C_TYPE>((C_TYPE*)kv_cache.data_ptr(),      \
+                                        (C_TYPE*)q.data_ptr(),             \
+                                        (C_TYPE*)k.data_ptr(),             \
+                                        (C_TYPE*)v.data_ptr(),             \
+                                        (C_TYPE*)inv_freq_ptr,             \
+                                        batch_wrapper,                     \
+                                        qkv_stride,                        \
+                                        kv_cache_stride,                   \
+                                        v_offset,                          \
+                                        inv_freq_stride,                   \
+                                        q_ratio,                           \
+                                        head_size,                         \
+                                        n_tokens,                          \
+                                        n_q_heads,                         \
+                                        at::cuda::getCurrentCUDAStream()); \
+    }
+
+/*
+Rotary position embeddings + copy into KV cache. This implementation assumes
+that the inverse frequencies should be ready from global memory rather than
+synthesized in the kernel.
+
+Arguments:
+    kv_cache: [n_blocks, block_size, 2, n_kv_heads, head_size]
+    q: [n_tokens, n_q_heads * head_size]
+    k: [n_tokens, n_kv_heads * head_size]
+    v: [n_tokens, n_kv_heads * head_size]
+    inv_freq: [max_seq_len, head_size // 2]
+*/
+void kv_trained_rotary_embeddings(torch::Tensor& kv_cache,
+                                  torch::Tensor& q,
+                                  torch::Tensor& k,
+                                  torch::Tensor& v,
+                                  torch::Tensor& inv_freq,
+                                  torch::Tensor& batch_metadata,
+                                  torch::Tensor& seq_metadata,
+                                  torch::Tensor& tokens_to_seq,
+                                  torch::Tensor& kv_ptrs)
+{
+    const int32_t n_tokens = q.size(0);
+    TORCH_CHECK(n_tokens == k.size(0));
+    TORCH_CHECK(n_tokens == v.size(0));
+
+    // Dimensions
+    const int32_t block_size = kv_cache.size(1);
+    const int32_t n_kv_heads = kv_cache.size(3);
+    const int32_t head_size = kv_cache.size(4);
+
+    // Strides
+    const int32_t qkv_stride = q.stride(0);              // Per token
+    const int32_t kv_cache_stride = kv_cache.stride(1);  // Per token
+    const int32_t v_offset = kv_cache.stride(2);         // From k_cache to v_cache
+    const int32_t inv_freq_stride = inv_freq.stride(0);  // Per token idx
+
+    const int n_q_heads = q.size(1) / head_size;
+    const int q_ratio = n_q_heads / n_kv_heads;
+
+    void* inv_freq_ptr = (void*)inv_freq.data_ptr();
+
+    BatchWrapperCPP batch_wrapper = make_cpp_batch_wrapper(
+        batch_metadata, seq_metadata, tokens_to_seq, kv_ptrs, block_size, kv_cache.size(0));
+
+    DISPATCH_KV_ROTARY(kHalf, __half);
+
+#ifdef BF16_AVAILABLE
+    DISPATCH_KV_ROTARY(kBFloat16, __nv_bfloat16);
+#endif
+}
+
+/*
+Rotary position embeddings + copy into KV cache. This implementation assumes
+that the inverse frequencies should be synthesized in the kernel.
+
+Arguments:
+    kv_cache: [n_blocks, block_size, 2, n_kv_heads, head_size]
+    q: [n_tokens, n_q_heads * head_size]
+    k: [n_tokens, n_kv_heads * head_size]
+    v: [n_tokens, n_kv_heads * head_size]
+*/
+void kv_rotary_embeddings(torch::Tensor& kv_cache,
+                          torch::Tensor& q,
+                          torch::Tensor& k,
+                          torch::Tensor& v,
+                          torch::Tensor& batch_metadata,
+                          torch::Tensor& seq_metadata,
+                          torch::Tensor& tokens_to_seq,
+                          torch::Tensor& kv_ptrs)
+{
+    const int32_t n_tokens = q.size(0);
+    TORCH_CHECK(n_tokens == k.size(0));
+    TORCH_CHECK(n_tokens == v.size(0));
+
+    // Dimensions
+    const int32_t block_size = kv_cache.size(1);
+    const int32_t n_kv_heads = kv_cache.size(3);
+    const int32_t head_size = kv_cache.size(4);
+
+    // Strides
+    const int32_t qkv_stride = q.stride(0);              // Per token
+    const int32_t kv_cache_stride = kv_cache.stride(1);  // Per token
+    const int32_t v_offset = kv_cache.stride(2);         // From k_cache to v_cache
+    const int32_t inv_freq_stride = 0;                   // Per token idx
+
+    const int n_q_heads = q.size(1) / head_size;
+    const int q_ratio = n_q_heads / n_kv_heads;
+
+    void* inv_freq_ptr = nullptr;
+
+    BatchWrapperCPP batch_wrapper = make_cpp_batch_wrapper(
+        batch_metadata, seq_metadata, tokens_to_seq, kv_ptrs, block_size, kv_cache.size(0));
+
+    DISPATCH_KV_ROTARY(kHalf, __half);
+
+#ifdef BF16_AVAILABLE
+    DISPATCH_KV_ROTARY(kBFloat16, __nv_bfloat16);
+#endif
+}
+
+#define DISPATCH_KV_COPY(T_TYPE, C_TYPE)                                 \
+    if (q.options().dtype() == torch::T_TYPE) {                          \
+        launch_kv_copy_kernel<C_TYPE>((C_TYPE*)kv_cache.data_ptr(),      \
+                                      (C_TYPE*)q.data_ptr(),             \
+                                      (C_TYPE*)k.data_ptr(),             \
+                                      (C_TYPE*)v.data_ptr(),             \
+                                      batch_wrapper,                     \
+                                      qkv_stride,                        \
+                                      kv_cache_stride,                   \
+                                      v_offset,                          \
+                                      q_ratio,                           \
+                                      head_size,                         \
+                                      n_tokens,                          \
+                                      n_q_heads,                         \
+                                      at::cuda::getCurrentCUDAStream()); \
+    }
+
+/*
+Copy into linear KV cache.
+*/
+void linear_kv_copy(torch::Tensor& kv_cache,
+                    torch::Tensor& q,
+                    torch::Tensor& k,
+                    torch::Tensor& v,
+                    torch::Tensor& batch_metadata,
+                    torch::Tensor& seq_metadata,
+                    torch::Tensor& tokens_to_seq,
+                    torch::Tensor& kv_ptrs)
+{
+    const int32_t n_tokens = q.size(0);
+    TORCH_CHECK(n_tokens == k.size(0));
+    TORCH_CHECK(n_tokens == v.size(0));
+
+    // Dimensions
+    const int32_t block_size = kv_cache.size(1);
+    const int32_t n_kv_heads = kv_cache.size(3);
+    const int32_t head_size = kv_cache.size(4);
+
+    // Strides
+    const int32_t qkv_stride = q.stride(0);  // Per token
+    TORCH_CHECK(qkv_stride == k.stride(0));
+    TORCH_CHECK(qkv_stride == v.stride(0));
+
+    const int32_t kv_cache_stride = kv_cache.stride(1);  // Per token
+    const int32_t v_offset = kv_cache.stride(2);         // From k_cache to v_cache
+
+    const int n_q_heads = q.size(1) / head_size;
+
+    TORCH_CHECK(n_q_heads % n_kv_heads == 0);
+    const int q_ratio = n_q_heads / n_kv_heads;
+
+    BatchWrapperCPP batch_wrapper = make_cpp_batch_wrapper(
+        batch_metadata, seq_metadata, tokens_to_seq, kv_ptrs, block_size, kv_cache.size(0));
+
+    DISPATCH_KV_COPY(kHalf, __half);
+
+#ifdef BF16_AVAILABLE
+    DISPATCH_KV_COPY(kBFloat16, __nv_bfloat16);
+#endif
+}
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_kv_rotary.cu b/deepspeed/inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_kv_rotary.cu
new file mode 100644
index 000000000000..63ea5bc88bab
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_kv_rotary.cu
@@ -0,0 +1,314 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include "blocked_kv_rotary.cuh"
+#include "conversion_utils.h"
+#include "ds_kernel_utils.h"
+#include "memory_access_utils.h"
+
+namespace cg = cooperative_groups;
+
+namespace kv_rot {
+
+constexpr int granularity = 16;
+constexpr int threads = 256;
+
+}  // namespace kv_rot
+
+/*
+Supports head size 32, 64, 128, 256
+*/
+
+template <typename T, int qRatio, int headSize, bool doRotary>
+__global__ void kv_rotary_pos_kernel(T* kv_cache,
+                                     T* q,
+                                     T* k,
+                                     T* v,
+                                     const T* inv_freq,
+                                     const BatchWrapperCPP batch_desc,
+                                     const int qkv_stride,
+                                     const int kv_cache_stride,
+                                     const int v_offset,
+                                     const int inv_freq_stride)
+{
+    // Derived constexpr
+    constexpr int vector_T = kv_rot::granularity / sizeof(T);
+    constexpr int threads_per_head = headSize / vector_T;
+    constexpr int half_head_size = headSize >> 1;
+    constexpr int tokens_per_block = kv_rot::threads / threads_per_head;
+
+    // CG helpers
+    cg::thread_block tb = cg::this_thread_block();
+    cg::thread_block_tile<hw_warp_size> warp = cg::tiled_partition<hw_warp_size>(tb);
+    cg::thread_block_tile<threads_per_head> head_group =
+        cg::tiled_partition<threads_per_head>(warp);
+
+    // Parallelize on the head dimension for X blocks
+    const int head_idx = blockIdx.x;
+
+    const int block_seq_idx = threadIdx.x / threads_per_head;
+    const int base_neuron_idx = (threadIdx.x * vector_T) % headSize;
+    const int half_idx = base_neuron_idx % half_head_size;
+    const int half_head_lanes = threads_per_head / 2;
+
+    // Multiple tokens processed by the same threadblock
+    const int token_idx = blockIdx.y * tokens_per_block + block_seq_idx;
+    const bool valid_token = token_idx < batch_desc.batch_metadata->n_tokens;
+    const bool load_inv_freq = (inv_freq != nullptr) && valid_token;
+
+    // If we have GQA, then only one of the Q heads needs to do rotary + copy
+    // for each of the heads in the group.
+    bool need_kv = head_idx % qRatio == 0;
+    // Make sure the following code is warp uniform
+    need_kv = warp.shfl(need_kv, 0);
+
+    const int kv_head_idx = head_idx / qRatio;
+
+    // Ensure we don't access invalid portions of the seq_metadata
+    const int32_t seq_id = (valid_token) ? batch_desc.tokens_to_seq[token_idx] : 0;
+    const InflightSeqDescriptor seq_desc = batch_desc.seq_metadata[seq_id];
+    // This will give an invalid index if valid_token is false, but should never affect memory.
+    const int32_t global_token_idx = seq_desc.seen_tokens + (token_idx - seq_desc.start_idx);
+
+    T* q_row = q + token_idx * qkv_stride + head_idx * headSize;
+    T q_reg[vector_T];
+
+    if (need_kv) {
+        // The following logic assumes a linearly blocked KV cache. This means that no sparsity has
+        // been introduced into cache history.
+        const KVCacheDescriptor kv_desc = batch_desc.kv_desc;
+        const int32_t seq_kv_block_idx = global_token_idx / kv_desc.block_size;
+        const int32_t mapped_kv_block_idx =
+            (valid_token) ? kv_desc.block_lists[seq_id][seq_kv_block_idx] : 0;
+
+        const int32_t kv_block_offset = global_token_idx % kv_desc.block_size;
+        const int32_t kv_offset =
+            (mapped_kv_block_idx * kv_desc.block_size + kv_block_offset) * kv_cache_stride +
+            kv_head_idx * headSize;
+
+        // Load indices from QKV output
+        T* k_row = k + token_idx * qkv_stride + kv_head_idx * headSize;
+        T* v_row = v + token_idx * qkv_stride + kv_head_idx * headSize;
+
+        T k_reg[vector_T], v_reg[vector_T], inv_freq_reg[vector_T];
+
+        mem_access::load_global<kv_rot::granularity>(q_reg, q_row + base_neuron_idx, valid_token);
+        mem_access::load_global<kv_rot::granularity>(k_reg, k_row + base_neuron_idx, valid_token);
+        mem_access::load_global<kv_rot::granularity>(v_reg, v_row + base_neuron_idx, valid_token);
+        mem_access::load_global<kv_rot::granularity>(
+            inv_freq_reg, inv_freq + half_idx, load_inv_freq);
+
+        if constexpr (doRotary) {
+#pragma unroll
+            for (int i = 0; i < vector_T; i++) {
+                const int head_neuron_idx = base_neuron_idx + i;
+
+                float inv_freq_flt;
+                if (inv_freq != nullptr) {
+                    inv_freq_flt = conversion::to<float>(inv_freq_reg[i]) * (float)global_token_idx;
+                } else {
+                    inv_freq_flt =
+                        (float)((head_neuron_idx % half_head_size) * 2) / (float)headSize;
+                    // Conversion to T and back means that both branches of this if statement
+                    // will produce the same results if using the same algo for producing the
+                    // freqs.
+                    T trunc_freq = conversion::to<T>(1.0 / powf(10000.0, inv_freq_flt));
+                    inv_freq_flt = conversion::to<float>(trunc_freq) * (float)global_token_idx;
+                }
+
+                float rotary_sign = (head_neuron_idx >= half_head_size) ? -1.0f : 1.0f;
+                float q_f = conversion::to<float>(q_reg[i]);
+                float k_f = conversion::to<float>(k_reg[i]);
+                float q_rot = q_f * rotary_sign;
+                float k_rot = k_f * rotary_sign;
+
+                const float q_rot_temp = head_group.shfl_xor(q_rot, half_head_lanes);
+                const float k_rot_temp = head_group.shfl_xor(k_rot, half_head_lanes);
+
+                q_reg[i] =
+                    conversion::to<T>(q_f * cosf(inv_freq_flt) + q_rot_temp * sinf(inv_freq_flt));
+                k_reg[i] =
+                    conversion::to<T>(k_f * cosf(inv_freq_flt) + k_rot_temp * sinf(inv_freq_flt));
+            }
+        }
+
+        if (valid_token) {
+            mem_access::store_global<kv_rot::granularity>(kv_cache + kv_offset + base_neuron_idx,
+                                                          k_reg);
+            mem_access::store_global<kv_rot::granularity>(
+                kv_cache + kv_offset + base_neuron_idx + v_offset, v_reg);
+        }
+    } else {
+        T inv_freq_reg[vector_T];
+
+        mem_access::load_global<kv_rot::granularity>(q_reg, q_row + base_neuron_idx, valid_token);
+        mem_access::load_global<kv_rot::granularity>(
+            inv_freq_reg, inv_freq + half_idx, load_inv_freq);
+
+        if constexpr (doRotary) {
+#pragma unroll
+            for (int i = 0; i < vector_T; i++) {
+                const int head_neuron_idx = base_neuron_idx + i;
+
+                float inv_freq_flt;
+                if (inv_freq != nullptr) {
+                    inv_freq_flt = conversion::to<float>(inv_freq_reg[i]) * (float)global_token_idx;
+                } else {
+                    inv_freq_flt =
+                        (float)((head_neuron_idx % half_head_size) * 2) / (float)headSize;
+                    inv_freq_flt = 1.0 / powf(10000.0, inv_freq_flt) * (float)global_token_idx;
+                }
+
+                float rotary_sign = (head_neuron_idx >= half_head_size) ? -1.0f : 1.0f;
+                float q_f = conversion::to<float>(q_reg[i]);
+                float q_rot = q_f * rotary_sign;
+
+                const float q_rot_temp = head_group.shfl_xor(q_rot, half_head_lanes);
+
+                q_reg[i] =
+                    conversion::to<T>(q_f * cosf(inv_freq_flt) + q_rot_temp * sinf(inv_freq_flt));
+            }
+        }
+    }
+
+    if (valid_token && doRotary) {
+        mem_access::store_global<kv_rot::granularity>(q_row + base_neuron_idx, q_reg);
+    }
+}
+
+#define DISPATCH_KV_ROTARY_IMPL(Q_RATIO, HEAD_SIZE)       \
+    if (q_ratio == Q_RATIO && head_size == HEAD_SIZE)     \
+        kv_rotary_pos_kernel<T, Q_RATIO, HEAD_SIZE, true> \
+            <<<grid, block, 0, stream>>>(kv_cache,        \
+                                         q,               \
+                                         k,               \
+                                         v,               \
+                                         inv_freq,        \
+                                         batch_desc,      \
+                                         qkv_stride,      \
+                                         kv_cache_stride, \
+                                         v_offset,        \
+                                         inv_freq_stride);
+
+template <typename T>
+void launch_kv_rotary_kernel(T* kv_cache,
+                             T* q,
+                             T* k,
+                             T* v,
+                             T* inv_freq,
+                             const BatchWrapperCPP batch_desc,
+                             const int qkv_stride,
+                             const int kv_cache_stride,
+                             const int v_offset,
+                             const int inv_freq_stride,
+                             const int q_ratio,
+                             const int head_size,
+                             const int n_tokens,
+                             const int n_q_heads,
+                             cudaStream_t stream)
+{
+    constexpr int vector_T = kv_rot::granularity / sizeof(T);
+    const int threads_per_head = head_size / vector_T;
+    const int tokens_per_block = kv_rot::threads / threads_per_head;
+
+    const dim3 block(kv_rot::threads);
+    const int token_blocks = (n_tokens + tokens_per_block - 1) / tokens_per_block;
+    const dim3 grid(n_q_heads, token_blocks);
+
+    DISPATCH_KV_ROTARY_IMPL(1, 64)
+    DISPATCH_KV_ROTARY_IMPL(1, 128)
+    DISPATCH_KV_ROTARY_IMPL(2, 64)
+    DISPATCH_KV_ROTARY_IMPL(2, 128)
+    DISPATCH_KV_ROTARY_IMPL(4, 64)
+    DISPATCH_KV_ROTARY_IMPL(4, 128)
+    DISPATCH_KV_ROTARY_IMPL(5, 64)
+    DISPATCH_KV_ROTARY_IMPL(5, 128)
+    DISPATCH_KV_ROTARY_IMPL(8, 64)
+    DISPATCH_KV_ROTARY_IMPL(8, 128)
+}
+
+#define INSTANTIATE_KV_ROTARY_KERNEL(TYPE)                                        \
+    template void launch_kv_rotary_kernel<TYPE>(TYPE * kv_cache,                  \
+                                                TYPE * q,                         \
+                                                TYPE * k,                         \
+                                                TYPE * v,                         \
+                                                TYPE * inv_freq,                  \
+                                                const BatchWrapperCPP batch_desc, \
+                                                const int qkv_stride,             \
+                                                const int kv_cache_stride,        \
+                                                const int v_offset,               \
+                                                const int inv_freq_stride,        \
+                                                const int q_ratio,                \
+                                                const int head_size,              \
+                                                const int n_tokens,               \
+                                                const int n_q_heads,              \
+                                                cudaStream_t stream);
+
+INSTANTIATE_KV_ROTARY_KERNEL(__half)
+
+#ifdef BF16_AVAILABLE
+INSTANTIATE_KV_ROTARY_KERNEL(__nv_bfloat16)
+#endif
+
+#define DISPATCH_KV_COPY_IMPL(Q_RATIO, HEAD_SIZE)                                       \
+    if (q_ratio == Q_RATIO && head_size == HEAD_SIZE)                                   \
+        kv_rotary_pos_kernel<T, Q_RATIO, HEAD_SIZE, false><<<grid, block, 0, stream>>>( \
+            kv_cache, q, k, v, nullptr, batch_desc, qkv_stride, kv_cache_stride, v_offset, 0);
+
+template <typename T>
+void launch_kv_copy_kernel(T* kv_cache,
+                           T* q,
+                           T* k,
+                           T* v,
+                           const BatchWrapperCPP batch_desc,
+                           const int qkv_stride,
+                           const int kv_cache_stride,
+                           const int v_offset,
+                           const int q_ratio,
+                           const int head_size,
+                           const int n_tokens,
+                           const int n_q_heads,
+                           cudaStream_t stream)
+{
+    constexpr int vector_T = kv_rot::granularity / sizeof(T);
+    const int threads_per_head = head_size / vector_T;
+    const int tokens_per_block = kv_rot::threads / threads_per_head;
+
+    const dim3 block(kv_rot::threads);
+    const int token_blocks = (n_tokens + tokens_per_block - 1) / tokens_per_block;
+    const dim3 grid(n_q_heads, token_blocks);
+
+    DISPATCH_KV_COPY_IMPL(1, 64)
+    DISPATCH_KV_COPY_IMPL(1, 128)
+    DISPATCH_KV_COPY_IMPL(2, 64)
+    DISPATCH_KV_COPY_IMPL(2, 128)
+    DISPATCH_KV_COPY_IMPL(4, 64)
+    DISPATCH_KV_COPY_IMPL(4, 128)
+    DISPATCH_KV_COPY_IMPL(5, 64)
+    DISPATCH_KV_COPY_IMPL(5, 128)
+    DISPATCH_KV_COPY_IMPL(8, 64)
+    DISPATCH_KV_COPY_IMPL(8, 128)
+}
+
+#define INSTANTIATE_KV_COPY_KERNEL(TYPE)                                        \
+    template void launch_kv_copy_kernel<TYPE>(TYPE * kv_cache,                  \
+                                              TYPE * q,                         \
+                                              TYPE * k,                         \
+                                              TYPE * v,                         \
+                                              const BatchWrapperCPP batch_desc, \
+                                              const int qkv_stride,             \
+                                              const int kv_cache_stride,        \
+                                              const int v_offset,               \
+                                              const int q_ratio,                \
+                                              const int head_size,              \
+                                              const int n_tokens,               \
+                                              const int n_q_heads,              \
+                                              cudaStream_t stream);
+
+INSTANTIATE_KV_COPY_KERNEL(__half)
+
+#ifdef BF16_AVAILABLE
+INSTANTIATE_KV_COPY_KERNEL(__nv_bfloat16)
+#endif
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_kv_rotary.cuh b/deepspeed/inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_kv_rotary.cuh
new file mode 100644
index 000000000000..be38ff30c46c
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_kv_rotary.cuh
@@ -0,0 +1,45 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#pragma once
+
+#include "ds_kernel_utils.h"
+#include "ragged_dtypes.h"
+
+#ifdef BF16_AVAILABLE
+#include <cuda_bf16.h>
+#endif
+
+template <typename T>
+void launch_kv_rotary_kernel(T* kv_cache,
+                             T* q,
+                             T* k,
+                             T* v,
+                             T* inv_freq,
+                             const BatchWrapperCPP batch_desc,
+                             const int qkv_stride,
+                             const int kv_cache_stride,
+                             const int v_offset,
+                             const int inv_freq_stride,
+                             const int q_ratio,
+                             const int head_size,
+                             const int n_tokens,
+                             const int n_q_heads,
+                             cudaStream_t stream);
+
+template <typename T>
+void launch_kv_copy_kernel(T* kv_cache,
+                           T* q,
+                           T* k,
+                           T* v,
+                           const BatchWrapperCPP batch_desc,
+                           const int qkv_stride,
+                           const int kv_cache_stride,
+                           const int v_offset,
+                           const int q_ratio,
+                           const int head_size,
+                           const int n_tokens,
+                           const int n_q_heads,
+                           cudaStream_t stream);
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_kv_rotary.h b/deepspeed/inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_kv_rotary.h
new file mode 100644
index 000000000000..0615825c0a21
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_kv_rotary.h
@@ -0,0 +1,63 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#pragma once
+
+#include <c10/cuda/CUDAStream.h>
+#include <torch/extension.h>
+#include "blocked_kv_rotary.cuh"
+
+/*
+Rotary position embeddings + copy into KV cache. This implementation assumes
+that the inverse frequencies should be ready from global memory rather than
+synthesized in the kernel.
+
+Arguments:
+    kv_cache: [n_blocks, block_size, 2, n_kv_heads, head_size]
+    q: [n_tokens, n_q_heads * head_size]
+    k: [n_tokens, n_kv_heads * head_size]
+    v: [n_tokens, n_kv_heads * head_size]
+    inv_freq: [max_seq_len, head_size // 2]
+*/
+void kv_trained_rotary_embeddings(torch::Tensor& kv_cache,
+                                  torch::Tensor& q,
+                                  torch::Tensor& k,
+                                  torch::Tensor& v,
+                                  torch::Tensor& inv_freq,
+                                  torch::Tensor& batch_metadata,
+                                  torch::Tensor& seq_metadata,
+                                  torch::Tensor& tokens_to_seq,
+                                  torch::Tensor& kv_ptrs);
+
+/*
+Rotary position embeddings + copy into KV cache. This implementation assumes
+that the inverse frequencies should be synthesized in the kernel.
+
+Arguments:
+    kv_cache: [n_blocks, block_size, 2, n_kv_heads, head_size]
+    q: [n_tokens, n_q_heads * head_size]
+    k: [n_tokens, n_kv_heads * head_size]
+    v: [n_tokens, n_kv_heads * head_size]
+*/
+void kv_rotary_embeddings(torch::Tensor& kv_cache,
+                          torch::Tensor& q,
+                          torch::Tensor& k,
+                          torch::Tensor& v,
+                          torch::Tensor& batch_metadata,
+                          torch::Tensor& seq_metadata,
+                          torch::Tensor& tokens_to_seq,
+                          torch::Tensor& kv_ptrs);
+
+/*
+Copy into linear KV cache.
+*/
+void linear_kv_copy(torch::Tensor& kv_cache,
+                    torch::Tensor& q,
+                    torch::Tensor& k,
+                    torch::Tensor& v,
+                    torch::Tensor& batch_metadata,
+                    torch::Tensor& seq_metadata,
+                    torch::Tensor& tokens_to_seq,
+                    torch::Tensor& kv_ptrs);
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_kv_rotary.py b/deepspeed/inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_kv_rotary.py
new file mode 100644
index 000000000000..630d58d90a23
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_kv_rotary.py
@@ -0,0 +1,70 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+
+from ....inference_utils import DtypeEnum
+from deepspeed.ops.op_builder import RaggedOpsBuilder
+from ....ragged import RaggedBatchWrapper
+from ... import DSKernelBase
+
+
+class BlockedRotaryEmbeddings(DSKernelBase):
+    """
+    CUDA Kernel implementation that will perform rotary position embeddings on the queries and keys
+    before copying into a blocked KV cache.
+    """
+
+    supported_dtypes = [DtypeEnum.fp16, DtypeEnum.bf16]
+    supported_head_sizes = [64, 128]
+    supported_q_ratios = [1, 2, 4, 5, 8]
+
+    def __init__(self, head_size: int, n_q_heads: int, n_kv_heads: int, dtype: torch.dtype) -> None:
+        """
+        Args:
+            head_size: The size of the attention head.
+            q_ratio: Ratio of q heads to kv heads (for GQA)
+            dtype: Data type for the input/output. Supported values are torch.float16 and torch.bfloat16.
+        """
+
+        q_ratio = n_q_heads // n_kv_heads
+
+        if head_size not in BlockedRotaryEmbeddings.supported_head_sizes:
+            raise ValueError("Unsupported head size: {}, supported_head_sizes are {}".format(
+                head_size, BlockedRotaryEmbeddings.supported_head_sizes))
+
+        if q_ratio not in BlockedRotaryEmbeddings.supported_q_ratios:
+            raise ValueError("Unsupported q_ratio: {}, supported_q_ratios are {}".format(
+                q_ratio, BlockedRotaryEmbeddings.supported_q_ratios))
+
+        if not isinstance(dtype, DtypeEnum):
+            dtype = DtypeEnum(dtype)
+
+        if dtype not in BlockedRotaryEmbeddings.supported_dtypes:
+            raise ValueError("Unsupported data type: {}, supported_dtypes are {}".format(
+                dtype, BlockedRotaryEmbeddings.supported_dtypes))
+
+        inf_module = RaggedOpsBuilder().load()
+        self.kernel = inf_module.kv_rotary_embeddings
+        self.head_size = head_size
+        self.n_q_heads = n_q_heads
+        self.n_kv_heads = n_kv_heads
+
+    def __call__(self, kv_cache: torch.Tensor, qkv: torch.Tensor, ragged_batch: RaggedBatchWrapper) -> None:
+        """
+        Perform rotary embeddings on the queries and keys before copying into a blocked KV cache.
+
+        Args:
+            kv_cache (torch.Tensor): Pre-allocated KV cache of [num_blocks, block_size, 2, n_kv_heads, head_size]
+            qkv: Input tensor of shape [num_tokens, head_size * (n_q_heads + 2 * n_kv_heads)]
+            ragged_batch: Wrapper for the ragged batch.
+        """
+
+        q = qkv[:, :self.head_size * self.n_q_heads]
+        k = qkv[:, self.head_size * self.n_q_heads:self.head_size * (self.n_q_heads + self.n_kv_heads)]
+        v = qkv[:, self.head_size * (self.n_q_heads + self.n_kv_heads):]
+
+        self.kernel(kv_cache, q, k, v, ragged_batch.batch_metadata_buffer(), ragged_batch.inflight_seq_descriptors(),
+                    ragged_batch.tokens_to_seq(), ragged_batch.kv_ptrs())
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_trained_kv_rotary.py b/deepspeed/inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_trained_kv_rotary.py
new file mode 100644
index 000000000000..59da1db0f5d6
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_trained_kv_rotary.py
@@ -0,0 +1,76 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+
+from ....inference_utils import DtypeEnum
+from deepspeed.ops.op_builder import RaggedOpsBuilder
+from ....ragged import RaggedBatchWrapper
+from ... import DSKernelBase
+
+
+class BlockedTrainedRotaryEmbeddings(DSKernelBase):
+    """
+    CUDA Kernel implementation that will perform rotary position embeddings on the queries and keys
+    before copying into a blocked KV cache.
+    """
+
+    supported_dtypes = [DtypeEnum.fp16, DtypeEnum.bf16]
+    supported_head_sizes = [64, 128]
+    supported_q_ratios = [1, 2, 4, 5, 8]
+
+    def __init__(self, head_size: int, n_q_heads: int, n_kv_heads: int, dtype: torch.dtype) -> None:
+        """
+        Args:
+            head_size: The size of the attention head.
+            dtype: Data type for the input/output. Supported values are torch.float16 and torch.bfloat16.
+        """
+
+        q_ratio = n_q_heads // n_kv_heads
+
+        if head_size not in BlockedTrainedRotaryEmbeddings.supported_head_sizes:
+            raise ValueError("Unsupported head size: {}, supported_head_sizes are {}".format(
+                head_size, BlockedTrainedRotaryEmbeddings.supported_head_sizes))
+
+        if q_ratio not in BlockedTrainedRotaryEmbeddings.supported_q_ratios:
+            raise ValueError("Unsupported q_ratio: {}, supported_q_ratios are {}".format(
+                q_ratio, BlockedTrainedRotaryEmbeddings.supported_q_ratios))
+
+        if not isinstance(dtype, DtypeEnum):
+            dtype = DtypeEnum(dtype)
+
+        if dtype not in BlockedTrainedRotaryEmbeddings.supported_dtypes:
+            raise ValueError("Unsupported data type: {}, supported_dtypes are {}".format(
+                dtype, BlockedTrainedRotaryEmbeddings.supported_dtypes))
+
+        inf_module = RaggedOpsBuilder().load()
+        self.kernel = inf_module.kv_trained_rotary_embeddings
+        self.head_size = head_size
+        self.n_q_heads = n_q_heads
+        self.n_kv_heads = n_kv_heads
+
+    def __call__(self, kv_cache: torch.Tensor, qkv: torch.Tensor, ragged_batch: RaggedBatchWrapper,
+                 inverse_freqs: torch.Tensor) -> None:
+        """
+        Perform rotary embeddings on the queries and keys before copying into a blocked KV cache.
+
+        Args:
+            kv_cache (torch.Tensor): Pre-allocated KV cache of [num_blocks, block_size, 2, n_kv_heads, head_size]
+            qkv: Input tensor of shape [num_tokens, head_size * (n_q_heads + 2 * n_kv_heads)]
+            ragged_batch: Wrapper for the ragged batch.
+            inverse_freqs: Inverse frequencies for the rotary embeddings. Shape [max_seq_len, head_size // 2]
+        """
+
+        q = qkv[:, :self.head_size * self.n_q_heads]
+        k = qkv[:, self.head_size * self.n_q_heads:self.head_size * (self.n_q_heads + self.n_kv_heads)]
+        v = qkv[:, self.head_size * (self.n_q_heads + self.n_kv_heads):]
+
+        self.kernel(kv_cache, q, k, v, inverse_freqs, ragged_batch.batch_metadata_buffer(),
+                    ragged_batch.inflight_seq_descriptors(), ragged_batch.tokens_to_seq(), ragged_batch.kv_ptrs())
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/linear_blocked_kv_copy.py b/deepspeed/inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/linear_blocked_kv_copy.py
new file mode 100644
index 000000000000..c9f6ffd37b3e
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/linear_blocked_kv_copy.py
@@ -0,0 +1,74 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+
+from ....inference_utils import DtypeEnum
+from ....ragged import RaggedBatchWrapper
+from deepspeed.ops.op_builder import RaggedOpsBuilder
+from ... import DSKernelBase
+
+
+class LinearBlockedKVCopy(DSKernelBase):
+    """
+    CUDA Kernel implementation that will perform rotary position embeddings on the queries and keys
+    before copying into a blocked KV cache.
+    """
+
+    supported_dtypes = [DtypeEnum.fp16, DtypeEnum.bf16]
+    supported_head_sizes = [64, 128]
+    supported_q_ratios = [1, 2, 4, 5, 8]
+
+    def __init__(self, head_size: int, n_q_heads: int, n_kv_heads: int, dtype: torch.dtype) -> None:
+        """
+        Args:
+            head_size: The size of the attention head.
+            dtype: Data type for the input/output. Supported values are torch.float16 and torch.bfloat16.
+        """
+
+        q_ratio = n_q_heads // n_kv_heads
+
+        if head_size not in LinearBlockedKVCopy.supported_head_sizes:
+            raise ValueError("Unsupported head size: {}, supported_head_sizes are {}".format(
+                head_size, LinearBlockedKVCopy.supported_head_sizes))
+
+        if q_ratio not in LinearBlockedKVCopy.supported_q_ratios:
+            raise ValueError("Unsupported q_ratio: {}, supported_q_ratios are {}".format(
+                q_ratio, LinearBlockedKVCopy.supported_q_ratios))
+
+        if not isinstance(dtype, DtypeEnum):
+            dtype = DtypeEnum(dtype)
+
+        if dtype not in LinearBlockedKVCopy.supported_dtypes:
+            raise ValueError("Unsupported data type: {}, supported_dtypes are {}".format(
+                dtype, LinearBlockedKVCopy.supported_dtypes))
+
+        inf_module = RaggedOpsBuilder().load()
+        self.kernel = inf_module.linear_kv_copy
+        self.head_size = head_size
+        self.n_q_heads = n_q_heads
+        self.n_kv_heads = n_kv_heads
+
+    def __call__(self, kv_cache: torch.Tensor, qkv: torch.Tensor, ragged_batch: RaggedBatchWrapper) -> None:
+        """
+        Perform rotary embeddings on the queries and keys before copying into a blocked KV cache.
+
+        Args:
+            kv_cache (torch.Tensor): Pre-allocated KV cache of [num_blocks, block_size, 2, n_kv_heads, head_size]
+            qkv: Input tensor of shape [num_tokens, head_size * (n_q_heads + 2 * n_kv_heads)]
+            ragged_batch: Wrapper for the ragged batch.
+        """
+
+        q = qkv[:, :self.head_size * self.n_q_heads]
+        k = qkv[:, self.head_size * self.n_q_heads:self.head_size * (self.n_q_heads + self.n_kv_heads)]
+        v = qkv[:, self.head_size * (self.n_q_heads + self.n_kv_heads):]
+
+        self.kernel(kv_cache, q, k, v, ragged_batch.batch_metadata_buffer(), ragged_batch.inflight_seq_descriptors(),
+                    ragged_batch.tokens_to_seq(), ragged_batch.kv_ptrs())
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/logits_gather/__init__.py b/deepspeed/inference/v2/kernels/ragged_ops/logits_gather/__init__.py
new file mode 100644
index 000000000000..72103a0d82a1
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/ragged_ops/logits_gather/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .logits_gather import *
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/logits_gather/logits_gather.cpp b/deepspeed/inference/v2/kernels/ragged_ops/logits_gather/logits_gather.cpp
new file mode 100644
index 000000000000..1a7e7c0a2167
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/ragged_ops/logits_gather/logits_gather.cpp
@@ -0,0 +1,45 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include "logits_gather.h"
+
+#define DISPATCH_TO_LOGITS_GATHER(T_TYPE, C_TYPE)                  \
+    if (all_acts.options().dtype() == torch::T_TYPE) {             \
+        launch_logits_gather((C_TYPE*)final_token_acts.data_ptr(), \
+                             (const C_TYPE*)all_acts.data_ptr(),   \
+                             batch_metadata_raw,                   \
+                             seq_metadata_raw,                     \
+                             n_seqs,                               \
+                             embed_dim,                            \
+                             at::cuda::getCurrentCUDAStream());    \
+    }
+
+/*
+Logits gather will parse the ragged batch data structure and gather only the logits that
+will be used for token sampling.
+*/
+void gather_for_logits(torch::Tensor& final_token_acts,
+                       torch::Tensor& all_acts,
+                       torch::Tensor& batch_metadata,
+                       torch::Tensor& seq_metadata)
+{
+    const RaggedBatchDescriptor* batch_metadata_raw =
+        reinterpret_cast<const RaggedBatchDescriptor*>(batch_metadata.data_ptr());
+
+    const InflightSeqDescriptor* seq_metadata_raw =
+        reinterpret_cast<const InflightSeqDescriptor*>(seq_metadata.data_ptr());
+
+    const int n_seqs = final_token_acts.size(0);
+    const int embed_dim = final_token_acts.size(1);
+
+    TORCH_CHECK(all_acts.scalar_type() == final_token_acts.scalar_type(),
+                "all_acts and final_token_acts must have the same scalar type");
+
+    DISPATCH_TO_LOGITS_GATHER(kFloat, float)
+    DISPATCH_TO_LOGITS_GATHER(kHalf, half)
+#ifdef BF16_AVAILABLE
+    DISPATCH_TO_LOGITS_GATHER(kBFloat16, __nv_bfloat16)
+#endif
+}
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/logits_gather/logits_gather.cu b/deepspeed/inference/v2/kernels/ragged_ops/logits_gather/logits_gather.cu
new file mode 100644
index 000000000000..a539888ff904
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/ragged_ops/logits_gather/logits_gather.cu
@@ -0,0 +1,86 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include "ds_kernel_utils.h"
+#include "logits_gather.cuh"
+#include "memory_access_utils.h"
+#include "ragged_dtypes.h"
+
+namespace logits_gather {
+
+constexpr int granularity = 16;
+constexpr int threads = 512;
+
+}  // namespace logits_gather
+
+template <typename T>
+__global__ void logits_gather_kernel(T* final_token_acts,
+                                     const T* token_acts,
+                                     const RaggedBatchDescriptor* ragged_batch,
+                                     const InflightSeqDescriptor* inflight_batch,
+                                     const int32_t embed_dim)
+{
+    constexpr int T_vector = logits_gather::granularity / sizeof(T);
+
+    const int32_t seq_id = blockIdx.y;
+
+    // It's possible we've padded the output Tensor (under CG conditions)
+    if (seq_id >= ragged_batch->n_sequences) return;
+
+    const InflightSeqDescriptor seq = inflight_batch[seq_id];
+    const int final_token_idx = seq.start_idx + seq.n_tokens - 1;
+
+    const int token_offset = final_token_idx * embed_dim;
+    const int thread_offset =
+        threadIdx.x * T_vector + blockIdx.x * logits_gather::threads * T_vector;
+
+    const int final_token_offset = seq_id * embed_dim;
+
+    T reg_buf[T_vector];
+
+    if (thread_offset < embed_dim) {
+        mem_access::load_global<logits_gather::granularity>(
+            reg_buf, token_acts + token_offset + thread_offset);
+
+        mem_access::store_global<logits_gather::granularity>(
+            final_token_acts + final_token_offset + thread_offset, reg_buf);
+    }
+}
+
+template <typename T>
+void launch_logits_gather(T* final_token_acts,
+                          const T* all_acts,
+                          const RaggedBatchDescriptor* ragged_batch,
+                          const InflightSeqDescriptor* inflight_batch,
+                          const int32_t n_seqs,
+                          const int32_t embed_dim,
+                          cudaStream_t stream)
+{
+    constexpr int T_vector = logits_gather::granularity / sizeof(T);
+    constexpr int elems_per_block = logits_gather::threads * T_vector;
+    const int parallel_blocks = (embed_dim + elems_per_block - 1) / elems_per_block;
+
+    const dim3 grid(parallel_blocks, n_seqs, 1);
+    const dim3 block(logits_gather::threads, 1, 1);
+
+    logits_gather_kernel<T><<<grid, block, 0, stream>>>(
+        final_token_acts, all_acts, ragged_batch, inflight_batch, embed_dim);
+}
+
+#define INSTANTIATE_FOR_TYPE(T)                                                        \
+    template void launch_logits_gather<T>(T * final_token_acts,                        \
+                                          const T* all_acts,                           \
+                                          const RaggedBatchDescriptor* ragged_batch,   \
+                                          const InflightSeqDescriptor* inflight_batch, \
+                                          const int32_t n_seqs,                        \
+                                          const int32_t embed_dim,                     \
+                                          cudaStream_t stream);
+
+INSTANTIATE_FOR_TYPE(float)
+INSTANTIATE_FOR_TYPE(__half)
+
+#ifdef BF16_AVAILABLE
+INSTANTIATE_FOR_TYPE(__nv_bfloat16)
+#endif
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/logits_gather/logits_gather.cuh b/deepspeed/inference/v2/kernels/ragged_ops/logits_gather/logits_gather.cuh
new file mode 100644
index 000000000000..c4e84c05e6d8
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/ragged_ops/logits_gather/logits_gather.cuh
@@ -0,0 +1,22 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#pragma once
+
+#include "ds_kernel_utils.h"
+#include "ragged_dtypes.h"
+
+#ifdef BF16_AVAILABLE
+#include <cuda_bf16.h>
+#endif
+
+template <typename T>
+void launch_logits_gather(T* final_token_acts,
+                          const T* all_acts,
+                          const RaggedBatchDescriptor* batch_metadata,
+                          const InflightSeqDescriptor* seq_metadata,
+                          const int32_t n_seqs,
+                          const int32_t embed_dim,
+                          cudaStream_t stream);
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/logits_gather/logits_gather.h b/deepspeed/inference/v2/kernels/ragged_ops/logits_gather/logits_gather.h
new file mode 100644
index 000000000000..73a855984daa
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/ragged_ops/logits_gather/logits_gather.h
@@ -0,0 +1,20 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#pragma once
+
+#include <c10/cuda/CUDAStream.h>
+#include <torch/extension.h>
+#include "logits_gather.cuh"
+#include "ragged_dtypes.h"
+
+/*
+Logits gather will parse the ragged batch data structure and gather only the logits that
+will be used for token sampling.
+*/
+void gather_for_logits(torch::Tensor& final_token_acts,
+                       torch::Tensor& all_acts,
+                       torch::Tensor& batch_metadata,
+                       torch::Tensor& seq_metadata);
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/logits_gather/logits_gather.py b/deepspeed/inference/v2/kernels/ragged_ops/logits_gather/logits_gather.py
new file mode 100644
index 000000000000..64b453e9e9e3
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/ragged_ops/logits_gather/logits_gather.py
@@ -0,0 +1,52 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+
+from ... import DSKernelBase
+from deepspeed.ops.op_builder import RaggedOpsBuilder
+from ....inference_utils import elem_size
+from ....ragged import RaggedBatchWrapper
+
+
+class RaggedLogitsGather(DSKernelBase):
+    """
+    CUDA Kernel implementation for gather the hidden states of the final token
+    of each sequence. This is used to reduce the cost of the performing the unembedding.
+    """
+
+    supported_dtypes = [torch.float16, torch.bfloat16, torch.float32]
+
+    def __init__(self, model_dim: int, fp_dtype: torch.dtype):
+        """
+        Parameters:
+            fp_dtype (torch.dtype): Data type for the input/output. Supported values
+                are torch.float16, torch.bfloat16, and torch.float32.
+        """
+        if fp_dtype not in RaggedLogitsGather.supported_dtypes:
+            raise ValueError("Unsupported data type: {}, supported_dtypes are {}".format(
+                fp_dtype, RaggedLogitsGather.supported_dtypes))
+
+        if elem_size(fp_dtype) * model_dim % 16 != 0:
+            raise ValueError("Embedding dimension must be aligned to 16 bytes, got {}".format(model_dim))
+
+        inf_module = RaggedOpsBuilder().load()
+        self.kernel = inf_module.gather_for_logits
+
+    def __call__(self, final_token_activations: torch.Tensor, all_activations: torch.Tensor,
+                 ragged_wrapper: RaggedBatchWrapper) -> torch.Tensor:
+        """
+        Gather the hidden states of the final token of each sequence from `all_activations` into
+        `final_token_activations`.
+
+        Args:
+            final_token_activations (torch.Tensor): Output tensor of shape [num_seqs, model_dim]
+            all_activations (torch.Tensor): Input tensor of shape [num_tokens, model_dim]
+            ragged_wrapper (RaggedBatchWrapper): Wrapper for the ragged batch.
+        """
+
+        self.kernel(final_token_activations, all_activations, ragged_wrapper.batch_metadata_buffer(),
+                    ragged_wrapper.inflight_seq_descriptors())
+        return final_token_activations
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/moe_gather/__init__.py b/deepspeed/inference/v2/kernels/ragged_ops/moe_gather/__init__.py
new file mode 100644
index 000000000000..096c0d984a5a
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/ragged_ops/moe_gather/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .moe_gather import *
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/moe_gather/moe_gather.cpp b/deepspeed/inference/v2/kernels/ragged_ops/moe_gather/moe_gather.cpp
new file mode 100644
index 000000000000..e55e1f48c125
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/ragged_ops/moe_gather/moe_gather.cpp
@@ -0,0 +1,53 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include "moe_gather.h"
+#include <c10/cuda/CUDAStream.h>
+
+#define DISPATCH_MOE_GATHER(T_TYPE, C_TYPE)                        \
+    if (layer_output.options().dtype() == torch::T_TYPE) {         \
+        launch_moe_gather((C_TYPE*)layer_output.data_ptr(),        \
+                          (const C_TYPE*)moe_output.data_ptr(),    \
+                          (const float*)scores.data_ptr(),         \
+                          (const int32_t*)mapped_slots.data_ptr(), \
+                          (int32_t*)expert_count.data_ptr(),       \
+                          n_channels,                              \
+                          n_experts,                               \
+                          n_tokens,                                \
+                          at::cuda::getCurrentCUDAStream());       \
+        return;                                                    \
+    }
+
+/*
+Re-gather the outputs of MoE and scale them by the gating score.
+*/
+void moe_gather(torch::Tensor& layer_output,
+                const torch::Tensor& moe_output,
+                const torch::Tensor& scores,
+                const torch::Tensor& mapped_slots,
+                const torch::Tensor& expert_count)
+{
+    const int32_t n_channels = layer_output.size(1);
+    const int32_t n_experts = expert_count.size(0);
+    const int32_t n_tokens = layer_output.size(0);
+
+    TORCH_CHECK(moe_output.size(0) == n_tokens);
+    TORCH_CHECK(moe_output.size(1) == n_channels);
+    TORCH_CHECK(scores.size(0) == n_tokens);
+    TORCH_CHECK(mapped_slots.size(0) == n_tokens);
+
+    TORCH_CHECK(layer_output.scalar_type() == moe_output.scalar_type());
+    TORCH_CHECK(scores.scalar_type() == torch::kFloat32);
+    TORCH_CHECK(mapped_slots.scalar_type() == torch::kInt32);
+    TORCH_CHECK(expert_count.scalar_type() == torch::kInt32);
+
+    DISPATCH_MOE_GATHER(kHalf, __half);
+
+#ifdef BF16_AVAILABLE
+    DISPATCH_MOE_GATHER(kBFloat16, __nv_bfloat16);
+#endif
+
+    TORCH_CHECK(false, "Unsupported data type for MoE gather");
+}
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/moe_gather/moe_gather.cu b/deepspeed/inference/v2/kernels/ragged_ops/moe_gather/moe_gather.cu
new file mode 100644
index 000000000000..c2fae24f5080
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/ragged_ops/moe_gather/moe_gather.cu
@@ -0,0 +1,122 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include "conversion_utils.h"
+#include "ds_kernel_utils.h"
+#include "moe_gather.cuh"
+#include "reduction_utils.h"
+#include "top_1_gating.cuh"
+
+namespace gather {
+
+constexpr int access_granularity = 16;
+constexpr int threads = 256;
+
+}  // namespace gather
+
+template <typename T, int copyUnroll>
+__global__ void moe_gather_kernel(T* layer_output,
+                                  const T* moe_output,
+                                  const float* scores,
+                                  const int32_t* mapped_slots,
+                                  int32_t* expert_counts,
+                                  const int32_t n_channels,
+                                  const int32_t n_experts)
+{
+    constexpr int32_t vector_size = gather::access_granularity / sizeof(T);
+    constexpr int32_t stride = vector_size * gather::threads;
+
+    const int32_t token_idx = blockIdx.x;
+    const int32_t mapped_slot = mapped_slots[token_idx];
+
+    if (token_idx == 0) {
+        // Reset expert counts for its next use.
+        if (threadIdx.x < n_experts) { expert_counts[threadIdx.x] = 0; }
+    }
+
+    if (mapped_slot == gating::unassigned) {
+        // This token was not assigned.
+        // TODO(cmikeh2): It's possible we want different behavior here moving forward.
+        return;
+    }
+
+    const float score = scores[token_idx];
+    const int32_t channel_offset = threadIdx.x * vector_size;
+
+    const T* moe_output_base = moe_output + mapped_slot * n_channels + channel_offset;
+    T* layer_output_base = layer_output + token_idx * n_channels + channel_offset;
+
+#pragma unroll
+    for (int i = 0; i < copyUnroll; i++) {
+        T reg_buffer[vector_size];
+
+        if (i * stride + channel_offset < n_channels) {
+            mem_access::load_global<gather::access_granularity>(reg_buffer,
+                                                                moe_output_base + i * stride);
+
+#pragma unroll
+            for (int j = 0; j < vector_size; j++) {
+                // There are accuracy implications of downcasting the score to a 16-bit
+                // data type, so we up-convert the input to 32-bit, multiply, and then
+                // down-convert back to 16-bit.
+                float up_cast = conversion::to<float>(reg_buffer[j]);
+                reg_buffer[j] = conversion::to<T>(up_cast * score);
+            }
+
+            mem_access::store_global<gather::access_granularity>(layer_output_base + i * stride,
+                                                                 reg_buffer);
+        }
+    }
+}
+
+#define LAUNCH_FOR_UNROLL(COUNT)                                                                   \
+    case COUNT:                                                                                    \
+        moe_gather_kernel<T, COUNT><<<grid, block, 0, stream>>>(                                   \
+            layer_output, moe_output, scores, mapped_slots, expert_counts, n_channels, n_experts); \
+        break;
+
+template <typename T>
+void launch_moe_gather(T* layer_output,
+                       const T* moe_output,
+                       const float* scores,
+                       const int32_t* mapped_slots,
+                       int32_t* expert_counts,
+                       const int32_t n_channels,
+                       const int32_t n_experts,
+                       const int32_t n_tokens,
+                       cudaStream_t stream)
+{
+    constexpr int vals_per_unroll = gather::threads * gather::access_granularity / sizeof(T);
+    const int copy_unroll = (n_channels + vals_per_unroll - 1) / vals_per_unroll;
+
+    const dim3 block(gather::threads);
+    const dim3 grid(n_tokens);
+
+    switch (copy_unroll) {
+        LAUNCH_FOR_UNROLL(1)
+        LAUNCH_FOR_UNROLL(2)
+        LAUNCH_FOR_UNROLL(3)
+        LAUNCH_FOR_UNROLL(4)
+        LAUNCH_FOR_UNROLL(5)
+        LAUNCH_FOR_UNROLL(6)
+    }
+}
+
+#define INSTANTIATE_GATHER_FOR_TYPE(TYPE)                              \
+    template void launch_moe_gather<TYPE>(TYPE * layer_output,         \
+                                          const TYPE* moe_output,      \
+                                          const float* scores,         \
+                                          const int32_t* mapped_slots, \
+                                          int32_t* expert_counts,      \
+                                          const int32_t n_channels,    \
+                                          const int32_t n_experts,     \
+                                          const int32_t n_tokens,      \
+                                          cudaStream_t stream);
+
+INSTANTIATE_GATHER_FOR_TYPE(__half)
+
+#ifdef BF16_AVAILABLE
+INSTANTIATE_GATHER_FOR_TYPE(__nv_bfloat16)
+#endif
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/moe_gather/moe_gather.cuh b/deepspeed/inference/v2/kernels/ragged_ops/moe_gather/moe_gather.cuh
new file mode 100644
index 000000000000..f98a727ead58
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/ragged_ops/moe_gather/moe_gather.cuh
@@ -0,0 +1,20 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#pragma once
+
+#include "ds_kernel_utils.h"
+#include "ragged_dtypes.h"
+
+template <typename T>
+void launch_moe_gather(T* layer_output,
+                       const T* moe_output,
+                       const float* scores,
+                       const int32_t* mapped_slots,
+                       int32_t* expert_counts,
+                       const int32_t n_channels,
+                       const int32_t n_experts,
+                       const int32_t n_tokens,
+                       cudaStream_t stream);
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/moe_gather/moe_gather.h b/deepspeed/inference/v2/kernels/ragged_ops/moe_gather/moe_gather.h
new file mode 100644
index 000000000000..7ffe9f8b4dc6
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/ragged_ops/moe_gather/moe_gather.h
@@ -0,0 +1,19 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#pragma once
+
+#include <c10/cuda/CUDAStream.h>
+#include <torch/extension.h>
+#include "moe_gather.cuh"
+
+/*
+Re-gather the outputs of MoE and scale them by the gating score.
+*/
+void moe_gather(torch::Tensor& layer_output,
+                const torch::Tensor& moe_output,
+                const torch::Tensor& scores,
+                const torch::Tensor& mapped_slots,
+                const torch::Tensor& expert_counts);
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/moe_gather/moe_gather.py b/deepspeed/inference/v2/kernels/ragged_ops/moe_gather/moe_gather.py
new file mode 100644
index 000000000000..c37683d03fbe
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/ragged_ops/moe_gather/moe_gather.py
@@ -0,0 +1,52 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+
+from ... import DSKernelBase
+from ....inference_utils import DtypeEnum
+from deepspeed.ops.op_builder import RaggedOpsBuilder
+
+
+class MoEGather(DSKernelBase):
+    """
+    CUDA implementation of MoE gather. This will bring the tokens back
+    to their original indices and perform the output scaling.
+    """
+
+    supported_dtypes = [DtypeEnum.fp16, DtypeEnum.bf16]
+
+    def __init__(self, dtype: DtypeEnum, channels: int) -> None:
+
+        if not isinstance(dtype, DtypeEnum):
+            dtype = DtypeEnum(dtype)
+
+        if dtype not in MoEGather.supported_dtypes:
+            raise RuntimeError(f"Unsupported dtype {dtype}")
+
+        if channels % 8 != 0:
+            raise RuntimeError(f"Channels {channels} must be divisible by 8")
+
+        inf_module = RaggedOpsBuilder().load()
+        self.kernel = inf_module.moe_gather
+
+    def __call__(self, layer_output: torch.Tensor, moe_output: torch.Tensor, scores: torch.Tensor,
+                 mapped_slots: torch.Tensor, expert_counts: torch.Tensor) -> torch.Tensor:
+        """
+        Reorders the moe_output tokens into their original order and scales them by their
+        gating scale. This will be a no-op for padded tokens.
+
+        Arguments:
+            layer_output (torch.Tensor): The output of the layer of shape [n_tokens, hidden_size]. This has been scaled appropriately.
+            moe_output (torch.Tensor): The output of the MoE of shape [n_tokens, hidden_size].
+            scores (torch.Tensor): The gating scores of shape [n_tokens].
+            mapped_slots (torch.Tensor): The index of the token in the expert's input of shape [n_tokens]. The index of token ``i`` in layer_output is ``mapped_slots[i]``.
+            expert_counts (torch.Tensor): The number of tokens assigned to each expert of shape [n_experts]. This is passed to fuse the clearing of this data structure into the gather.
+
+        Returns:
+            layer_output
+        """
+        self.kernel(layer_output, moe_output, scores, mapped_slots, expert_counts)
+        return layer_output
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/moe_scatter/__init__.py b/deepspeed/inference/v2/kernels/ragged_ops/moe_scatter/__init__.py
new file mode 100644
index 000000000000..a7ca91fe5363
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/ragged_ops/moe_scatter/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .moe_scatter import *
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/moe_scatter/moe_scatter.cpp b/deepspeed/inference/v2/kernels/ragged_ops/moe_scatter/moe_scatter.cpp
new file mode 100644
index 000000000000..902f1cc0ea15
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/ragged_ops/moe_scatter/moe_scatter.cpp
@@ -0,0 +1,62 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include "moe_scatter.h"
+#include <c10/cuda/CUDAStream.h>
+
+#define DISPATCH_MOE_SCATTER(T_TYPE, C_TYPE)                          \
+    if (activations.options().dtype() == torch::T_TYPE) {             \
+        launch_moe_scatter((C_TYPE*)moe_input.data_ptr(),             \
+                           (int64_t*)expert_count_cumsums.data_ptr(), \
+                           (int32_t*)mapped_slots.data_ptr(),         \
+                           (const C_TYPE*)activations.data_ptr(),     \
+                           (const int32_t*)expert_counts.data_ptr(),  \
+                           (const int32_t*)assignments.data_ptr(),    \
+                           (const int32_t*)offsets.data_ptr(),        \
+                           n_channels,                                \
+                           n_tokens,                                  \
+                           n_experts,                                 \
+                           at::cuda::getCurrentCUDAStream());         \
+        return;                                                       \
+    }
+
+/*
+Performs a cumsum on the expert counts and copies the hidden states to the
+appropriate spot to ensure that each experts inputs are contiguous.
+*/
+void moe_scatter(torch::Tensor& moe_input,
+                 torch::Tensor& expert_count_cumsums,
+                 torch::Tensor& mapped_slots,
+                 torch::Tensor& activations,
+                 torch::Tensor& expert_counts,
+                 torch::Tensor& assignments,
+                 torch::Tensor& offsets)
+{
+    const int32_t n_tokens = activations.size(0);
+    const int32_t n_channels = activations.size(1);
+
+    // Should have a lot of matching buffer sizes here.
+    TORCH_CHECK(n_tokens == moe_input.size(0));
+    TORCH_CHECK(n_tokens == assignments.size(0));
+    TORCH_CHECK(n_tokens == offsets.size(0));
+    TORCH_CHECK(n_channels == moe_input.size(1));
+
+    const int32_t n_experts = expert_count_cumsums.size(0);
+
+    TORCH_CHECK(moe_input.scalar_type() == activations.scalar_type());
+    TORCH_CHECK(expert_count_cumsums.scalar_type() == torch::kInt64);
+    TORCH_CHECK(mapped_slots.scalar_type() == torch::kInt32);
+    TORCH_CHECK(expert_counts.scalar_type() == torch::kInt32);
+    TORCH_CHECK(assignments.scalar_type() == torch::kInt32);
+    TORCH_CHECK(offsets.scalar_type() == torch::kInt32);
+
+    DISPATCH_MOE_SCATTER(kHalf, __half);
+
+#ifdef BF16_AVAILABLE
+    DISPATCH_MOE_SCATTER(kBFloat16, __nv_bfloat16);
+#endif
+
+    TORCH_CHECK(false, "Unsupported dtype for moe_scatter")
+}
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/moe_scatter/moe_scatter.cu b/deepspeed/inference/v2/kernels/ragged_ops/moe_scatter/moe_scatter.cu
new file mode 100644
index 000000000000..0746cd7be645
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/ragged_ops/moe_scatter/moe_scatter.cu
@@ -0,0 +1,208 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include "ds_kernel_utils.h"
+#include "moe_scatter.cuh"
+#include "reduction_utils.h"
+#include "top_1_gating.cuh"
+
+using ROp = reduce::ROpType;
+
+namespace scatter {
+
+constexpr int access_granularity = 16;
+constexpr int threads = 256;
+constexpr int warps = threads / hw_warp_size;
+
+}  // namespace scatter
+
+template <typename T, int copyUnroll>
+__global__ void moe_scatter_kernel(T* moe_input,
+                                   int64_t* expert_count_cumsums,
+                                   int32_t* mapped_slots,
+                                   const T* activations,
+                                   const int32_t* assignments,
+                                   const int32_t* expert_counts,
+                                   const int32_t* offsets,
+                                   const int32_t n_channels,
+                                   const int32_t n_experts)
+{
+    constexpr int32_t vector_size = scatter::access_granularity / sizeof(T);
+    constexpr int32_t load_stride = vector_size * scatter::threads;
+
+    const int32_t token_idx = blockIdx.x;
+    const int32_t tidx = threadIdx.x;
+    const int32_t warp_rank = tidx / hw_warp_size;
+
+    // Bank aligned and sufficient
+    __shared__ int32_t red_buffer[32];
+    __shared__ int32_t token_0_row;
+
+    // CG helpers
+    cg::thread_block tb = cg::this_thread_block();
+    cg::thread_block_tile<hw_warp_size> warp = cg::tiled_partition<hw_warp_size>(tb);
+
+    const int assigned_expert = assignments[token_idx];
+
+    // For the different codepaths, we'll converge on this variable for doing
+    // the token copy.
+    int32_t token_base_row;
+
+    if (token_idx == 0) {
+        // Token 0 will perform a cumsum on the data
+        int32_t expert_vals;
+        if (tidx < n_experts) {
+            expert_vals = expert_counts[tidx];
+        } else {
+            expert_vals = 0;
+        }
+
+#pragma unroll
+        for (int i = 1; i < hw_warp_size; i *= 2) {
+            int32_t maybe_add = warp.shfl_up(expert_vals, i);
+            expert_vals = (warp.thread_rank() < i) ? expert_vals : expert_vals + maybe_add;
+        }
+
+        if (warp.thread_rank() == hw_warp_size - 1) {
+            mem_access::store_shared<4>(red_buffer + warp_rank, &expert_vals);
+        }
+
+        tb.sync();
+
+        int32_t phase_2_val = 0;
+        if (warp.thread_rank() < scatter::warps) {
+            mem_access::load_shared<4>(&phase_2_val, red_buffer + warp.thread_rank());
+        }
+
+#pragma unroll
+        for (int i = 1; i < hw_warp_size; i *= 2) {
+            int32_t maybe_add = warp.shfl_up(phase_2_val, i);
+            phase_2_val = (warp.thread_rank() < i) ? phase_2_val : phase_2_val + maybe_add;
+        }
+
+        int warp_offset = 0;
+        if (warp_rank > 0) { warp_offset = warp.shfl(phase_2_val, warp_rank - 1); }
+        const int32_t expert_cumsum = warp_offset + expert_vals;
+
+        if (tidx < n_experts) {
+            int64_t expert_cumsum_64 = (int64_t)expert_cumsum;
+            expert_count_cumsums[tidx] = expert_cumsum_64;
+        }
+
+        if (assigned_expert == gating::unassigned) return;
+        if (assigned_expert - 1 == tidx) token_0_row = expert_cumsum;
+
+        tb.sync();
+
+        if (assigned_expert != 0) {
+            token_base_row = token_0_row;
+        } else {
+            token_base_row = 0;
+        }
+
+    } else if (assigned_expert == gating::unassigned) {
+        // For whatever reason, don't need to perform the copy, so we'll early return
+        // and signal this wasn't mapped with a negative 1.
+        if (tidx == 0) mapped_slots[token_idx] = gating::unassigned;
+        return;
+    } else {
+        // For all other valid tokens, we can just do a block-scoped sum.
+        if (tidx < assigned_expert) {
+            token_base_row = expert_counts[tidx];
+        } else {
+            token_base_row = 0;
+        }
+
+        warp.sync();
+
+        // TODO(cmikeh2): Shouldn't use the internal api.
+        reduce::_block<int32_t, scatter::warps, ROp::Add>(tb, warp, &token_base_row);
+    }
+
+    // Data copy to appropriate location
+    const int32_t thread_offset = tidx * vector_size;
+
+    const int32_t base_load_offset = token_idx * n_channels + thread_offset;
+    const T* load_base_ptr = activations + base_load_offset;
+
+    const int32_t store_row = token_base_row + offsets[token_idx];
+    const int32_t base_store_offset = store_row * n_channels + thread_offset;
+    T* store_base_ptr = moe_input + base_store_offset;
+
+#pragma unroll
+    for (int i = 0; i < copyUnroll; i++) {
+        T tmp_buf[vector_size];
+
+        if (i * load_stride + thread_offset < n_channels) {
+            mem_access::load_global<scatter::access_granularity>(tmp_buf,
+                                                                 load_base_ptr + i * load_stride);
+            mem_access::store_global<scatter::access_granularity>(store_base_ptr + i * load_stride,
+                                                                  tmp_buf);
+        }
+    }
+
+    if (threadIdx.x == 0) { mapped_slots[token_idx] = store_row; }
+}
+
+#define LAUNCH_FOR_UNROLL(COUNT)                                                       \
+    case COUNT:                                                                        \
+        moe_scatter_kernel<T, COUNT><<<grid, block, 0, stream>>>(moe_input,            \
+                                                                 expert_count_cumsums, \
+                                                                 mapped_slots,         \
+                                                                 activations,          \
+                                                                 assignments,          \
+                                                                 expert_counts,        \
+                                                                 offsets,              \
+                                                                 n_channels,           \
+                                                                 n_experts);           \
+        break;
+
+template <typename T>
+void launch_moe_scatter(T* moe_input,
+                        int64_t* expert_count_cumsums,
+                        int32_t* mapped_slots,
+                        const T* activations,
+                        const int32_t* expert_counts,
+                        const int32_t* assignments,
+                        const int32_t* offsets,
+                        const int32_t n_channels,
+                        const int32_t n_tokens,
+                        const int32_t n_experts,
+                        cudaStream_t stream)
+{
+    constexpr int vals_per_unroll = scatter::threads * scatter::access_granularity / sizeof(T);
+    const int copy_unroll = (n_channels + vals_per_unroll - 1) / vals_per_unroll;
+
+    const dim3 block(scatter::threads);
+    const dim3 grid(n_tokens);
+
+    switch (copy_unroll) {
+        LAUNCH_FOR_UNROLL(1);
+        LAUNCH_FOR_UNROLL(2);
+        LAUNCH_FOR_UNROLL(3);
+        LAUNCH_FOR_UNROLL(4);
+        LAUNCH_FOR_UNROLL(5);
+        LAUNCH_FOR_UNROLL(6);
+    }
+}
+
+#define INSTANTIATE_SCATTER_FOR_TYPE(TYPE)                 \
+    template void launch_moe_scatter<TYPE>(TYPE*,          \
+                                           int64_t*,       \
+                                           int32_t*,       \
+                                           const TYPE*,    \
+                                           const int32_t*, \
+                                           const int32_t*, \
+                                           const int32_t*, \
+                                           const int32_t,  \
+                                           const int32_t,  \
+                                           const int32_t,  \
+                                           cudaStream_t);
+
+INSTANTIATE_SCATTER_FOR_TYPE(__half);
+
+#ifdef BF16_AVAILABLE
+INSTANTIATE_SCATTER_FOR_TYPE(__nv_bfloat16);
+#endif
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/moe_scatter/moe_scatter.cuh b/deepspeed/inference/v2/kernels/ragged_ops/moe_scatter/moe_scatter.cuh
new file mode 100644
index 000000000000..5c94cb0ef734
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/ragged_ops/moe_scatter/moe_scatter.cuh
@@ -0,0 +1,22 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#pragma once
+
+#include "ds_kernel_utils.h"
+#include "ragged_dtypes.h"
+
+template <typename T>
+void launch_moe_scatter(T* moe_input,
+                        int64_t* expert_count_cumsums,
+                        int32_t* mapped_slots,
+                        const T* activations,
+                        const int32_t* expert_counts,
+                        const int32_t* assignments,
+                        const int32_t* offsets,
+                        const int32_t n_channels,
+                        const int32_t n_tokens,
+                        const int32_t n_experts,
+                        cudaStream_t stream);
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/moe_scatter/moe_scatter.h b/deepspeed/inference/v2/kernels/ragged_ops/moe_scatter/moe_scatter.h
new file mode 100644
index 000000000000..59597f63d123
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/ragged_ops/moe_scatter/moe_scatter.h
@@ -0,0 +1,23 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#pragma once
+
+#include <c10/cuda/CUDAStream.h>
+#include <torch/extension.h>
+#include "moe_scatter.cuh"
+#include "ragged_dtypes.h"
+
+/*
+Performs a cumsum on the expert counts and copies the hidden states to the
+appropriate spot to ensure that each experts inputs are contiguous.
+*/
+void moe_scatter(torch::Tensor& moe_input,
+                 torch::Tensor& expert_count_cumsums,
+                 torch::Tensor& mapped_slots,
+                 torch::Tensor& activations,
+                 torch::Tensor& expert_counts,
+                 torch::Tensor& assignments,
+                 torch::Tensor& offsets);
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/moe_scatter/moe_scatter.py b/deepspeed/inference/v2/kernels/ragged_ops/moe_scatter/moe_scatter.py
new file mode 100644
index 000000000000..5cd6ae5f0fe2
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/ragged_ops/moe_scatter/moe_scatter.py
@@ -0,0 +1,55 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+
+from typing import Tuple
+
+from ... import DSKernelBase
+from ....inference_utils import DtypeEnum
+from deepspeed.ops.op_builder import RaggedOpsBuilder
+
+
+class MoEScatter(DSKernelBase):
+    """
+    CUDA implementation of MoE scatter
+    """
+
+    supported_dtypes = [DtypeEnum.fp16, DtypeEnum.bf16]
+
+    def __init__(self, dtype: DtypeEnum, channels: int) -> None:
+
+        if not isinstance(dtype, DtypeEnum):
+            dtype = DtypeEnum(dtype)
+
+        if dtype not in MoEScatter.supported_dtypes:
+            raise RuntimeError(f"Unsupported dtype {dtype}")
+
+        if channels % 8 != 0:
+            raise RuntimeError(f"Channels {channels} must be divisible by 8")
+
+        inf_module = RaggedOpsBuilder().load()
+        self.kernel = inf_module.moe_scatter
+
+    def __call__(self, moe_input: torch.Tensor, expert_cumsum: torch.Tensor, mapped_slots: torch.Tensor,
+                 activations: torch.Tensor, expert_counts: torch.Tensor, assignments: torch.Tensor,
+                 offsets: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Scatters the hidden states such that the token stride for each expert's input is contiguous.
+
+        Arguments:
+            moe_input (torch.Tensor): The direct input for the MoE GEMM of shape [n_tokens, hidden_size].
+            expert_cumsum (torch.Tensor): The cumulative sum of the expert counts of shape [n_experts].
+            mapped_slots (torch.Tensor): The index of the token in the expert's input of shape [n_tokens].
+            hidden_states (torch.Tensor): The hidden states of shape [n_tokens, hidden_size].
+            expert_counts (torch.Tensor): The number of tokens assigned to each expert of shape [n_experts].
+            assignments (torch.Tensor): The expert assignments of shape [n_tokens].
+            offsets (torch.Tensor): The offsets into the expert for a given token of shape [n_tokens].
+
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: The MoE input (with scattered values), the cumsum of the offsets (for the MoE kernels themselves), and the assignments Tensor modified in place to show which row that token was mapped to in the input.
+        """
+        self.kernel(moe_input, expert_cumsum, mapped_slots, activations, expert_counts, assignments, offsets)
+        return moe_input, expert_cumsum, mapped_slots
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/ragged_helpers/ragged_dtypes.h b/deepspeed/inference/v2/kernels/ragged_ops/ragged_helpers/ragged_dtypes.h
new file mode 100644
index 000000000000..7876b354af0d
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/ragged_ops/ragged_helpers/ragged_dtypes.h
@@ -0,0 +1,48 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#pragma once
+
+#include <stdint.h>
+
+struct
+#ifdef __CUDA_CC__
+    __align__(8)
+#endif
+{
+    int32_t n_tokens;
+    int32_t n_sequences;
+}
+typedef RaggedBatchDescriptor;
+
+struct
+#ifdef __CUDA_CC__
+    __align__(16)
+#endif
+{
+    int32_t start_idx;
+    int32_t n_tokens;
+    int32_t seen_tokens;
+    int32_t UNUSED;  // Explicit padding to match the Python code pattern.
+}
+typedef InflightSeqDescriptor;
+
+struct
+#ifdef __CUDA_CC__
+    __align__(8)
+#endif
+{
+    int32_t** block_lists;
+    int32_t block_size;
+    int32_t n_blocks;
+}
+typedef KVCacheDescriptor;
+
+struct {
+    const RaggedBatchDescriptor* batch_metadata;  // Offset 0
+    const InflightSeqDescriptor* seq_metadata;    // Offset 8
+    const int32_t* tokens_to_seq;                 // Offset 16
+    const KVCacheDescriptor kv_desc;              // Offset 24
+} typedef BatchWrapperCPP;
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/ragged_helpers/ragged_kernel_helpers.cpp b/deepspeed/inference/v2/kernels/ragged_ops/ragged_helpers/ragged_kernel_helpers.cpp
new file mode 100644
index 000000000000..a6cb7f275366
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/ragged_ops/ragged_helpers/ragged_kernel_helpers.cpp
@@ -0,0 +1,28 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include "ragged_kernel_helpers.h"
+
+BatchWrapperCPP make_cpp_batch_wrapper(torch::Tensor& batch_metadata,
+                                       torch::Tensor& seq_metadata,
+                                       torch::Tensor& tokens_to_seq,
+                                       torch::Tensor& kv_cache_desc,
+                                       int32_t block_size,
+                                       int32_t n_blocks)
+{
+    const RaggedBatchDescriptor* batch_metadata_raw =
+        reinterpret_cast<const RaggedBatchDescriptor*>(batch_metadata.data_ptr());
+
+    const InflightSeqDescriptor* seq_metadata_raw =
+        reinterpret_cast<const InflightSeqDescriptor*>(seq_metadata.data_ptr());
+
+    const int32_t* tokens_to_seq_raw = tokens_to_seq.data_ptr<int32_t>();
+
+    int32_t** kv_ptrs_raw = reinterpret_cast<int32_t**>(kv_cache_desc.data_ptr());
+    KVCacheDescriptor kv_desc = {kv_ptrs_raw, block_size, n_blocks};
+
+    BatchWrapperCPP wrapper = {batch_metadata_raw, seq_metadata_raw, tokens_to_seq_raw, kv_desc};
+    return wrapper;
+}
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/ragged_helpers/ragged_kernel_helpers.h b/deepspeed/inference/v2/kernels/ragged_ops/ragged_helpers/ragged_kernel_helpers.h
new file mode 100644
index 000000000000..7ce082d31853
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/ragged_ops/ragged_helpers/ragged_kernel_helpers.h
@@ -0,0 +1,16 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#pragma once
+
+#include <torch/extension.h>
+#include "ragged_dtypes.h"
+
+BatchWrapperCPP make_cpp_batch_wrapper(torch::Tensor& batch_metadata,
+                                       torch::Tensor& seq_metadata,
+                                       torch::Tensor& tokens_to_seq,
+                                       torch::Tensor& kv_cache_desc,
+                                       int32_t block_size,
+                                       int32_t n_blocks);
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/ragged_ops.cpp b/deepspeed/inference/v2/kernels/ragged_ops/ragged_ops.cpp
new file mode 100644
index 000000000000..1c09fc52bbb1
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/ragged_ops/ragged_ops.cpp
@@ -0,0 +1,48 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include <torch/extension.h>
+
+#include "atom_builder.h"
+#include "blocked_flash.h"
+#include "blocked_kv_rotary.h"
+#include "embed.h"
+#include "logits_gather.h"
+#include "moe_gather.h"
+#include "moe_scatter.h"
+#include "top_1_gating.h"
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+    // atom_builder.h
+    m.def("build_atoms", &build_atoms, "Host kernel for building the atoms.");
+
+    // blocked_flash.h
+    m.def("flash_attn_by_atoms",
+          &flash_attn_by_atoms,
+          "Blocked flash attention scheduled with atoms");
+
+    // blocked_kv_rotary.h
+    m.def("kv_rotary_embeddings", &kv_rotary_embeddings, "KV rotary embedding for blocked KV");
+    m.def("kv_trained_rotary_embeddings",
+          &kv_trained_rotary_embeddings,
+          "KV rotary embeddings for blocked KV");
+    m.def("linear_kv_copy", &linear_kv_copy, "Linear copy for blocked KV");
+
+    // embed.h
+    m.def("ragged_embed", &ragged_embed, "Embedding lookup for ragged batch");
+
+    // logits_gather.h
+    m.def("gather_for_logits", &gather_for_logits, "Sparse gather from ragged batch");
+
+    // moe_gather.h
+    m.def("moe_gather", &moe_gather, "MoE gather for top-1-gating.");
+
+    // moe_scatter.h
+    m.def("moe_scatter", &moe_scatter, "MoE scatter for top-1-gating.");
+
+    // top_1_gating.h
+    m.def("top_1_gating", &top_1_gating, "Top-1 gating for MoE with ragged batch awareness.");
+}
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/top_1_gating/__init__.py b/deepspeed/inference/v2/kernels/ragged_ops/top_1_gating/__init__.py
new file mode 100644
index 000000000000..b50a0838d9f8
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/ragged_ops/top_1_gating/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .top_1_gating import RaggedTop1Gating
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/top_1_gating/top_1_gating.cpp b/deepspeed/inference/v2/kernels/ragged_ops/top_1_gating/top_1_gating.cpp
new file mode 100644
index 000000000000..55c68454b228
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/ragged_ops/top_1_gating/top_1_gating.cpp
@@ -0,0 +1,55 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include "top_1_gating.h"
+#include <c10/cuda/CUDAStream.h>
+
+#define DISPATCH_TOP_1_GATING(T_TYPE, C_TYPE)                   \
+    if (logits.options().dtype() == torch::T_TYPE) {            \
+        launch_top_1_gating((int32_t*)expert_counts.data_ptr(), \
+                            (float*)scores.data_ptr(),          \
+                            (int32_t*)assignments.data_ptr(),   \
+                            (int32_t*)offsets.data_ptr(),       \
+                            (const C_TYPE*)logits.data_ptr(),   \
+                            batch_metadata_ptr,                 \
+                            n_tokens,                           \
+                            n_experts,                          \
+                            at::cuda::getCurrentCUDAStream());  \
+        return;                                                 \
+    }
+
+/*
+Perform softmax plus atomics in order to do first pass of top_1_gating.
+*/
+void top_1_gating(torch::Tensor& expert_counts,
+                  torch::Tensor& scores,
+                  torch::Tensor& assignments,
+                  torch::Tensor& offsets,
+                  torch::Tensor& logits,
+                  torch::Tensor& batch_metadata)
+{
+    const int32_t n_tokens = scores.size(0);
+
+    // Should have the same buffer size for scores and offsets
+    TORCH_CHECK(n_tokens == offsets.size(0));
+    TORCH_CHECK(n_tokens == logits.size(0));
+
+    TORCH_CHECK(expert_counts.scalar_type() == torch::kInt32);
+    TORCH_CHECK(scores.scalar_type() == torch::kFloat);
+    TORCH_CHECK(assignments.scalar_type() == torch::kInt32);
+    TORCH_CHECK(offsets.scalar_type() == torch::kInt32);
+
+    const int32_t n_experts = logits.size(1);
+    const RaggedBatchDescriptor* batch_metadata_ptr =
+        reinterpret_cast<const RaggedBatchDescriptor*>(batch_metadata.data_ptr());
+
+    DISPATCH_TOP_1_GATING(kFloat, float)
+    DISPATCH_TOP_1_GATING(kHalf, __half)
+#ifdef BF16_AVAILABLE
+    DISPATCH_TOP_1_GATING(kBFloat16, __nv_bfloat16)
+#endif
+
+    TORCH_CHECK(false, "Unsupported dtype for logits in top_1_gating");
+}
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/top_1_gating/top_1_gating.cu b/deepspeed/inference/v2/kernels/ragged_ops/top_1_gating/top_1_gating.cu
new file mode 100644
index 000000000000..02daee9f692e
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/ragged_ops/top_1_gating/top_1_gating.cu
@@ -0,0 +1,106 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include "conversion_utils.h"
+#include "memory_access_utils.h"
+#include "reduction_utils.h"
+#include "top_1_gating.cuh"
+
+using ROp = reduce::ROpType;
+
+template <typename T>
+__global__ void top_1_gating_kernel(int32_t* expert_counts,
+                                    float* scores,
+                                    int32_t* assignments,
+                                    int32_t* offsets,
+                                    const T* logits,
+                                    const RaggedBatchDescriptor* batch_metadata,
+                                    const int32_t n_experts)
+{
+    const int32_t token_idx = blockIdx.x;
+    const int32_t expert_idx = threadIdx.x;
+    const int32_t max_warps = 1024 / hw_warp_size;
+
+    // CG helpers
+    cg::thread_block tb = cg::this_thread_block();
+    cg::thread_block_tile<hw_warp_size> warp = cg::tiled_partition<hw_warp_size>(tb);
+
+    // Padding tokens do not require
+    if (token_idx >= batch_metadata->n_tokens) {
+        if (threadIdx.x == 0) {
+            offsets[token_idx] = gating::unassigned;
+            assignments[token_idx] = gating::unassigned;
+        }
+        return;
+    }
+
+    const T* token_logits = logits + token_idx * n_experts;
+
+    float logit_val;
+    if (expert_idx < n_experts) {
+        logit_val = conversion::to<float>(token_logits[expert_idx]);
+    } else {
+        reduce::init<ROp::Max>(&logit_val);
+    }
+
+    // Training code tends to use ``torch.argmax`` to select the expert, which
+    // which has ties broken by the lower index. Since our fused comparison algorithm
+    // breaks ties by the higher index (since it's the lower 32-bits of the 64-bit
+    // comparison), we invert the expert index to break ties by the lower index.
+    int32_t inverted_expert = n_experts - expert_idx - 1;
+    // Perform softmax
+    const reduce::IdxReduceResult res =
+        reduce::idx_reduce<ROp::Max, max_warps>(tb, warp, logit_val, inverted_expert);
+    // Recover the original expert index
+    const int32_t assigned_expert = n_experts - res.idx - 1;
+    const float max_logit = res.val;
+
+    float softmax_sum = __expf(logit_val - max_logit);
+    reduce::block<ROp::Add>(tb, warp, softmax_sum);
+
+    // Compute the score
+    const float score = __expf(max_logit - max_logit) / softmax_sum;
+
+    if (threadIdx.x == 0) {
+        scores[token_idx] = score;
+        assignments[token_idx] = assigned_expert;
+        offsets[token_idx] = atomicAdd(expert_counts + assigned_expert, 1);
+    }
+}
+
+template <typename T>
+void launch_top_1_gating(int32_t* expert_counts,
+                         float* scores,
+                         int32_t* assignments,
+                         int32_t* offsets,
+                         const T* logits,
+                         const RaggedBatchDescriptor* batch_metadata,
+                         const int32_t n_tokens,
+                         const int32_t n_experts,
+                         cudaStream_t stream)
+{
+    const dim3 grid(n_tokens);
+    const dim3 block(((n_experts + hw_warp_size - 1) / hw_warp_size) * hw_warp_size);
+
+    top_1_gating_kernel<T><<<grid, block, 0, stream>>>(
+        expert_counts, scores, assignments, offsets, logits, batch_metadata, n_experts);
+}
+
+#define INSTANTIATE_TOP_1_KERNEL(T)                                                   \
+    template void launch_top_1_gating<T>(int32_t * expert_counts,                     \
+                                         float* scores,                               \
+                                         int32_t* assignments,                        \
+                                         int32_t* offsets,                            \
+                                         const T* logits,                             \
+                                         const RaggedBatchDescriptor* batch_metadata, \
+                                         const int32_t n_tokens,                      \
+                                         const int32_t n_experts,                     \
+                                         cudaStream_t stream);
+
+INSTANTIATE_TOP_1_KERNEL(float)
+INSTANTIATE_TOP_1_KERNEL(__half)
+#ifdef BF16_AVAILABLE
+INSTANTIATE_TOP_1_KERNEL(__nv_bfloat16)
+#endif
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/top_1_gating/top_1_gating.cuh b/deepspeed/inference/v2/kernels/ragged_ops/top_1_gating/top_1_gating.cuh
new file mode 100644
index 000000000000..c83ad56ff2f1
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/ragged_ops/top_1_gating/top_1_gating.cuh
@@ -0,0 +1,24 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#pragma once
+
+#include "ds_kernel_utils.h"
+#include "ragged_dtypes.h"
+
+namespace gating {
+constexpr int unassigned = -1;
+}  // namespace gating
+
+template <typename T>
+void launch_top_1_gating(int32_t* expert_counts,
+                         float* scores,
+                         int32_t* assignments,
+                         int32_t* offsets,
+                         const T* logits,
+                         const RaggedBatchDescriptor* batch_metadata,
+                         const int32_t n_tokens,
+                         const int32_t n_experts,
+                         cudaStream_t stream);
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/top_1_gating/top_1_gating.h b/deepspeed/inference/v2/kernels/ragged_ops/top_1_gating/top_1_gating.h
new file mode 100644
index 000000000000..b431f4cad30c
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/ragged_ops/top_1_gating/top_1_gating.h
@@ -0,0 +1,21 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#pragma once
+
+#include <c10/cuda/CUDAStream.h>
+#include <torch/extension.h>
+#include "ragged_dtypes.h"
+#include "top_1_gating.cuh"
+
+/*
+Perform softmax plus atomics to get token mapping.
+*/
+void top_1_gating(torch::Tensor& expert_counts,
+                  torch::Tensor& scores,
+                  torch::Tensor& assignments,
+                  torch::Tensor& offsets,
+                  torch::Tensor& logits,
+                  torch::Tensor& batch_metadata);
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/top_1_gating/top_1_gating.py b/deepspeed/inference/v2/kernels/ragged_ops/top_1_gating/top_1_gating.py
new file mode 100644
index 000000000000..1df97c2e9f8d
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/ragged_ops/top_1_gating/top_1_gating.py
@@ -0,0 +1,59 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+
+from typing import Tuple
+
+from ... import DSKernelBase
+from ....inference_utils import DtypeEnum
+from ....ragged import RaggedBatchWrapper
+from deepspeed.ops.op_builder import RaggedOpsBuilder
+
+
+class RaggedTop1Gating(DSKernelBase):
+    """
+    CUDA implementation of top-1 gating. This will perform a softmax on the logits,
+    and return the scale as well as its idx within that expert's allocation.
+    """
+
+    supported_logit_dtypes = [DtypeEnum.fp16, DtypeEnum.bf16, DtypeEnum.fp32]
+
+    def __init__(self, logit_dtype: DtypeEnum) -> None:
+
+        if not isinstance(logit_dtype, DtypeEnum):
+            logit_dtype = DtypeEnum(logit_dtype)
+
+        if logit_dtype not in RaggedTop1Gating.supported_logit_dtypes:
+            raise RuntimeError(f"Unsupported logit dtype {logit_dtype}")
+
+        inf_module = RaggedOpsBuilder().load()
+        self.kernel = inf_module.top_1_gating
+
+    def __call__(self, expert_counts: torch.Tensor, scores: torch.Tensor, assignments: torch.Tensor,
+                 offsets: torch.Tensor, logits: torch.Tensor,
+                 batch: RaggedBatchWrapper) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Perform the ragged top_1_gating.
+
+        Arguments:
+            expert_counts (torch.Tensor): Tensor of 0s of shape [n_experts] to be filled with
+                number of tokens assigned to each expert. This must be filled with 0s else
+                the copy kernel will buffer overflow. In order to minimize the zero-fill cost,
+                it is recommended to write to 0 during the MoE output remapping.
+            scores (torch.Tensor): Preallocated output of shape [n_tokens] to place expert scaling
+                value.
+            expert_assignment (torch.Tensor): Preallocated output of shape [n_tokens] to place
+                which expert a token has been assigned to.
+            expert_offset (torch.Tensor): Preallocated output of shape [n_tokens] to place which
+                offset within an experts group a token is.
+            logits (torch.Tensor): Raw logits of gating function.
+            batch (RaggedBatchWrapper): Batch information for ragged tensor.
+
+        Returns:
+            tuple of (expert_counts, scores, expert_assignment, expert_offset)
+        """
+        self.kernel(expert_counts, scores, assignments, offsets, logits, batch.batch_metadata_buffer())
+        return expert_counts, scores, assignments, offsets
diff --git a/deepspeed/inference/v2/logging.py b/deepspeed/inference/v2/logging.py
new file mode 100644
index 000000000000..77afe351cbea
--- /dev/null
+++ b/deepspeed/inference/v2/logging.py
@@ -0,0 +1,26 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import logging
+
+from deepspeed.utils.logging import LoggerFactory
+
+inf_logger = None
+
+
+def inference_logger(level: int = logging.INFO) -> logging.Logger:
+    """
+    Create the inference logger. NOTE: Logging is not cost free. On a 3960X,
+    there is a cost of about 6 us per call to a no-op logger, so this should
+    be used during setup only and not during the inference loop.
+
+    Args:
+        level (int, optional): The logging level. Defaults to logging.INFO.
+    """
+    global inf_logger
+    if inf_logger is None:
+        inf_logger = LoggerFactory.create_logger(name="DS-Inference", level=level)
+        inf_logger.debug("Inference logger created.")
+    return inf_logger
diff --git a/deepspeed/inference/v2/model_implementations/AddingAModel.md b/deepspeed/inference/v2/model_implementations/AddingAModel.md
new file mode 100644
index 000000000000..8fe27297080b
--- /dev/null
+++ b/deepspeed/inference/v2/model_implementations/AddingAModel.md
@@ -0,0 +1,84 @@
+# Adding Support for a New Model in DeepSpeed Inference V2
+
+Adding supoprt for a new model in DeepSpeed Inference requires developing three related components:
+- Containers: These describe the parameters contained in the model
+- Model implementation: How should the model be computed.
+- Policy: The map for adding parameters to your containers and creating the model implementation.
+
+In this tutorial, we will assume that you'd like to use a relatively traditionally styled Transformer model and will be able to inherit from `DSTransformerModelBase` and can take advantage of the utilities that provides.
+
+## Defining Your Containers
+
+A container is the bridge between the original model's parameters and how to transform them to serve them for inference. For a model implementation, there are two primary kinds of containers: transformer containers and non-transformer containers. A transformer container consists of the parameters for a single Transformer layer in the model. So this includes your traditional parameters like the projections for the fully connected network, or query-key-value projections. The non-transformer container will contain basically everything else! However, before defining these containers, we need to understand how to define an individual parameter.
+
+In DeepSpeed inference, the original model parameters are populated into the model and mapped as dependencies to a parameter. A `Parameter` has two primary components: its dependencies and its `finalize` method. Let's do an example. In Llama models, the native format is for the `query`, `key`, and `value` projections to be performed independently. However, we can achieve higher throughput by fusing them into a single larger projection. We can define this fusion with a parameter:
+
+```python
+from deepspeed.inference.module_implementations.parameter_base import ParameterBase
+
+class UnfusedQKVParameter(ParameterBase):
+    query: torch.Tensor
+    key: torch.Tensor
+    value: torch.Tensor
+
+    def finalize(self) -> torch.Tensor:
+        fused_param = torch.cat([self.query, self.key, self.value], dim=0)
+        return self.inference_model.transform_qkv_param(fused_param)
+```
+
+Let's walk through each part of this implementation. First, parameters should inherit from `ParameterBase`. This will allow it to automatically determine when its dependencies are met and set the appropriate components of a parent `LayerContainer`. The second key component is the type annotations on the class itself. Each type annotation represents a dependency of the parameter. Since the original Llama mode has separate query, key, and value dependencies, our fused parameter will declare dependencies for each. Finally, we have the `finalize` method. This method is automatically called once all dependencies on the layer are met and should return the final parameter.
+
+In this `finalize` method, we are doing two things: the first is the act of fusing the parameters together through the concatenate method. Note that each of the dependencies can be accessed via `self.{name}`. The second is calling `self.inference_model.transform_qkv_param`. A parameter's finalize method always has access to the inference model. In this case we are using that to use a feature provided by `DSTransformerBase`. This method will automatically shard the parameter for tensor parallelism and then pass it to the linear module implementation to perform additional optimizations or shape transformations, like quantization.
+
+Since many patterns are very common in Transformer models, `model_implementations.common_parameters` provides implementations for many of the patterns (all compatible with `DSTransformerBase`) to help accelerate development.
+
+Once all parameters are created, we need to compose them into a layer container. In our simplified Llama model, let's assume there's only QKV and attention output projection matrices. A layer container would appear as the following:
+
+```python
+from deepspeed.inference.module_implementations.layer_container_base import LayerContainer
+
+class ExampleContainer(LayerContainer):
+    qkvw: UnfusedQKVParameter
+
+    attn_o: AttentionOutputParameter
+
+    PARAM_MAPPING: {
+        "self_attn.q_proj.weight": "qkvw.query",
+        "self_attn.k_proj.weight": "qkvw.key",
+        "self_attn.v_proj.weight": "qkvw.value",
+        "self_attn.o_proj.weight": "attn_o.params",
+    }
+```
+
+Once again, we have a couple of key components. The first are parameter type annotations. Each annotation corresponds to a parameter that can be used in the model implementation. In the model implementation, I can simply write `container.qkvw` to access my fused and transformed QKV parameter. The second key component is the `PARAM_MAPPING` dictionary. This is our explicit mapping of the names of parameters in the source model to a parameter dependency. This mapping dictionary will be used by the policy to automatically populate dependencies.
+
+Once you have written `LayerContainer`s for both the transformer and non-transformer parameters, it's time to work on the model implementation!
+
+## Building a Model Implementation that Inherits from `DSTransformerBase`
+
+By inheriting from `DSTransformerBase`, most of the implementation work for sharding and transforming parameters will be automatically handled for you. However, there are four key tasks that still need to be completed.
+
+1. Defining the abstract properties based on your model configuration.
+2. Configuring embedding and unembedding modules and the forward implementations for them.
+3. Configuring the attention configuration and desired KV cache behaviors.
+4. Writing the forward implementation for your layer.
+
+## Writing a Policy
+
+The `InferenceV2Policy` is the level of composition. This is the object that will be passed directly to the inference engine and will compose the model implementation and your containers to create an end-to-end solution. There are two main components to be implemented: the first is to create the model that you defined earlier. This is done by implementing the `instantiate_model` method of the policy. In general, this can just be implemented by calling the constructor for your model and passing the engine config, tensor-parallel communication object, and your custom model config.
+
+The second component is to define how the parameters from the checkpoint will map to each container. From the section on `LayerContainer`s above, you may remember that the `LayerContainer` can handle the internal routing of a checkpoint parameter to its dependency. In order to find the correct `LayerContainer` though, we need a second abstraction: the `ContainerMap`.
+
+A `ContainerMap` performs this mapping by categorizing checkpoint prefix strings to the type of container they map to. Typically, the easiest way to do this is through iterating over a model checkpoint's state dict or by iterating over the `named_parameters` of a PyTorch model. There are three types of mappings to define: the transformer mappings, the non-transformer mappings, and the what we'll call the rest. Let's work through an example:
+
+```python
+from deepspeed.inference.module_implementations.inference_policy_base import ContainerMap
+
+def build_container_map(self) -> ContainerMap:
+    map = ContainerMap()
+
+    transformer_containers = [MyTransformerContainer(self.model) for _ in range(self.model.num_layers)]
+    map.set_transformer_params("model.layers", transformer_containers)
+
+    non_transformer_container = MyNonTransformerContainer(self.model)
+```
diff --git a/deepspeed/inference/v2/model_implementations/__init__.py b/deepspeed/inference/v2/model_implementations/__init__.py
new file mode 100644
index 000000000000..dae406271245
--- /dev/null
+++ b/deepspeed/inference/v2/model_implementations/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .inference_model_base import DSInferenceModelBase
+from .inference_transformer_base import DSTransformerModelBase, DSMoETransformerModelBase
+from .inference_policy_base import InferenceV2Policy, ContainerMap
+from .sharding import *
+
+# Model Implementations
+from .llama_v2 import *
+from .opt import *
+from .mistral import *
diff --git a/deepspeed/inference/v2/model_implementations/common_parameters/__init__.py b/deepspeed/inference/v2/model_implementations/common_parameters/__init__.py
new file mode 100644
index 000000000000..60963011cd66
--- /dev/null
+++ b/deepspeed/inference/v2/model_implementations/common_parameters/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .attn_output_parameters import *
+from .embedding_parameters import *
+from .mlp_parameters import *
+from .moe_parameters import *
+from .norm_parameters import *
+from .qkv_parameters import *
+from .unembed_parameters import *
+from .invfreq_parameters import *
diff --git a/deepspeed/inference/v2/model_implementations/common_parameters/attn_output_parameters.py b/deepspeed/inference/v2/model_implementations/common_parameters/attn_output_parameters.py
new file mode 100644
index 000000000000..f220cf7a7125
--- /dev/null
+++ b/deepspeed/inference/v2/model_implementations/common_parameters/attn_output_parameters.py
@@ -0,0 +1,29 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+
+from ...model_implementations.parameter_base import ParameterBase
+"""
+Common Attention Output Parameter Patterns
+"""
+
+
+class AttentionOutputParameter(ParameterBase):
+    """
+    Attention output parameter container.
+
+    Note: The differentiation for something like GQA for this matrix is primarily
+    encompassed in the sharding logic, which is currently expected to be performed by
+    the model implementation.
+    """
+
+    params: torch.Tensor
+    """
+    Unsharded attention output parameter of shape [model_dim, model_dim]
+    """
+
+    def finalize(self) -> torch.Tensor:
+        return self.inference_model.transform_attn_out_param(self.params)
diff --git a/deepspeed/inference/v2/model_implementations/common_parameters/embedding_parameters.py b/deepspeed/inference/v2/model_implementations/common_parameters/embedding_parameters.py
new file mode 100644
index 000000000000..2ed34b5fd259
--- /dev/null
+++ b/deepspeed/inference/v2/model_implementations/common_parameters/embedding_parameters.py
@@ -0,0 +1,26 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+
+from ...model_implementations.parameter_base import ParameterBase
+"""
+Embedding containers.
+"""
+
+
+class EmbeddingParameter(ParameterBase):
+    """
+    Embedding container. This should be safe to use for all types of embeddings (i.e. word, position,
+    and token type).
+    """
+
+    params: torch.Tensor
+    """
+    Vocabulary parameter of shape [vocab_size, model_dim].
+    """
+
+    def finalize(self) -> torch.Tensor:
+        return self.inference_model.transform_embedding_param(self.params)
diff --git a/deepspeed/inference/v2/model_implementations/common_parameters/invfreq_parameters.py b/deepspeed/inference/v2/model_implementations/common_parameters/invfreq_parameters.py
new file mode 100644
index 000000000000..163f9de81d98
--- /dev/null
+++ b/deepspeed/inference/v2/model_implementations/common_parameters/invfreq_parameters.py
@@ -0,0 +1,19 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+
+from ...model_implementations.parameter_base import ParameterBase
+"""
+Common InvFreq Parameter Patterns
+"""
+
+
+class InvFreqParameter(ParameterBase):
+
+    params: torch.Tensor
+
+    def finalize(self) -> torch.Tensor:
+        return self.params.to(self.inference_model.activation_dtype.value)
diff --git a/deepspeed/inference/v2/model_implementations/common_parameters/mlp_parameters.py b/deepspeed/inference/v2/model_implementations/common_parameters/mlp_parameters.py
new file mode 100644
index 000000000000..ddb8996e03a3
--- /dev/null
+++ b/deepspeed/inference/v2/model_implementations/common_parameters/mlp_parameters.py
@@ -0,0 +1,81 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+
+from ...model_implementations.parameter_base import ParameterBase
+"""
+MLP Parameter Containers
+"""
+
+
+class MLP1Parameter(ParameterBase):
+    """
+    First MLP projection weight container. This performs a straight pass-through to the
+    model implementation for transformation.
+    """
+    params: torch.Tensor
+
+    def finalize(self) -> torch.Tensor:
+        # NOTE(cmikeh2): If we are gated but not in the format specified below, we should trigger a permutation here.
+        # I am not currently aware of any models that use this format (or how we should even detect it; probably should
+        # just be a different param entirely, but until then we'll just assume the format is correct).
+        return self.inference_model.transform_mlp_1_param(self.params)
+
+
+class GatedMLPParameter(ParameterBase):
+    """
+    Gated MLP projection container.
+    """
+
+    gate_params: torch.Tensor
+    """
+    Weight parameter for the gating matrix.
+    """
+
+    up_params: torch.Tensor
+    """
+    For lack of a better name, the non-gating weight parameters.
+    """
+
+    def finalize(self) -> torch.Tensor:
+        """
+        Our gated format (this is different from InferenceV1!) is to have the gate and activated neurons
+        interleaved. So if we have 4 output neurons (two effective neurons) with 4 input neurons, the finalized
+        parameter will look like:
+        [g0_0, g0_1, g0_2, g0_3]
+        [a0_0, a0_1, a0_2, a0_3]
+        [g1_0, g1_1, g1_2, g1_3]
+        [a1_0, a1_1, a1_2, a1_3]
+
+        As a reference, in inference v1, the format is:
+        [g0_0, g0_1, g0_2, g0_3]
+        [g1_0, g1_1, g1_2, g1_3]
+        [a0_0, a0_1, a0_2, a0_3]
+        [a1_0, a1_1, a1_2, a1_3]
+        """
+        assert self.gate_params.shape[0] == self.up_params.shape[
+            0], "Gated MLP parameters must have the same number of neurons."
+        total_neurons = self.gate_params.shape[0] + self.up_params.shape[0]
+
+        # flip the order if even with the correct tokenizer we get wrong output
+        #fused_param = torch.cat([self.up_params, self.gate_params], dim=-1).reshape(total_neurons, -1)
+        fused_param = torch.cat([self.gate_params, self.up_params], dim=-1).reshape(total_neurons, -1)
+        return self.inference_model.transform_mlp_1_param(fused_param)
+
+
+class MLP2Parameter(ParameterBase):
+    """
+    Second MLP projection weight container. This performs a straight pass-through to the
+    model implementation for transformation.
+    """
+
+    params: torch.Tensor
+    """
+    Full weight parameter.
+    """
+
+    def finalize(self) -> torch.Tensor:
+        return self.inference_model.transform_mlp_2_param(self.params)
diff --git a/deepspeed/inference/v2/model_implementations/common_parameters/moe_parameters.py b/deepspeed/inference/v2/model_implementations/common_parameters/moe_parameters.py
new file mode 100644
index 000000000000..df5f1427a5cf
--- /dev/null
+++ b/deepspeed/inference/v2/model_implementations/common_parameters/moe_parameters.py
@@ -0,0 +1,69 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+
+from ...model_implementations.parameter_base import ParameterBase, ParamList
+"""
+Moe Parameters
+
+These parameters are compatible with any model inheriting from ``DSMoETransformerModelBase``.
+"""
+
+
+class MoEGatingWeightParameter(ParameterBase):
+    """
+    Gating weight matrix.
+    """
+
+    params: torch.Tensor
+    """
+    Projection matrix from the input activations to the gate logits.
+    """
+
+    def finalize(self) -> torch.Tensor:
+        return self.inference_model.transform_moe_gate_param(self.params)
+
+
+class UnfusedMoEMLP1Parameter(ParameterBase):
+    """
+    This container should be used when the experts are held in separate parameters
+    and need to be joined into a single group.
+    """
+
+    experts: ParamList("num_experts")  # noqa: F821
+
+    def finalize(self) -> torch.Tensor:
+        stacked_experts = torch.stack([p for p in self.experts], dim=0)
+        return self.inference_model.transform_moe_mlp_1_param(stacked_experts)
+
+
+class UnfusedMoEMLP2Parameter(ParameterBase):
+    """
+    This container should be used when the experts are held in separate parameters
+    and need to be joined into a single group.
+    """
+
+    experts: ParamList("num_experts")  # noqa: F821
+
+    def finalize(self) -> torch.Tensor:
+        stacked_experts = torch.stack([p for p in self.experts], dim=0)
+        return self.inference_model.transform_moe_mlp_2_param(stacked_experts)
+
+
+class UnfusedMoEGatedMLPParameter(ParameterBase):
+    """
+    MoE Parameter for a gated activation function in which the gating matrix is not
+    fused in the same parameter as the non-gating matrix.
+    """
+
+    gating_experts: ParamList("num_experts")  # noqa: F821
+
+    up_experts: ParamList("num_experts")  # noqa: F821
+
+    def finalize(self) -> torch.Tensor:
+        fused_params = [torch.cat([gate, weight], dim=0) for gate, weight in zip(self.gating_experts, self.up_experts)]
+        stacked_params = torch.stack(fused_params, dim=0)
+        return self.inference_model.transform_moe_mlp_2_param(stacked_params)
diff --git a/deepspeed/inference/v2/model_implementations/common_parameters/norm_parameters.py b/deepspeed/inference/v2/model_implementations/common_parameters/norm_parameters.py
new file mode 100644
index 000000000000..81ffcc3221df
--- /dev/null
+++ b/deepspeed/inference/v2/model_implementations/common_parameters/norm_parameters.py
@@ -0,0 +1,22 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+
+from ...model_implementations.parameter_base import ParameterBase
+"""
+Common Attention Output Parameter Patterns
+"""
+
+
+class NormParameter(ParameterBase):
+    """
+    Simple normalization container.
+    """
+
+    params: torch.Tensor
+
+    def finalize(self) -> torch.Tensor:
+        return self.inference_model.transform_norm_param(self.params)
diff --git a/deepspeed/inference/v2/model_implementations/common_parameters/qkv_parameters.py b/deepspeed/inference/v2/model_implementations/common_parameters/qkv_parameters.py
new file mode 100644
index 000000000000..e240137186fe
--- /dev/null
+++ b/deepspeed/inference/v2/model_implementations/common_parameters/qkv_parameters.py
@@ -0,0 +1,115 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+
+from ...model_implementations.parameter_base import ParameterBase
+"""
+Common QKV Parameter Patterns
+"""
+
+
+class FusedQKVParameter(ParameterBase):
+    """
+    Traditional fused QKV parameters for QKV projection. This is functionally
+    a direct copy.
+
+    src_qkv_w shape: [3 * out_features, in_features]
+    qkv_w shape: [3 * out_features, in_features]
+    """
+
+    params: torch.Tensor
+
+    def finalize(self) -> torch.Tensor:
+        return self.inference_model.transform_qkv_param(self.params)
+
+
+class UnfusedQKVParameter(ParameterBase):
+    """
+    QKV parameter container for unfused QKV projection.
+
+    src_param shapes: 3 x [out_features, in_features]
+    dst_param shape: [3 x out_features, in_features]
+    """
+
+    q_params: torch.Tensor
+
+    k_params: torch.Tensor
+
+    v_params: torch.Tensor
+
+    def finalize(self):
+        fused_param = torch.cat([self.q_params, self.k_params, self.v_params], dim=0)
+        return self.inference_model.transform_qkv_param(fused_param)
+
+
+def megatron_qkv_reshape(param: torch.Tensor, head_size: int, n_heads: int) -> torch.Tensor:
+    assert param.shape[0] == 3 * n_heads * head_size
+
+    all_heads = torch.chunk(param, chunks=3 * n_heads, dim=0)
+    q_heads = all_heads[::3]
+    k_heads = all_heads[1::3]
+    v_heads = all_heads[2::3]
+    return torch.cat([q_heads, k_heads, v_heads], dim=0)
+
+
+class MegatronQKVParameter(ParameterBase):
+    """
+    QKV parameter container for Megatron-style QKV projection. Megatron stores the parameter
+    as [n_heads, 3, head_size, in_features] whereas our inference system is built around
+    [3, n_heads, head_size, in_features]. This container handles the conversion.
+
+    Note: this container expects the model implementation to implement properties for
+    `head_size` and `n_heads`.
+
+    src_qkv_w shape: [3 * out_features, in_features]
+    qkv_w shape: [3 * out_features, in_features]
+    """
+
+    params: torch.Tensor
+
+    def finalize(self) -> torch.Tensor:
+        head_size = self.inference_model.head_size
+        n_heads = self.inference_model.n_heads
+
+        transposed_param = megatron_qkv_reshape(self.params, head_size, n_heads)
+        return self.inference_model.transform_qkv_param(transposed_param)
+
+
+def transform_gqa_megatron(src_param: torch.Tensor, head_size: int, n_q_heads: int, n_kv_heads: int) -> torch.Tensor:
+    assert src_param.shape[0] == (2 * n_kv_heads + n_q_heads) * head_size
+
+    head_ratio = n_q_heads // n_kv_heads
+
+    # Reshape to get the groups as the leading dimension
+    groups_leading_view = src_param.reshape(n_kv_heads, 2 + head_ratio, head_size, -1)
+    q_heads = groups_leading_view[:, :head_ratio, :, :].reshape(-1, groups_leading_view.shape[-1])
+    k_heads = groups_leading_view[:, head_ratio, :, :].reshape(-1, groups_leading_view.shape[-1])
+    v_heads = groups_leading_view[:, head_ratio + 1, :, :].reshape(-1, groups_leading_view.shape[-1])
+    # Squeeze will remove extra dimension for bias
+    return torch.cat([q_heads, k_heads, v_heads], dim=0).squeeze()
+
+
+class GQAMegatronQKVParameter(ParameterBase):
+    """
+    QKV parameter for Megatron-style QKV projection with GQA-style QKV projection. In this
+    storage format each of the groups is stored consecutively, so there will be multiple q_heads,
+    then one k head, and one v head.
+
+    Note: this container expects the model implementation to implement properties for
+    `head_size`, `n_q_heads`, and `n_kv_heads`.
+
+    src_qkv_w shape: [(2 * n_kv_heads + n_q_heads) * head_size, in_features]
+    qkv_w shape: [(2 * n_kv_heads + n_q_heads) * head_size, in_features]
+    """
+
+    params: torch.Tensor
+
+    def finalize(self) -> torch.Tensor:
+        head_size = self.inference_model.head_size
+        n_q_heads = self.inference_model.n_heads_q
+        n_kv_heads = self.inference_model.n_heads_kv
+        transposed_param = transform_gqa_megatron(self.params, head_size, n_q_heads, n_kv_heads)
+        return self.inference_model.transform_qkv_param(transposed_param)
diff --git a/deepspeed/inference/v2/model_implementations/common_parameters/unembed_parameters.py b/deepspeed/inference/v2/model_implementations/common_parameters/unembed_parameters.py
new file mode 100644
index 000000000000..9f67c0ce3c27
--- /dev/null
+++ b/deepspeed/inference/v2/model_implementations/common_parameters/unembed_parameters.py
@@ -0,0 +1,26 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+
+from ...model_implementations.parameter_base import ParameterBase
+"""
+Unembedding containers.
+"""
+
+
+class UnembedParameter(ParameterBase):
+    """
+    Unembedding parameter. This will likely be mapped to the same original weight in the model as the
+    embedding, but we have a different preferred sharding approach.
+    """
+
+    params: torch.Tensor
+    """
+    Unembedding parameter of shape [vocab_size, model_dim].
+    """
+
+    def finalize(self) -> torch.Tensor:
+        return self.inference_model.transform_unembed_param(self.params)
diff --git a/deepspeed/inference/v2/model_implementations/flat_model_helpers.py b/deepspeed/inference/v2/model_implementations/flat_model_helpers.py
new file mode 100644
index 000000000000..f9da7ac5d23e
--- /dev/null
+++ b/deepspeed/inference/v2/model_implementations/flat_model_helpers.py
@@ -0,0 +1,282 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import Dict, Iterable, Tuple, Optional
+from os import path
+
+import torch
+
+from deepspeed.accelerator import get_accelerator
+from deepspeed.ops.op_builder import RaggedUtilsBuilder
+from deepspeed.runtime.config_utils import DeepSpeedConfigModel
+from .layer_container_base import LayerContainer
+from ..inference_parameter import InferenceParameter, STR_TO_DTYPE
+from ..inference_utils import elem_size
+
+
+def pad_to_aligned_offset(offset: int, alignment: int = 256) -> int:
+    """
+    Pad the provided offset to a well-aligned value.
+    """
+    return ((offset + alignment - 1) // alignment) * alignment
+
+
+class TensorMetadata(DeepSpeedConfigModel):
+    """
+    A class to represent a tensor specification.
+    """
+    dtype: Optional[str]
+    shape: Optional[Tuple[int, ...]]
+    strides: Optional[Tuple[int, ...]]
+    offset: int
+
+
+class ParameterMetadata(DeepSpeedConfigModel):
+    """
+    A class to represent a parameter specification.
+    """
+    core_param: TensorMetadata = None
+    aux_params: Dict[str, TensorMetadata] = {}
+
+
+class LayerMetadata(DeepSpeedConfigModel):
+    """
+    A class to represent a layer specification.
+    """
+    params: Dict[str, ParameterMetadata] = {}
+
+
+class ModelMetadata(DeepSpeedConfigModel):
+    """
+    A class to represent a model specification.
+    """
+    policy: str = ""
+    layers: Dict[str, LayerMetadata] = {}
+
+
+def make_param_filename(base: str, rank: int, n_ranks: int) -> str:
+    """
+    Make a filename for a parameter file.
+
+    Arguments:
+        rank: Rank of the file.
+        n_ranks: Total number of ranks.
+
+    Returns:
+        str: Filename.
+    """
+    return path.join(base, f"params_rank_{rank}_of_{n_ranks}.pt")
+
+
+def make_metadata_filename(base: str, rank: int, n_ranks: int) -> str:
+    """
+    Make a filename for a metadata file.
+
+    Arguments:
+        rank: Rank of the file.
+        n_ranks: Total number of ranks.
+
+    Returns:
+        str: Filename.
+    """
+    return path.join(base, f"metadata_rank_{rank}_of_{n_ranks}.json")
+
+
+def make_model_config_filename(base: str) -> str:
+    """
+    Make a filename for a model config file.
+
+    Arguments:
+        base: Base directory.
+
+    Returns:
+        str: Filename.
+    """
+    return path.join(base, "ds_model_config.json")
+
+
+def flatten_inference_model(
+    transformer_containers: Iterable[LayerContainer],
+    non_transformer_container: LayerContainer,
+    policy_name: str,
+) -> Tuple[torch.Tensor, ModelMetadata]:
+    """
+    Flatten the underlying parameters into
+
+    Arguments:
+        transformer_containers: Iterable of layer containers corresponding to the transformer
+            parameters.
+        non_transformer_container: Layer container corresponding to the non-transformer parameters.
+        policy_name: The name of the policy class (typically accessed with `type(policy).__name__`).
+
+    Returns:
+        Iterable[Any]: Flattened list of parameters.
+    """
+    alloc_fn = RaggedUtilsBuilder().load().allocate_view_on
+
+    total_size = 0
+    metadata = ModelMetadata(policy=policy_name)
+
+    def process_layer(layer_container: LayerContainer, l_name: str, cur_offset: int) -> int:
+        """
+        Iterate over the parameters of a single container and collect metadata for the final
+        flattened buffer.
+
+        Arguments:
+            layer_container: The layer container to process.
+            l_name: The name of the layer container to key the metadata.
+            cur_offset: The current offset into the flattened buffer.
+
+        Captured Variables:
+            metadata: The metadata object to populate.
+
+        Returns:
+            int: The updated offset into the flattened buffer.
+        """
+        try:
+            _ = layer_container.is_populated
+        except ValueError as e:
+            raise ValueError(f"Layer container {l_name} is not populated.") from e
+
+        layer_metadata = LayerMetadata()
+
+        for p_name in layer_container.annotation_attrs:
+            param = getattr(layer_container, p_name)
+            param_metadata = ParameterMetadata()
+
+            if param is None:
+                param_metadata.core_param = TensorMetadata(offset=-1)
+                layer_metadata.params[p_name] = param_metadata
+                continue
+
+            param_metadata.core_param = TensorMetadata(dtype=str(param.dtype),
+                                                       shape=param.shape,
+                                                       strides=param.stride(),
+                                                       offset=cur_offset)
+
+            cur_offset += pad_to_aligned_offset(elem_size(param.dtype) * param.numel())
+
+            for t_name, tensor in param.aux_attrs.items():
+                param_metadata.aux_params[t_name] = TensorMetadata(dtype=str(tensor.dtype),
+                                                                   shape=tensor.shape,
+                                                                   strides=tensor.stride(),
+                                                                   offset=cur_offset)
+
+                cur_offset += pad_to_aligned_offset(elem_size(param.dtype) * param.numel())
+
+            layer_metadata.params[p_name] = param_metadata
+
+        metadata.layers[l_name] = layer_metadata
+        return cur_offset
+
+    for i, layer in enumerate(transformer_containers):
+        l_name = f"transformer_layer_{i}"
+        total_size = process_layer(layer, l_name, total_size)
+
+    l_name = "non_transformer"
+    total_size = process_layer(non_transformer_container, l_name, total_size)
+
+    buffer = torch.empty(total_size, dtype=torch.uint8, device=get_accelerator().current_device())
+
+    def copy_layer(layer_container: LayerContainer, l_name: str) -> None:
+        """
+        Local method for copying from the layer container to the flattened buffer.
+
+        Arguments:
+            layer_container: The layer container to copy from.
+            l_name: The name of the layer container to key the metadata.
+
+        Captured Variables:
+            buffer: The flattened buffer to copy into.
+            metadata: The metadata object to populate.
+        """
+        l_metadata = metadata.layers[l_name]
+        for p_name in layer_container.annotation_attrs:
+            p_metadata = l_metadata.params[p_name]
+            param = getattr(layer_container, p_name)
+
+            if param is None:
+                continue
+
+            core_param = alloc_fn(param, buffer, p_metadata.core_param.offset)
+            core_param.copy_(param)
+
+            aux_params = {}
+
+            for t_name, tensor in param.aux_attrs.items():
+                t_view = alloc_fn(tensor, buffer, p_metadata.aux_params[t_name].offset)
+                aux_params[t_name] = t_view
+                t_view.copy_(tensor)
+
+            setattr(layer_container, p_name, InferenceParameter.initialize(core_param, **aux_params))
+
+    for i, layer in enumerate(transformer_containers):
+        l_name = f"transformer_layer_{i}"
+        copy_layer(layer, l_name)
+
+    l_name = "non_transformer"
+    copy_layer(non_transformer_container, l_name)
+
+    return buffer, metadata
+
+
+def restore_inference_model(buffer: torch.Tensor, metadata: ModelMetadata,
+                            transformer_containers: Iterable[LayerContainer],
+                            non_transformer_container: LayerContainer) -> None:
+    """
+    Restore the model from the buffer and metadata.
+
+    Arguments:
+        buffer: Buffer containing the model parameters.
+        metadata: Metadata for the model.
+        transformer_containers: Iterable of transformer layer containers.
+        non_transformer_container: Non-transformer layer container.
+    """
+    alloc_fn = RaggedUtilsBuilder().load().allocate_view_like
+
+    def restore_layer(layer_container: LayerContainer, l_name: str) -> None:
+        """
+        Local method for restoring a layer container from a flattened buffer. This
+        only constructs views for the parameters onto the buffer. No data movement
+        is performed.
+
+        Arguments:
+            layer_container: The layer container to restore.
+            l_name: The name of the layer container to key the metadata.
+
+        Captured Variables:
+            buffer: The flattened buffer to reconstruct views on top of.
+            metadata: The metadata object describing the each parameter in the model.
+        """
+        l_metadata = metadata.layers[l_name]
+
+        for p_name in layer_container.annotation_attrs:
+            p_metadata = l_metadata.params[p_name]
+
+            if p_metadata.core_param.offset == -1:
+                layer_container.direct_injection(p_name, None)
+                continue
+
+            dummy_tensor = torch.empty([], dtype=STR_TO_DTYPE[p_metadata.core_param.dtype])
+            core_param = alloc_fn(p_metadata.core_param.shape, p_metadata.core_param.strides, dummy_tensor, buffer,
+                                  p_metadata.core_param.offset)
+
+            aux_params = {}
+
+            for t_name, t_metadata in p_metadata.aux_params.items():
+                dummy_tensor = torch.empty([], dtype=STR_TO_DTYPE[t_metadata.dtype])
+                t_view = alloc_fn(t_metadata.shape, t_metadata.strides, dummy_tensor, buffer, t_metadata.offset)
+
+                aux_params[t_name] = t_view
+
+            restored_param = InferenceParameter.initialize(core_param, **aux_params)
+            layer_container.direct_injection(p_name, restored_param)
+
+    for i, layer in enumerate(transformer_containers):
+        l_name = f"transformer_layer_{i}"
+        restore_layer(layer, l_name)
+
+    l_name = "non_transformer"
+    restore_layer(non_transformer_container, l_name)
diff --git a/deepspeed/inference/v2/model_implementations/inference_model_base.py b/deepspeed/inference/v2/model_implementations/inference_model_base.py
new file mode 100644
index 000000000000..a7ff699e9058
--- /dev/null
+++ b/deepspeed/inference/v2/model_implementations/inference_model_base.py
@@ -0,0 +1,268 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from abc import ABC, abstractmethod
+from typing import Iterable, Optional, Tuple, Type
+
+import torch
+
+import deepspeed.comm as dist
+from ..ragged import DSStateManager, RaggedBatchWrapper
+from ..ragged.manager_configs import KVCacheConfig
+from ..ragged import DSSequenceDescriptor
+from ..model_implementations.layer_container_base import LayerContainer
+from ..config_v2 import RaggedInferenceEngineConfig
+from .flat_model_helpers import ModelMetadata
+
+try:
+    from functools import cached_property
+except ImportError:
+
+    def cached_property(func):
+        return property(func)
+
+
+"""
+This abstract class defines the interfaces that a model implementation should implement
+in order to include anything that may be called by the engine. Most models should be able
+to inherit from `DSInferenceTransformerModelBase` to reduce implementation work so it is recommended
+to begin there.
+"""
+"""
+Placeholder for typing the model config, which can vary based on model implementation/
+"""
+DSModelImplementationConfig = Type['DSModelImplementationConfig']
+"""
+Placeholder for typing the distributed comm object.
+
+TODO(cmikeh2): Replace when we have a more defined API for the inference communication system.
+"""
+MPType = Type["MPType"]
+
+
+class DSInferenceModelBase(torch.nn.Module, ABC):
+    """
+    Implementation of a model for inference composable with ragged batching.
+    """
+
+    _config: DSModelImplementationConfig
+    """
+    Model-specific configuration. No abstraction surrounds this yet.
+    """
+
+    _engine_config: RaggedInferenceEngineConfig
+    """
+    Engine configuration.
+    """
+
+    _base_mp_group: MPType
+    """
+    Base communication group for Tensor-parallel inference.
+    """
+
+    _non_transformer: Optional[LayerContainer]
+    """
+    Abstract container for storing both embedding (pre-transformer) and unembedding (post-transformer)
+    parameters. This attribute should be None at model instantiation until the Policy sets
+    the model parameters. These parameters are grouped together since many model implementations
+    will tie the embedding and unembedding parameters together.
+    """
+
+    _transformer: Optional[Iterable[LayerContainer]]
+    """
+    List of abstract containers (1 per layer) for storing transformer (transformer)
+    parameters. This attribute should be None at model instantiation until the Policy
+    sets the model parameters.
+    """
+
+    state_manager: Optional[DSStateManager]
+    """
+    Since the state manager is lazy initialized, by the engine, it is not guaranteed to be present
+    until full initialization.
+    """
+
+    def __init__(self, config: DSModelImplementationConfig, engine_config: RaggedInferenceEngineConfig,
+                 base_mp_group: MPType) -> None:
+        """
+        Minimal initialization of the model.
+
+        Arguments:
+            config (DSModelImplementationConfig): Model-specific configuration. No assumptions
+                should be made about this config that are not closely tied to the specific
+                model implementation.
+            engine_config (RaggedInferenceEngineConfig): Engine configuration.
+            base_mp_group (MPType): Base communication group for Tensor-parallel inference.
+        """
+        super().__init__()
+        self._config = config
+        self._engine_config = engine_config
+        self._base_mp_group = base_mp_group
+
+        # Set to None until the Policy sets the model parameters
+        self._non_transformer = None
+        self._transformer = None
+        self._flattened_param_buffer = None
+        self._flattened_param_metadata = None
+
+    @property
+    def config(self) -> DSModelImplementationConfig:
+        """
+        The model config.
+        """
+        return self._config
+
+    def set_parameters(self, transformer: Iterable[LayerContainer], non_transformer: LayerContainer,
+                       flattened_param_buffer: torch.Tensor, flattened_param_metadata: ModelMetadata):
+        """
+        Set the model parameters for the embedding, transformer, and unembedding containers.
+        """
+        self._transformer = transformer
+        self._non_transformer = non_transformer
+        self._flattened_param_buffer = flattened_param_buffer
+        self._flattened_param_metadata = flattened_param_metadata
+
+    def set_state_manager(self, state_manager: DSStateManager):
+        """
+        Sets the state manager attribute. This is called by the inference engine after
+        the model is fully initialized.
+        """
+        self.state_manager = state_manager
+
+    @cached_property
+    def tp_rank(self) -> int:
+        """
+        The rank of the current process.
+
+        # TODO(cmikeh2): Kind of a hack right now, but this is too verbose to use at
+        the frequency we need.
+        """
+        return dist.get_rank(group=self._base_mp_group)
+
+    @cached_property
+    def tp_size(self) -> int:
+        """
+        The total number of processes.
+
+        # TODO(cmikeh2): Kind of a hack right now, but this is too verbose to use at
+        the frequency we need.
+        """
+        return dist.get_world_size(group=self._base_mp_group)
+
+    @property
+    def model_config(self):
+        """
+        The model config.
+        """
+        return self._config
+
+    @property
+    def engine_config(self):
+        """
+        The engine config.
+        """
+        return self._engine_config
+
+    @property
+    def flattened_params(self) -> Optional[torch.Tensor]:
+        """
+        The flattened parameter buffer.
+        """
+        return self._flattened_param_buffer
+
+    @property
+    def flattened_param_metadata(self) -> Optional[ModelMetadata]:
+        """
+        The flattened parameter metadata.
+        """
+        return self._flattened_param_metadata
+
+    @abstractmethod
+    def get_kv_requirements(self, sequence: DSSequenceDescriptor, max_new_tokens: int,
+                            max_new_blocks: Tuple[int, ...]) -> Tuple[int, torch.Tensor]:
+        """
+        Given a sequence and the number of new tokens in the sequence, determine the
+        number of new KV blocks needed to support the sequence. This method is
+        used to help the engine provide schedulability APIs and can be used as a helper
+        for ``maybe_allocate_kv``.
+
+        Args:
+            sequence (DSSequenceDescriptor): The sequence for which to allocate KV-storage.
+            max_new_tokens (int): Maximum number of tokens to hypothetically schedule.
+            max_new_blocks (int): Maximum number of blocks to hypothetically allocate.
+
+        Returns:
+            Tuple[int, torch.Tensor]: The tuple of number of tokens scheduled and number
+                of blocks allocated (per KV cache). In general, only one of these numbers will
+                match the corresponding input argument, but this is not guaranteed.
+        """
+        raise NotImplementedError()
+
+    @abstractmethod
+    def maybe_allocate_kv(self, sequence: DSSequenceDescriptor, n_new_tokens: int) -> None:
+        """
+        Given a sequence and the number of new tokens in the sequence, determine
+        whether or not additional KV-storage is needed and allocate it if so.
+
+        Args:
+            sequence (DSSequenceDescriptor): The sequence for which to allocate KV-storage.
+            n_new_tokens (int): The number of new tokens in the sequence.
+        """
+        raise NotImplementedError()
+
+    @abstractmethod
+    def kv_cache_config(self) -> Tuple[KVCacheConfig, ...]:
+        """
+        Return the KV-cache configuration for this model. This should be a tuple of one or more
+        KVCacheConfig objects (one for each distinct cache group).
+        """
+        raise NotImplementedError()
+
+    @property
+    @abstractmethod
+    def max_sequence_length(self) -> int:
+        """
+        The maximum sequence length supported by the model.
+        """
+        ...
+
+    def maybe_free_kv(self, sequence: DSSequenceDescriptor) -> None:
+        """
+        After completing a forward pass, determine whether or not the there are any KV blocks
+        that maybe freed since they are no longer in use.
+
+        Consider the following example:
+
+        We have a block size of 4 and a local window size of 8. At the beginning of the forward
+        pass there 10 tokens had been seen and the new forward has a size of 4. This would lend
+        itself to the following cache structure prior to the forward:
+            [[0, 1, 2*, 3*] [4*, 5*, 6*, 7*] [8*, 9*, x, x] [x x x x]]
+        Where x's denote empty cache locations and * denote values that are needed for attention
+        of the next open slot. After the forward, the cache would look like the following:
+            [[0, 1, 2, 3] [4, 5, 6*, 7*] [8*, 9*, 10*, 11*] [12* 13* x x]]
+        In this case, the first block is no longer needed since it is not needed for any future
+        local attention windows. This function would be responsible for freeing that block.
+
+        Default behavior assumes no local patterns that require freeing and in general should
+        be sufficient.
+        """
+        pass
+
+    @abstractmethod
+    def prepare_batch(self, wrapped_batch: RaggedBatchWrapper) -> None:
+        """
+        This will be called before each forward with the intent of building forward-specific metadata
+        about a batch. The intent here is to build data structures like attention atoms without necessarily
+        needing to implement graphable kernels to do so.
+
+        Abstract so as to force model implementations to opt out of doing anything here explicitly.
+        """
+        raise NotImplementedError()
+
+    def forward(wrapped_batch: RaggedBatchWrapper) -> torch.Tensor:
+        """
+        Complete a forward pass of the model. This interface should be graphable, so it
+        should not rely on the ability to use python control flow.
+        """
+        raise NotImplementedError()
diff --git a/deepspeed/inference/v2/model_implementations/inference_policy_base.py b/deepspeed/inference/v2/model_implementations/inference_policy_base.py
new file mode 100644
index 000000000000..d5a326c03599
--- /dev/null
+++ b/deepspeed/inference/v2/model_implementations/inference_policy_base.py
@@ -0,0 +1,220 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import json
+from abc import ABC, ABCMeta, abstractmethod
+from typing import Any, Iterable, List, Optional, Union
+
+import torch
+
+from ..config_v2 import RaggedInferenceEngineConfig
+from ..checkpoint import CheckpointEngineBase
+from ..logging import inference_logger
+from .layer_container_base import LayerContainer
+from .inference_model_base import DSInferenceModelBase
+from .flat_model_helpers import (
+    flatten_inference_model,
+    make_param_filename,
+    make_metadata_filename,
+    ModelMetadata,
+    restore_inference_model,
+)
+
+POLICIES = {}
+
+
+class ContainerMap:
+
+    def __init__(self) -> None:
+        self._prefix_map = {}
+        self._transformer_params = None
+        self._non_transformer_params = None
+
+    @property
+    def transformer_params(self) -> Iterable[LayerContainer]:
+        return self._transformer_params
+
+    @property
+    def non_transformer_params(self) -> LayerContainer:
+        return self._non_transformer_params
+
+    def set_transformer_params(self, prefixes: Union[str, Iterable[str]], containers: List[LayerContainer]) -> None:
+        if not isinstance(containers, list):
+            raise ValueError(
+                f"The transformer containers should be a list, of one container per layer, but got {type(containers)} instead."
+            )
+
+        self._transformer_prefixes = prefixes if isinstance(prefixes, list) else [prefixes]
+        self._transformer_params = containers
+
+    def set_non_transformer_params(self, container: LayerContainer) -> None:
+        self._non_transformer_params = container
+
+    def set_unmapped_params(self, prefixes: Union[str, Iterable[str]]) -> None:
+        self._unmapped_prefixes = prefixes
+
+    def map_param(self, name, parameter) -> None:
+        for unmapped_prefix in self._unmapped_prefixes:
+            if name.startswith(unmapped_prefix):
+                inference_logger().debug(f"Ignoring: {name} for {unmapped_prefix}")
+                return
+
+        for transformer_prefix in self._transformer_prefixes:
+            if name.startswith(transformer_prefix):
+                popped_name = name[len(transformer_prefix) + 1:]
+                layer_idx = popped_name.split(".")[0]
+                assert layer_idx.isdigit(
+                ), f"expected name to start w. list index but got {layer_idx} instead, name={name}"
+                layer_idx = int(layer_idx)
+                inference_logger().debug(
+                    f"Setting: {'.'.join(popped_name.split('.')[1:])} for layer-idx={layer_idx} to {parameter.shape}")
+                self._transformer_params[layer_idx].set_dependency(".".join(popped_name.split(".")[1:]), parameter)
+                return
+
+        try:
+            inference_logger().debug(f"Setting: {name} to {parameter.shape}")
+            self._non_transformer_params.set_dependency(name, parameter)
+        except ValueError:
+            # Catch the ValueError here from the non_transformer_params because we are knowingly
+            # calling it with something that may not match. This should allow us to raise a slightly more
+            # informative error message.
+            raise ValueError(f"Cannot find container for {name}, please double check the Containers/ContainerMap")
+
+    def validate(self) -> None:
+        if not self._non_transformer_params.is_initialized:
+            raise RuntimeError("Non-transformer parameters not fully initialized after checkpoint load.")
+
+        for layer_idx, container in enumerate(self._transformer_params):
+            if not container.is_initialized:
+                raise RuntimeError(
+                    f"Transformer container at index {layer_idx} not fully initialized after checkpoint load.")
+
+
+class PolicyMeta(ABCMeta):
+
+    def __new__(cls, name, bases, dct):
+        new_obj = super().__new__(cls, name, bases, dct)
+        if name != "InferenceV2Policy":
+            POLICIES[name] = new_obj
+        return new_obj
+
+
+class InferenceV2Policy(ABC, metaclass=PolicyMeta):
+    """
+    The InferenceV2Policy is the base class for all inference policies. An inference policy
+    is responsible for instantiating the inference model and mapping the parameters from the
+    checkpoint engine to the model itself.
+    """
+
+    def __init__(
+        self,
+        model_config: Any,
+        checkpoint_engine: Optional[CheckpointEngineBase] = None,
+        inf_checkpoint_path: Optional[str] = None,
+    ) -> None:
+        """
+        Create the Policy with sufficient context to build the model. There are two supported
+        model creation mechanisms.
+
+        The first is the generalized ``checkpoint_engine`` which
+        will iterate over the parameters of the model and provide them to the policy. These in
+        turn will be sharded/transformed by the model implementation.
+
+        The second is used to re-create a previously serialized DeepSpeed inference model. These
+        checkpoints should not be used across different model backend configurations.
+
+        TODO(cmikeh2): Enforce this in code
+        """
+        if checkpoint_engine is None and inf_checkpoint_path is None:
+            raise ValueError("Either checkpoint_engine or ds_checkpoint_path must be provided.")
+
+        if checkpoint_engine is not None and inf_checkpoint_path is not None:
+            raise ValueError("Only one of checkpoint_engine or ds_checkpoint_path can be provided.")
+
+        self._checkpoint_engine = checkpoint_engine
+        self._inf_checkpoint_path = inf_checkpoint_path
+        self._model_config = model_config
+
+    def build_model(self, engine_config: RaggedInferenceEngineConfig, mp_group: Any) -> DSInferenceModelBase:
+        """
+        Completely instantiate the inference model. This will both create the ops needed to run the
+        model, as well as load the model parameters via the checkpoint engine. For more context
+        on each of these components please see ``instantiate_model`` and ``populate_model_parameters``.
+
+        Arguments:
+            engine_config: The config that has been used to instantiate the engine. This is used
+                to communicate to the model implementation the limits on batches (sequences/tokens)
+                and bound the size of intermediate buffers.
+            mp_group: Object to enable communication between tensor parallel ranks.
+
+        Returns:
+            DSInferenceModelBase: An implementation of the inference model abstraction that will be
+                run by the engine.
+        """
+        self.model = self.instantiate_model(engine_config, mp_group)
+        self.populate_model_parameters()
+        return self.model
+
+    @abstractmethod
+    def instantiate_model(self, engine_config: RaggedInferenceEngineConfig) -> DSInferenceModelBase:
+        """
+        Instantiate the inference model. Depending on the engine/model config, this could be where
+        different model implementations could be selected.
+
+        Arguments:
+            engine_config: The config that has been used to instantiate the engine. This is used
+                to communicate to the model implementation the limits on batches (sequences/tokens)
+                and bound the size of intermediate buffers.
+
+        Returns:
+            DSInferenceModelBase: An implementation of the inference model abstraction that will be
+                run by the engine.
+        """
+        ...
+
+    @abstractmethod
+    def build_container_map(self) -> ContainerMap:
+        """
+        Build a dictionary representing the structure of the string prefixes leading
+        to the parameters to be mapped to the container.
+
+        Returns:
+            ContainerMap: An instantiated mapping describing how checkpoint prefixes map
+                to ``LayerContainer`` instances.
+        """
+        raise NotImplementedError()
+
+    def populate_model_parameters(self) -> None:
+        """
+        This model will iterate over the parameters (as provided by the checkpoint engine) and
+        use the container map built by ``build_container_map`` to populate the model
+        """
+
+        container_map = self.build_container_map()
+
+        if self._checkpoint_engine is not None:
+            for name, parameter in self._checkpoint_engine.parameters():
+                container_map.map_param(name, parameter)
+
+            buffer, metadata = flatten_inference_model(container_map.transformer_params,
+                                                       container_map.non_transformer_params, self.__class__.__name__)
+        else:
+
+            buffer_path = make_param_filename(self._inf_checkpoint_path, self.model.tp_rank, self.model.tp_size)
+            metadata_path = make_metadata_filename(self._inf_checkpoint_path, self.model.tp_rank, self.model.tp_size)
+
+            buffer = torch.load(buffer_path)
+            metadata = json.load(open(metadata_path, "r"))
+            metadata = ModelMetadata.parse_raw(metadata)
+
+            restore_inference_model(buffer, metadata, container_map.transformer_params,
+                                    container_map.non_transformer_params)
+
+        container_map.validate()
+
+        self.model.set_parameters(transformer=container_map.transformer_params,
+                                  non_transformer=container_map.non_transformer_params,
+                                  flattened_param_buffer=buffer,
+                                  flattened_param_metadata=metadata)
diff --git a/deepspeed/inference/v2/model_implementations/inference_transformer_base.py b/deepspeed/inference/v2/model_implementations/inference_transformer_base.py
new file mode 100644
index 000000000000..8f6a0b7fa688
--- /dev/null
+++ b/deepspeed/inference/v2/model_implementations/inference_transformer_base.py
@@ -0,0 +1,587 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from abc import abstractmethod
+from typing import Optional
+
+import torch
+
+from deepspeed.accelerator import get_accelerator
+from ..config_v2 import RaggedInferenceEngineConfig
+from ..inference_utils import ActivationType, ceil_div, is_gated
+from ..model_implementations import *
+from ..model_implementations.sharding import *
+from ..modules.configs import (
+    DSEmbeddingsConfig,
+    DSLinearConfig,
+    DSMoEConfig,
+    DSNormConfig,
+    DSSelfAttentionConfig,
+    DSUnembedConfig,
+    NormTypeEnum,
+    PositionalEmbeddingType,
+)
+from ..modules import heuristics
+from ..ragged import (
+    DSSequenceDescriptor,
+    KVCacheConfig,
+    RaggedBatchWrapper,
+)
+from .inference_model_base import (
+    DSInferenceModelBase,
+    DSModelImplementationConfig,
+    MPType,
+)
+from ..inference_parameter import InferenceParameter
+
+try:
+    from functools import cached_property
+except ImportError:
+
+    def cached_property(func):
+        return property(func)
+
+
+class DSTransformerModelBase(DSInferenceModelBase):
+    """
+    Dimensioning properties
+    """
+
+    @property
+    @abstractmethod
+    def num_layers(self) -> int:
+        """
+        Number of the layers in the model
+        """
+        ...
+
+    @property
+    @abstractmethod
+    def model_dim(self) -> int:
+        """
+        Size of embedding projection and residuals.
+        """
+        ...
+
+    @property
+    @abstractmethod
+    def vocab_size(self) -> int:
+        """
+        Size of the vocabulary (including padding).
+        """
+        ...
+
+    @property
+    @abstractmethod
+    def head_size(self) -> int:
+        """
+        Size of each attention head.
+        """
+        ...
+
+    @property
+    @abstractmethod
+    def n_heads(self) -> int:
+        """
+        The number of query heads on the model. This should not take into account
+        any dimension reductions from model sharding.
+        """
+        ...
+
+    @property
+    def n_heads_q(self) -> int:
+        """
+        Alias to n_heads.
+        """
+        return self.n_heads
+
+    @property
+    def n_heads_kv(self) -> int:
+        """
+        The number of key and value heads on the model. For GQA or MQA, overload this attribute.
+        Otherwise it adopts MHA formulations and uses n_heads. This should not take into account
+        any dimension reductions from model sharding.
+        """
+        return self.n_heads
+
+    @property
+    @abstractmethod
+    def intermediate_dim(self) -> int:
+        """
+        The size of the (unsharded) intermediate projection dim. For a gated activation function
+        this is the size of the input to the second MLP layer. This should not take into account
+        any dimension reductions from model sharding.
+        """
+        ...
+
+    @property
+    @abstractmethod
+    def positional_embedding_type(self) -> PositionalEmbeddingType:
+        """
+        The type of positional embedding used by the model.
+        """
+        ...
+
+    """
+    Architectural properties
+    """
+
+    @property
+    @abstractmethod
+    def activation_dtype(self) -> torch.dtype:
+        """
+        The activation dtype of the model.
+        """
+        ...
+
+    @property
+    @abstractmethod
+    def mlp_activation_fn(self) -> ActivationType:
+        """
+        The activation function used in the MLP.
+        """
+        ...
+
+    @property
+    @abstractmethod
+    def norm_type(self) -> NormTypeEnum:
+        """
+        The type of normalization used in the model.
+        """
+        ...
+
+    """
+    Derived helpers
+    """
+
+    @cached_property
+    def n_heads_q_local(self) -> int:
+        """
+        Number of local heads post sharding.
+        """
+        return get_local_heads(self.tp_rank, self.tp_size, self.n_heads_q, self.n_heads_kv)[0]
+
+    @cached_property
+    def n_heads_kv_local(self) -> int:
+        """
+        Number of local heads post sharding.
+        """
+        return get_local_heads(self.tp_rank, self.tp_size, self.n_heads_q, self.n_heads_kv)[1]
+
+    @property
+    def gated_mlp(self) -> bool:
+        """
+        Return a boolean to determine whether the model uses a gated activation function.
+        """
+        return is_gated(self.mlp_activation_fn)
+
+    """
+    Method implementations
+    """
+
+    def __init__(self, config: DSModelImplementationConfig, engine_config: RaggedInferenceEngineConfig,
+                 base_mp_group: MPType) -> None:
+        """
+        Base implementation for initialization. By default, this will initialize
+        the traditional components of a transformer model:
+            - Embedding
+            - QKV projection
+            - Self attention
+            - Attention output projection
+            - Feed forward network
+            - Normalization
+            - Unembedding
+
+        Arguments:
+            config (DSModelImplementationConfig): Model-specific configuration. No assumptions
+                should be made about this config that are not closely tied to the specific
+                model implementation.
+            engine_config (RaggedInferenceEngineConfig): Engine configuration.
+            base_mp_group (MPType): Base communication group for Tensor-parallel inference.
+        """
+        super().__init__(config, engine_config, base_mp_group)
+
+        self.make_norm_layer()
+        self.make_qkv_layer()
+        self.make_attn_layer()
+        self.make_attn_out_layer()
+        self.make_mlp_1_layer()
+        self.make_mlp_2_layer()
+        self.make_embedding_layer()
+        self.make_unembedding_layer()
+        self._kv_cache_config = None
+
+    ######### Embedding #########
+    def make_embedding_layer(self) -> None:
+        """
+        Performs setup and creates embedding DSModule. This will set the `self.embed` attribute.
+        """
+
+        embed_config = DSEmbeddingsConfig(
+            max_tokens=self._engine_config.state_manager.max_ragged_batch_size,
+            residual_dtype=self.activation_dtype,
+            embedding_dim=self.model_dim,
+        )
+
+        self.embed = heuristics.instantiate_embed(embed_config, self._engine_config)
+
+    def transform_embedding_param(self, param: torch.Tensor) -> InferenceParameter:
+        """
+        Performs embedding sharding along the channels dimension.
+        """
+        # Until we can do non-contiguous all-gather, we won't shard the embedding parameters.
+        param = param.to(self.activation_dtype.value)
+        return InferenceParameter.initialize(param)
+
+    ######### Unembedding #########
+    def make_unembedding_layer(self) -> None:
+        """
+        Performs setup and creates an unembedding layer. This implementation assumes
+        normalization prior to the LM head projection. If this does not match the model's
+        implementation, override this method. This will set the ``self.unembed`` attribute.
+        """
+        unembed_dim = sharded_unembed_dim(self.vocab_size, self.tp_rank, self.tp_size)
+
+        unembed_config = DSUnembedConfig(
+            max_tokens=self._engine_config.state_manager.max_ragged_batch_size,
+            max_sequences=self._engine_config.state_manager.max_ragged_sequence_count,
+            dtype=self.activation_dtype,
+            model_dim=self.model_dim,
+            vocab_size=unembed_dim,
+            norm_type=self.norm_type,
+        )
+
+        self.unembed = heuristics.instantiate_unembed(unembed_config, self._engine_config)
+
+        if self.tp_size > 1:
+            self._comm_logits = torch.empty(self.tp_size,
+                                            self._engine_config.state_manager.max_ragged_sequence_count,
+                                            unembed_dim,
+                                            device=get_accelerator().current_device(),
+                                            dtype=self.activation_dtype.value)
+            self._return_logits = torch.empty(self._engine_config.state_manager.max_ragged_sequence_count,
+                                              self.vocab_size,
+                                              device=get_accelerator().current_device(),
+                                              dtype=self.activation_dtype.value)
+
+    def transform_unembed_param(self, param: torch.Tensor) -> InferenceParameter:
+        """
+        Performs sharding along the vocab dimension.
+        """
+        param = shard_unembed_param(param, self.tp_rank, self.tp_size).to(self.activation_dtype.value)
+        return InferenceParameter.initialize(param)
+
+    ######### QKV #########
+    def make_qkv_layer(self) -> None:
+        """
+        Instantiates the linear projection layer for the QKV linear layer. This sets the
+        `self.qkv` attribute.
+        """
+        out_features = qkv_out_features(self.model_dim, self.tp_rank, self.tp_size, self.head_size, self.n_heads_q,
+                                        self.n_heads_kv)
+
+        linear_config = DSLinearConfig(
+            max_tokens=self._engine_config.state_manager.max_ragged_batch_size,
+            in_channels=self.model_dim,
+            out_channels=out_features,
+            input_dtype=self.activation_dtype,
+            output_dtype=self.activation_dtype,
+        )
+
+        self.qkv = heuristics.instantiate_linear(linear_config, self._engine_config)
+
+    def transform_qkv_param(self, param: torch.Tensor) -> InferenceParameter:
+        """
+        Passes a QKV parameter to the underlying implementation for any necessary
+        transformations.
+
+        Args:
+            param (torch.Tensor): The parameter to transform. This may be either a bias or weight and should have
+                the shape (out_neurons, in_neurons)
+        """
+        param = shard_qkv_param(param, self.tp_rank, self.tp_size, self.head_size, self.n_heads_q, self.n_heads_kv)
+        return self.qkv.transform_param(param)
+
+    ######### Attention #########
+    def make_attn_layer(self) -> None:
+        """
+        Builds the attention layer for the model. This sets the `self.attn` attribute.
+        """
+        softmax_scale = 1.0 / (self.head_size**0.5)
+
+        attn_config = DSSelfAttentionConfig(max_tokens=self._engine_config.state_manager.max_ragged_batch_size,
+                                            n_heads_q=self.n_heads_q_local,
+                                            n_heads_kv=self.n_heads_kv_local,
+                                            head_size=self.head_size,
+                                            max_sequences=self._engine_config.state_manager.max_ragged_sequence_count,
+                                            scale_factor=softmax_scale,
+                                            input_dtype=self.activation_dtype,
+                                            output_dtype=self.activation_dtype,
+                                            positional_embedding_type=self.positional_embedding_type)
+
+        self.attn = heuristics.instantiate_attention(attn_config, self._engine_config)
+
+    def get_kv_requirements(self, sequence: DSSequenceDescriptor, max_new_tokens: int,
+                            max_new_blocks: int) -> Tuple[int, torch.Tensor]:
+        """
+        See ``DSInferenceModelBase.get_kv_requirements`` for documentation.
+
+        This method assumes an autoregressive dense attention pattern. Override this method
+        if this does not match the model's attention pattern.
+        """
+        total_tokens = sequence.seen_tokens + max_new_tokens
+        req_blocks = ceil_div(total_tokens, self.attn.kv_block_size)
+        block_lim = req_blocks - sequence.cur_allocated_blocks
+
+        if block_lim <= max_new_blocks:
+            return max_new_tokens, block_lim
+
+        token_capacity = (max_new_blocks +
+                          sequence.cur_allocated_blocks) * self.attn.kv_block_size - sequence.seen_tokens
+
+        return token_capacity, torch.tensor([max_new_blocks])
+
+    def maybe_allocate_kv(self, sequence: DSSequenceDescriptor, n_new_tokens: int) -> None:
+        """
+        See ``DSInferenceModelBase.maybe_allocate_kv`` for documentation.
+
+        This method assumes an autoregressive dense attention pattern. Override this method
+        if this does not match the model's attention pattern.
+        """
+        _, n_needed_blocks = self.get_kv_requirements(sequence, n_new_tokens, self.state_manager.free_blocks)
+
+        if n_needed_blocks > 0:
+            new_blocks = self.state_manager.allocate_blocks(n_needed_blocks)
+            sequence.extend_kv_cache(new_blocks)
+
+    def kv_cache_config(self) -> Tuple[KVCacheConfig, ...]:
+        """
+        See ``DSInferenceModelBase.kv_cache_config`` for documentation.
+
+        This method assumes an autoregressive dense attention pattern. Override this method
+        if this does not match the model's attention pattern.
+        """
+        if self._kv_cache_config is None:
+            cache_shape = (self.num_layers, self.n_heads_kv_local, self.head_size)
+            max_blocks = ceil_div(self.max_sequence_length, self.attn.kv_block_size)
+            self._kv_cache_config = KVCacheConfig(block_size=self.attn.kv_block_size,
+                                                  cache_shape=cache_shape,
+                                                  cache_dtype=self.activation_dtype,
+                                                  max_blocks_per_allocation_group=max_blocks)
+        return (self._kv_cache_config, )
+
+    def prepare_batch(self, wrapped_batch: RaggedBatchWrapper) -> None:
+        """
+        See ``DSInferenceModelBase.prepare_batch`` for documentation.
+
+        This method assumes an autoregressive dense attention pattern. Override this method
+        if this does not match the model's attention pattern.
+        """
+        self.attn.build_atoms(wrapped_batch)
+
+    ######### Attention output #########
+    def make_attn_out_layer(self) -> None:
+        """
+        Instantiates the linear projection layer for the attention output linear layer. This sets the
+        `self.attn_out` attribute.
+        """
+        in_features = attn_out_in_features(self.model_dim, self.tp_rank, self.tp_size, self.head_size, self.n_heads_q,
+                                           self.n_heads_kv)
+
+        linear_config = DSLinearConfig(
+            max_tokens=self._engine_config.state_manager.max_ragged_batch_size,
+            in_channels=in_features,
+            out_channels=self.model_dim,
+            input_dtype=self.activation_dtype,
+            output_dtype=self.activation_dtype,
+        )
+
+        self.attn_out = heuristics.instantiate_linear(linear_config, self._engine_config)
+
+    def transform_attn_out_param(self, param: torch.Tensor) -> Optional[InferenceParameter]:
+        """
+        Shards an attention output projection parameter and passes it to the underlying
+        implementation for any necessary transformations. This will return `None` for bias parameters
+        if they are not on TP rank 0.
+
+        Args:
+            param (torch.Tensor): The parameter to transform. This may be either a bias or weight and should have
+                the shape (out_neurons, in_neurons).
+        """
+        param = shard_attn_out_param(param, self.tp_rank, self.tp_size, self.head_size, self.n_heads_q,
+                                     self.n_heads_kv)
+
+        if param is not None:
+            param = self.attn_out.transform_param(param)
+
+        return param
+
+    ######### MLP #########
+    def make_mlp_1_layer(self) -> None:
+        """
+        Instantiates the linear projection layer for the first MLP in the feedforward network.
+        This sets the `self.mlp_1` attribute.
+        """
+        shard_size = sharded_intermediate_dim(self.intermediate_dim, self.tp_size, self.tp_rank)
+
+        linear_config = DSLinearConfig(
+            max_tokens=self._engine_config.state_manager.max_ragged_batch_size,
+            in_channels=self.model_dim,
+            out_channels=shard_size,
+            activation=self.mlp_activation_fn,
+            input_dtype=self.activation_dtype,
+            output_dtype=self.activation_dtype,
+        )
+
+        self.mlp_1 = heuristics.instantiate_linear(linear_config, self._engine_config)
+
+    def transform_mlp_1_param(self, param: torch.Tensor) -> InferenceParameter:
+        """
+        Shards the first MLP parameter and passes it to the underlying implementation
+        for any necessary transformations.
+
+        Args:
+            param (torch.Tensor): The parameter to transform. This may be either a bias or weight and should have
+                the shape (out_neurons, in_neurons).
+        """
+        param = shard_mlp_1_param(param, self.tp_rank, self.tp_size, gated=self.gated_mlp)
+
+        return self.mlp_1.transform_param(param)
+
+    def make_mlp_2_layer(self) -> None:
+        """
+        Instantiates the linear projection layer for the second MLP in the feedforward network.
+        This sets the `self.mlp_2` attribute.
+        """
+        shard_size = sharded_intermediate_dim(self.intermediate_dim, self.tp_size, self.tp_rank)
+
+        linear_config = DSLinearConfig(
+            max_tokens=self._engine_config.state_manager.max_ragged_batch_size,
+            in_channels=shard_size,
+            out_channels=self.model_dim,
+            input_dtype=self.activation_dtype,
+            output_dtype=self.activation_dtype,
+        )
+
+        self.mlp_2 = heuristics.instantiate_linear(linear_config, self._engine_config)
+
+    def transform_mlp_2_param(self, param: torch.Tensor) -> Optional[InferenceParameter]:
+        """
+        Shards the second MLP parameter and passes it to the underlying implementation
+        for any necessary transformations. This will return `None` for bias parameters
+        if they are not on TP rank 0.
+
+        Args:
+            param (torch.Tensor): The parameter to transform. This may be either a bias or weight and should have
+                the shape (out_neurons, in_neurons).
+        """
+        param = shard_mlp_2_param(param, self.tp_rank, self.tp_size)
+
+        if param is not None:
+            param = self.mlp_2.transform_param(param)
+
+        return param
+
+    ######### Norm #########
+    def make_norm_layer(self) -> None:
+        """
+        Instantiates the normalization layer for the model. This sets the `self.norm` attribute.
+
+        TODO(cmikeh2): In the future we'll distinguish between the different norm objects,
+        but for now we'll just use the same one for all of them.
+        """
+        norm_config = DSNormConfig(
+            max_tokens=self._engine_config.state_manager.max_ragged_batch_size,
+            type=self.norm_type,
+            channels=self.model_dim,
+            residual_dtype=self.activation_dtype,
+            input_dtype=self.activation_dtype,
+            output_dtype=self.activation_dtype,
+        )
+
+        self.norm = heuristics.instantiate_pre_norm(norm_config, self._engine_config)
+
+    def transform_norm_param(self, param: torch.Tensor) -> InferenceParameter:
+        """
+        Passes a normalization parameter to the underlying implementation for any
+        necessary transformations.
+
+        TODO(cmikeh2): In the future we'll distinguish between the different norm objects,
+        but for now we'll just use the same one for all of them.
+
+        Args:
+            param (torch.Tensor): The parameter to transform. This may be either a bias or weight and should have
+                shape (model_dim,)
+        """
+        return self.norm.transform_param(param)
+
+
+class DSMoETransformerModelBase(DSTransformerModelBase):
+
+    @property
+    def num_experts(self) -> int:
+        """
+        Return the number of experts in the model.
+        """
+        raise NotImplementedError("Attempted to access an unimplemented number of experts")
+
+    def make_moe_layer(self) -> None:
+        """
+        Instantiates the MoE layer for the model. This sets the `self.moe` attribute.
+        """
+        sharded_dim = sharded_intermediate_dim(self.intermediate_dim, self.tp_size, self.tp_rank)
+
+        moe_config = DSMoEConfig(
+            max_tokens=self._engine_config.state_manager.max_ragged_batch_size,
+            model_dim=self.model_dim,
+            intermediate_features=sharded_dim,
+            activation=self.mlp_activation_fn,
+            n_experts=self.num_experts,
+            input_dtype=self.activation_dtype,
+            output_dtype=self.activation_dtype,
+        )
+
+        self.moe = heuristics.instantiate_moe(moe_config, self._engine_config)
+
+    def transform_moe_gate_param(self, param: torch.Tensor) -> InferenceParameter:
+        """
+        Passes a MoE gate parameter to the underlying implementation for any necessary transformations.
+
+        TODO(cmikeh2): This will need to be updated/overridden for expert parallelism.
+        """
+        return self.moe.transform_gate_param(param)
+
+    def transform_moe_mlp_1_param(self, param: torch.Tensor) -> InferenceParameter:
+        """
+        Shards the first MoE param and passes it to the underlying implementation. Since it's possible for an architecture
+        to have both MoE and non-MoE layers, this can't be overloaded on the MLP1 transform. Furthermore, since both
+        the MoE DSModule owns both MLP1 and MLP2, under certain sharding conditions it's not possible for the model implementation
+        to infer from the shape whether to perform a different transformation based on MLP1 or MLP2. This (and the below)
+        separations are intended to solve both these issues.
+
+        Args:
+            param (torch.Tensor): The parameter to transform. This should have shape (n_experts, out_neurons, in_neurons).
+        """
+        param = shard_mlp_1_param(param, self.tp_rank, self.tp_size, gated=self.gated_mlp, is_moe=True)
+
+        return self.moe.transform_moe_mlp_1_param(param)
+
+    def transform_moe_mlp_2_param(self, param: torch.Tensor) -> Optional[torch.Tensor]:
+        """
+        Shards the second MoE param and passes it to the underlying implementation. See the above for context on why this API
+        exists.
+
+        This will return `None` for expert bias params not on TP rank 0. NOTE(cmikeh2): Does it make sense to round-robin assign?
+        My intuition is that this will make debugging much more difficult for minimal memory reduction.
+
+        Args:
+            param (torch.Tensor): The parameter to transform. This should have shape (n_experts, out_neurons, in_neurons).
+        """
+        param = shard_mlp_2_param(param, self.tp_rank, self.tp_size, is_moe=True)
+
+        if param is not None:
+            param = self.moe.transform_moe_mlp_2_param(param)
+
+        return param
diff --git a/deepspeed/inference/v2/model_implementations/layer_container_base.py b/deepspeed/inference/v2/model_implementations/layer_container_base.py
new file mode 100644
index 000000000000..f26c87556665
--- /dev/null
+++ b/deepspeed/inference/v2/model_implementations/layer_container_base.py
@@ -0,0 +1,355 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import re
+from typing import Type
+
+import torch
+
+from deepspeed.accelerator import get_accelerator
+from .parameter_base import ParameterBase, ParametrizedList
+from ..inference_parameter import InferenceParameter
+
+# Currently have dependency loops for the type hints.
+InferenceModel = Type["InferenceModel"]
+LayerContainer = Type["LayerContainer"]
+
+MAPPING_KEY = "PARAM_MAPPING"
+PLIST_HELPERS = "_ds_plist_strip_vals"
+
+
+def make_finalization_callback(all_names: str):
+    """
+    Helper method for building the finalization callback for a LayerContainer. This
+    is not client code and should not be used or called directly.
+    """
+
+    def finalization_callback(self, param: ParameterBase, finalized_param: torch.Tensor) -> None:
+        """
+        Callback for when a parameter is finalized.
+        """
+        self._finalized_params += 1
+
+        for name in all_names:
+            if getattr(self, name) is param:
+                setattr(self, name, finalized_param)
+
+    return finalization_callback
+
+
+class LayerMetaclass(type):
+    """
+    MetaClass for the LayerContainer base class. This class will parse the annotations
+    of the class that correspond to `ParameterBase` and create None initializers for each
+    as well as a finalization callback that for when each `ParameterBase` is finalized
+    and should be replaced with a Tensor.
+    """
+
+    def __new__(cls, clsname, bases, attrs):
+
+        annotations = attrs.get("__annotations__", {})
+
+        for base in bases:
+            # We'll pick up all annotations on any base classes. This will allow us to
+            # to use inheritance to share common parameter groups in base classes.
+            if hasattr(base, "__annotations__"):
+                annotations.update(base.__annotations__)
+
+            if hasattr(base, MAPPING_KEY):
+                if MAPPING_KEY not in attrs:
+                    # This is likely a fail state. If a parent has MAPPING KEY but the child does
+                    # not, then we're guaranteed only a subset of the parameters will be mapped.
+                    attrs[MAPPING_KEY] = {}
+                attrs[MAPPING_KEY].update(getattr(base, MAPPING_KEY))
+
+        all_names = [name for name, annotation in annotations.items() if issubclass(annotation, ParameterBase)]
+
+        if MAPPING_KEY in attrs:
+            # If we have a mapping key at all, then we will enter the validation mode for building
+            # helpers for mapping and ensuring we have complete mapping.
+
+            # First we'll build a flat list of every dependency for this layer.
+            all_deps = set()
+            for name in all_names:
+                parameter_deps = [
+                    name for name, annotation in annotations[name].__annotations__.items()
+                    if issubclass(annotation, (torch.Tensor, ParametrizedList))
+                ]
+
+                all_deps.update([f"{name}.{dep}" for dep in parameter_deps])
+
+            # Create static helper for doing the string processing only once.
+            attrs[PLIST_HELPERS] = []
+
+            # Iterate over all the mappings
+            for src_name, target_or_targets in attrs[MAPPING_KEY].items():
+                if isinstance(target_or_targets, str):
+                    target_or_targets = [target_or_targets]
+
+                actual_targets = []
+                for target_name in target_or_targets:
+                    base_dependency, dependency_attr = target_name.split(".")
+
+                    # Check for invalid mappings
+                    if base_dependency not in all_names:
+                        raise ValueError(
+                            "Target parameter \"{}\" not found in this layer. Valid targets are {}".format(
+                                base_dependency, all_names))
+                    if dependency_attr not in annotations[base_dependency].__annotations__:
+                        # This check is not universal (see below) if a single dependency is being
+                        # mapped to by a single row.
+                        raise ValueError(
+                            "Target dependency \"{}\" not found on parameter \"{}\". Valid targets are {}".format(
+                                dependency_attr, base_dependency, annotations[base_dependency].__annotations__.keys()))
+                    if target_name not in all_deps:
+                        raise ValueError(
+                            "Target dependency \"{}\" was targeted with multiple mapping rules.".format(target_name))
+
+                    # If we've made it this far, the dependency definitely exists.
+                    actual_targets.append(annotations[base_dependency].__annotations__[dependency_attr])
+
+                    all_deps.remove(target_name)
+
+                are_plists = [issubclass(target, ParametrizedList) for target in actual_targets]
+                if all(are_plists):
+                    # We can do direct sets on everything but ParametrizedLists, so we'll only explicitly
+                    # handle these here.
+                    # TODO(cmikeh2): SPLIT, error if more than 1
+                    glob_count = src_name.count("*")
+                    if glob_count > 1:
+                        raise ValueError(
+                            "ParametrizedList index inference can only work with a single glob: {}".format(src_name))
+                    elif glob_count == 0:
+                        raise ValueError(
+                            "Must have wildcard (*) in source name for ParametrizedList mapping: {}".format(src_name))
+
+                    wildcard_idx = src_name.find("*")
+                    prefix = src_name[:wildcard_idx]
+                    suffix = src_name[wildcard_idx + 1:]
+                    attrs[PLIST_HELPERS].append((prefix, suffix, target_or_targets))
+                elif any(are_plists):
+                    raise ValueError("Cannot mix ParametrizedLists and Tensors in a single mapping rule.")
+
+            if len(all_deps) > 0:
+                raise ValueError(
+                    "A parameter mapping was provided for {}, but the following dependencies were not mapped: {}".
+                    format(clsname, all_deps))
+
+        attrs["finalization_callback"] = make_finalization_callback(all_names)
+
+        new_obj = super().__new__(cls, clsname, bases, attrs)
+
+        setattr(new_obj, "_n_params", len(all_names))
+        setattr(new_obj, "_annotation_attrs", all_names)
+
+        return new_obj
+
+    def __call__(cls, *args, **kwargs):
+        instance = cls.__new__(cls, *args, **kwargs)
+        instance.__init__(*args, **kwargs)
+
+        for name, annotation in instance.__annotations__.items():
+            if issubclass(annotation, ParameterBase):
+                # TODO(cmikeh2): Do we want to make this a property
+                # It might also make sense to do this in the base class __init__
+                # but since it is tied with the changes made in __new__ it feels
+                # to me like it should be here.
+                setattr(instance, name, annotation(instance.inference_model, instance))
+
+        return instance
+
+
+class LayerContainer(metaclass=LayerMetaclass):
+    """
+    Abstract base class for containing model parameters.
+
+    This is primarily a guidance abstraction since we do not put any restrictions
+    on how the parameters are stored.
+
+    To use this class, annotate the class with `ParameterBase` subclasses and give them
+    names. As a checkpoint is loaded into this container, the `ParameterBase` instances
+    will be replaced with realized Tensors as soon as each of their dependencies are met.
+
+    To enable automatic mapping, add a static attribute `PARAM_MAPPING` to the class
+    definition. This should be a dictionary mapping from a source string to one or
+    more dependencies.
+
+    ```python
+    class MyLayer(LayerContainer):
+        PARAM_MAPPING = {
+            "path.to.param.dependency", "container_param_1.dependency",
+            "path.to.param2.dependency", "container_param_2.dependency",
+            "path.to.param3.*.dependency", "container_param_3.list_dependency"
+        }
+
+        ...
+    ```
+    """
+
+    def __init__(self, model: InferenceModel) -> None:
+        """
+        Initialization of the LayerContainer. This method does not need to be overridden
+        for any children classes.
+
+        Args:
+            model (InferenceModel): Inference model that will be used to shard and transform
+                parameters correctly, as well as provide specific information about the model
+                for `ParameterizedList`s that may be part of one of the member `ParameterBase`s.
+        """
+        self.inference_model = model
+        self._finalized_params = 0
+
+    def _initialization_checker(self, check_device: bool = True) -> bool:
+        """
+        Returns whether or not all parameters have been initialized and transformed by
+        the model. Once this returns True, all the `ParameterBase` instances will be
+        torch.Tensors.
+        """
+        if self._finalized_params != self.n_params:
+            return False
+
+        for name in self._annotation_attrs:
+            tensor = getattr(self, name)
+            if tensor is None:
+                continue
+            elif not isinstance(tensor, InferenceParameter):
+                raise ValueError("Layer should be finalized, but {} ({}) is neither InferenceParameter or None".format(
+                    name, type(tensor)))
+            elif check_device and tensor.device != torch.device(get_accelerator().current_device()):
+                raise RuntimeError("Layer should be finalized, but {} is not on device {}".format(
+                    name,
+                    get_accelerator().current_device()))
+        return True
+
+    @property
+    def is_populated(self) -> bool:
+        """
+        Returns whether or not all parameters have been populated by the checkpoint engine, but
+        does not validat the parameters are on the correct device.
+        """
+        return self._initialization_checker(check_device=False)
+
+    @property
+    def is_initialized(self) -> bool:
+        """
+        Returns whether or not all parameters have been initialized and transformed by
+        the model and are located on the appropriate device. Once this returns True, all
+        the `ParameterBase` instances ``InferenceParameter``s or explicitly set to ``None``.
+        """
+        return self._initialization_checker()
+
+    @property
+    def n_params(self) -> int:
+        """
+        The number of parameters this container holds. This is a read-only value
+        that is set by the metaclass.
+        """
+        return self._n_params
+
+    @property
+    def annotation_attrs(self) -> list:
+        return self._annotation_attrs
+
+    @property
+    def mapping_params(self) -> dict:
+        return getattr(self.__class__, MAPPING_KEY, {})
+
+    @property
+    def plist_helpers(self) -> list:
+        return getattr(self.__class__, PLIST_HELPERS, [])
+
+    def direct_injection(self, name: str, tensor: InferenceParameter) -> None:
+
+        if name not in self._annotation_attrs:
+            raise ValueError(f"Cannot directly inject {name}, not a valid parameter.")
+
+        setattr(self, name, tensor)
+        self._finalized_params += 1
+
+    def set_dependency(self, dep_name: str, dep_value: torch.Tensor) -> None:
+        """
+        Set dependency can be used for managing dependencies when a mapping is provided
+        in the class definition for the layer. The dep_name here should have any prefix
+        for transformer layers removed (such as model.layers.*.attn.qkv.weight -> attn.qkv.weight).
+
+        Args:
+            dep_name (str): The name of the dependency to set.
+            dep_value (torch.Tensor): The value to set the dependency to.
+        """
+
+        def get_dep_name_target(dep_name: str) -> str:
+            """
+            Helper method for getting the target name for a dependency from the
+            mapping params. Tries to match exact string first, then looks for
+            wildcards and attempts regex matching. Will return empty string if
+            no match found.
+            """
+            if dep_name in self.mapping_params:
+                # If we have an exact match, it's a direct mapping and we can
+                # immediately set the value.
+                return self.mapping_params[dep_name]
+
+            matched_targets = []
+            for key, target in self.mapping_params.items():
+                regex_key = key.replace("*", ".*")
+                if re.match(regex_key, dep_name):
+                    matched_targets.append(target)
+            if len(matched_targets) > 1:
+                raise ValueError(f"Multiple targets matched for dependency {dep_name}: {matched_targets}")
+            if matched_targets:
+                return matched_targets[0]
+            return ""
+
+        if dep_name in self.mapping_params:
+            # If we have an exact match, it's a direct mapping and we can immediately set
+            # the value.
+            target = self.mapping_params[dep_name]
+
+            # Convert single targets to a list for consistency
+            if isinstance(target, str):
+                target = [target]
+
+            for target_name in target:
+                # Double setting doesn't set the attribute correctly, so we do a getattr then setattr
+                target_param_name, target_dependency_name = target_name.split(".")
+                target_param = getattr(self, target_param_name)
+                setattr(target_param, target_dependency_name, dep_value)
+            return
+
+        # Otherwise we need to map to one of the parameter lists.
+        for prefix, suffix, dests in self.plist_helpers:
+            if dep_name.startswith(prefix) and dep_name.endswith(suffix):
+                # We have a match, so we can set the value.
+                target_idx = int(dep_name[len(prefix):-len(suffix)])
+
+                # Convert single targets to a list for consistency
+                if isinstance(dests, str):
+                    dests = [dests]
+
+                for dest in dests:
+                    target_param_name, target_dependency_name = dest.split(".")
+                    target_param = getattr(self, target_param_name)
+                    target_dependency = getattr(target_param, target_dependency_name)
+                    target_dependency[target_idx] = dep_value
+                return
+
+        # TODO: Refactor this with the help of cmikeh2
+        # We should be able to combine this with the wildcard matching above.
+        target = get_dep_name_target(dep_name)
+        if target:
+            # Convert single targets to a list for consistency
+            if isinstance(target, str):
+                target = [target]
+
+            for target_name in target:
+                # Double setting doesn't set the attribute correctly, so we do a getattr then setattr
+                target_param_name, target_dependency_name = target_name.split(".")
+                target_param = getattr(self, target_param_name)
+                setattr(target_param, target_dependency_name, dep_value)
+            return
+
+        raise ValueError(
+            "Could not find a mapping for dependency \"{}\". Check that it is included in the ``MAPPING_PARAMS``. See docstring for more on ``MAPPING_PARAMS``"
+            .format(dep_name))
diff --git a/deepspeed/inference/v2/model_implementations/llama_v2/__init__.py b/deepspeed/inference/v2/model_implementations/llama_v2/__init__.py
new file mode 100644
index 000000000000..5d2b5ae562ee
--- /dev/null
+++ b/deepspeed/inference/v2/model_implementations/llama_v2/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .llama_v2_policy import Llama2Policy
diff --git a/deepspeed/inference/v2/model_implementations/llama_v2/llama_v2_containers.py b/deepspeed/inference/v2/model_implementations/llama_v2/llama_v2_containers.py
new file mode 100644
index 000000000000..e9c473ce512b
--- /dev/null
+++ b/deepspeed/inference/v2/model_implementations/llama_v2/llama_v2_containers.py
@@ -0,0 +1,78 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# Create a container object to save model-specific tensors using the policy file above.
+
+from ...model_implementations.common_parameters import *
+from ...model_implementations.layer_container_base import LayerContainer
+'''
+ # HF Llama model looks like this:
+
+LlamaForCausalLM(
+  (model): LlamaModel(
+    (embed_tokens): Embedding(32000, 4096, padding_idx=0)
+    (layers): ModuleList(
+      (0-31): 32 x LlamaDecoderLayer(
+        (self_attn): LlamaAttention(
+          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
+          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
+          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
+          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
+          (rotary_emb): LlamaRotaryEmbedding()
+        )
+        (mlp): LlamaMLP(
+          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
+          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
+          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
+          (act_fn): SiLUActivation()
+        )
+        (input_layernorm): LlamaRMSNorm()
+        (post_attention_layernorm): LlamaRMSNorm()
+      )
+    )
+    (norm): LlamaRMSNorm()
+  )
+  (lm_head): Linear(in_features=4096, out_features=32000, bias=False)
+)
+'''
+
+
+class Llama2TransformerContainer(LayerContainer):
+    """
+        Transformer layer container for the Llama-2 model.
+    """
+    qkv_w: UnfusedQKVParameter
+    attn_out_w: AttentionOutputParameter
+    mlp_1_w: GatedMLPParameter
+    mlp_2_w: MLP2Parameter
+    attn_norm_gamma: NormParameter
+    mlp_norm_gamma: NormParameter
+
+    PARAM_MAPPING = {
+        "self_attn.q_proj.weight": "qkv_w.q_params",
+        "self_attn.k_proj.weight": "qkv_w.k_params",
+        "self_attn.v_proj.weight": "qkv_w.v_params",
+        "self_attn.o_proj.weight": "attn_out_w.params",
+        "mlp.gate_proj.weight": "mlp_1_w.gate_params",
+        "mlp.up_proj.weight": "mlp_1_w.up_params",
+        "mlp.down_proj.weight": "mlp_2_w.params",
+        "input_layernorm.weight": "attn_norm_gamma.params",
+        "post_attention_layernorm.weight": "mlp_norm_gamma.params",
+    }
+
+
+class Llama2NonTransformerContainer(LayerContainer):
+    """
+        Non-Transformer layer container for the Llama-2 model.
+    """
+    word_emb: EmbeddingParameter
+    word_unembed: UnembedParameter
+    final_norm: NormParameter
+
+    PARAM_MAPPING = {
+        "model.embed_tokens.weight": "word_emb.params",
+        "model.norm.weight": "final_norm.params",
+        "lm_head.weight": "word_unembed.params",
+    }
diff --git a/deepspeed/inference/v2/model_implementations/llama_v2/llama_v2_model.py b/deepspeed/inference/v2/model_implementations/llama_v2/llama_v2_model.py
new file mode 100644
index 000000000000..9b628f77de01
--- /dev/null
+++ b/deepspeed/inference/v2/model_implementations/llama_v2/llama_v2_model.py
@@ -0,0 +1,204 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import Iterable, Optional, Tuple
+
+import torch
+
+import deepspeed.comm as dist
+
+from ...allocator import empty_from
+from ...inference_utils import ActivationType, DtypeEnum
+from ...model_implementations import *
+from ...modules.configs import *
+from ...modules.interfaces import *
+from ...ragged import RaggedBatchWrapper
+
+from .llama_v2_containers import Llama2NonTransformerContainer, Llama2TransformerContainer
+
+
+class Llama2InferenceModel(DSTransformerModelBase):
+    """
+    Inference model implementation for ragged batching for Llama-2 models.
+    """
+
+    _non_transformer: Optional[Llama2NonTransformerContainer]
+    """
+    Embed + unembed container. Specializing the type annotation.
+    """
+
+    _transformer: Optional[Iterable[Llama2TransformerContainer]]
+    """
+    Per-layer transformer container. Specializing the type annotation.
+    """
+    """
+    Properties ineherited from `DSInferenceModelBase`
+    """
+
+    @property
+    def max_sequence_length(self) -> int:
+        return self._config.max_seq_length
+
+    """
+    Properties ineherited from `DSTransformerModelBase`
+    """
+
+    @property
+    def num_layers(self) -> int:
+        return self._config.num_hidden_layers
+
+    @property
+    def model_dim(self) -> int:
+        return self._config.hidden_size
+
+    @property
+    def vocab_size(self) -> int:
+        return self._config.vocab_size
+
+    @property
+    def head_size(self) -> int:
+        return self.model_dim // self.n_heads
+
+    @property
+    def n_heads(self) -> int:
+        return self._config.num_attention_heads
+
+    @property
+    def intermediate_dim(self) -> int:
+        return self._config.intermediate_size
+
+    @property
+    def n_heads_kv(self) -> int:
+        return self._config.num_key_value_heads
+
+    @property
+    def activation_dtype(self) -> DtypeEnum:
+        if self._config.torch_dtype == torch.float16:
+            return DtypeEnum.fp16
+        elif self._config.torch_dtype == torch.bfloat16:
+            return DtypeEnum.bf16
+        else:
+            raise NotImplementedError("Only fp16 and bf16 are supported")
+
+    @property
+    def mlp_activation_fn(self) -> ActivationType:
+        activation = self._config.hidden_act.lower()
+        # llama model family is special and is always gated so force gated versions of relu, gelu, silu
+        if activation == "gelu":
+            return ActivationType.GEGLU
+        elif activation == "relu":
+            return ActivationType.ReGLU
+        elif activation == "gegelu":
+            return ActivationType.GEGLU
+        elif activation == "silu":
+            return ActivationType.SiGLU
+        else:
+            raise NotImplementedError(f"Activation {activation} not supported")
+
+    @property
+    def norm_type(self) -> NormTypeEnum:
+        return NormTypeEnum.RMSNorm
+
+    @property
+    def positional_embedding_type(self) -> PositionalEmbeddingType:
+        return PositionalEmbeddingType.rotate_half
+
+    """
+    Forward implementations
+    """
+
+    def _forward_embed(self, ragged_batch: RaggedBatchWrapper) -> torch.Tensor:
+        """
+        Performs the embedding lookup prior to running the transformer of the model.
+
+        Arguments:
+            ragged_batch (RaggedBatchWrapper): The batch to embed.
+
+        Returns:
+            torch.Tensor: The embedded batch.
+        """
+        embed = self.embed(ragged_batch, self._non_transformer.word_emb)
+
+        if embed.shape[-1] != self.model_dim:
+            raise ValueError(f"Embedding output shape {embed.shape} does not match model_dim {self.model_dim}")
+
+        return embed
+
+    def _forward_transformer_layer(self, layer_idx: int, residual: torch.Tensor, hidden_states: torch.Tensor,
+                                   ragged_batch_info: RaggedBatchWrapper) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Executes one (slightly offset) layer of the transformer. This implementation does a peak-ahead
+        optimization to fuse the layer norm of the next layer into the current layer.
+
+        Arguments:
+            layer_idx (int): The index of the layer to execute.
+            residual (torch.Tensor): The residual tensor from the previous layer.
+            hidden_states (torch.Tensor): The hidden states from the previous layer. This is the
+                hidden states after pre normalization.
+            ragged_batch_info (RaggedBatchWrapper): The batch metadata.
+        """
+        # TODO(cmikeh2): Distribute ragged_batch_info to all modules
+
+        cur_params = self._transformer[layer_idx]
+        kv_cache = self.state_manager.get_cache(layer_idx)
+
+        hidden_states = self.qkv(hidden_states, cur_params.qkv_w, b=None)
+        hidden_states = self.attn(hidden_states, kv_cache,
+                                  ragged_batch_info)  #, inv_freqs=None) #cur_params.rotary_emb)
+        hidden_states = self.attn_out(hidden_states, cur_params.attn_out_w, b=None)
+
+        if self.tp_size > 1:
+            dist.all_reduce(hidden_states, group=self._base_mp_group)
+
+        residual, hidden_states = self.norm(residual, hidden_states, cur_params.mlp_norm_gamma, beta=None)
+
+        # Should be configurable in the future
+        hidden_states = self.mlp_1(hidden_states, cur_params.mlp_1_w, b=None)
+        hidden_states = self.mlp_2(hidden_states, cur_params.mlp_2_w, b=None)
+
+        if self.tp_size > 1:
+            dist.all_reduce(hidden_states, group=self._base_mp_group)
+
+        if layer_idx != self.num_layers - 1:
+            next_params = self._transformer[layer_idx + 1]
+            residual, hidden_states = self.norm(residual, hidden_states, next_params.attn_norm_gamma, beta=None)
+        else:
+            # On last layer, we just need to perform the residual add. Adding into the residual
+            # here is safe.
+            residual.add_(hidden_states)
+
+        return residual, hidden_states
+
+    def _forward_unembed(self, hidden_states: torch.Tensor, ragged_batch_info: RaggedBatchWrapper) -> torch.Tensor:
+        """
+        Performs unembedding of the hidden states to logits. This will only sample the final
+        token of each sequence.
+        """
+        logits = self.unembed(hidden_states, self._non_transformer.word_unembed, ragged_batch_info,
+                              self._non_transformer.final_norm)
+
+        if self.tp_size > 1:
+            comm_buffer = empty_from(self._comm_logits, (self.tp_size, logits.shape[0], logits.shape[1]))
+            full_logits = empty_from(self._return_logits, (logits.shape[0], self.vocab_size))
+
+            dist.all_gather_into_tensor(comm_buffer, logits, group=self._base_mp_group)
+
+            full_logits.copy_(comm_buffer.permute(1, 0, 2).reshape(logits.shape[0], self.vocab_size))
+
+            return full_logits
+        else:
+            return logits
+
+    def forward(self, wrapped_batch: RaggedBatchWrapper) -> torch.Tensor:
+
+        residual = self._forward_embed(wrapped_batch)
+
+        residual, hidden_states = self.norm(residual, None, self._transformer[0].attn_norm_gamma, beta=None)
+
+        for layer_idx in range(self.num_layers):
+            residual, hidden_states = self._forward_transformer_layer(layer_idx, residual, hidden_states,
+                                                                      wrapped_batch)
+
+        return self._forward_unembed(residual, wrapped_batch)
diff --git a/deepspeed/inference/v2/model_implementations/llama_v2/llama_v2_policy.py b/deepspeed/inference/v2/model_implementations/llama_v2/llama_v2_policy.py
new file mode 100644
index 000000000000..c8253be79fad
--- /dev/null
+++ b/deepspeed/inference/v2/model_implementations/llama_v2/llama_v2_policy.py
@@ -0,0 +1,31 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import Any
+
+from ...config_v2 import RaggedInferenceEngineConfig
+from ...model_implementations.inference_policy_base import ContainerMap, InferenceV2Policy
+from ...model_implementations.llama_v2.llama_v2_containers import Llama2NonTransformerContainer, Llama2TransformerContainer
+from ...model_implementations.llama_v2.llama_v2_model import Llama2InferenceModel
+
+
+class Llama2Policy(InferenceV2Policy):
+
+    def instantiate_model(self, engine_config: RaggedInferenceEngineConfig, mp_group: Any) -> Llama2InferenceModel:
+        return Llama2InferenceModel(config=self._model_config, engine_config=engine_config, base_mp_group=mp_group)
+
+    def build_container_map(self) -> ContainerMap:
+        map = ContainerMap()
+
+        transformer_containers = [Llama2TransformerContainer(self.model) for _ in range(self.model.num_layers)]
+
+        map.set_transformer_params(['model.layers'], transformer_containers)
+
+        map.set_non_transformer_params(Llama2NonTransformerContainer(self.model))
+
+        map.set_unmapped_params(
+            [f'model.layers.{i}.self_attn.rotary_emb.inv_freq' for i in range(self.model.num_layers)])
+
+        return map
diff --git a/deepspeed/inference/v2/model_implementations/mistral/__init__.py b/deepspeed/inference/v2/model_implementations/mistral/__init__.py
new file mode 100644
index 000000000000..60d636693ef3
--- /dev/null
+++ b/deepspeed/inference/v2/model_implementations/mistral/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .policy import MistralPolicy
diff --git a/deepspeed/inference/v2/model_implementations/mistral/container.py b/deepspeed/inference/v2/model_implementations/mistral/container.py
new file mode 100644
index 000000000000..b4c0956f4049
--- /dev/null
+++ b/deepspeed/inference/v2/model_implementations/mistral/container.py
@@ -0,0 +1,77 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# Create a container object to save model-specific tensors using the policy file above.
+
+from deepspeed.inference.v2.model_implementations.common_parameters import *
+from deepspeed.inference.v2.model_implementations.layer_container_base import LayerContainer
+'''
+ # HF Mistral model (mistralai/Mistral-7B-v0.1) looks like this:
+MistralForCausalLM(
+  (model): MistralModel(
+    (embed_tokens): Embedding(32000, 4096)
+    (layers): ModuleList(
+      (0-31): 32 x MistralDecoderLayer(
+        (self_attn): MistralAttention(
+          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
+          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
+          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
+          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
+          (rotary_emb): MistralRotaryEmbedding()
+        )
+        (mlp): MistralMLP(
+          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
+          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
+          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
+          (act_fn): SiLUActivation()
+        )
+        (input_layernorm): MistralRMSNorm()
+        (post_attention_layernorm): MistralRMSNorm()
+      )
+    )
+    (norm): MistralRMSNorm()
+  )
+  (lm_head): Linear(in_features=4096, out_features=32000, bias=False)
+)
+'''
+
+
+class MistralTransformerContainer(LayerContainer):
+    """
+        Transformer layer container for the Mistral model.
+    """
+    qkv_w: UnfusedQKVParameter
+    attn_out_w: AttentionOutputParameter
+    mlp_1_w: GatedMLPParameter
+    mlp_2_w: MLP2Parameter
+    attn_norm_gamma: NormParameter
+    mlp_norm_gamma: NormParameter
+
+    PARAM_MAPPING = {
+        "self_attn.q_proj.weight": "qkv_w.q_params",
+        "self_attn.k_proj.weight": "qkv_w.k_params",
+        "self_attn.v_proj.weight": "qkv_w.v_params",
+        "self_attn.o_proj.weight": "attn_out_w.params",
+        "mlp.gate_proj.weight": "mlp_1_w.gate_params",
+        "mlp.up_proj.weight": "mlp_1_w.up_params",
+        "mlp.down_proj.weight": "mlp_2_w.params",
+        "input_layernorm.weight": "attn_norm_gamma.params",
+        "post_attention_layernorm.weight": "mlp_norm_gamma.params",
+    }
+
+
+class MistralNonTransformerContainer(LayerContainer):
+    """
+        Non-Transformer layer container for the Mistral model.
+    """
+    word_emb: EmbeddingParameter
+    word_unembed: UnembedParameter
+    final_norm: NormParameter
+
+    PARAM_MAPPING = {
+        "model.embed_tokens.weight": "word_emb.params",
+        "model.norm.weight": "final_norm.params",
+        "lm_head.weight": "word_unembed.params",
+    }
diff --git a/deepspeed/inference/v2/model_implementations/mistral/model.py b/deepspeed/inference/v2/model_implementations/mistral/model.py
new file mode 100644
index 000000000000..d9b06b91e308
--- /dev/null
+++ b/deepspeed/inference/v2/model_implementations/mistral/model.py
@@ -0,0 +1,202 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import Iterable, Optional, Tuple
+
+import torch
+
+import deepspeed.comm as dist
+
+from ...allocator import empty_from
+from ...inference_utils import ActivationType, DtypeEnum
+from ...model_implementations import *
+from ...modules.configs import *
+from ...modules.interfaces import *
+from ...ragged import RaggedBatchWrapper
+
+from .container import MistralNonTransformerContainer, MistralTransformerContainer
+
+
+class MistralInferenceModel(DSTransformerModelBase):
+    """
+    Inference model implementation for ragged batching for Mistral models.
+    """
+
+    _non_transformer: Optional[MistralNonTransformerContainer]
+    """
+    Embed + unembed container. Specializing the type annotation.
+    """
+
+    _transformer: Optional[Iterable[MistralTransformerContainer]]
+    """
+    Per-layer transformer container. Specializing the type annotation.
+    """
+    """
+    Properties ineherited from `DSInferenceModelBase`
+    """
+
+    @property
+    def max_sequence_length(self) -> int:
+        return self._config.max_seq_length
+
+    """
+    Properties ineherited from `DSTransformerModelBase`
+    """
+
+    @property
+    def num_layers(self) -> int:
+        return self._config.num_hidden_layers
+
+    @property
+    def model_dim(self) -> int:
+        return self._config.hidden_size
+
+    @property
+    def vocab_size(self) -> int:
+        return self._config.vocab_size
+
+    @property
+    def head_size(self) -> int:
+        return self.model_dim // self.n_heads
+
+    @property
+    def n_heads(self) -> int:
+        return self._config.num_attention_heads
+
+    @property
+    def intermediate_dim(self) -> int:
+        return self._config.intermediate_size
+
+    @property
+    def n_heads_kv(self) -> int:
+        return self._config.num_key_value_heads
+
+    @property
+    def activation_dtype(self) -> DtypeEnum:
+        if self._config.torch_dtype == torch.float16:
+            return DtypeEnum.fp16
+        elif self._config.torch_dtype == torch.bfloat16:
+            return DtypeEnum.bf16
+        else:
+            raise NotImplementedError("Only fp16 and bf16 are supported")
+
+    @property
+    def mlp_activation_fn(self) -> ActivationType:
+        activation = self._config.hidden_act.lower()
+        if activation == "gelu":
+            return ActivationType.GEGLU
+        elif activation == "relu":
+            return ActivationType.ReGLU
+        elif activation == "gegelu":
+            return ActivationType.GEGLU
+        elif activation == "silu":
+            return ActivationType.SiGLU
+        else:
+            raise NotImplementedError(f"Activation {activation} not supported")
+
+    @property
+    def norm_type(self) -> NormTypeEnum:
+        return NormTypeEnum.RMSNorm
+
+    @property
+    def positional_embedding_type(self) -> PositionalEmbeddingType:
+        return PositionalEmbeddingType.rotate_half
+
+    """
+    Forward implementations
+    """
+
+    def _forward_embed(self, ragged_batch: RaggedBatchWrapper) -> torch.Tensor:
+        """
+        Performs the embedding lookup prior to running the transformer of the model.
+
+        Arguments:
+            ragged_batch (RaggedBatchWrapper): The batch to embed.
+
+        Returns:
+            torch.Tensor: The embedded batch.
+        """
+        embed = self.embed(ragged_batch, self._non_transformer.word_emb)
+
+        if embed.shape[-1] != self.model_dim:
+            raise ValueError(f"Embedding output shape {embed.shape} does not match model_dim {self.model_dim}")
+
+        return embed
+
+    def _forward_transformer(self, layer_idx: int, residual: torch.Tensor, hidden_states: torch.Tensor,
+                             ragged_batch_info: RaggedBatchWrapper) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Executes one (slightly offset) layer of the transformer. This implementation does a peak-ahead
+        optimization to fuse the layer norm of the next layer into the current layer.
+
+        Arguments:
+            layer_idx (int): The index of the layer to execute.
+            residual (torch.Tensor): The residual tensor from the previous layer.
+            hidden_states (torch.Tensor): The hidden states from the previous layer. This is the
+                hidden states after pre normalization.
+            ragged_batch_info (RaggedBatchWrapper): The batch metadata.
+        """
+        # TODO(cmikeh2): Distribute ragged_batch_info to all modules
+
+        cur_params = self._transformer[layer_idx]
+        kv_cache = self.state_manager.get_cache(layer_idx)
+
+        hidden_states = self.qkv(hidden_states, cur_params.qkv_w, b=None)
+        hidden_states = self.attn(hidden_states, kv_cache,
+                                  ragged_batch_info)  #, inv_freqs=None) #cur_params.rotary_emb)
+        hidden_states = self.attn_out(hidden_states, cur_params.attn_out_w, b=None)
+
+        if self.tp_size > 1:
+            dist.all_reduce(hidden_states, group=self._base_mp_group)
+
+        residual, hidden_states = self.norm(residual, hidden_states, cur_params.mlp_norm_gamma, beta=None)
+
+        # Should be configurable in the future
+        hidden_states = self.mlp_1(hidden_states, cur_params.mlp_1_w, b=None)
+        hidden_states = self.mlp_2(hidden_states, cur_params.mlp_2_w, b=None)
+
+        if self.tp_size > 1:
+            dist.all_reduce(hidden_states, group=self._base_mp_group)
+
+        if layer_idx != self.num_layers - 1:
+            next_params = self._transformer[layer_idx + 1]
+            residual, hidden_states = self.norm(residual, hidden_states, next_params.attn_norm_gamma, beta=None)
+        else:
+            # On last layer, we just need to perform the residual add. Adding into the residual
+            # here is safe.
+            residual.add_(hidden_states)
+
+        return residual, hidden_states
+
+    def _forward_unembed(self, hidden_states: torch.Tensor, ragged_batch_info: RaggedBatchWrapper) -> torch.Tensor:
+        """
+        Performs unembedding of the hidden states to logits. This will only sample the final
+        token of each sequence.
+        """
+        logits = self.unembed(hidden_states, self._non_transformer.word_unembed, ragged_batch_info,
+                              self._non_transformer.final_norm)
+
+        if self.tp_size > 1:
+            comm_buffer = empty_from(self._comm_logits, (self.tp_size, logits.shape[0], logits.shape[1]))
+            full_logits = empty_from(self._return_logits, (logits.shape[0], self.vocab_size))
+
+            dist.all_gather_into_tensor(comm_buffer, logits, group=self._base_mp_group)
+
+            full_logits.copy_(comm_buffer.permute(1, 0, 2).reshape(logits.shape[0], self.vocab_size))
+
+            return full_logits
+        else:
+            return logits
+
+    def forward(self, wrapped_batch: RaggedBatchWrapper) -> torch.Tensor:
+
+        residual = self._forward_embed(wrapped_batch)
+
+        residual, hidden_states = self.norm(residual, None, self._transformer[0].attn_norm_gamma, beta=None)
+
+        for layer_idx in range(self.num_layers):
+            residual, hidden_states = self._forward_transformer(layer_idx, residual, hidden_states, wrapped_batch)
+
+        return self._forward_unembed(residual, wrapped_batch)
diff --git a/deepspeed/inference/v2/model_implementations/mistral/policy.py b/deepspeed/inference/v2/model_implementations/mistral/policy.py
new file mode 100644
index 000000000000..f6d0a0fe5987
--- /dev/null
+++ b/deepspeed/inference/v2/model_implementations/mistral/policy.py
@@ -0,0 +1,30 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import Any
+
+from deepspeed.inference.v2.config_v2 import RaggedInferenceEngineConfig
+from deepspeed.inference.v2.model_implementations.inference_policy_base import ContainerMap, InferenceV2Policy
+from deepspeed.inference.v2.model_implementations.mistral.container import MistralNonTransformerContainer, MistralTransformerContainer
+from deepspeed.inference.v2.model_implementations.mistral.model import MistralInferenceModel
+
+
+class MistralPolicy(InferenceV2Policy):
+
+    def instantiate_model(self, engine_config: RaggedInferenceEngineConfig, mp_group: Any) -> MistralInferenceModel:
+        return MistralInferenceModel(config=self._model_config, engine_config=engine_config, base_mp_group=mp_group)
+
+    def build_container_map(self) -> ContainerMap:
+        map = ContainerMap()
+
+        transformer_containers = [MistralTransformerContainer(self.model) for _ in range(self.model.num_layers)]
+
+        map.set_transformer_params(['model.layers'], transformer_containers)
+
+        map.set_non_transformer_params(MistralNonTransformerContainer(self.model))
+
+        map.set_unmapped_params([])
+
+        return map
diff --git a/deepspeed/inference/v2/model_implementations/opt/__init__.py b/deepspeed/inference/v2/model_implementations/opt/__init__.py
new file mode 100644
index 000000000000..c0f24d5243b8
--- /dev/null
+++ b/deepspeed/inference/v2/model_implementations/opt/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .policy import OPTPolicy
diff --git a/deepspeed/inference/v2/model_implementations/opt/container.py b/deepspeed/inference/v2/model_implementations/opt/container.py
new file mode 100644
index 000000000000..5ddbbde3f141
--- /dev/null
+++ b/deepspeed/inference/v2/model_implementations/opt/container.py
@@ -0,0 +1,94 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# Create a container object to save model-specific tensors using the policy file above.
+
+from ...model_implementations.common_parameters import *
+from ...model_implementations.layer_container_base import LayerContainer
+'''
+ # HF OPT model looks like this:
+
+OPTForCausalLM(
+  (model): OPTModel(
+    (decoder): OPTDecoder(
+      (embed_tokens): Embedding(50272, 768, padding_idx=1)
+      (embed_positions): OPTLearnedPositionalEmbedding(2050, 768)
+      (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+      (layers): ModuleList(
+        (0-11): 12 x OPTDecoderLayer(
+          (self_attn): OPTAttention(
+            (k_proj): Linear(in_features=768, out_features=768, bias=True)
+            (v_proj): Linear(in_features=768, out_features=768, bias=True)
+            (q_proj): Linear(in_features=768, out_features=768, bias=True)
+            (out_proj): Linear(in_features=768, out_features=768, bias=True)
+          )
+          (activation_fn): ReLU()
+          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (fc1): Linear(in_features=768, out_features=3072, bias=True)
+          (fc2): Linear(in_features=3072, out_features=768, bias=True)
+          (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+        )
+      )
+    )
+  )
+  (lm_head): Linear(in_features=768, out_features=50272, bias=False)
+)
+
+'''
+
+
+class OPTTransformerContainer(LayerContainer):
+    """
+        Transformer layer container for the OPT model.
+    """
+    qkv_w: UnfusedQKVParameter
+    qkv_b: UnfusedQKVParameter
+    attn_out_w: AttentionOutputParameter
+    attn_out_b: AttentionOutputParameter
+    mlp_1_w: MLP1Parameter
+    mlp_1_b: MLP1Parameter
+    mlp_2_w: MLP2Parameter
+    mlp_2_b: MLP2Parameter
+    attn_norm_beta: NormParameter
+    attn_norm_gamma: NormParameter
+    mlp_norm_beta: NormParameter
+    mlp_norm_gamma: NormParameter
+
+    PARAM_MAPPING = {
+        "self_attn.q_proj.weight": "qkv_w.q_params",
+        "self_attn.q_proj.bias": "qkv_b.q_params",
+        "self_attn.k_proj.weight": "qkv_w.k_params",
+        "self_attn.k_proj.bias": "qkv_b.k_params",
+        "self_attn.v_proj.weight": "qkv_w.v_params",
+        "self_attn.v_proj.bias": "qkv_b.v_params",
+        "self_attn.out_proj.weight": "attn_out_w.params",
+        "self_attn.out_proj.bias": "attn_out_b.params",
+        "fc1.weight": "mlp_1_w.params",
+        "fc1.bias": "mlp_1_b.params",
+        "fc2.weight": "mlp_2_w.params",
+        "fc2.bias": "mlp_2_b.params",
+        "self_attn_layer_norm.weight": "attn_norm_gamma.params",
+        "self_attn_layer_norm.bias": "attn_norm_beta.params",
+        "final_layer_norm.weight": "mlp_norm_gamma.params",
+        "final_layer_norm.bias": "mlp_norm_beta.params",
+    }
+
+
+class OPTNonTransformerContainer(LayerContainer):
+    """
+        Non-Transformer layer container for the OPT model.
+    """
+    word_emb: EmbeddingParameter
+    word_emb_pos: EmbeddingParameter
+    word_unembed: UnembedParameter
+    final_norm_w: NormParameter
+    final_norm_b: NormParameter
+
+    PARAM_MAPPING = {
+        "*decoder.embed_tokens.weight": ["word_emb.params", "word_unembed.params"],
+        "*decoder.embed_positions.weight": "word_emb_pos.params",
+        "*decoder.final_layer_norm.weight": "final_norm_w.params",
+        "*decoder.final_layer_norm.bias": "final_norm_b.params",
+    }
diff --git a/deepspeed/inference/v2/model_implementations/opt/model.py b/deepspeed/inference/v2/model_implementations/opt/model.py
new file mode 100644
index 000000000000..fa221e15a0b7
--- /dev/null
+++ b/deepspeed/inference/v2/model_implementations/opt/model.py
@@ -0,0 +1,195 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import Iterable, Optional, Tuple
+
+import torch
+
+import deepspeed.comm as dist
+
+from ...allocator import empty_from
+from ...inference_utils import ActivationType, DtypeEnum
+from ...model_implementations import *
+from ...modules.configs import (
+    DSEmbeddingsConfig,
+    NormTypeEnum,
+    PositionalEmbeddingType,
+)
+from ...ragged import RaggedBatchWrapper
+from .container import OPTNonTransformerContainer, OPTTransformerContainer
+
+from ...modules.heuristics import instantiate_embed
+
+
+class OPTInferenceModel(DSTransformerModelBase):
+    """
+    Inference model implementation for ragged batching for OPT models.
+    """
+
+    _non_transformer: Optional[OPTNonTransformerContainer]
+    """
+    Embed + unembed container. Specializing the type annotation.
+    """
+
+    _transformer: Optional[Iterable[OPTTransformerContainer]]
+    """
+    Per-layer transformer container. Specializing the type annotation.
+    """
+    """
+    Properties ineherited from `DSInferenceModelBase`
+    """
+
+    @property
+    def max_sequence_length(self) -> int:
+        return self._config.max_seq_length
+
+    """
+    Properties ineherited from `DSTransformerModelBase`
+    """
+
+    @property
+    def num_layers(self) -> int:
+        return self._config.num_hidden_layers
+
+    @property
+    def model_dim(self) -> int:
+        return self._config.hidden_size
+
+    @property
+    def vocab_size(self) -> int:
+        return self._config.vocab_size
+
+    @property
+    def head_size(self) -> int:
+        return self.model_dim // self.n_heads
+
+    @property
+    def n_heads(self) -> int:
+        return self._config.num_attention_heads
+
+    @property
+    def intermediate_dim(self) -> int:
+        return self._config.ffn_dim
+
+    @property
+    def activation_dtype(self) -> DtypeEnum:
+        if self._config.torch_dtype == torch.float16:
+            return DtypeEnum.fp16
+        elif self._config.torch_dtype == torch.bfloat16:
+            return DtypeEnum.bf16
+        else:
+            raise NotImplementedError("Only fp16 and bf16 are supported")
+
+    @property
+    def mlp_activation_fn(self) -> ActivationType:
+        return ActivationType.RELU
+
+    @property
+    def norm_type(self) -> NormTypeEnum:
+        return NormTypeEnum.LayerNorm
+
+    @property
+    def positional_embedding_type(self) -> PositionalEmbeddingType:
+        return PositionalEmbeddingType.none
+
+    """
+    Overrides of ``DSTransformerModelBase`` methods
+    """
+
+    def make_embedding_layer(self) -> None:
+        """
+        Performs setup and creates embedding DSModule. Since OPT includes trained
+        positional embeddings, we will override the base model implementation.
+        """
+
+        embed_config = DSEmbeddingsConfig(max_tokens=self._engine_config.state_manager.max_ragged_batch_size,
+                                          residual_dtype=self.activation_dtype,
+                                          embedding_dim=self.model_dim,
+                                          positional_embedding=True,
+                                          positional_offset=2)
+
+        self.embed = instantiate_embed(embed_config, self._engine_config)
+
+    """
+    Forward implementations
+    """
+
+    def _forward_embed(self, ragged_batch: RaggedBatchWrapper) -> torch.Tensor:
+        embed = self.embed(ragged_batch, self._non_transformer.word_emb, self._non_transformer.word_emb_pos)
+        if embed.shape[-1] != self.model_dim:
+            raise ValueError(f"Embedding output shape {embed.shape} does not match model_dim {self.model_dim}")
+
+        return embed
+
+    def _forward_transformer_layer(self, layer_idx: int, residual: torch.Tensor, hidden_states: torch.Tensor,
+                                   ragged_batch_info: RaggedBatchWrapper) -> Tuple[torch.Tensor, torch.Tensor]:
+        # TODO(cmikeh2): Distribute ragged_batch_info to all modules
+
+        cur_params = self._transformer[layer_idx]
+        kv_cache = self.state_manager.get_cache(layer_idx)
+
+        hidden_states = self.qkv(hidden_states, cur_params.qkv_w, b=cur_params.qkv_b)
+        hidden_states = self.attn(hidden_states, kv_cache,
+                                  ragged_batch_info)  #, inv_freqs=None) #cur_params.rotary_emb)
+        hidden_states = self.attn_out(hidden_states, cur_params.attn_out_w, b=cur_params.attn_out_b)
+
+        if self.tp_size > 1:
+            dist.all_reduce(hidden_states, group=self._base_mp_group)
+
+        residual, hidden_states = self.norm(residual,
+                                            hidden_states,
+                                            cur_params.mlp_norm_gamma,
+                                            beta=cur_params.mlp_norm_beta)
+
+        # Should be configurable in the future
+        hidden_states = self.mlp_1(hidden_states, cur_params.mlp_1_w, b=cur_params.mlp_1_b)
+        hidden_states = self.mlp_2(hidden_states, cur_params.mlp_2_w, b=cur_params.mlp_2_b)
+
+        if self.tp_size > 1:
+            dist.all_reduce(hidden_states, group=self._base_mp_group)
+
+        if layer_idx != self.num_layers - 1:
+            next_params = self._transformer[layer_idx + 1]
+            residual, hidden_states = self.norm(residual,
+                                                hidden_states,
+                                                next_params.attn_norm_gamma,
+                                                beta=next_params.attn_norm_beta)
+        else:
+            # On last layer, we just need to perform the residual add. Adding into the residual
+            # here is safe.
+            residual.add_(hidden_states)
+
+        return residual, hidden_states
+
+    def _forward_unembed(self, hidden_states: torch.Tensor, ragged_batch_info: RaggedBatchWrapper) -> torch.Tensor:
+        logits = self.unembed(hidden_states, self._non_transformer.word_unembed, ragged_batch_info,
+                              self._non_transformer.final_norm_w, self._non_transformer.final_norm_b)
+
+        if self.tp_size > 1:
+            comm_buffer = empty_from(self._comm_logits, (self.tp_size, logits.shape[0], logits.shape[1]))
+            full_logits = empty_from(self._return_logits, (logits.shape[0], self.vocab_size))
+
+            dist.all_gather_into_tensor(comm_buffer, logits, group=self._base_mp_group)
+
+            full_logits.copy_(comm_buffer.permute(1, 0, 2).reshape(logits.shape[0], self.vocab_size))
+
+            return full_logits
+        else:
+            return logits
+
+    def forward(self, wrapped_batch: RaggedBatchWrapper) -> torch.Tensor:
+
+        residual = self._forward_embed(wrapped_batch)
+
+        residual, hidden_states = self.norm(residual,
+                                            None,
+                                            self._transformer[0].attn_norm_gamma,
+                                            beta=self._transformer[0].attn_norm_beta)
+
+        for layer_idx in range(self.num_layers):
+            residual, hidden_states = self._forward_transformer_layer(layer_idx, residual, hidden_states,
+                                                                      wrapped_batch)
+
+        return self._forward_unembed(residual, wrapped_batch)
diff --git a/deepspeed/inference/v2/model_implementations/opt/policy.py b/deepspeed/inference/v2/model_implementations/opt/policy.py
new file mode 100644
index 000000000000..af5750260ead
--- /dev/null
+++ b/deepspeed/inference/v2/model_implementations/opt/policy.py
@@ -0,0 +1,30 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import Any
+
+from ...config_v2 import RaggedInferenceEngineConfig
+from ...model_implementations.inference_policy_base import ContainerMap, InferenceV2Policy
+from ...model_implementations.opt.container import OPTNonTransformerContainer, OPTTransformerContainer
+from ...model_implementations.opt.model import OPTInferenceModel
+
+
+class OPTPolicy(InferenceV2Policy):
+
+    def instantiate_model(self, engine_config: RaggedInferenceEngineConfig, mp_group: Any) -> OPTInferenceModel:
+        return OPTInferenceModel(config=self._model_config, engine_config=engine_config, base_mp_group=mp_group)
+
+    def build_container_map(self) -> ContainerMap:
+        map = ContainerMap()
+
+        transformer_containers = [OPTTransformerContainer(self.model) for _ in range(self.model.num_layers)]
+
+        map.set_transformer_params(['model.decoder.layers', 'decoder.layers'], transformer_containers)
+
+        map.set_non_transformer_params(OPTNonTransformerContainer(self.model))
+
+        map.set_unmapped_params(['lm_head.weight'])
+
+        return map
diff --git a/deepspeed/inference/v2/model_implementations/parameter_base.py b/deepspeed/inference/v2/model_implementations/parameter_base.py
new file mode 100644
index 000000000000..2dcb63c050a0
--- /dev/null
+++ b/deepspeed/inference/v2/model_implementations/parameter_base.py
@@ -0,0 +1,255 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import weakref
+from abc import abstractmethod
+from typing import Type
+
+import torch
+
+# Currently have dependency loops for the type hints.
+InferenceModel = Type["InferenceModel"]
+LayerContainer = Type["LayerContainer"]
+
+MAPPING_KEY = "PARAM_MAPPING"
+
+
+def make_param_getter(clsname, param):
+    """
+    Normal getter implementation for a property.
+    """
+
+    def param_getter(self):
+        return getattr(self, f"__{clsname}__{param}")
+
+    return param_getter
+
+
+def make_param_setter(clsname, param):
+    """
+    Setter implementation that will call complete component to potentially
+    finalize the parameter.
+    """
+
+    def param_setter(self, value):
+        setattr(self, f"__{clsname}__{param}", value)
+        self.complete_component()
+
+    return param_setter
+
+
+def make_readonly_setter():
+    """
+    Setter implementation that will raise an error if called.
+    """
+
+    def paramlist_setter(self, value):
+        raise ValueError("Cannot set a ParametrizedList directly.")
+
+    return paramlist_setter
+
+
+class ParameterMetaclass(type):
+    """
+    MetaClass for the ParameterBase base class. This class will parse the `src_params`
+    attribute and create properties for each of the dependencies. A dependency can either
+    be represented as a string, which is interpreted as a named Tensor, or a `ParametrizedList`
+    subclass.
+    """
+
+    def __new__(cls, clsname, bases, attrs):
+
+        annotations = attrs.get("__annotations__", {})
+        dependencies = {
+            name: annotation
+            for name, annotation in annotations.items() if issubclass(annotation, (torch.Tensor, ParametrizedList))
+        }
+        n_dependencies = len(dependencies)
+
+        # Create properties for each of our dependencies
+        for d_name, d_type in dependencies.items():
+            if issubclass(d_type, ParametrizedList):
+                assert hasattr(
+                    d_type, "count_attr"
+                ), "ParametrizedList must have a count_attr attribute to access on the inference module."
+                attrs[d_name] = property(make_param_getter(clsname, d_name), make_readonly_setter())
+            else:  # torch.Tensor
+                attrs[d_name] = property(make_param_getter(clsname, d_name), make_param_setter(clsname, d_name))
+
+        new_cls = super().__new__(cls, clsname, bases, attrs)
+        new_cls.n_dependencies = n_dependencies
+
+        return new_cls
+
+    def __call__(cls, *args, **kwargs):
+        new_obj = super().__call__(*args, **kwargs)
+        new_obj.__init__(*args, **kwargs)
+
+        setattr(new_obj, "dest_param", None)
+
+        # Initialize our dependences to None/empty `ParametrizedList`s
+        for name, annotation in new_obj.__annotations__.items():
+            if issubclass(annotation, ParametrizedList):
+                #TODO(jeff): update assert with this, model implementation attribute does not align or missing wrt the ParametrizedList attributes
+                assert hasattr(
+                    new_obj.inference_model, annotation.count_attr
+                ), f"new_obj={new_obj.__class__.__name__}, name={name}, annotation.count_attr={annotation.count_attr}"
+                param_list = annotation(new_obj, getattr(new_obj.inference_model, annotation.count_attr))
+                setattr(new_obj, f"__{new_obj.__class__.__name__}__{name}", param_list)
+            else:  # torch.Tensor
+                setattr(new_obj, f"__{new_obj.__class__.__name__}__{name}", None)
+
+        return new_obj
+
+
+class ParameterBase(metaclass=ParameterMetaclass):
+    """
+    A ParameterBase allows us to consolidate tracking the dependencies of loading a parameter from
+    a checkpoint into a single object. This class should not be used directly, but rather subclassed
+    and the `src_params` attribute set to a list of strings and/or `ParametrizedList`s.
+    """
+
+    # inference_model: InferenceModel
+    """
+    Inference model that will provide context on how to shard and transform the parameter.
+    """
+
+    #completed_components: int
+    """
+    How many of the layer dependencies have been met. This is used to determine when the parameter
+    is ready to be finalized. A ParametrizedList counts as a single dependency for the purposes
+    of this counter.
+    """
+
+    def __init__(self, model: InferenceModel, parent_container: LayerContainer) -> None:
+        """
+        Direct constructor. This should not be called from client code.
+
+        Args:
+            model (InferenceModel): Inference model that will be used to shard and transform the
+                parameter in `finalize`.
+            parent_container (LayerContainer): The parent container that this parameter is a member
+                of. We will build a weakref to this container to call the finalization callback.
+        """
+        self.inference_model = model
+        self.completed_components = 0
+        self.parent_container = weakref.ref(parent_container)
+
+    @abstractmethod
+    def finalize(self) -> torch.Tensor:
+        """
+        Finalize the parameter after all of its source parameters have been set. This method
+        will be automatically called when all inputs have been set. It should return the Tensor
+        with all transformations performed on it.
+        """
+        pass
+
+    def complete_component(self) -> None:
+        """
+        Mark a component as completed. This should be called by the relevant setter of a direct
+        property or a ParametrizedList. This method will automatically call `finalize` when all
+        dependencies have been met and then call the finalization callback on the parent container.
+
+        Once the finalization callback has been called, the parameter will be replaced with the
+        `dst_param` attribute on the parent container, and this instance will be destroyed.
+        """
+        self.completed_components += 1
+
+        if self.completed_components != self.n_dependencies:
+            return
+
+        finalized_param = self.finalize()
+        self.parent_container().finalization_callback(self, finalized_param)
+
+
+class ParametrizedList:
+    """
+    A ParametrizedList is a list of parameters that are dependencies
+    of a `ParameterBase` but may vary in length depending on the model
+    configuration (rather than architecture). For example, a MoE layer
+    may have different number of experts depending on the size of the model.
+
+    This class is used to manage these lists and provide integer indexing
+    of a single component rather than accessing names directly. For example,
+    it tends to be more natural to access the 8th expert with `experts[8]`
+    rather than a name like `expert_8`, especially as an attribute.
+
+    To inherit from this class, set static variables `name` and `count_attr`.
+
+    ```python
+    class MyParametrizedList(ParametrizedList):
+        count_attr: str = "my_list_count"
+    ```
+
+    In the above example, `my_list_count` should be an accessible attribute
+    of the inference model (i.e. via `self.inference_model.my_list_count`).
+
+    NOTE: There are some APIs in which this type cannot be used as if it is
+    just a list of Tensors. For example, `torch.cat(param_list)` will not work.
+    However, you can make it compatible with a tuple wrapper:
+        `torch.cat(tuple(param_list))`
+    """
+
+    n_params: int
+    """
+    Number of params this list contains.
+    """
+
+    param: ParameterBase
+    """
+    WeakRef to the owning parameter.
+    """
+
+    def __init__(self, param: ParameterBase, n_params: int) -> None:
+        """
+        Constructor. Should not be called from client code.
+
+        Args:
+            param (ParameterBase): The owning parameter.
+            n_params (int): The number of parameters this list contains. This should be
+        """
+        self.n_params = n_params
+        self.set_params = 0
+        self.param = weakref.ref(param)
+        self._params = [None] * n_params
+
+    def __getitem__(self, index):
+        return self._params[index]
+
+    def __setitem__(self, index, value):
+        if self._params[index] is not None:
+            raise ValueError("Cannot set a parameter twice.")
+
+        self._params[index] = value
+        self.set_params += 1
+
+        if self.set_params != self.n_params:
+            return
+
+        self.param().complete_component()
+
+    def __iter__(self):
+        return iter(self._params)
+
+
+def ParamList(attr: str):
+    """
+    Helper to create a subclass of ParametrizedList with the desired `count_attr`.
+
+    In this manner, we can annotate the type of a Parameter dependency with the
+    following:
+
+    ```python
+    class CustomParameter(ParameterBase):
+        dependency_list: ParamList("dependencies_count_name")
+    ```
+
+    where "dependencies_count_name" is the name of the attribute on the inference model.
+    """
+
+    class ParametrizedListInstance(ParametrizedList):
+        count_attr: str = attr
+
+    return ParametrizedListInstance
diff --git a/deepspeed/inference/v2/model_implementations/sharding/__init__.py b/deepspeed/inference/v2/model_implementations/sharding/__init__.py
new file mode 100644
index 000000000000..63421bc1c622
--- /dev/null
+++ b/deepspeed/inference/v2/model_implementations/sharding/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .attn import *
+from .attn_out import *
+from .embedding import *
+from .mlp import *
+from .qkv import *
+from .types import *
+from .unembed import *
diff --git a/deepspeed/inference/v2/model_implementations/sharding/attn.py b/deepspeed/inference/v2/model_implementations/sharding/attn.py
new file mode 100644
index 000000000000..de8d6f6ac4c5
--- /dev/null
+++ b/deepspeed/inference/v2/model_implementations/sharding/attn.py
@@ -0,0 +1,63 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import Optional, Tuple
+
+
+def get_local_heads(shard_rank: int,
+                    num_shards: int,
+                    n_heads_q: int,
+                    n_heads_kv: Optional[int] = None) -> Tuple[int, int]:
+    """
+    Helper to determine the number of local heads of a given shard.
+
+    Args:
+        shard_rank (int): The rank of the shard.
+        num_shards (int): The total number of shards that attention is distributed over.
+        n_heads_q (int): The number of query heads.
+        n_heads_kv (int): The number of key/value heads. If not passed, it is assumed that
+            the number of query and key/value heads are the same.
+    """
+    if n_heads_q < num_shards:
+        raise ValueError("There must be at least as many attention heads as there are shards.")
+
+    if n_heads_kv is None or n_heads_kv == n_heads_q:
+        # MHA attention
+        base_heads = n_heads_q // num_shards
+        extra_heads = n_heads_q % num_shards
+
+        if shard_rank < extra_heads:
+            return (base_heads + 1), (base_heads + 1)
+        else:
+            return base_heads, base_heads
+    else:
+        # GQA attention
+        if n_heads_q % n_heads_kv != 0:
+            raise ValueError("Must be an even ratio between query and key/value heads.")
+
+        if n_heads_kv < num_shards and num_shards % n_heads_kv != 0:
+            raise ValueError(
+                "If splitting a group across multiple shards, we must be able to distribute the groups evenly.")
+
+        if n_heads_kv >= num_shards and n_heads_kv % num_shards != 0:
+            raise ValueError("If parallelizing groups, must be able to evenly distribute them.")
+
+        q_ratio = n_heads_q // n_heads_kv
+
+        if n_heads_kv >= num_shards:
+            local_kv_heads = n_heads_kv // num_shards
+            local_q_heads = local_kv_heads * q_ratio
+            return local_q_heads, local_kv_heads
+        else:
+            group_sharding_size = num_shards // n_heads_kv
+            group_rank_idx = shard_rank % group_sharding_size
+
+            base_heads = q_ratio // group_sharding_size
+            extra_heads = q_ratio % group_sharding_size
+
+            if group_rank_idx < extra_heads:
+                return (base_heads + 1), 1
+            else:
+                return base_heads, 1
diff --git a/deepspeed/inference/v2/model_implementations/sharding/attn_out.py b/deepspeed/inference/v2/model_implementations/sharding/attn_out.py
new file mode 100644
index 000000000000..ce7c105531ea
--- /dev/null
+++ b/deepspeed/inference/v2/model_implementations/sharding/attn_out.py
@@ -0,0 +1,111 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import Optional
+
+import torch
+
+from .types import ShardingType
+from .utils import shard_param, get_shard_endpoints
+
+
+def shard_attn_out_param(param: torch.Tensor,
+                         shard_rank: int,
+                         num_shards: int,
+                         head_size: int,
+                         n_heads_q: Optional[int] = None,
+                         n_heads_kv: Optional[int] = None) -> Optional[torch.Tensor]:
+    """
+    Utility method for sharding an attention output parameter.
+    """
+    if len(param.shape) == 1:
+        # We will do the bias addition on the 0th rank only rather than scale the parameter and
+        # implicitly reconstruct this in the distributed reduce.
+        return param if shard_rank == 0 else None
+
+    assert n_heads_kv is None or (n_heads_q is not None
+                                  and n_heads_kv is not None), "n_heads_kv should not be passed without n_heads_q"
+
+    mha_sharding = n_heads_kv is None or n_heads_q == n_heads_kv
+
+    if mha_sharding:
+        return shard_param(param, ShardingType.INNER_DIMENSION, shard_rank, num_shards, granularity=head_size)
+    else:
+        assert param.shape[0] == head_size * n_heads_q, "GQA param shape is not correct"
+
+        # 32 KV heads, 16 shards for example
+        even_kv_sharding = n_heads_kv % num_shards == 0
+
+        # 8 KV heads, 16 shards for example
+        even_kv_distribution = num_shards % n_heads_kv == 0
+
+        assert even_kv_sharding or even_kv_distribution, "No partitioning algorithm for this yet."
+
+        if even_kv_sharding:
+            # Same as original sharding scenario
+            return shard_param(param, ShardingType.INNER_DIMENSION, shard_rank, num_shards, granularity=head_size)
+        else:
+            # We will first do a sharding on the KV and Q to map to the one KV shard per group of Q.
+            q_sharding_degree = num_shards // n_heads_kv
+
+            kv_head = shard_rank // q_sharding_degree
+
+            q_sharding_rank = shard_rank % q_sharding_degree
+            q_factor = n_heads_q // n_heads_kv
+
+            q_chunk = param[..., q_factor * kv_head * head_size:q_factor * (kv_head + 1) * head_size]
+
+            return shard_param(q_chunk,
+                               ShardingType.INNER_DIMENSION,
+                               q_sharding_rank,
+                               q_sharding_degree,
+                               granularity=head_size)
+
+
+def attn_out_in_features(out_features: int,
+                         shard_rank: int,
+                         num_shards: int,
+                         head_size: int,
+                         n_heads_q: Optional[int] = None,
+                         n_heads_kv: Optional[int] = None) -> int:
+    """
+    Helper to calculate the expected output projection dimension of a QKV projection matrix.
+
+    Args:
+        in_features (int): The model dimension.
+        shard_rank (int): Which rank to return the corresponding size for.
+        num_shards (int): The total number of shards the parameter is distributed across.
+        head_size (int): The size of each attention head.
+        n_heads_q (int): The number of query heads on the model. This only needs to be passed if the number
+            of query and key/value heads are different. If passed without n_heads_kv, default
+            MHA partitioning will be used.
+        n_heads_kv (int): The number of key and value heads on the model. This only needs to be passed
+            if the number of query and key/value heads are different. This argument cannot be passed without
+            also passing n_heads_q (we want to explicitly opt into GQA sharding).
+    """
+    assert n_heads_kv is None or (n_heads_q is not None
+                                  and n_heads_kv is not None), "n_heads_kv should not be passed without n_heads_q"
+
+    mha_sharding = n_heads_kv is None or n_heads_q == n_heads_kv
+
+    if mha_sharding:
+        endpoints = get_shard_endpoints(out_features, shard_rank, num_shards, granularity=head_size)
+        return endpoints[1] - endpoints[0]
+    else:
+        if n_heads_kv >= num_shards:
+            assert n_heads_kv % num_shards == 0, "No partitioning algorithm for this yet."
+            n_local_groups = n_heads_kv // num_shards
+            group_size = n_heads_q // n_heads_kv
+
+            return n_local_groups * head_size * group_size
+        else:
+            assert num_shards % n_heads_kv == 0, "No partitioning algorithm for this yet."
+            q_split_degree = num_shards // n_heads_kv
+            q_split_rank = shard_rank % q_split_degree
+            split_granularity = (n_heads_q // n_heads_kv) * head_size
+
+            q_endpoints = get_shard_endpoints(split_granularity, q_split_rank, q_split_degree, granularity=head_size)
+
+            return q_endpoints[1] - q_endpoints[0]
diff --git a/deepspeed/inference/v2/model_implementations/sharding/embedding.py b/deepspeed/inference/v2/model_implementations/sharding/embedding.py
new file mode 100644
index 000000000000..00d335768ae6
--- /dev/null
+++ b/deepspeed/inference/v2/model_implementations/sharding/embedding.py
@@ -0,0 +1,34 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+
+from .types import ShardingType
+from .utils import shard_param, get_shard_endpoints
+
+
+def shard_embedding_param(param: torch.Tensor, shard_rank: int, num_shards: int) -> torch.Tensor:
+    """
+    Utility method for sharding an embedding parameter.
+
+    Args:
+        param (torch.Tensor): The parameter to shard. Should be of shape [vocab_size, model_dim]
+        shard_rank (int): Which shard of the partitioned tensor to return.
+        num_shards (int): The total number of shards the parameter is distributed across.
+    """
+    return shard_param(param, ShardingType.INNER_DIMENSION, shard_rank, num_shards)
+
+
+def sharded_embedding_dim(embedding_size: int, shard_rank: int, num_shards: int) -> int:
+    """
+    Utility method for getting the size of the embedding dimension of a sharded embedding.
+
+    Args:
+        embedding_size (int): The size of the embedding.
+        shard_rank (int): Which shard of the partitioned tensor to return.
+        num_shards (int): The total number of shards the parameter is distributed across.
+    """
+    start_idx, end_idx = get_shard_endpoints(embedding_size, shard_rank, num_shards)
+    return end_idx - start_idx
diff --git a/deepspeed/inference/v2/model_implementations/sharding/mlp.py b/deepspeed/inference/v2/model_implementations/sharding/mlp.py
new file mode 100644
index 000000000000..8abd0ff8622d
--- /dev/null
+++ b/deepspeed/inference/v2/model_implementations/sharding/mlp.py
@@ -0,0 +1,75 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import Optional
+
+import torch
+
+from .types import ShardingType, DEFAULT_SHARD_GRANULARITY
+from .utils import shard_param, get_shard_endpoints
+
+
+def shard_mlp_1_param(param: torch.Tensor,
+                      shard_rank: int,
+                      num_shards: int,
+                      gated: bool = False,
+                      is_moe: bool = False) -> torch.Tensor:
+    """
+    Utility method for sharding an MLP 1 parameter. Both biases and weights are supported, as well
+    as for fused weights for MoE.
+
+    Args:
+        param (torch.Tensor): The parameter to shard.
+        shard_rank (int): Which shard of the partitioned tensor to return.
+        num_shards (int): The total number of shards the parameter is distributed across.
+        gated (bool): Whether or not the parameter is from a gated MLP.
+    """
+    bias_dims = 2 if is_moe else 1
+
+    if gated:
+        return shard_param(param,
+                           ShardingType.OUTER_DIMENSION,
+                           shard_rank,
+                           num_shards,
+                           granularity=DEFAULT_SHARD_GRANULARITY * 2,
+                           bias_dims=bias_dims)
+    else:
+        return shard_param(param, ShardingType.OUTER_DIMENSION, shard_rank, num_shards, bias_dims=bias_dims)
+
+
+def shard_mlp_2_param(param: torch.Tensor,
+                      shard_rank: int,
+                      num_shards: int,
+                      is_moe: bool = False) -> Optional[torch.Tensor]:
+    """
+    Utility method for sharding an MLP 2 parameter.
+
+    Args:
+        param (torch.Tensor): The parameter to shard.
+        shard_rank (int): Which shard of the partitioned tensor to return.
+        num_shards (int): The total number of shards the parameter is distributed across.
+        is_moe (bool): Whether or not the parameter is from a MoE model.
+    """
+    bias_dim_size = 2 if is_moe else 1
+
+    if len(param.shape) == bias_dim_size:
+        # We will do the bias addition on the 0th rank only rather than scale the parameter and
+        # implicitly reconstruct this in the distributed reduce.
+        return param if shard_rank == 0 else None
+
+    return shard_param(param, ShardingType.INNER_DIMENSION, shard_rank, num_shards)
+
+
+def sharded_intermediate_dim(intermediate_size: int, num_shards: int, shard_rank: int) -> int:
+    """
+    Utility method for getting the size of the intermediate dimension of a sharded MLP.
+
+    Args:
+        intermediate_size (int): The size of the intermediate dimension.
+        num_shards (int): The total number of shards the parameter is distributed across.
+        shard_rank (int): Which shard of the partitioned tensor to return.
+    """
+    endpoints = get_shard_endpoints(intermediate_size, shard_rank, num_shards)
+    return endpoints[1] - endpoints[0]
diff --git a/deepspeed/inference/v2/model_implementations/sharding/qkv.py b/deepspeed/inference/v2/model_implementations/sharding/qkv.py
new file mode 100644
index 000000000000..2b6d7f40836e
--- /dev/null
+++ b/deepspeed/inference/v2/model_implementations/sharding/qkv.py
@@ -0,0 +1,166 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import Optional
+
+import torch
+
+from .types import ShardingType
+from .utils import shard_param, get_shard_endpoints
+
+
+def shard_qkv_param(param: torch.Tensor,
+                    shard_rank: int,
+                    num_shards: int,
+                    head_size: int,
+                    n_heads_q: Optional[int] = None,
+                    n_heads_kv: Optional[int] = None) -> Optional[torch.Tensor]:
+    """
+    Utility method for sharding a QKV parameter. Both biases and weights are supported. It is assumed
+    that the layout of the parameter is such that all Q heads, all K heads, and all V heads
+    are contiguous with respect to each other.
+
+    Args:
+        param (torch.Tensor): The parameter to shard.
+        shard_rank (int): Which shard of the partitioned tensor to return.
+        num_shards (int): The total number of shards the parameter is distributed across.
+        head_size (int): The size of each head.
+        n_heads_q (int): The number of query heads. This only needs to be passed if the number
+             of query and key/value heads are different. If passed without n_heads_kv, default
+             MHA partitioning will be used.
+        n_heads_kv (int): The number of key/value heads. This only needs to be passed if the number
+                of query and key/value heads are different. This argument should not be passed without
+                n_heads_q (we want to explicitly opt into GQA sharding).
+    """
+    if n_heads_kv is not None and n_heads_q is None:
+        raise ValueError("n_heads_kv should not be passed without n_heads_q")
+
+    if n_heads_q is None:
+        # Guaranteed to be in MHA
+        if param.shape[0] // 3 % head_size != 0:
+            raise ValueError("MHA param shape is not correct")
+        n_heads_q = param.shape[0] // head_size // 3
+        mha_sharding = True
+    else:
+        mha_sharding = n_heads_q == n_heads_kv
+
+    if n_heads_q < num_shards:
+        raise ValueError("There must be at least as many query heads as there are shards.")
+
+    if mha_sharding:
+        return shard_param(param,
+                           ShardingType.OUTER_DIMENSION,
+                           shard_rank,
+                           num_shards,
+                           num_concatenated_matrices=3,
+                           granularity=head_size)
+    else:
+        if n_heads_q % n_heads_kv != 0:
+            raise ValueError("Must be an even ratio between query and key/value heads.")
+
+        if param.shape[0] != head_size * (n_heads_q + 2 * n_heads_kv):
+            raise ValueError("GQA param shape is not correct")
+
+        # 32 KV heads, 16 shards for example
+        if n_heads_kv >= num_shards and n_heads_kv % num_shards != 0:
+            raise ValueError("Currently do not support uneven partitioning of KV heads for GQA.")
+
+        # 8 KV heads, 16 shards for example
+        if n_heads_kv < num_shards and num_shards % n_heads_kv != 0:
+            raise ValueError("Currently do not support distributing KV heads across different numbers of shards.")
+        else:
+            even_kv_sharding = n_heads_kv >= num_shards
+
+        if param is None:
+            return None
+
+        q_param = param[:head_size * n_heads_q]
+        kv_param = param[head_size * n_heads_q:]
+
+        if even_kv_sharding:
+            # This is equivalent to the original sharding algorithm since n_heads_q = C * n_heads_kv.
+            # If n_heads_kv % num_shards == 0, then n_heads_q % num_shards == 0.
+            q_param = shard_param(q_param, ShardingType.OUTER_DIMENSION, shard_rank, num_shards, granularity=head_size)
+            kv_param = shard_param(kv_param,
+                                   ShardingType.OUTER_DIMENSION,
+                                   shard_rank,
+                                   num_shards,
+                                   num_concatenated_matrices=2,
+                                   granularity=head_size)
+            return torch.cat([q_param, kv_param], dim=0)
+        else:
+            # We will first do a sharding on the KV and Q to map to the one KV shard per group of Q.
+            q_sharding_degree = num_shards // n_heads_kv
+
+            kv_head = shard_rank // q_sharding_degree
+            k_param = kv_param[kv_head * head_size:(kv_head + 1) * head_size]
+            v_param = kv_param[(n_heads_kv + kv_head) * head_size:(n_heads_kv + kv_head + 1) * head_size]
+
+            q_sharding_rank = shard_rank % q_sharding_degree
+            q_factor = n_heads_q // n_heads_kv
+
+            q_chunk = q_param[q_factor * kv_head * head_size:q_factor * (kv_head + 1) * head_size]
+
+            q_param = shard_param(q_chunk,
+                                  ShardingType.OUTER_DIMENSION,
+                                  q_sharding_rank,
+                                  q_sharding_degree,
+                                  granularity=head_size)
+
+            return torch.cat([q_param, k_param, v_param], dim=0)
+
+
+def qkv_out_features(in_features: int,
+                     shard_rank: int,
+                     num_shards: int,
+                     head_size: int,
+                     n_heads_q: Optional[int] = None,
+                     n_heads_kv: Optional[int] = None) -> int:
+    """
+    Helper to calculate the expected output projection dimension of a QKV projection matrix.
+
+    Args:
+        in_features (int): The model dimension.
+        shard_rank (int): Which rank to return the corresponding size for.
+        num_shards (int): The total number of shards the parameter is distributed across.
+        head_size (int): The size of each head.
+        n_heads_q (int): The number of query heads. This only needs to be passed if the number
+             of query and key/value heads are different. If passed without n_heads_kv, default
+             MHA partitioning will be used.
+        n_heads_kv (int): The number of key/value heads. This only needs to be passed if the number
+            of query and key/value heads are different. This argument cannot be passed without also
+            passing n_heads_q (we want to explicitly opt into GQA sharding).
+    """
+    if n_heads_kv is not None and n_heads_q is None:
+        raise ValueError("n_heads_kv should not be passed without n_heads_q")
+
+    mha_sharding = n_heads_kv is None or n_heads_q == n_heads_kv
+
+    if n_heads_q is not None and in_features != head_size * n_heads_q:
+        raise ValueError("in_features is not consistent with n_heads_q and head_size")
+
+    if mha_sharding:
+        endpoints = get_shard_endpoints(in_features, shard_rank, num_shards, granularity=head_size)
+        return (endpoints[1] - endpoints[0]) * 3
+    else:
+        if n_heads_kv >= num_shards:
+            if n_heads_kv % num_shards != 0:
+                raise ValueError("The KV heads must be evenly distributed across the shards.")
+
+            n_local_groups = n_heads_kv // num_shards
+            group_size = n_heads_q // n_heads_kv
+
+            return n_local_groups * head_size * (2 + group_size)
+        else:
+            if num_shards % n_heads_kv != 0:
+                raise ValueError("A shared KV head must always partition across the same number of shards.")
+
+            q_split_degree = num_shards // n_heads_kv
+            q_split_rank = shard_rank % q_split_degree
+            split_granularity = (n_heads_q // n_heads_kv) * head_size
+
+            q_endpoints = get_shard_endpoints(split_granularity, q_split_rank, q_split_degree, granularity=head_size)
+
+            return (q_endpoints[1] - q_endpoints[0]) + 2 * head_size
diff --git a/deepspeed/inference/v2/model_implementations/sharding/types.py b/deepspeed/inference/v2/model_implementations/sharding/types.py
new file mode 100644
index 000000000000..01dce0db523a
--- /dev/null
+++ b/deepspeed/inference/v2/model_implementations/sharding/types.py
@@ -0,0 +1,18 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from enum import Enum
+
+DEFAULT_SHARD_GRANULARITY = 32
+
+
+class ShardingType(Enum):
+    # Inner dimension sharding corresponds to splitting the Tensor along the K-dimension
+    # of a matrix multiplication. This would be used for attention_output or MLP2.
+    INNER_DIMENSION = 1
+
+    # Outer dimension sharding corresponds to splitting the Tensor along the N-dimension
+    # of a matrix multiplication. This would be used for the QKV and MLP1 projections.
+    OUTER_DIMENSION = 0
diff --git a/deepspeed/inference/v2/model_implementations/sharding/unembed.py b/deepspeed/inference/v2/model_implementations/sharding/unembed.py
new file mode 100644
index 000000000000..6cc771969ad9
--- /dev/null
+++ b/deepspeed/inference/v2/model_implementations/sharding/unembed.py
@@ -0,0 +1,41 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+
+from .types import ShardingType
+from .utils import shard_param, get_shard_endpoints
+
+
+def shard_unembed_param(param: torch.Tensor, shard_rank: int, num_shards: int) -> torch.Tensor:
+    """
+    Utility method for sharding an unembed parameter. We shard unembeddings on the vocab dimension
+    with the expectation of an all-gather to produce the full results.
+
+    TODO(cmikeh2): Really ideal would be if MII could have access to the comm and we would do
+    an A2A and sharded sampling.
+
+    Args:
+        param (torch.Tensor): The parameter to shard. Should be of shape [vocab_size, model_dim]
+        shard_rank (int): Which shard of the partitioned tensor to return.
+        num_shards (int): The total number of shards the parameter is distributed across.
+
+    Returns:
+        torch.Tensor: The sharded parameter of shape [sharded_vocab_size, model_dim]
+    """
+    return shard_param(param, ShardingType.OUTER_DIMENSION, shard_rank, num_shards, granularity=1)
+
+
+def sharded_unembed_dim(vocab_size: int, shard_rank: int, num_shards: int) -> int:
+    """
+    Utility method for determining the sharded vocab size of a sharded unembed parameter.
+
+    Args:
+        vocab_size (int): The size of the vocabulary.
+        shard_rank (int): Which shard of the partitioned tensor to return.
+        num_shards (int): The total number of shards the parameter is distributed across.
+    """
+    start_idx, end_idx = get_shard_endpoints(vocab_size, shard_rank, num_shards, granularity=1)
+    return end_idx - start_idx
diff --git a/deepspeed/inference/v2/model_implementations/sharding/utils.py b/deepspeed/inference/v2/model_implementations/sharding/utils.py
new file mode 100644
index 000000000000..fd0eb51873f8
--- /dev/null
+++ b/deepspeed/inference/v2/model_implementations/sharding/utils.py
@@ -0,0 +1,104 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import Optional, Tuple
+
+import torch
+
+from .types import ShardingType, DEFAULT_SHARD_GRANULARITY
+
+
+def get_shard_endpoints(dim_size: int,
+                        shard_rank: int,
+                        num_shards: int,
+                        granularity: int = DEFAULT_SHARD_GRANULARITY) -> Tuple[int, int]:
+    """
+    Given a dimension to shard with size dim_size, return the start and end indices of the slice
+    that belong to the given rank.
+
+    The typical use of this is as an internal helper function, so see if there is a higher level
+    API that better suits the application.
+
+    Args:
+        dim_size (int): The size of the dimension to shard.
+        shard_rank (int): The rank of the shard to return.
+        num_shards (int): Total number of shards the dimension will be distributed across.
+        granularity (int): The minimum alignment of the shard endpoints. This is used to support
+            non-even head counts as well as align dimensions to cleaner GEMM boundaries.
+    """
+    assert dim_size % granularity == 0, "Dimension size must be divisible by granularity"
+
+    total_chunks = dim_size // granularity
+    base_chunks_per_rank = total_chunks // num_shards
+    remainder_chunks = total_chunks % num_shards
+
+    start_chunk_id = shard_rank * base_chunks_per_rank + min(shard_rank, remainder_chunks)
+    end_chunk_id = start_chunk_id + base_chunks_per_rank + (1 if shard_rank < remainder_chunks else 0)
+
+    return start_chunk_id * granularity, end_chunk_id * granularity
+
+
+def shard_param(param: Optional[torch.Tensor],
+                shard_mode: ShardingType,
+                shard_rank: int,
+                num_shards: int,
+                num_concatenated_matrices: int = 1,
+                granularity: int = 32,
+                bias_dims: int = 1) -> torch.Tensor:
+    """
+    Utility for sharding a parameter. This will return the slice of the parameter that should
+    exist on the given shard_rank given the sharding configuration. The workflow here is
+    to find the minimum bounded Tensor to shard, get the slicing endpoints, and then concatenate
+    as needed.
+
+    The typical use of this is as an internal helper function, so see if there is a higher level
+    API that better suits the application.
+
+    Args:
+        param (torch.Tensor): The parameter to shard.
+        shard_mode (ShardingType): The type of sharding to apply. See ShardingType for more context.
+        shard_rank (int): The rank of the shard to return.
+        num_shards (int): Total number of shards the parameter will be distrbuted across.
+        num_concatenated_matrices (int): The number of matrices that have been concatenated together in the original
+            parameter. An example of this is a fused QKV projection matrix, where the `num_concatenated_matrices`
+            argument would be 3.
+        granularity (int): The minimum alignment of the shard endpoints. For attention projection matrices, this
+            should be set to the head size to support non-even sharding.
+        bias_dims (int): The number of dimensions that are considered bias dimensions. This is used to support
+            sharding of MoE and non-MoE biases on the same codepath.
+    """
+    assert shard_rank < num_shards, "Shard rank must be less than num_shards"
+
+    # Easier to hide this inside of the sharding logic than to add checks in every model
+    # implementation.
+    if param is None:
+        return None
+
+    if num_shards == 1:
+        # Trivial case of no sharding.
+        return param
+
+    if shard_mode == ShardingType.OUTER_DIMENSION:
+
+        def get_matrices(dim_idx: int) -> torch.Tensor:
+            dim_size = param.size(dim_idx) // num_concatenated_matrices
+            start_channel_id, end_channel_id = get_shard_endpoints(dim_size, shard_rank, num_shards, granularity)
+            return torch.chunk(param, num_concatenated_matrices, dim=dim_idx), start_channel_id, end_channel_id
+
+        if param.ndim == bias_dims:
+            # Special case for bias parameters.
+            matrices, start_channel_id, end_channel_id = get_matrices(dim_idx=-1)
+            return torch.cat([mat[..., start_channel_id:end_channel_id] for mat in matrices], dim=-1)
+        else:
+            # General case for weight parameters. This assumes MoE parameters are stored in the format of
+            # [num_experts, out_features, in_features]
+            matrices, start_channel_id, end_channel_id = get_matrices(dim_idx=-2)
+            return torch.cat([mat[..., start_channel_id:end_channel_id, :] for mat in matrices], dim=-2)
+
+    elif shard_mode == ShardingType.INNER_DIMENSION:
+        dim_size = param.size(-1) // num_concatenated_matrices
+        start_channel_id, end_channel_id = get_shard_endpoints(dim_size, shard_rank, num_shards, granularity)
+        matrices = torch.chunk(param, num_concatenated_matrices, dim=-1)
+        return torch.cat([mat[..., start_channel_id:end_channel_id] for mat in matrices], dim=-1)
diff --git a/deepspeed/inference/v2/modules/__init__.py b/deepspeed/inference/v2/modules/__init__.py
new file mode 100644
index 000000000000..917c1599de2e
--- /dev/null
+++ b/deepspeed/inference/v2/modules/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from . import implementations
+from . import interfaces
+from .module_registry import ConfigBundle
diff --git a/deepspeed/inference/v2/modules/configs/__init__.py b/deepspeed/inference/v2/modules/configs/__init__.py
new file mode 100644
index 000000000000..19b9fb99ddea
--- /dev/null
+++ b/deepspeed/inference/v2/modules/configs/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .attention_configs import (DSSelfAttentionConfig, PositionalEmbeddingType, MaskingType)
+from .embedding_config import DSEmbeddingsConfig
+from .linear_config import DSLinearConfig
+from .moe_config import DSMoEConfig
+from .norm_config import DSNormConfig, NormTypeEnum
+from .unembed_config import DSUnembedConfig
diff --git a/deepspeed/inference/v2/modules/configs/attention_configs.py b/deepspeed/inference/v2/modules/configs/attention_configs.py
new file mode 100644
index 000000000000..bcdc3d2613d5
--- /dev/null
+++ b/deepspeed/inference/v2/modules/configs/attention_configs.py
@@ -0,0 +1,82 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from enum import Enum
+from typing import Dict
+
+from ...inference_utils import DtypeEnum
+from ...modules.ds_module import DSModuleConfig
+
+
+class PositionalEmbeddingType(Enum):
+
+    # No positional embeddings
+    none = "none"
+
+    # Rotary positional embeddings - every half
+    rotate_half = "rotate_half"
+
+    # Rotary positional embeddings - every other
+    rotate_every_other = "rotate_every_other"
+
+    # Alibi
+    alibi = "alibi"
+
+
+class MaskingType(Enum):
+
+    # No masking
+    none = "none"
+
+    # Causal masking
+    causal = "causal"
+
+    # Local masking
+    local = "local"
+
+    # Symmetric masking (this is a 1D tensor mask)
+    symmetric = "symmetric"
+
+    # Arbitrary masking (this would correspond to a 2D tensor mask)
+    asymmetric = "asymmetric"
+
+
+class DSSelfAttentionConfig(DSModuleConfig):
+    """
+    Config class for attention.
+    """
+
+    # Number of query attention heads on this shard
+    n_heads_q: int
+
+    # Number of KV attention heads on this shard
+    n_heads_kv: int
+
+    # Size of each attention head
+    head_size: int
+
+    # Max number of sequences that may compose a ragged batch
+    max_sequences: int
+
+    # Scale factor for attention scores
+    scale_factor: float = 1.0
+
+    # Input data type
+    input_dtype: DtypeEnum = DtypeEnum.fp16
+
+    # Output data type
+    output_dtype: DtypeEnum = DtypeEnum.fp16
+
+    # Masking type
+    masking_type: MaskingType = MaskingType.causal
+
+    # Masking args
+    masking_args: Dict = {}
+
+    # Positional embedding type
+    positional_embedding_type: PositionalEmbeddingType = PositionalEmbeddingType.none
+
+    # Positional embedding args
+    positional_embedding_args: Dict = {}
diff --git a/deepspeed/inference/v2/modules/configs/embedding_config.py b/deepspeed/inference/v2/modules/configs/embedding_config.py
new file mode 100644
index 000000000000..2486c5986e95
--- /dev/null
+++ b/deepspeed/inference/v2/modules/configs/embedding_config.py
@@ -0,0 +1,70 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import Optional
+
+from ...inference_utils import DtypeEnum, NormTypeEnum
+from ...modules.ds_module import DSModuleConfig
+"""
+Trying to define the space we need to support here right now:
+
+Types of embeddings I've found so far:
+    1. Token embedding
+    2. Position embedding
+    3. Token type embedding
+    4. LN
+
+GPTNeo: 1, 2, 3 (shared with 1)
+GPTNeoX: 1
+GPTJ: 1, 3
+LLaMA: 1
+BERT: 1, 2, 3, 4
+GPT2: 1, 2, 3 (shared with 1)
+
+Sidebar for OPT:
+OPT: 1, 2
+1 may not actually project to the actual hidden dimension according to the raw
+code, but for the model configs we care about it does.
+2 has a weird offset associated with it that the others do not.
+"""
+
+
+class DSEmbeddingsConfig(DSModuleConfig):
+    """
+    Config class for DSEmbeddings.
+    """
+
+    residual_dtype: DtypeEnum = DtypeEnum.fp16
+    """
+    Data type the module should use for its output.
+    """
+
+    embedding_dim: int
+    """
+    Dimensionality of the embedding projections.
+    """
+
+    positional_embedding: bool = False
+    """
+    Whether the module should expect a positional embedding matrix. The shape of this
+    matrix should be of shape [max_seq_len + positional_offset, embedding_dim]
+    """
+
+    positional_offset: int = 0
+    """
+    Whether the linearized token IDs should be offset by a certain amount. For an example
+    of this, see the OPT model implementation.
+    """
+
+    use_token_type: bool = False
+    """
+    Whether the module should expect a token type embedding matrix.
+    """
+
+    output_normalization: Optional[NormTypeEnum] = None
+    """
+    If a the output of the embedding module should be normalized, specify here. See
+    ``inference.inference_utils.NormTypeEnum`` for supported values.
+    """
diff --git a/deepspeed/inference/v2/modules/configs/linear_config.py b/deepspeed/inference/v2/modules/configs/linear_config.py
new file mode 100644
index 000000000000..40fe0773aeee
--- /dev/null
+++ b/deepspeed/inference/v2/modules/configs/linear_config.py
@@ -0,0 +1,43 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from ...inference_utils import ActivationType, DtypeEnum
+from ...modules.ds_module import DSModuleConfig
+
+
+class DSLinearConfig(DSModuleConfig):
+    """
+    Config class for DSLinearBase.
+    """
+
+    in_channels: int
+    """
+    Number of input channels
+    """
+
+    out_channels: int
+    """
+    Number of output channels. NOTE: If this linear layer is using a gated activation function,
+    the value for ``out_channels`` passed here should refer to the number of channels after
+    gating (i.e., the expected weight shape before transformations will be ``[out_channels * 2, in_channels]``).
+    """
+
+    activation: ActivationType = ActivationType.IDENTITY
+    """
+    The activation function for this layer. See :class:`deepspeed.inference.inference_utils.ActivationType` for
+    supported activation functions.
+    """
+
+    input_dtype: DtypeEnum = DtypeEnum.fp16
+    """
+    The data type of the input tensor. See :class:`deepspeed.inference.inference_utils.DtypeEnum` for supported
+    data types.
+    """
+
+    output_dtype: DtypeEnum = DtypeEnum.fp16
+    """
+    The data type of the output tensor. See :class:`deepspeed.inference.inference_utils.DtypeEnum` for supported
+    data types.
+    """
diff --git a/deepspeed/inference/v2/modules/configs/moe_config.py b/deepspeed/inference/v2/modules/configs/moe_config.py
new file mode 100644
index 000000000000..1a88d54af19f
--- /dev/null
+++ b/deepspeed/inference/v2/modules/configs/moe_config.py
@@ -0,0 +1,50 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from ...inference_utils import ActivationType, DtypeEnum
+from ...modules.ds_module import DSModuleConfig
+
+
+class DSMoEConfig(DSModuleConfig):
+    """
+    Config class for DSMoEBase
+    """
+
+    model_dim: int
+    """
+    Size of input activation.
+    """
+
+    intermediate_features: int
+    """
+    Size of intermediate activation. Specifically, this is the number of input features
+    in the second linear layer. Depending on the activation function, the output of the first
+    linear layer may have increased dimensionality.
+    """
+
+    n_experts: int
+    """
+    Number of experts.
+    """
+
+    top_k: int = 1
+    """
+    top-k gating function (like top-1 or top-2)
+    """
+
+    input_dtype: DtypeEnum = DtypeEnum.fp16
+    """
+    Data type for the input activations.
+    """
+
+    output_dtype: DtypeEnum = DtypeEnum.fp16
+    """
+    Data type for the output activations.
+    """
+
+    activation: ActivationType = ActivationType.IDENTITY
+    """
+    Activation function of the first MLP1
+    """
diff --git a/deepspeed/inference/v2/modules/configs/norm_config.py b/deepspeed/inference/v2/modules/configs/norm_config.py
new file mode 100644
index 000000000000..358982253756
--- /dev/null
+++ b/deepspeed/inference/v2/modules/configs/norm_config.py
@@ -0,0 +1,32 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from ...inference_utils import DtypeEnum, NormTypeEnum
+from ...modules.ds_module import DSModuleConfig
+
+
+class DSNormConfig(DSModuleConfig):
+    """
+    Config class for both DSPreLN and DSPostLN.
+    """
+
+    # Type of normalization
+    type: NormTypeEnum
+
+    # Number of channels in the model embedding
+    channels: int
+
+    # Data type of the residual input/outputs (we assume the residual must
+    # be the same data type for the entire model).
+    residual_dtype: DtypeEnum = DtypeEnum.fp16
+
+    # Data type of the hidden states input
+    input_dtype: DtypeEnum = DtypeEnum.fp16
+
+    # Data type of the hidden states output
+    output_dtype: DtypeEnum = DtypeEnum.fp16
+
+    # Epsilon value for numerical stability
+    eps: float = 1e-5
diff --git a/deepspeed/inference/v2/modules/configs/unembed_config.py b/deepspeed/inference/v2/modules/configs/unembed_config.py
new file mode 100644
index 000000000000..ea4cc3cc99c1
--- /dev/null
+++ b/deepspeed/inference/v2/modules/configs/unembed_config.py
@@ -0,0 +1,39 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from ...inference_utils import DtypeEnum, NormTypeEnum
+from ...modules.ds_module import DSModuleConfig
+from typing import Optional
+
+
+class DSUnembedConfig(DSModuleConfig):
+    """
+    Config class for DSUnembed
+    """
+
+    dtype: DtypeEnum = DtypeEnum.fp16
+    """
+    Expected data type.
+    """
+
+    norm_type: Optional[NormTypeEnum] = None
+    """
+    Whether the input to the unembed is normalized prior to the unembedding projection.
+    """
+
+    model_dim: int
+    """
+    Model embedding size.
+    """
+
+    max_sequences: int
+    """
+    Max sequences composing the ragged batch.
+    """
+
+    vocab_size: int
+    """
+    Local vocab size (the full vocab size may have been sharded across model parallel ranks)
+    """
diff --git a/deepspeed/inference/v2/modules/ds_module.py b/deepspeed/inference/v2/modules/ds_module.py
new file mode 100644
index 000000000000..2a6d294f3266
--- /dev/null
+++ b/deepspeed/inference/v2/modules/ds_module.py
@@ -0,0 +1,62 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from abc import ABC, abstractstaticmethod
+from typing import Any, Dict, Type
+
+import torch
+
+from deepspeed.runtime.config_utils import DeepSpeedConfigModel
+
+
+class DSModuleConfig(DeepSpeedConfigModel):
+
+    max_tokens: int
+
+
+class DSModuleBase(torch.nn.Module, ABC):
+    """
+    Base class for all DeepSpeed Inference modules. This class establishes
+    the basic attributes of a DSModule. Only abstract functionality modules should inherit
+    directly from this class, not specific implementations.
+    """
+
+    @abstractstaticmethod
+    def name() -> str:
+        """
+        Return a memorable, human-readable name for this module.
+
+        This will be used as a key in custom inference configurations and should only
+        be implemented by the children of functionality modules.
+        """
+        ...
+
+    @abstractstaticmethod
+    def config_class() -> Type[DSModuleConfig]:
+        """
+        Return the associated config class for this module.
+
+        This should be implemented (along with the config class) by an abstract functionality
+        module.
+        """
+        ...
+
+    @abstractstaticmethod
+    def supports_config(config: DSModuleConfig) -> bool:
+        """
+        Return whether or not this module supports the given config.
+
+        This should be implemented by the children of functionality modules and should report
+        whether it would be feasible to instantiate this module with the given config.
+        """
+        ...
+
+    def __init__(self, config: DSModuleConfig, implementation_config: Dict[str, Any] = {}) -> None:
+        """
+        Initialize the module with the given config.
+        """
+        super().__init__()
+        self._config = config
+        self._implementation_config = implementation_config
diff --git a/deepspeed/inference/v2/modules/heuristics.py b/deepspeed/inference/v2/modules/heuristics.py
new file mode 100644
index 000000000000..b89e95c0d834
--- /dev/null
+++ b/deepspeed/inference/v2/modules/heuristics.py
@@ -0,0 +1,179 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from ..config_v2 import RaggedInferenceEngineConfig
+from ..inference_utils import NormTypeEnum
+
+from .module_registry import ConfigBundle
+from ..modules.configs import (
+    DSEmbeddingsConfig,
+    DSLinearConfig,
+    DSMoEConfig,
+    DSNormConfig,
+    DSSelfAttentionConfig,
+    DSUnembedConfig,
+)
+from ..modules.interfaces import (
+    DSEmbeddingBase,
+    DSEmbeddingRegistry,
+    DSLinearBase,
+    DSLinearRegistry,
+    DSMoEBase,
+    DSMoERegistry,
+    DSPostNormBase,
+    DSPostNormRegistry,
+    DSPreNormBase,
+    DSPreNormRegistry,
+    DSSelfAttentionBase,
+    DSSelfAttentionRegistry,
+    DSUnembedBase,
+    DSUnembedRegistry,
+)
+
+
+def instantiate_attention(attention_config: DSSelfAttentionConfig,
+                          engine_config: RaggedInferenceEngineConfig) -> DSSelfAttentionBase:
+    """
+    Choose an appropriate attention implementation based on the given configurations. This
+    method is currently a stub, but as more implementations may be developed  we can centralize
+    the logic for choosing between them here.
+
+    Arguments:
+        attention_config (DSSelfAttentionConfig): Configuration for the attention module.
+        engine_config (RaggedInferenceEngineConfig): Configuration for the inference engine.
+
+    Returns:
+        An attention module implementing the given configuration.
+    """
+
+    # Currently, we only have one implementation, so we just return it.
+    config = ConfigBundle(name="dense_blocked_attention", config=attention_config)
+    return DSSelfAttentionRegistry.instantiate_config(config)
+
+
+def instantiate_embed(embed_config: DSEmbeddingsConfig, engine_config: RaggedInferenceEngineConfig) -> DSEmbeddingBase:
+    """
+    Choose an appropriate embedding implementation based on the given configurations. This
+    method is currently a stub, but as more implementations may be developed  we can centralize
+    the logic for choosing between them here.
+
+    Arguments:
+        embed_config (DSEmbeddingsConfig): Configuration for the embedding module.
+        engine_config (RaggedInferenceEngineConfig): Configuration for the inference engine.
+
+    Returns:
+        An embedding module implementing the given configuration.
+    """
+
+    # Currently, we only have one implementation, so we just return it.
+    config = ConfigBundle(name="ragged_embedding", config=embed_config)
+    return DSEmbeddingRegistry.instantiate_config(config)
+
+
+def instantiate_linear(linear_config: DSLinearConfig, engine_config: RaggedInferenceEngineConfig) -> DSLinearBase:
+    """
+    Choose an appropriate linear implementation based on the given configurations. This
+    method is currently a stub, but as more implementations may be developed  we can centralize
+    the logic for choosing between them here.
+
+    Arguments:
+        linear_config (DSLinearConfig): Configuration for the linear module.
+        engine_config (RaggedInferenceEngineConfig): Configuration for the inference engine.
+
+    Returns:
+        A linear module implementing the given configuration.
+    """
+
+    # Currently, we only have one implementation, so we just return it.
+    config = ConfigBundle(name="blas_fp_linear", config=linear_config)
+    return DSLinearRegistry.instantiate_config(config)
+
+
+def instantiate_moe(moe_config: DSMoEConfig, engine_config: RaggedInferenceEngineConfig) -> DSMoEBase:
+    """
+    Choose an appropriate MoE implementation based on the given configurations. This
+    method is currently a stub, but as more implementations may be developed  we can centralize
+    the logic for choosing between them here.
+
+    Arguments:
+        moe_config (DSMoEConfig): Configuration for the MoE module.
+        engine_config (RaggedInferenceEngineConfig): Configuration for the inference engine.
+
+    Returns:
+        A MoE module implementing the given configuration.
+    """
+
+    moe_type = "cutlass_multi_gemm_moe"
+
+    if moe_type == "cutlass_multi_gemm_moe":
+        # TODO: Get this off an engine config
+        implementation_config = {
+            "weight_dtype": moe_config.input_dtype,
+        }
+
+    # Currently, we only have one implementation, so we just return it.
+    config = ConfigBundle(name="cutlass_multi_gemm_moe",
+                          config=moe_config,
+                          implementation_config=implementation_config)
+    return DSMoERegistry.instantiate_config(config)
+
+
+def instantiate_post_norm(norm_config: DSNormConfig, engine_config: RaggedInferenceEngineConfig) -> DSPostNormBase:
+    """
+    Choose an appropriate post-norm implementation based on the given configurations. This
+    method is currently a stub, but as more implementations may be developed  we can centralize
+    the logic for choosing between them here.
+
+    Arguments:
+        norm_config (DSNormConfig): Configuration for the post-norm module.
+        engine_config (RaggedInferenceEngineConfig): Configuration for the inference engine.
+
+    Returns:
+        A post-norm module implementing the given configuration.
+    """
+
+    # Currently, we only have one implementation, so we just return it.
+    config = ConfigBundle(name="cuda_post_ln", config=norm_config)
+    return DSPostNormRegistry.instantiate_config(config)
+
+
+def instantiate_pre_norm(norm_config: DSNormConfig, engine_config: RaggedInferenceEngineConfig) -> DSPreNormBase:
+    """
+    Choose an appropriate pre-norm implementation based on the given configurations. Currently,
+    this will select between two CUDA implementations, one for LayerNorm and one for RMSNorm.
+
+    Arguments:
+        norm_config (DSNormConfig): Configuration for the pre-norm module.
+        engine_config (RaggedInferenceEngineConfig): Configuration for the inference engine.
+
+    Returns:
+        A pre-norm module implementing the given configuration.
+    """
+    if NormTypeEnum(norm_config.type) == NormTypeEnum.LayerNorm:
+        module_name = "cuda_pre_ln"
+    elif NormTypeEnum(norm_config.type) == NormTypeEnum.RMSNorm:
+        module_name = "cuda_pre_rms"
+
+    config = ConfigBundle(name=module_name, config=norm_config)
+    return DSPreNormRegistry.instantiate_config(config)
+
+
+def instantiate_unembed(unembed_config: DSUnembedConfig, engine_config: RaggedInferenceEngineConfig) -> DSUnembedBase:
+    """
+    Choose an appropriate unembedding implementation based on the given configurations. This
+    method is currently a stub, but as more implementations may be developed  we can centralize
+    the logic for choosing between them here.
+
+    Arguments:
+        unembed_config (DSUnembedConfig): Configuration for the unembed module.
+        engine_config (RaggedInferenceEngineConfig): Configuration for the inference engine.
+
+    Returns:
+        An unembed module implementing the given configuration.
+    """
+
+    # Currently, we only have one implementation, so we just return it.
+    config = ConfigBundle(name="ragged_unembed", config=unembed_config)
+    return DSUnembedRegistry.instantiate_config(config)
diff --git a/deepspeed/inference/v2/modules/implementations/__init__.py b/deepspeed/inference/v2/modules/implementations/__init__.py
new file mode 100644
index 000000000000..1b500a9a0b5a
--- /dev/null
+++ b/deepspeed/inference/v2/modules/implementations/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# Imports for registering ops
+from .attention import *
+from .linear import *
+from .post_norm import *
+from .pre_norm import *
+from .embedding import *
+from .unembed import *
+from .moe import *
diff --git a/deepspeed/inference/v2/modules/implementations/attention/__init__.py b/deepspeed/inference/v2/modules/implementations/attention/__init__.py
new file mode 100644
index 000000000000..1ff593ac1368
--- /dev/null
+++ b/deepspeed/inference/v2/modules/implementations/attention/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .dense_blocked_attention import DSDenseBlockedAttention
diff --git a/deepspeed/inference/v2/modules/implementations/attention/dense_blocked_attention.py b/deepspeed/inference/v2/modules/implementations/attention/dense_blocked_attention.py
new file mode 100644
index 000000000000..bb482f0c58d6
--- /dev/null
+++ b/deepspeed/inference/v2/modules/implementations/attention/dense_blocked_attention.py
@@ -0,0 +1,167 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import Any, Dict, Optional
+
+import torch
+
+from deepspeed.accelerator import get_accelerator
+from ....allocator import empty_from
+from ....inference_utils import DtypeEnum
+from ....kernels.ragged_ops import (
+    AtomBuilder,
+    BlockedFlashAttn,
+    BlockedRotaryEmbeddings,
+    BlockedTrainedRotaryEmbeddings,
+    get_q_block_size,
+    get_kv_block_size,
+    LinearBlockedKVCopy,
+)
+from ....ragged import RaggedBatchWrapper, split_kv
+from deepspeed.ops.op_builder import RaggedUtilsBuilder
+
+from ...interfaces import DSSelfAttentionBase, DSSelfAttentionRegistry
+from ...configs import DSSelfAttentionConfig, PositionalEmbeddingType, MaskingType
+
+try:
+    from functools import cached_property
+except ImportError:
+
+    def cached_property(func):
+        return property(func)
+
+
+@DSSelfAttentionRegistry.register_module
+class DSDenseBlockedAttention(DSSelfAttentionBase):
+    """
+    Self attention implementation for dense, blocked self attention.
+    """
+
+    @staticmethod
+    def name() -> str:
+        return 'dense_blocked_attention'
+
+    @staticmethod
+    def supports_config(config: DSSelfAttentionConfig) -> bool:
+
+        if config.input_dtype != config.output_dtype:
+            return False
+
+        if DtypeEnum(config.input_dtype) not in (DtypeEnum.fp16, DtypeEnum.bf16):
+            return False
+
+        if PositionalEmbeddingType(config.positional_embedding_type) not in [
+                PositionalEmbeddingType.none, PositionalEmbeddingType.rotate_half
+        ]:
+            return False
+
+        if MaskingType(config.masking_type) != MaskingType.causal:
+            return False
+
+        return True
+
+    def __init__(self, config: DSSelfAttentionConfig, implementation_config: Dict[str, Any]) -> None:
+        """
+        Create the Attention DSModule.
+
+        Args:
+            config (DSSelfAttentionConfig): The self attention config for all attention DSModules.
+            implementation_config (Dict[str, Any]): The implementation config for this DSModule may
+                contain a `trained_freqs` key. If passed, the implementation will expect a `trained_freqs`
+                tensor in the `forward` method and will not synthesize the frequencies internally.
+        """
+        super().__init__(config, implementation_config)
+
+        embed_type = PositionalEmbeddingType(config.positional_embedding_type)
+        if embed_type == PositionalEmbeddingType.none:
+            self._kv_copy = LinearBlockedKVCopy(self._config.head_size, self._config.n_heads_q,
+                                                self._config.n_heads_kv, self._config.input_dtype)
+        elif embed_type == PositionalEmbeddingType.rotate_half:
+            use_trained_freqs = "trained_freqs" in self._config.positional_embedding_args and self._config.positional_embedding_args[
+                "trained_freqs"]
+            if use_trained_freqs:
+                self._kv_copy = BlockedTrainedRotaryEmbeddings(self._config.head_size, self._config.n_heads_q,
+                                                               self._config.n_heads_kv, self._config.input_dtype)
+            else:
+                self._kv_copy = BlockedRotaryEmbeddings(self._config.head_size, self._config.n_heads_q,
+                                                        self._config.n_heads_kv, self._config.input_dtype)
+
+        self._softmax_scale = self._config.scale_factor
+
+        # TODO(cmikeh2): Attention kernel gets created here.
+        self._attn_kernel = BlockedFlashAttn(self._config.head_size, self._config.input_dtype)
+        self._atom_builder = AtomBuilder()
+
+        self.model_dim = self._config.head_size * self._config.n_heads_q
+        self._output = torch.empty((self._config.max_tokens, self._config.head_size * self._config.n_heads_q),
+                                   dtype=self._config.output_dtype,
+                                   device=get_accelerator().current_device())
+
+        # TODO(cmikeh2): Pre-allocate storage buffer for the attention atoms.
+        self._max_atoms = self._config.max_sequences
+        self._atoms = torch.empty((self._max_atoms, 8), dtype=torch.int32, device=get_accelerator().current_device())
+
+        alloc_func = RaggedUtilsBuilder().load().allocate_fast_host_buffer
+        self._atoms_shadow = alloc_func(self._atoms)
+        self._cur_atoms = 0
+
+    @cached_property
+    def kv_block_size(self) -> int:
+        """
+        Return preferred granulatity for blocked KV-cache implementation.
+        """
+        return get_kv_block_size(self._config.head_size)
+
+    @cached_property
+    def q_block_size(self) -> int:
+        """
+        Property to calculate blocking granularity for the query dimension.
+        This has no impact on the KV-cache structure, but will  affect the
+        number of attention atoms associated with a batch.
+        """
+        return get_q_block_size(self._config.head_size)
+
+    def build_atoms(self, ragged_batch: RaggedBatchWrapper) -> None:
+        """
+        Build the atoms for the attention kernel.
+
+        Args:
+            ragged_batch (RaggedBatchWrapper): The input ids and associated ragged batch metadata.
+        """
+        host_atoms, n_atoms = self._atom_builder(self._atoms_shadow, ragged_batch, self.q_block_size,
+                                                 self.kv_block_size)
+
+        self._cur_atoms = n_atoms
+        self._atoms[:n_atoms].copy_(host_atoms[:n_atoms], non_blocking=True)
+
+    def forward(self,
+                q_k_v: torch.Tensor,
+                kv_cache: torch.Tensor,
+                batch: RaggedBatchWrapper,
+                inv_freqs: Optional[torch.Tensor] = None) -> torch.Tensor:
+        """
+        Forward implementation.
+
+        Args:
+            q_k_v (torch.Tensor): Query/Key/Value projection Tensor of shape
+                [n_heads, (n_heads_q + 2 * n_heads_kv) * head_size].
+            kv_cache (torch.Tensor): Blocked persistent cache of shape
+                [2, batch, block_size, n_heads_kv, head_size].
+            batch (RaggedBatchWrapper): The input ids and associated ragged batch metadata.
+            inv_freqs (Optional[torch.Tensor]): The inverse frequencies for the rotary embeddings if they
+                have been modified from synthesizable values.
+        """
+        if inv_freqs is not None:
+            self._kv_copy(kv_cache, q_k_v, batch, inv_freqs)
+        else:
+            self._kv_copy(kv_cache, q_k_v, batch)
+
+        q = q_k_v[:, :self._config.head_size * self._config.n_heads_q]
+        output = empty_from(self._output, q.shape)
+        k_cache, v_cache = split_kv(kv_cache)
+
+        self._attn_kernel(output, q, k_cache, v_cache, self._atoms[:self._cur_atoms], self._softmax_scale)
+
+        return output
diff --git a/deepspeed/inference/v2/modules/implementations/embedding/__init__.py b/deepspeed/inference/v2/modules/implementations/embedding/__init__.py
new file mode 100644
index 000000000000..5458a855abf4
--- /dev/null
+++ b/deepspeed/inference/v2/modules/implementations/embedding/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .ragged_embedding import DSRaggedEmbedding
diff --git a/deepspeed/inference/v2/modules/implementations/embedding/ragged_embedding.py b/deepspeed/inference/v2/modules/implementations/embedding/ragged_embedding.py
new file mode 100644
index 000000000000..6782bcae81c8
--- /dev/null
+++ b/deepspeed/inference/v2/modules/implementations/embedding/ragged_embedding.py
@@ -0,0 +1,77 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import Any, Dict, Optional
+
+import torch
+
+from deepspeed.accelerator import get_accelerator
+from ....allocator import empty_from
+from ....inference_utils import DtypeEnum
+from ....kernels.ragged_ops import RaggedEmbeddingKernel
+from ....ragged import RaggedBatchWrapper
+from ...interfaces import DSEmbeddingBase, DSEmbeddingRegistry
+from ...configs import DSEmbeddingsConfig
+
+
+@DSEmbeddingRegistry.register_module
+class DSRaggedEmbedding(DSEmbeddingBase):
+
+    @staticmethod
+    def name():
+        return 'ragged_embedding'
+
+    @staticmethod
+    def supports_config(config: DSEmbeddingsConfig) -> bool:
+
+        if DtypeEnum(config.residual_dtype) not in [DtypeEnum.fp16, DtypeEnum.bf16, DtypeEnum.fp32]:
+            return False
+
+        if config.use_token_type:
+            return False
+
+        if config.output_normalization != None:
+            return False
+
+        try:
+            _ = RaggedEmbeddingKernel(config.residual_dtype, torch.int32, config.embedding_dim)
+        except ValueError:
+            return False
+
+        return True
+
+    def __init__(self, config: DSEmbeddingsConfig, implementation_config: Dict[str, Any]) -> None:
+        super().__init__(config, implementation_config)
+
+        self.embed_offset = self._config.positional_offset
+
+        # TODO(cmikeh2): How do we want to avoid the int32 vs int64 issue?
+        self._ragged_embed = RaggedEmbeddingKernel(self._config.residual_dtype, torch.int32,
+                                                   self._config.embedding_dim)
+
+        self._output = torch.empty((self._config.max_tokens, self._config.embedding_dim),
+                                   dtype=self._config.residual_dtype,
+                                   device=get_accelerator().current_device())
+
+    @property
+    def output(self) -> torch.Tensor:
+        return self._output
+
+    def forward(self,
+                ragged_batch: RaggedBatchWrapper,
+                word_embeddings: torch.Tensor,
+                position_embeddings: Optional[torch.Tensor] = None) -> torch.Tensor:
+        """
+        Parameters:
+            ragged_batch (RaggedBatchWrapper): The input ids and associated ragged batch metadata.
+            word_embeddings (torch.Tensor): The word embedding table
+        """
+        output = empty_from(self._output, (ragged_batch.tensor_toks, self._config.embedding_dim))
+        self._ragged_embed(output,
+                           ragged_batch,
+                           word_embeddings,
+                           position_embed_weight=position_embeddings,
+                           position_embed_offset=self.embed_offset)
+        return output
diff --git a/deepspeed/inference/v2/modules/implementations/linear/__init__.py b/deepspeed/inference/v2/modules/implementations/linear/__init__.py
new file mode 100644
index 000000000000..e76aab71c4cf
--- /dev/null
+++ b/deepspeed/inference/v2/modules/implementations/linear/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .blas_fp_linear import BlasFPLinear
diff --git a/deepspeed/inference/v2/modules/implementations/linear/blas_fp_linear.py b/deepspeed/inference/v2/modules/implementations/linear/blas_fp_linear.py
new file mode 100644
index 000000000000..c58dab0b826b
--- /dev/null
+++ b/deepspeed/inference/v2/modules/implementations/linear/blas_fp_linear.py
@@ -0,0 +1,103 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import Any, Dict, Optional
+
+import torch
+
+from deepspeed.accelerator import get_accelerator
+from ....allocator import empty_from
+from ....inference_utils import is_gated
+from ....kernels.core_ops import (
+    BlasLibLinear,
+    CUDABiasActivation,
+    CUDAGatedActivation,
+)
+
+from ...interfaces import DSLinearBase, DSLinearRegistry
+from ...configs import DSLinearConfig
+from ....inference_parameter import InferenceParameter
+
+
+@DSLinearRegistry.register_module
+class BlasFPLinear(DSLinearBase):
+    """
+    Linear DSModule based on BLAS library and standalone bias + activation kernel implementation.
+    """
+
+    @staticmethod
+    def name():
+        return 'blas_fp_linear'
+
+    @staticmethod
+    def supports_config(config: DSLinearConfig) -> bool:
+        if config.input_dtype != config.output_dtype:
+            return False
+
+        if config.input_dtype != torch.float16 and config.input_dtype != torch.bfloat16:
+            return False
+
+        if is_gated(config.activation):
+            try:
+                _ = CUDAGatedActivation(config.out_channels, config.output_dtype, config.activation)
+            except ValueError:
+                return False
+        else:
+            try:
+                _ = CUDABiasActivation(config.out_channels, config.output_dtype, config.activation)
+            except ValueError:
+                return False
+
+        return True
+
+    def __init__(self, config: DSLinearConfig, implementation_config: Dict[str, Any]) -> None:
+        super().__init__(config, implementation_config)
+
+        self._linear_impl = BlasLibLinear(self._config.input_dtype)
+
+        if is_gated(config.activation):
+            self._is_gated = True
+            self._act_fn = CUDAGatedActivation(config.out_channels, config.output_dtype, config.activation)
+            self._double_buffer = torch.empty((config.max_tokens, config.out_channels * 2),
+                                              dtype=config.output_dtype,
+                                              device=get_accelerator().current_device())
+        else:
+            self._is_gated = False
+            self._act_fn = CUDABiasActivation(config.out_channels, config.output_dtype, config.activation)
+
+        self._output = torch.empty((config.max_tokens, config.out_channels),
+                                   dtype=config.output_dtype,
+                                   device=get_accelerator().current_device())
+
+    def transform_param(self, param: torch.Tensor) -> InferenceParameter:
+        """
+        Converts param to same data type as input and output.
+
+        Parameters:
+            param (torch.Tensor): Weight or bias tensor.
+        """
+        param = param.to(self._config.output_dtype)
+        return InferenceParameter.initialize(param)
+
+    def forward(self, hidden_states: torch.Tensor, w: torch.Tensor, b: Optional[torch.Tensor] = None) -> torch.Tensor:
+
+        output = empty_from(self._output, (hidden_states.shape[0], self._config.out_channels))
+
+        if self._is_gated:
+            staging_output = empty_from(self._double_buffer, (hidden_states.shape[0], self._config.out_channels * 2))
+            self._linear_impl(staging_output, hidden_states, w)
+            self._act_fn(output, staging_output, b)
+        else:
+            self._linear_impl(output, hidden_states, w)
+            self._act_fn(output, b)
+
+        return output
+
+    @property
+    def output(self) -> torch.Tensor:
+        """
+        Return the padded, pre-allocated output Tensor.
+        """
+        return self._output
diff --git a/deepspeed/inference/v2/modules/implementations/moe/__init__.py b/deepspeed/inference/v2/modules/implementations/moe/__init__.py
new file mode 100644
index 000000000000..053ad5da7746
--- /dev/null
+++ b/deepspeed/inference/v2/modules/implementations/moe/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .cutlass_multi_gemm import DSMultiGemmMoE
diff --git a/deepspeed/inference/v2/modules/implementations/moe/cutlass_multi_gemm.py b/deepspeed/inference/v2/modules/implementations/moe/cutlass_multi_gemm.py
new file mode 100644
index 000000000000..e43a737515ed
--- /dev/null
+++ b/deepspeed/inference/v2/modules/implementations/moe/cutlass_multi_gemm.py
@@ -0,0 +1,225 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import Any, Dict, Optional, Tuple
+
+import torch
+
+from deepspeed.accelerator import get_accelerator
+from ....allocator import empty_from
+from ....inference_utils import ActivationType
+from ....kernels.core_ops import BlasLibLinear
+from ....kernels.ragged_ops import (
+    MoEGather,
+    MoEScatter,
+    RaggedTop1Gating,
+)
+from ....ragged import RaggedBatchWrapper
+
+from ...interfaces import DSMoEBase, DSMoERegistry
+from ...configs import DSMoEConfig
+from ....kernels.cutlass_ops import MoEGEMM
+from ....inference_parameter import InferenceParameter
+
+
+@DSMoERegistry.register_module
+class DSMultiGemmMoE(DSMoEBase):
+    """
+    MoE implementation based on the CUTLASS multi-GEMM.
+    """
+
+    @staticmethod
+    def name():
+        return 'cutlass_multi_gemm_moe'
+
+    @staticmethod
+    def supports_config(config: DSMoEConfig) -> bool:
+        if config.input_dtype != config.output_dtype:
+            return False
+
+        if config.input_dtype != torch.float16 and config.input_dtype != torch.bfloat16:
+            return False
+
+        if config.top_k != 1:
+            return False
+
+        if config.activation in [ActivationType.GEGLU, ActivationType.ReGLU, ActivationType.SiGLU]:
+            # Currently not supporting gated activations in MoE
+            return False
+
+        return True
+
+    def __init__(self, config: DSMoEConfig, implementation_config: Dict[str, Any]) -> None:
+        super().__init__(config, implementation_config)
+
+        # Convenience variables for frequently accessed items.
+        self.max_tokens = self._config.max_tokens
+        self.n_experts = self._config.n_experts
+        self.intermediate_dim = self._config.intermediate_features
+
+        self._mlp_1 = MoEGEMM(fp_dtype=implementation_config['weight_dtype'], act_fn=config.activation)
+        self._mlp_2 = MoEGEMM(fp_dtype=implementation_config['weight_dtype'], act_fn=ActivationType.IDENTITY)
+
+        self._gate_proj = BlasLibLinear(self._config.input_dtype)
+        self._top_1_gate = RaggedTop1Gating(config.input_dtype)
+        self._moe_scatter = MoEScatter(config.input_dtype, config.model_dim)
+        self._moe_gather = MoEGather(config.input_dtype, config.model_dim)
+
+        self._create_buffers()
+
+    def _create_buffers(self):
+
+        # Gating buffers
+        self._logits = torch.empty((self._config.max_tokens, self.n_experts),
+                                   dtype=self._config.input_dtype,
+                                   device=get_accelerator().current_device())
+        self._expert_counts = torch.empty((self.n_experts, ),
+                                          dtype=torch.int32,
+                                          device=get_accelerator().current_device())
+        self._scores = torch.empty((self._config.max_tokens, ),
+                                   dtype=torch.float32,
+                                   device=get_accelerator().current_device())
+        self._assignments = torch.empty((self._config.max_tokens, ),
+                                        dtype=torch.int32,
+                                        device=get_accelerator().current_device())
+        self._offsets = torch.empty((self._config.max_tokens, ),
+                                    dtype=torch.int32,
+                                    device=get_accelerator().current_device())
+
+        # Scatter buffers
+        self._moe_input = torch.empty((self._config.max_tokens, self._config.model_dim),
+                                      dtype=self._config.input_dtype,
+                                      device=get_accelerator().current_device())
+        self._expert_cumsum = torch.empty((self._config.n_experts, ),
+                                          dtype=torch.int64,
+                                          device=get_accelerator().current_device())
+        self._mapped_slots = torch.empty((self._config.max_tokens, ),
+                                         dtype=torch.int32,
+                                         device=get_accelerator().current_device())
+
+        # GEMM Buffers
+        self._intermediate = torch.empty((self._config.max_tokens, self._config.intermediate_features),
+                                         dtype=self._config.output_dtype,
+                                         device=get_accelerator().current_device())
+        self._output_unordered = torch.empty((self._config.max_tokens, self._config.model_dim),
+                                             dtype=self._config.output_dtype,
+                                             device=get_accelerator().current_device())
+
+        # Gather buffer
+        self._output = torch.empty((self._config.max_tokens, self._config.model_dim),
+                                   dtype=self._config.output_dtype,
+                                   device=get_accelerator().current_device())
+
+    def transform_gate_param(self, param: torch.Tensor) -> InferenceParameter:
+        """
+        Ensures gate param is going to match the activation data type.
+        """
+        param = param.to(self._config.input_dtype)
+        return InferenceParameter.initialize(param)
+
+    def transform_moe_mlp_1_param(self, param: torch.Tensor) -> InferenceParameter:
+        """
+        Converts param to same data type as input and output.
+
+        Parameters:
+            param (torch.Tensor): Weight or bias tensor.
+        """
+        param = param.to(self._config.input_dtype)
+
+        if len(param.shape) == 3:
+            param = param.permute(0, 2, 1).contiguous()
+        return InferenceParameter.initialize(param)
+
+    def transform_moe_mlp_2_param(self, param: torch.Tensor) -> InferenceParameter:
+        """
+        Converts param to same data type as input and output.
+
+        Parameters:
+            param (torch.Tensor): Weight or bias tensor.
+        """
+        param = param.to(self._config.input_dtype)
+
+        if len(param.shape) == 3:
+            param = param.permute(0, 2, 1).contiguous()
+        return InferenceParameter.initialize(param)
+
+    @property
+    def output(self) -> torch.Tensor:
+        return self._output
+
+    def _gate(self, hidden_states: torch.Tensor, batch_metadata: RaggedBatchWrapper,
+              gate_w: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Helper function to isolate the logit for gating. This will take the hidden states
+        and produce the metadata + tensors for the CUTLASS ragged GEMMs. If the input has
+        been padded for CG, this will strip the padding for MoE.
+
+        Parameters:
+            hidden_states (torch.Tensor): Hidden states tensor. Expected shape is [n_tokens, model_dim].
+            batch_metadata (RaggedBatchWrapper): Batch metadata for the hidden states.
+            gate_w (torch.Tensor): Gate weight tensor. Expected shape is [num_experts, model_dim].
+
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: The MoE input, the cumsum of the offsets (for the MoE kernels themselves), the scores, and the mapped slots (to recover the original order of the tokens)
+        """
+
+        # Get views on the buffers for gating
+        logits = empty_from(self._logits, (hidden_states.shape[0], self._logits.shape[-1]))
+        scores = empty_from(self._scores, (hidden_states.shape[0], ))
+        assignments = empty_from(self._assignments, (hidden_states.shape[0], ))
+        offsets = empty_from(self._offsets, (hidden_states.shape[0], ))
+        mapped_slots = empty_from(self._mapped_slots, (hidden_states.shape[0], ))
+        moe_input = empty_from(self._moe_input, (hidden_states.shape[0], self._moe_input.shape[-1]))
+
+        self._gate_proj(logits, hidden_states, gate_w)
+        self._expert_counts.zero_()
+        self._top_1_gate(self._expert_counts, scores, assignments, offsets, logits, batch_metadata)
+        self._moe_scatter(moe_input, self._expert_cumsum, mapped_slots, hidden_states, self._expert_counts,
+                          assignments, offsets)
+
+        return moe_input, self._expert_cumsum, scores, mapped_slots
+
+    def forward(self,
+                hidden_states: torch.Tensor,
+                batch_metadata: RaggedBatchWrapper,
+                gate_w: torch.Tensor,
+                mlp_1_w: torch.Tensor,
+                mlp_2_w: torch.Tensor,
+                mlp_1_b: Optional[torch.Tensor] = None,
+                mlp_2_b: Optional[torch.Tensor] = None) -> torch.Tensor:
+        """
+        MoE forward pass built on top of CUTLASS multi-GEMM.
+
+        Parameters:
+            hidden_states (torch.Tensor): Hidden states tensor. Expected shape is [batch, seq_len, model_dim].
+            gate_w (torch.Tensor): Gate weight tensor. Expected shape is [num_experts, model_dim].
+        """
+
+        moe_input, expert_cumsum, scores, mapped_slots = self._gate(hidden_states, batch_metadata, gate_w)
+
+        # Get views on the buffers for GEMM
+        intermediate = empty_from(self._intermediate, (hidden_states.shape[0], self._intermediate.shape[-1]))
+        output_unordered = empty_from(self._output_unordered,
+                                      (hidden_states.shape[0], self._output_unordered.shape[-1]))
+        output = empty_from(self._output, (hidden_states.shape[0], self._output.shape[-1]))
+
+        self._mlp_1(
+            intermediate,
+            moe_input,
+            mlp_1_w,
+            expert_cumsum,
+            mlp_1_b,
+        )
+
+        self._mlp_2(
+            output_unordered,
+            intermediate,
+            mlp_2_w,
+            expert_cumsum,
+            mlp_2_b,
+        )
+
+        self._moe_gather(output, output_unordered, scores, mapped_slots, self._expert_counts)
+        return output
diff --git a/deepspeed/inference/v2/modules/implementations/post_norm/__init__.py b/deepspeed/inference/v2/modules/implementations/post_norm/__init__.py
new file mode 100644
index 000000000000..653a2fe4fb5b
--- /dev/null
+++ b/deepspeed/inference/v2/modules/implementations/post_norm/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .cuda_post_ln import DSPostLNCUDAModule
diff --git a/deepspeed/inference/v2/modules/implementations/post_norm/cuda_post_ln.py b/deepspeed/inference/v2/modules/implementations/post_norm/cuda_post_ln.py
new file mode 100644
index 000000000000..9b2af4bb9023
--- /dev/null
+++ b/deepspeed/inference/v2/modules/implementations/post_norm/cuda_post_ln.py
@@ -0,0 +1,56 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import Any, Dict, Tuple
+
+import torch
+
+from deepspeed.accelerator import get_accelerator
+from ...interfaces import DSPostNormBase, DSPostNormRegistry
+from ...configs import DSNormConfig
+from ....kernels.core_ops.cuda_layer_norm.cuda_post_ln import CUDAFPPostLN
+from ....allocator import empty_from
+from ....inference_parameter import InferenceParameter
+
+
+@DSPostNormRegistry.register_module
+class DSPostLNCUDAModule(DSPostNormBase):
+
+    @staticmethod
+    def name():
+        return 'cuda_post_ln'
+
+    @staticmethod
+    def supports_config(config: DSNormConfig):
+        if len(set([config.residual_dtype, config.input_dtype, config.output_dtype])) != 1:
+            return False
+
+        try:
+            _ = CUDAFPPostLN(config.channels, config.residual_dtype)
+        except ValueError:
+            return False
+        return True
+
+    def __init__(self, config: DSNormConfig, implementation_config: Dict[str, Any]):
+        super().__init__(config, implementation_config)
+        self._fp_post_ln = CUDAFPPostLN(self._config.channels, self._config.residual_dtype, epsilon=self._config.eps)
+
+        self._output = torch.empty((config.max_tokens, config.channels),
+                                   dtype=config.output_dtype,
+                                   device=get_accelerator().current_device())
+
+    def transform_param(self, param: torch.Tensor) -> InferenceParameter:
+        param = param.to(self._config.input_dtype)
+        return InferenceParameter.initialize(param)
+
+    def forward(self, residual: torch.Tensor, hidden_in: torch.Tensor, gamma: torch.Tensor,
+                beta: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Since the CUDA FP only supports all data types being the same, we will alias the residual
+        with our output.
+        """
+        self._residual_output = empty_from(self._output, residual.shape)
+        self._fp_post_ln(residual, residual, hidden_in, gamma, beta)
+        return residual, residual
diff --git a/deepspeed/inference/v2/modules/implementations/pre_norm/__init__.py b/deepspeed/inference/v2/modules/implementations/pre_norm/__init__.py
new file mode 100644
index 000000000000..12605f13f955
--- /dev/null
+++ b/deepspeed/inference/v2/modules/implementations/pre_norm/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .cuda_pre_ln import DSPreLNCUDAModule
+from .cuda_pre_rms import DSPreRMSCUDAModule
diff --git a/deepspeed/inference/v2/modules/implementations/pre_norm/cuda_pre_ln.py b/deepspeed/inference/v2/modules/implementations/pre_norm/cuda_pre_ln.py
new file mode 100644
index 000000000000..90783ce8c9a6
--- /dev/null
+++ b/deepspeed/inference/v2/modules/implementations/pre_norm/cuda_pre_ln.py
@@ -0,0 +1,69 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import Any, Dict, Optional, Tuple
+
+import torch
+
+from deepspeed.accelerator import get_accelerator
+from ...interfaces import DSPreNormBase, DSPreNormRegistry
+from ...configs import DSNormConfig, NormTypeEnum
+from ....kernels.core_ops.cuda_layer_norm.cuda_pre_ln import CUDAFPPreLN
+from ....kernels.core_ops.cuda_layer_norm.cuda_ln import CUDAFPLN
+from ....allocator import empty_from
+from ....inference_parameter import InferenceParameter
+
+
+@DSPreNormRegistry.register_module
+class DSPreLNCUDAModule(DSPreNormBase):
+
+    @staticmethod
+    def name():
+        return 'cuda_pre_ln'
+
+    @staticmethod
+    def supports_config(config: DSNormConfig):
+        type = NormTypeEnum(config.type)
+        if type != NormTypeEnum.LayerNorm:
+            return False
+
+        if len(set([config.residual_dtype, config.input_dtype, config.output_dtype])) != 1:
+            return False
+
+        try:
+            _ = CUDAFPPreLN(config.channels, config.residual_dtype)
+        except ValueError:
+            return False
+        return True
+
+    def __init__(self, config: DSNormConfig, implementation_config: Dict[str, Any]):
+        super().__init__(config, implementation_config)
+        self._fp_pre_ln = CUDAFPPreLN(self._config.channels, self._config.residual_dtype, epsilon=self._config.eps)
+        self._fp_ln = CUDAFPLN(self._config.channels, self._config.residual_dtype, epsilon=self._config.eps)
+
+        # Buffers for the hidden output (residual is updated in-place)
+        self._hidden_output = torch.empty((config.max_tokens, config.channels),
+                                          dtype=config.output_dtype,
+                                          device=get_accelerator().current_device())
+
+    def transform_param(self, param: torch.Tensor) -> InferenceParameter:
+        param = param.to(self._config.input_dtype)
+        return InferenceParameter.initialize(param)
+
+    def forward(self, residual: torch.Tensor, hidden_in: Optional[torch.Tensor], gamma: torch.Tensor,
+                beta: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Since the CUDA FP only supports all data types being the same, we will alias the residual
+        with our output.
+
+        If hidden_in is None, that means we do not need to perform the residual add and will
+        only return the hidden output modified.
+        """
+        hidden_out = empty_from(self._hidden_output, residual.shape)
+        if hidden_in is None:
+            self._fp_ln(hidden_out, residual, gamma, beta)
+        else:
+            self._fp_pre_ln(residual, hidden_out, residual, hidden_in, gamma, beta)
+        return residual, hidden_out
diff --git a/deepspeed/inference/v2/modules/implementations/pre_norm/cuda_pre_rms.py b/deepspeed/inference/v2/modules/implementations/pre_norm/cuda_pre_rms.py
new file mode 100644
index 000000000000..986262b31b1f
--- /dev/null
+++ b/deepspeed/inference/v2/modules/implementations/pre_norm/cuda_pre_rms.py
@@ -0,0 +1,79 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import Any, Dict, Optional, Tuple
+
+import torch
+
+from deepspeed.accelerator import get_accelerator
+from ...interfaces import DSPreNormBase, DSPreNormRegistry
+from ...configs import DSNormConfig, NormTypeEnum
+from ....kernels.core_ops import CUDARMSNorm, CUDARMSPreNorm
+from ....allocator import empty_from
+from ....inference_parameter import InferenceParameter
+
+
+@DSPreNormRegistry.register_module
+class DSPreRMSCUDAModule(DSPreNormBase):
+
+    @staticmethod
+    def name():
+        return 'cuda_pre_rms'
+
+    @staticmethod
+    def supports_config(config: DSNormConfig):
+        type = NormTypeEnum(config.type)
+        if type != NormTypeEnum.RMSNorm:
+            return False
+
+        if len(set([config.residual_dtype, config.input_dtype, config.output_dtype])) != 1:
+            return False
+
+        try:
+            # Only need to check one since the support matrix for the two rms kernels is the same
+            _ = CUDARMSPreNorm(config.channels, config.residual_dtype)
+        except ValueError:
+            return False
+        return True
+
+    def __init__(self, config: DSNormConfig, implementation_config: Dict[str, Any]):
+        super().__init__(config, implementation_config)
+        self._fp_rms = CUDARMSNorm(self._config.channels, self._config.residual_dtype, epsilon=self._config.eps)
+        self._fp_rms_pre = CUDARMSPreNorm(self._config.channels, self._config.residual_dtype, epsilon=self._config.eps)
+
+        # Buffers for both the hidden and residual outputs
+        self._hidden_output = torch.empty((config.max_tokens, config.channels),
+                                          dtype=config.output_dtype,
+                                          device=get_accelerator().current_device())
+        self._residual_output = torch.empty((config.max_tokens, config.channels),
+                                            dtype=config.output_dtype,
+                                            device=get_accelerator().current_device())
+
+    def transform_param(self, param: torch.Tensor) -> InferenceParameter:
+        param = param.to(self._config.input_dtype)
+        return InferenceParameter.initialize(param)
+
+    def forward(self,
+                residual: torch.Tensor,
+                hidden_in: Optional[torch.Tensor],
+                gamma: torch.Tensor,
+                beta: Optional[torch.Tensor] = None) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Since the CUDA FP only supports all data types being the same, we will alias the residual
+        with our output.
+
+        If hidden_in is None, that means we do not need to perform the residual add and will
+        only return the hidden output modified.
+        """
+        assert beta is None, "Beta is not supported for RMSNorm"
+
+        hidden_out = empty_from(self._hidden_output, residual.shape)
+        if hidden_in is None:
+            self._fp_rms(hidden_out, residual, gamma)
+            residual_out = residual
+        else:
+            residual_out = empty_from(self._residual_output, residual.shape)
+            self._fp_rms_pre(residual_out, hidden_out, residual, hidden_in, gamma)
+        return residual_out, hidden_out
diff --git a/deepspeed/inference/v2/modules/implementations/unembed/__init__.py b/deepspeed/inference/v2/modules/implementations/unembed/__init__.py
new file mode 100644
index 000000000000..4a5fd24d518b
--- /dev/null
+++ b/deepspeed/inference/v2/modules/implementations/unembed/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .ragged_unembed import DSRaggedUnembed
diff --git a/deepspeed/inference/v2/modules/implementations/unembed/ragged_unembed.py b/deepspeed/inference/v2/modules/implementations/unembed/ragged_unembed.py
new file mode 100644
index 000000000000..40d70cbd4df7
--- /dev/null
+++ b/deepspeed/inference/v2/modules/implementations/unembed/ragged_unembed.py
@@ -0,0 +1,115 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import Any, Dict, Optional
+
+import torch
+
+from deepspeed.accelerator import get_accelerator
+from ....allocator import empty_from
+from ....inference_utils import DtypeEnum
+from ....kernels.core_ops import CUDAFPLN, BlasLibLinear, CUDARMSNorm
+from ....kernels.ragged_ops import RaggedLogitsGather
+from ....ragged import RaggedBatchWrapper
+from ...interfaces import DSUnembedBase, DSUnembedRegistry
+from ...configs import DSUnembedConfig
+
+
+@DSUnembedRegistry.register_module
+class DSRaggedUnembed(DSUnembedBase):
+    """
+    Ragged unembedding implementation. This implementation will gather only the last token
+    of each sequence in the ragged inflight batch and calculate the logits only for those rows.
+    """
+
+    @staticmethod
+    def name():
+        return 'ragged_unembed'
+
+    @staticmethod
+    def supports_config(config: DSUnembedConfig):
+
+        if DtypeEnum(config.dtype) not in [DtypeEnum.fp16, DtypeEnum.bf16, DtypeEnum.fp32]:
+            return False
+
+        try:
+            _ = RaggedLogitsGather(config.model_dim, config.dtype)
+        except ValueError:
+            return False
+
+        if config.norm_type == 'rms_norm':
+            try:
+                _ = CUDARMSNorm(config.model_dim, config.dtype)
+            except ValueError:
+                return False
+        elif config.norm_type == 'layer_norm':
+            try:
+                _ = CUDAFPLN(config.model_dim, config.dtype)
+            except ValueError:
+                return False
+
+        return True
+
+    def __init__(self, config: DSUnembedConfig, implementation_config: Dict[str, Any]) -> None:
+        super().__init__(config, implementation_config)
+
+        self._logits_gather = RaggedLogitsGather(config.model_dim, self._config.dtype)
+
+        if self._config.norm_type == 'layer_norm':
+            self._norm = CUDAFPLN(self._config.model_dim, self._config.dtype)
+        elif self._config.norm_type == 'rms_norm':
+            self._norm = CUDARMSNorm(self._config.model_dim, self._config.dtype)
+        else:
+            self._norm = None
+
+        self._linear = BlasLibLinear(self._config.dtype)
+
+        self._intermediate = torch.empty((self._config.max_sequences, self._config.model_dim),
+                                         dtype=self._config.dtype,
+                                         device=get_accelerator().current_device())
+
+        self._output = torch.empty((self._config.max_sequences, self._config.vocab_size),
+                                   dtype=self._config.dtype,
+                                   device=get_accelerator().current_device())
+
+    @property
+    def output(self) -> torch.Tensor:
+        return self._output
+
+    def forward(self,
+                hidden_states: torch.Tensor,
+                vocab_embedding: torch.Tensor,
+                ragged_metadata: RaggedBatchWrapper,
+                gamma: Optional[torch.Tensor] = None,
+                beta: Optional[torch.Tensor] = None) -> torch.Tensor:
+        """
+        Return final model logits.
+
+        Args:
+            hidden_states (torch.Tensor): The hidden states from the model. This is the output of the
+                final layer of the model.
+            vocab_embedding (torch.Tensor): The vocab embedding table.
+            raged_metadata (RaggedBatchWrapper): The ragged batch metadata.
+            gamma (Optional[torch.Tensor]): The gamma tensor for normalization.
+            beta (Optional[torch.Tensor]): The beta tensor for normalization.
+        """
+
+        cut_down_hidden_states = empty_from(self._intermediate,
+                                            (ragged_metadata.current_sequences, self._config.model_dim))
+        self._logits_gather(cut_down_hidden_states, hidden_states, ragged_metadata)
+
+        if self._config.norm_type == 'rms_norm':
+            if gamma is None:
+                raise ValueError('RMS Normalization enabled but gamma not provided.')
+            self._norm(cut_down_hidden_states, cut_down_hidden_states, gamma)
+        elif self._config.norm_type == 'layer_norm':
+            if gamma is None or beta is None:
+                raise ValueError('Normalization enabled but gamma and/or beta not provided.')
+            self._norm(cut_down_hidden_states, cut_down_hidden_states, gamma, beta)
+
+        output = empty_from(self._output, (ragged_metadata.current_sequences, self._config.vocab_size))
+        self._linear(output, cut_down_hidden_states, vocab_embedding)
+
+        return output
diff --git a/deepspeed/inference/v2/modules/interfaces/__init__.py b/deepspeed/inference/v2/modules/interfaces/__init__.py
new file mode 100644
index 000000000000..13b556789e4e
--- /dev/null
+++ b/deepspeed/inference/v2/modules/interfaces/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .attention_base import DSSelfAttentionRegistry, DSSelfAttentionBase
+from .embedding_base import DSEmbeddingRegistry, DSEmbeddingBase
+from .linear_base import DSLinearRegistry, DSLinearBase
+from .moe_base import DSMoERegistry, DSMoEBase
+from .post_norm_base import DSPostNormRegistry, DSPostNormBase
+from .pre_norm_base import DSPreNormRegistry, DSPreNormBase
+from .unembed_base import DSUnembedRegistry, DSUnembedBase
diff --git a/deepspeed/inference/v2/modules/interfaces/attention_base.py b/deepspeed/inference/v2/modules/interfaces/attention_base.py
new file mode 100644
index 000000000000..c67dc033f92a
--- /dev/null
+++ b/deepspeed/inference/v2/modules/interfaces/attention_base.py
@@ -0,0 +1,97 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import Any, Dict, Optional, Tuple, Type
+
+import torch
+
+from ...ragged import RaggedBatchWrapper
+from deepspeed.runtime.config_utils import DeepSpeedConfigModel
+from ..ds_module import DSModuleBase
+from ..module_registry import DSModuleRegistryBase
+from ..configs import DSSelfAttentionConfig
+
+
+class DSSelfAttentionBase(DSModuleBase):
+    """
+    Base mixin for all attention modules. The interface represented by this module
+    is broadly:
+
+    output = attention(query_key_value,
+                       Optional[kv_cache],
+                       Optional[attention_mask],
+                       Optional[attention_bias])
+    """
+
+    @staticmethod
+    def config_class() -> Type[DeepSpeedConfigModel]:
+        return DSSelfAttentionConfig
+
+    def __init__(self, config: DSSelfAttentionConfig, implementation_config: Dict[str, Any]) -> None:
+        super().__init__(config, implementation_config)
+
+    @property
+    def kv_block_size(self) -> int:
+        """
+        Return preferred granulatity for blocked KV-cache implementation.
+        """
+        raise NotImplementedError()
+
+    @property
+    def q_block_size(self) -> int:
+        """
+        Property to calculate blocking granularity for the query dimension.
+        This has no impact on the KV-cache structure, but will  affect the
+        number of attention atoms associated with a batch.
+        """
+        raise NotImplementedError()
+
+    def build_atoms(self, ragged_batch: RaggedBatchWrapper) -> None:
+        """
+        Build the atoms for this module. This is not a strict requirement for the class,
+        so this method is a no-op by default rather than abstract.
+        """
+        pass
+
+    def forward(self,
+                q_k_v: torch.Tensor,
+                kv_cache: torch.Tensor,
+                batch: RaggedBatchWrapper,
+                attention_mask: Optional[torch.Tensor] = None,
+                attention_bias: Optional[torch.Tensor] = None,
+                inv_freqs: Optional[torch.Tensor] = None) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Parameters:
+            q_k_v (torch.Tensor): Query, key, and value tensors. Expected shape is:
+                [
+                    batch,
+                    seq_len,
+                    2 * self._config.n_heads_kv + self._config.n_heads_q,
+                    self._config.head_size
+                ].
+            kv_cache (Optional[torch.Tensor]): Key and value cache tensor. Expected shape is
+                [
+                    2,
+                    batch,
+                    kv_cache_len,
+                    self._config.n_heads_kv,
+                    self._config.head_size
+                ]. If None, cache is disabled. The `kv_cache_len` dimension does not need to
+                be contiguous (it should expand stride by `max_out_tokens`).
+            batch (RaggedBatchWrapper): Ragged batch metadata.
+            attention_mask (Optional[torch.Tensor]): Attention mask tensor. If None, masking is
+                disabled. This will defer to the config in the case of conflicting information.
+                This means if the config class is implying causal attention, the mask will be ignored.
+            attention_bias (Optional[torch.Tensor]): Attention bias tensor. If None, bias is disabled.
+        """
+        raise NotImplementedError()
+
+
+class DSSelfAttentionRegistry(DSModuleRegistryBase):
+    registry: Dict = {}
+
+    @staticmethod
+    def associated_class() -> Type[DSModuleBase]:
+        return DSSelfAttentionBase
diff --git a/deepspeed/inference/v2/modules/interfaces/embedding_base.py b/deepspeed/inference/v2/modules/interfaces/embedding_base.py
new file mode 100644
index 000000000000..1ab7e5f0b7a2
--- /dev/null
+++ b/deepspeed/inference/v2/modules/interfaces/embedding_base.py
@@ -0,0 +1,85 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from abc import abstractmethod
+from typing import Any, Dict, Optional, Type
+
+import torch
+
+from deepspeed.runtime.config_utils import DeepSpeedConfigModel
+from ...ragged import RaggedBatchWrapper
+from ..ds_module import DSModuleBase
+from ..module_registry import DSModuleRegistryBase
+from ..configs import DSEmbeddingsConfig
+from ...inference_parameter import InferenceParameter
+
+
+class DSEmbeddingBase(DSModuleBase):
+    """
+    Base mixin for embedding modules. The interface represented by this module is:
+
+    hidden_out = embedding(input_ids) +
+                 position_embedding(position_ids) +
+                 token_type_embedding(token_type_ids)
+    with optional normalization.
+    """
+
+    @staticmethod
+    def config_class() -> Type[DeepSpeedConfigModel]:
+        return DSEmbeddingsConfig
+
+    def __init__(self, config: DSEmbeddingsConfig, implementation_config: Dict[str, Any]) -> None:
+        super().__init__(config, implementation_config)
+
+    def transform_param(self, embed_param: torch.Tensor) -> InferenceParameter:
+        """
+        Perform any necessary transformations on an embedding parameter. This module assumes
+        that all embedding parameters would require the same set of transformations.
+
+        Parameters:
+            embed_param (torch.Tensor): Embedding parameter. Shape is of [vocab_size, hidden_size]
+        """
+        raise NotImplementedError()
+
+    @property
+    @abstractmethod
+    def output(self) -> torch.Tensor:
+        """
+        Pre-allocated output Tensor. This currently needs to be exposed for gather operations
+        on the output.
+
+        TODO(cmikeh2): This is not ideal. We need a better abstraction for this, such as giving
+        access to the inference comm object to the DSModule.
+        """
+        raise NotImplementedError()
+
+    def forward(self,
+                ragged_batch: RaggedBatchWrapper,
+                word_embeddings: torch.Tensor,
+                position_embeddings: Optional[torch.Tensor] = None,
+                token_type_ids: Optional[torch.Tensor] = None,
+                token_type_embeddings: Optional[torch.Tensor] = None) -> InferenceParameter:
+        """
+        Parameters:
+            ragged_batch (torch.Tensor): Ragged batch of token ids + associated metadata.
+            word_embeddings (torch.Tensor): Word embeddings.
+            position_embeddings (torch.Tensor): Position embeddings. If passed, IDs will be
+                inferred from the ragged batch itself.
+            token_type_ids (torch.Tensor): Token type ids.
+            token_type_embeddings (torch.Tensor): Token type embeddings.
+
+        Returns:
+            torch.Tensor: Hidden states. This should be the sum of the relevant
+                encodings for the model.
+        """
+        raise NotImplementedError()
+
+
+class DSEmbeddingRegistry(DSModuleRegistryBase):
+    registry: Dict = {}
+
+    @staticmethod
+    def associated_class() -> Type[DSModuleBase]:
+        return DSEmbeddingBase
diff --git a/deepspeed/inference/v2/modules/interfaces/linear_base.py b/deepspeed/inference/v2/modules/interfaces/linear_base.py
new file mode 100644
index 000000000000..fe6ccbcd9344
--- /dev/null
+++ b/deepspeed/inference/v2/modules/interfaces/linear_base.py
@@ -0,0 +1,72 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from abc import abstractmethod
+from typing import Any, Dict, Optional, Type
+
+import torch
+
+from deepspeed.runtime.config_utils import DeepSpeedConfigModel
+from ..ds_module import DSModuleBase
+from ..module_registry import DSModuleRegistryBase
+from ..configs import DSLinearConfig
+from ...inference_parameter import InferenceParameter
+
+
+class DSLinearBase(DSModuleBase):
+    """
+    Base mixin for all Linear modules. The interface represented by this module
+    is:
+
+    hidden_out = activation(hidden_in * weight + bias)
+
+    The format and dtype of the weight and bias tensors are not defined and implementations
+    may compress as necessary. Must support a bias.
+    """
+
+    @staticmethod
+    def config_class() -> Type[DeepSpeedConfigModel]:
+        return DSLinearConfig
+
+    def __init__(self, config: DSLinearConfig, implementation_config: Dict[str, Any]) -> None:
+        super().__init__(config, implementation_config)
+
+    @abstractmethod
+    def transform_param(self, param: torch.Tensor) -> InferenceParameter:
+        """
+        Perform any necessary transformations of the parameters of this module.
+
+        Parameters:
+            param (torch.Tensor): Weight or bias tensor.
+        """
+        ...
+
+    def forward(self, hidden_states: torch.Tensor, w: torch.Tensor, b: Optional[torch.Tensor] = None) -> torch.Tensor:
+        """
+        Parameters:
+            hidden_states (torch.Tensor): Hidden states tensor. Expected shape is either
+                [batch, seq_len, in_channels] or [batch, in_channels].
+
+        Returns:
+            torch.Tensor: Output tensor. Tensor should have same number of dimensions as
+                input tensor.
+        """
+        raise NotImplementedError()
+
+    @property
+    @abstractmethod
+    def output(self) -> torch.Tensor:
+        """
+        Return the padded, pre-allocated output Tensor.
+        """
+        ...
+
+
+class DSLinearRegistry(DSModuleRegistryBase):
+    registry: Dict = {}
+
+    @staticmethod
+    def associated_class() -> Type[DSModuleBase]:
+        return DSLinearBase
diff --git a/deepspeed/inference/v2/modules/interfaces/moe_base.py b/deepspeed/inference/v2/modules/interfaces/moe_base.py
new file mode 100644
index 000000000000..78bdc0700f63
--- /dev/null
+++ b/deepspeed/inference/v2/modules/interfaces/moe_base.py
@@ -0,0 +1,91 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from abc import abstractmethod
+from typing import Any, Dict, Optional, Type
+
+import torch
+
+from deepspeed.runtime.config_utils import DeepSpeedConfigModel
+from ..ds_module import DSModuleBase
+from ..module_registry import DSModuleRegistryBase
+from ..configs import DSMoEConfig
+from ...inference_parameter import InferenceParameter
+
+
+class DSMoEBase(DSModuleBase):
+    """
+    Base mixing for MoE modules. The interface represented by this module is:
+
+    expert_assignments = gate(hidden_states)
+    intermediate = ragged_linear(hidden_states, expert_assignments)
+    output = ragged_linear(intermediate, expert_assignments)
+    """
+
+    @staticmethod
+    def config_class() -> Type[DeepSpeedConfigModel]:
+        return DSMoEConfig
+
+    def __init__(self, config: DSMoEConfig, implementation_config: Dict[str, Any]) -> None:
+        super().__init__(config, implementation_config)
+
+    @abstractmethod
+    def transform_gate_param(self, param: torch.Tensor) -> InferenceParameter:
+        """
+        Perform any necessary transformations of the gate parameter.
+
+        Args:
+            param (torch.Tensor): gate_w (shape: [num_experts, model_dim])
+        """
+        ...
+
+    @abstractmethod
+    def transform_moe_mlp_1_param(self, param: torch.Tensor) -> InferenceParameter:
+        """
+        Perform any necessary transformations of the parameter. The specific component
+        being transformed should be inferred from the shape of the parameter.
+
+        Args:
+            param (torch.Tensor): One of either mlp_1_w, mlp_1_b
+        """
+        ...
+
+    @abstractmethod
+    def transform_moe_mlp_2_param(self, param: torch.Tensor) -> InferenceParameter:
+        """
+        Perform any necessary transformations of the parameter. The specified component being
+        transformed should be inferred from the shape of the parameter. This interface is
+        separate from transform_moe_1_param because the two components may have identical
+        shapes.
+
+        Args:
+            param (torch.Tensor): One of either mlp_2_w or mlp_2_b
+        """
+        ...
+
+    def forward(self,
+                hidden_states: torch.Tensor,
+                gate_w: torch.Tensor,
+                mlp_1_w: torch.Tensor,
+                mlp_2_w: torch.Tensor,
+                mlp_1_b: Optional[torch.Tensor] = None,
+                mlp_2_b: Optional[torch.Tensor] = None) -> torch.Tensor:
+        raise NotImplementedError()
+
+    @property
+    @abstractmethod
+    def output(self) -> torch.Tensor:
+        """
+        Returns the pre-allocated, padded output Tensor.
+        """
+        ...
+
+
+class DSMoERegistry(DSModuleRegistryBase):
+    registry: Dict = {}
+
+    @staticmethod
+    def associated_class() -> Type[DSModuleBase]:
+        return DSMoEBase
diff --git a/deepspeed/inference/v2/modules/interfaces/post_norm_base.py b/deepspeed/inference/v2/modules/interfaces/post_norm_base.py
new file mode 100644
index 000000000000..cc80e5c94bf7
--- /dev/null
+++ b/deepspeed/inference/v2/modules/interfaces/post_norm_base.py
@@ -0,0 +1,69 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from abc import abstractmethod
+from typing import Any, Dict, Optional, Tuple, Type
+
+import torch
+
+from deepspeed.runtime.config_utils import DeepSpeedConfigModel
+from ..ds_module import DSModuleBase
+from ..configs.norm_config import DSNormConfig
+from ..module_registry import DSModuleRegistryBase
+from ...inference_parameter import InferenceParameter
+
+
+class DSPostNormBase(DSModuleBase):
+    """
+    Base MixIn for all Post-Normalization modules. The interface represented by this
+    module is:
+
+    residual, hidden_out = norm(residual + hidden_in)
+
+    If residual and hidden_out are the same data type, then they may alias each other.
+    Furthermore, residual should be updated in-place.
+    """
+
+    @staticmethod
+    def config_class() -> Type[DeepSpeedConfigModel]:
+        return DSNormConfig
+
+    def __init__(self, config: DSNormConfig, implementation_config: Dict[str, Any]) -> None:
+        super().__init__(config, implementation_config)
+
+    @abstractmethod
+    def transform_param(self, param: torch.Tensor) -> InferenceParameter:
+        """
+        Transform a gamma/beta parameter. It is assumed that both transformations are
+        the same.
+
+        Parameters:
+            param (torch.Tensor): Gamma or beta parameter.
+        """
+        ...
+
+    def forward(self,
+                residual: torch.Tensor,
+                hidden_states: torch.Tensor,
+                gamma: torch.Tensor,
+                beta: Optional[torch.Tensor] = None) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Parameters:
+            residual (torch.Tensor): Residual tensor.
+            hidden_states (torch.Tensor): Hidden states tensor.
+
+        Returns:
+            (torch.Tensor, torch.Tensor): Tuple of residual and hidden states.
+                Hidden states may alias with residual.
+        """
+        raise NotImplementedError()
+
+
+class DSPostNormRegistry(DSModuleRegistryBase):
+    registry: Dict = {}
+
+    @staticmethod
+    def associated_class() -> Type[DSModuleBase]:
+        return DSPostNormBase
diff --git a/deepspeed/inference/v2/modules/interfaces/pre_norm_base.py b/deepspeed/inference/v2/modules/interfaces/pre_norm_base.py
new file mode 100644
index 000000000000..84f51cff6947
--- /dev/null
+++ b/deepspeed/inference/v2/modules/interfaces/pre_norm_base.py
@@ -0,0 +1,73 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from abc import abstractmethod
+from typing import Any, Dict, Optional, Tuple, Type
+
+import torch
+
+from deepspeed.runtime.config_utils import DeepSpeedConfigModel
+from ..ds_module import DSModuleBase
+from ..configs.norm_config import DSNormConfig
+from ..module_registry import DSModuleRegistryBase
+from ...inference_parameter import InferenceParameter
+
+
+class DSPreNormBase(DSModuleBase):
+    """
+    Base mixin for all Pre-Normalization modules. The interface represented by this module
+    is:
+
+    if hidden_in is not None:
+        residual_out = residual + hidden_in
+    else:
+        residual_out = residual
+
+    hidden_out = normalize(residual_out)
+    return residual_out, hidden_out
+
+    Residual should be updated in-place.
+    """
+
+    @staticmethod
+    def config_class() -> Type[DeepSpeedConfigModel]:
+        return DSNormConfig
+
+    def __init__(self, config: DSNormConfig, implementation_config: Dict[str, Any]):
+        super().__init__(config, implementation_config)
+
+    @abstractmethod
+    def transform_param(self, param: torch.Tensor) -> InferenceParameter:
+        """
+        Transform a gamma/beta parameter. It is assumed that both transformations are
+        the same.
+
+        Parameters:
+            param (torch.Tensor): Gamma or beta parameter.
+        """
+        ...
+
+    def forward(self,
+                residual: torch.Tensor,
+                hidden_states: Optional[torch.Tensor],
+                gamma: torch.Tensor,
+                beta: Optional[torch.Tensor] = None) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Parameters:
+            residual (torch.Tensor): Residual tensor.
+            hidden_states (torch.Tensor): Hidden states tensor.
+
+        Returns:
+            (torch.Tensor, torch.Tensor): Tuple of residual and hidden states.
+        """
+        raise NotImplementedError()
+
+
+class DSPreNormRegistry(DSModuleRegistryBase):
+    registry: Dict = {}
+
+    @staticmethod
+    def associated_class() -> Type[DSModuleBase]:
+        return DSPreNormBase
diff --git a/deepspeed/inference/v2/modules/interfaces/unembed_base.py b/deepspeed/inference/v2/modules/interfaces/unembed_base.py
new file mode 100644
index 000000000000..9eca6fcde768
--- /dev/null
+++ b/deepspeed/inference/v2/modules/interfaces/unembed_base.py
@@ -0,0 +1,61 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import Any, Dict, Optional, Type
+
+import torch
+
+from deepspeed.runtime.config_utils import DeepSpeedConfigModel
+from ...ragged import RaggedBatchWrapper
+from ..ds_module import DSModuleBase
+from ..module_registry import DSModuleRegistryBase
+from ..configs import DSUnembedConfig
+
+
+class DSUnembedBase(DSModuleBase):
+    """
+    Base mixin for unmebedding modules. The interface represented by this module is:
+
+    if config.do_normalization
+        hidden = layer_norm(hidden)
+    logits = hidden @ projection
+    """
+
+    @staticmethod
+    def config_class() -> Type[DeepSpeedConfigModel]:
+        return DSUnembedConfig
+
+    def __init__(self, config: DSUnembedConfig, implementation_config: Dict[str, Any]) -> None:
+        super().__init__(config, implementation_config)
+
+    def forward(self,
+                hidden_states: torch.Tensor,
+                vocab_embedding: torch.Tensor,
+                ragged_metadata: RaggedBatchWrapper,
+                gamma: Optional[torch.Tensor] = None,
+                beta: Optional[torch.Tensor] = None) -> torch.Tensor:
+        """
+        Forward interface. Gamma and beta are optional parameters passed depending on
+        `self.config.do_normalization`.
+
+        Args:
+            hidden_states (torch.Tensor): Hidden states of shape [tokens, model_dim]
+            vocab_embedding (torch.Tensor): Embedding matrix of shape [vocab_size, model_dim]
+            ragged_metadata (RaggedBatchWrapper): Metadata for the ragged batch.
+            gamma (Optional[torch.Tensor]): Gamma parameter for layer norm.
+            beta (Optional[torch.Tensor]): Beta parameter for layer norm.
+
+        Returns:
+            torch.Tensor: Unembedded hidden states of shape [n_seqs, model_dim]
+        """
+        raise NotImplementedError()
+
+
+class DSUnembedRegistry(DSModuleRegistryBase):
+    registry: Dict = {}
+
+    @staticmethod
+    def associated_class() -> Type[DSModuleBase]:
+        return DSUnembedBase
diff --git a/deepspeed/inference/v2/modules/module_registry.py b/deepspeed/inference/v2/modules/module_registry.py
new file mode 100644
index 000000000000..e04b8d734518
--- /dev/null
+++ b/deepspeed/inference/v2/modules/module_registry.py
@@ -0,0 +1,58 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from abc import ABC, abstractstaticmethod
+from typing import Any, Dict, Type
+
+from deepspeed.runtime.config_utils import DeepSpeedConfigModel
+from .ds_module import DSModuleBase
+
+
+class ConfigBundle(DeepSpeedConfigModel):
+    """
+    A config bundle is a collection of configs that are used to instantiate a model implementation.
+    """
+    name: str
+    config: DeepSpeedConfigModel
+    implementation_config: Dict[str, Any] = {}
+
+
+class DSModuleRegistryBase(ABC):
+    """
+    Class holding logic for tracking the DSModule implementations of a given interface.
+    """
+
+    @classmethod
+    def instantiate_config(cls, config_bundle: ConfigBundle) -> DSModuleBase:
+        """
+        Given a DSModule key, attempt to instantiate
+        """
+        if config_bundle.name not in cls.registry:
+            raise KeyError(f"Unknown DSModule: {config_bundle.name}, cls.registry={cls.registry}")
+
+        target_implementation = cls.registry[config_bundle.name]
+        if not target_implementation.supports_config(config_bundle.config):
+            raise ValueError(f"Config {config_bundle.config} is not supported by {target_implementation}")
+
+        return cls.registry[config_bundle.name](config_bundle.config, config_bundle.implementation_config)
+
+    @abstractstaticmethod
+    def associated_class() -> Type[DSModuleBase]:
+        """
+        Return the class associated with this registry.
+        """
+        raise NotImplementedError("Must associated a DSModule class with its registry.")
+
+    @classmethod
+    def register_module(cls, child_class: DSModuleBase) -> None:
+        """
+        Register a module with this registry.
+        """
+        if not issubclass(child_class, cls.associated_class()):
+            raise TypeError(
+                f"Can only register subclasses of {cls.associated_class()}, {child_class} does not inherit from {cls.associated_class()}"
+            )
+        cls.registry[child_class.name()] = child_class
+        return child_class
diff --git a/deepspeed/inference/v2/ragged/__init__.py b/deepspeed/inference/v2/ragged/__init__.py
new file mode 100644
index 000000000000..3af09cff4be5
--- /dev/null
+++ b/deepspeed/inference/v2/ragged/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .kv_cache import split_kv
+from .manager_configs import (
+    AllocationMode,
+    DSStateManagerConfig,
+    KVCacheConfig,
+    MemoryConfig,
+)
+from .ragged_manager import DSStateManager
+from .ragged_wrapper import RaggedBatchWrapper
+from .sequence_descriptor import DSSequenceDescriptor, PlaceholderSequenceDescriptor
diff --git a/deepspeed/inference/v2/ragged/blocked_allocator.py b/deepspeed/inference/v2/ragged/blocked_allocator.py
new file mode 100644
index 000000000000..7884d8cccb47
--- /dev/null
+++ b/deepspeed/inference/v2/ragged/blocked_allocator.py
@@ -0,0 +1,105 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import Iterable, Union
+
+import torch
+
+
+class BlockedAllocator:
+    """
+    Allocator class for managing which blocks are free/used in the
+    blocked KV-cache. This is a simple allocator that uses a linked list
+    to keep track of which blocks are free/used. The cost of allocation/deallocation
+    is O(blocks), where blocks is the number of blocks to allocate/deallocate.
+
+    TODO(cmikeh2): Evaluate performance of this allocator and migrate
+    to C++ if necessary.
+    """
+    # Number of blocks in the KV-cache(s).
+    _num_blocks: int
+
+    # Array of blocks, where each element is the next block in the linked list.
+    _blocks: torch.Tensor
+
+    # Index of the head of the linked list.
+    _head: int
+
+    # Number of free blocks in the KV-cache.
+    _free_blocks: int
+
+    def __init__(self, num_blocks: int) -> None:
+        """
+        Initialize an allocator with `num_blocks` blocks. This requires at least
+        `num_blocks` * 4 bytes of host memory.
+
+        Parameters:
+            num_blocks (int): The number of blocks to allocate.
+        """
+
+        if num_blocks < 1:
+            raise ValueError(f'Blocked KV-cache must have at least 1 block, provided {num_blocks}')
+
+        self._num_blocks = num_blocks
+        self._blocks = torch.arange(1, num_blocks + 1, dtype=torch.int32, device='cpu', pin_memory=True)
+        self._head = 0
+        self._free_blocks = num_blocks
+
+    def allocate(self, num_blocks: int) -> torch.Tensor:
+        """
+        Allocate a list of blocks from the associated KV-caches. This will
+        return `num_blocks` blocks from the KV-cache if they are available,
+        or raise an exception if there are not enough free blocks.
+
+        Parameters:
+            num_blocks (int): The number of blocks to allocate.
+
+        Returns:
+            List[int]: The list of blocks allocated.
+        """
+        if num_blocks > self._free_blocks:
+            raise ValueError(f'Not enough free blocks in the KV-cache to allocate {num_blocks} blocks')
+
+        allocated_blocks = torch.zeros(num_blocks, dtype=torch.int32)
+        for i in range(num_blocks):
+            allocated_blocks[i] = self._head
+            self._head = self._blocks[self._head].item()
+            self._blocks[allocated_blocks[i]] = -1  # Mark as used
+            self._free_blocks -= 1
+
+        return allocated_blocks
+
+    def free(self, blocks: Union[Iterable[int], int]) -> None:
+        """
+        Return a list of blocks to the free pool. If a single invalid block is provided (i.e.,
+        one that is out of range of the allocator or is already free), then an exception is raised
+        and no blocks are freed.
+
+        Parameters:
+            blocks (Union[Iterable[int], int]): The list of blocks to free. If only one block
+                is to be freed, this can be alone as an integer.
+        """
+        if isinstance(blocks, int):
+            blocks = [blocks]
+
+        for block in blocks:
+            # Parse all blocks for validity before mutating the list.
+            if block < 0 or block >= self._num_blocks:
+                raise ValueError(f'Invalid block {block} provided to free')
+
+            if self._blocks[block] != -1:
+                raise ValueError(f'Block {block} is already free')
+
+        for block in blocks:
+            self._blocks[block] = self._head
+            self._head = block
+            self._free_blocks += 1
+
+    @property
+    def free_blocks(self) -> int:
+        """
+        Return the number of free blocks in the KV-cache.
+        """
+        return self._free_blocks
diff --git a/deepspeed/inference/v2/ragged/csrc/fast_host_buffer.cu b/deepspeed/inference/v2/ragged/csrc/fast_host_buffer.cu
new file mode 100644
index 000000000000..31347636b50c
--- /dev/null
+++ b/deepspeed/inference/v2/ragged/csrc/fast_host_buffer.cu
@@ -0,0 +1,18 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include "ds_kernel_utils.h"
+#include "fast_host_buffer.h"
+
+void* get_cuda_fast_buffer(int64_t size)
+{
+    void* buffer_ptr;
+    // Host allocation flags that should minimize the host -> accelerator copy latency
+    unsigned int alloc_flags =
+        cudaHostAllocPortable | cudaHostAllocMapped | cudaHostAllocWriteCombined;
+
+    cudaHostAlloc(&buffer_ptr, size, alloc_flags);
+    return buffer_ptr;
+}
diff --git a/deepspeed/inference/v2/ragged/csrc/ragged_ops.cpp b/deepspeed/inference/v2/ragged/csrc/ragged_ops.cpp
new file mode 100644
index 000000000000..8a29dd2d5945
--- /dev/null
+++ b/deepspeed/inference/v2/ragged/csrc/ragged_ops.cpp
@@ -0,0 +1,76 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include <c10/cuda/CUDAStream.h>
+#include <torch/extension.h>
+
+#include "fast_host_buffer.h"
+
+/*
+Similar to doing an empty_like to replicate a Tensor on the host, but will
+attempt to optimize for faster host -> accelerator copies. Since this is on the critical
+path for the forward pass, this should directly improve performance.
+Allocates the shadow buffers for the input_ids, batch, seq and kv_ids tensors.
+
+Arguments:
+    device_mirror: A tensor on the accelerator that should be mirrored by the host.
+
+Returns:
+    A tensor on the host of the same size and datatype optimized for fast host -> accelerator
+copies.
+*/
+torch::Tensor allocate_fast_host_buffer(torch::Tensor device_mirror)
+{
+#ifdef __HIP_PLATFORM_HCC__
+    auto options =
+        torch::TensorOptions().device(torch::kCPU).pinned_memory(true).dtype(device_mirror.dtype());
+    auto buffer = torch::empty(device_mirror.sizes(), options);
+#else
+
+    void* buffer_ptr = get_cuda_fast_buffer(device_mirror.numel() * device_mirror.element_size());
+
+    auto options = torch::TensorOptions().device(torch::kCPU).dtype(device_mirror.dtype());
+    auto buffer = torch::from_blob(buffer_ptr, device_mirror.sizes(), options);
+#endif
+    return buffer;
+}
+
+torch::Tensor allocate_view_on(torch::Tensor& tensor, torch::Tensor& buffer, int64_t offset)
+{
+    int8_t* data = reinterpret_cast<int8_t*>(buffer.data_ptr());
+
+    auto options = tensor.options().device(buffer.device());
+
+    return at::from_blob(data + offset, tensor.sizes(), tensor.strides(), options);
+}
+
+torch::Tensor allocate_view_like(py::tuple shape,
+                                 py::tuple strides,
+                                 torch::Tensor& dummy_tensor,
+                                 torch::Tensor& buffer,
+                                 int64_t offset)
+{
+    int8_t* data = reinterpret_cast<int8_t*>(buffer.data_ptr());
+
+    auto options = torch::TensorOptions().device(buffer.device()).dtype(dummy_tensor.dtype());
+
+    return at::from_blob(data + offset,
+                         shape.cast<std::vector<int64_t>>(),
+                         strides.cast<std::vector<int64_t>>(),
+                         options);
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+    m.def("allocate_fast_host_buffer",
+          &allocate_fast_host_buffer,
+          "Allocate a host mirror of an accelerator Tensor.");
+    m.def("allocate_view_on",
+          &allocate_view_on,
+          "Allocate a view on a Tensor on the same device as the input Tensor.");
+    m.def("allocate_view_like",
+          &allocate_view_like,
+          "Allocate a view on a Tensor on the same device as the input Tensor.");
+}
diff --git a/deepspeed/inference/v2/ragged/includes/fast_host_buffer.h b/deepspeed/inference/v2/ragged/includes/fast_host_buffer.h
new file mode 100644
index 000000000000..81f24ed8fdaa
--- /dev/null
+++ b/deepspeed/inference/v2/ragged/includes/fast_host_buffer.h
@@ -0,0 +1,14 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#pragma once
+
+#include "ds_kernel_utils.h"
+
+/*
+Wrapper around cudaHostAlloc with some specific flags. Returns a pointer to the
+memory region of `size` bytes.
+*/
+void* get_cuda_fast_buffer(int64_t size);
diff --git a/deepspeed/inference/v2/ragged/kv_cache.py b/deepspeed/inference/v2/ragged/kv_cache.py
new file mode 100644
index 000000000000..50da350b6506
--- /dev/null
+++ b/deepspeed/inference/v2/ragged/kv_cache.py
@@ -0,0 +1,213 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import operator
+from functools import reduce
+from typing import Any, Iterable, Optional, Tuple
+
+import torch
+
+import deepspeed.comm as dist
+from deepspeed.comm.reduce_op import ReduceOp
+
+from deepspeed.accelerator import get_accelerator
+from ..inference_utils import elem_size
+from ..logging import inference_logger
+from .blocked_allocator import BlockedAllocator
+from .manager_configs import AllocationMode, KVCacheConfig, MemoryConfig
+
+
+def split_kv(kv_cache: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Split a KV cache instance into its key and value components.
+
+    Parameters:
+        kv_cache (torch.Tensor): The KV-cache to split. This should be a 5D tensor with the
+            following shape: [num_blocks, block_size, 2, num_heads, head_size]
+
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: The key and value components of the KV-cache. Both
+            tensors will have the shape [num_blocks, block_size, num_heads, head_size].
+    """
+    if kv_cache.ndim != 5:
+        raise ValueError(f"KV-cache must have 5 dimensions, got {kv_cache.ndim}.")
+
+    return kv_cache[:, :, 0, :, :], kv_cache[:, :, 1, :, :]
+
+
+class BlockedKVCache:
+
+    _caches: Tuple[torch.Tensor, ...]
+    """
+    Backing storage for all KV caches. This is a 6D tensor with the following shape:
+        (num_caches, num_blocks, block_size, 2, num_heads, head_size)
+    """
+
+    _allocators: Tuple[BlockedAllocator, ...]
+    """
+    Block allocator for tracking cache usage. This manages the GPU cache.
+    """
+
+    _configs: Tuple[KVCacheConfig, ...]
+    """
+    Configuration of the KV cache(s). See ``KVCacheConfig`` for more details. This enables the support
+    for different types/shapes of KV-caches (i.e. the alternating local and global attention in
+    GPT-Neo).
+    """
+
+    def __init__(self,
+                 configs: Tuple[KVCacheConfig, ...],
+                 memory_config: MemoryConfig,
+                 mp_group: Optional[Any] = None,
+                 offload: bool = False) -> None:
+        """
+        Create a container that will maintain the storage and allocations for a set of
+        blocked KV-caches.
+
+        Parameters:
+            config (KVCacheConfig): The configuration of the KV-cache.
+            slack (int): The amount of slack space to reserve in GPU memory for the cache.
+            enable_offload (bool): Whether to enable offloading of the cache to the host.
+            blocks (int): The number of blocks to pre-allocate for the cache. If this is set,
+                slack will be ignored.
+        """
+        self._configs = configs
+        self._memory_config = memory_config
+        self._enable_offload = offload
+
+        if self._enable_offload:
+            raise NotImplementedError("Offloading of KV-caches is not yet supported.")
+
+        if AllocationMode(self._memory_config.mode) is AllocationMode.RESERVE:
+            # TODO(cmikeh2): Change the weighting based on the type of the KV-cache
+
+            total_per_block_footprint = 0
+            for config in self._configs:
+                per_block_footprint = reduce(operator.mul, config.cache_shape, config.block_size)
+                per_block_footprint *= 2  # for key and value
+                total_per_block_footprint += per_block_footprint * elem_size(config.cache_dtype)
+
+            # Perform a dummy nccl call before calculating available memory, on some systems (H100) we've observed higher memory allocations from NCCL
+            if dist.get_world_size(group=mp_group) > 1:
+                dummy_tensor = torch.tensor(0, dtype=torch.int32, device=get_accelerator().current_device())
+                dist.all_reduce(dummy_tensor, op=ReduceOp.MIN, group=mp_group)
+
+            get_accelerator().empty_cache()
+            available_kv_memory = get_accelerator().available_memory() - self._memory_config.size
+            total_memory = get_accelerator().total_memory()
+
+            inference_logger().debug(
+                f"Memory usage before KV-cache allocation: total_memory={total_memory}, available_kv_memory={available_kv_memory}, total_per_block_footprint={total_per_block_footprint}"
+            )
+
+            if available_kv_memory < total_per_block_footprint:
+                raise ValueError(
+                    f"Insufficient memory to allocate KV-caches. Required: {total_per_block_footprint}, Available: {available_kv_memory}"
+                )
+
+            num_blocks = available_kv_memory // total_per_block_footprint
+
+            # In a multi-process setting, we need to ensure that all processes have the same
+            # KV cache capacity to ensure scheduling guarantees are equivalent on all ranks.
+            if dist.get_world_size(group=mp_group) > 1:
+                reduce_tensor = torch.tensor(num_blocks, dtype=torch.int32, device=get_accelerator().current_device())
+                dist.all_reduce(reduce_tensor, op=ReduceOp.MIN, group=mp_group)
+                num_blocks = reduce_tensor.item()
+
+                # This is ugly but don't want the fragmentation of the 8 byte Tensor maybe
+                # hanging around.
+                del reduce_tensor
+                get_accelerator().empty_cache()
+        else:  # AllocationMode.ALLOCATE
+            num_blocks = self._memory_config.size
+
+        caches = []
+        allocators = []
+
+        for cache_group_id, config in enumerate(self._configs):
+            num_caches = config.cache_shape[0]
+            num_heads = config.cache_shape[1]
+            head_size = config.cache_shape[2]
+
+            alloc_shape = (num_caches, num_blocks, config.block_size, 2, num_heads, head_size)
+            inference_logger().info(
+                f"Allocating KV-cache {cache_group_id} with shape: {alloc_shape} consisting of {num_blocks} blocks.")
+            caches.append(torch.empty(alloc_shape, dtype=config.cache_dtype,
+                                      device=get_accelerator().current_device()))
+            allocators.append(BlockedAllocator(num_blocks))
+
+        self._caches = tuple(caches)
+        self._allocators = tuple(allocators)
+        self._free_blocks = torch.empty(len(self._allocators), dtype=torch.int32, device="cpu")
+        for i, allocator in enumerate(self._allocators):
+            self._free_blocks[i] = allocator.free_blocks
+
+    def reserve(self, num_blocks: int, cache_group: int = 0) -> torch.Tensor:
+        """
+        Reserve a number of blocks from the cache. This will return a 1D tensor of
+        block_ids that have been marked as reserved.
+
+        Parameters:
+            num_blocks (int): The number of blocks to reserve.
+            cache_group (int): The cache group to reserve from. Default is 0.
+        """
+        return self._allocators[cache_group].allocate(num_blocks)
+
+    def free(self, blocks: Iterable[int], cache_group: int = 0) -> None:
+        """
+        Free a set of blocks from the cache. This will mark the blocks as free in the
+        allocator.
+
+        Parameters:
+            blocks (Iterable[int]): The blocks to free.
+            cache_group (int): The cache group to free from. Default is 0.
+        """
+        self._allocators[cache_group].free(blocks)
+
+    def offload(self, blocks: Iterable[int], cache_group: int = 0) -> torch.Tensor:
+        """
+        Offload KV-cache blocks from accelerator memory to the host.
+
+        Parameters:
+            blocks (Iterable[int]): The blocks to offload.
+            cache_group (int): The cache group to offload from. Default is 0.
+        """
+        raise NotImplementedError("Offloading is not yet supported.")
+
+    def restore(self, blocks: Iterable[int], cache_group: int = 0) -> torch.Tensor:
+        """
+        Restore KV-cache blocks from the host to accelerator memory.
+
+        Parameters:
+            blocks (Iterable[int]): The blocks to restore.
+            cache_group (int): The cache group to restore to. Default is 0.
+        """
+        raise NotImplementedError("Offloading is not yet supported.")
+
+    def get_cache(self, cache_id: int, cache_group: int = 0) -> torch.Tensor:
+        """
+        Get the tensor associated with the given cache ID.
+
+        Parameters:
+            cache_id (int): The ID of the cache tensor to get.
+            cache_group (int): The cache group to get from. Default is 0.
+        """
+        return self._caches[cache_group][cache_id]
+
+    @property
+    def free_blocks(self) -> torch.Tensor:
+        """
+        Return the number of free blocks in each cache
+        """
+        for i, allocator in enumerate(self._allocators):
+            self._free_blocks[i] = allocator.free_blocks
+        return self._free_blocks
+
+    @property
+    def num_caches(self) -> int:
+        """
+        Return the number of caches
+        """
+        return len(self._caches)
diff --git a/deepspeed/inference/v2/ragged/manager_configs.py b/deepspeed/inference/v2/ragged/manager_configs.py
new file mode 100644
index 000000000000..a5e98e5bcef1
--- /dev/null
+++ b/deepspeed/inference/v2/ragged/manager_configs.py
@@ -0,0 +1,183 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from enum import Enum
+from typing import Tuple
+
+from deepspeed.pydantic_v1 import PositiveInt, validator
+
+from deepspeed.runtime.config_utils import DeepSpeedConfigModel
+from ..inference_utils import DtypeEnum
+
+
+class KVCacheType(Enum):
+
+    DENSE = "dense"
+    """
+    Dense KV-cache. This is the default type.
+    """
+
+    LOCAL = "local"
+    """
+    KV-cache that attends to only a local (trailing) window of tokens.
+    """
+
+
+class KVCacheConfig(DeepSpeedConfigModel):
+
+    type: KVCacheType = KVCacheType.DENSE
+    """
+    Type of KV-cache to use. This may inform the allocator of the expected access/retention pattern
+    to enable more efficient memory management.
+    """
+
+    block_size: int = 128
+    """
+    Number of tokens that may be contained in each cache block.
+    """
+
+    num_allocation_groups: PositiveInt = 1
+    """
+    Allocation groups are assumed to be able to use the same allocation block size because
+    the allocation granularity is the same but the number of blocks required in each group
+    may differ.
+
+    As a concrete example, consider a model with alternating layers of local and global
+    attention (such as GPTNeo). The local attention layers do not require the same number
+    of cache blocks as the global layer. However, a static partitioning scheme is sub-optimal since the ratio of local to global KV-cache blocks is not constant across
+    the range of sequence lengths that may be encountered.
+
+    NOTE: In theory, this functionality could be used to do per-head and per-layer
+    KV-cache allocation, but it is likely the allocator will struggle with managing that
+    many blocks.
+
+    NOTE: This will need to be primarily understood and handled by the model implementation
+    itself, rather than the KV cache manager. However, I'd like to make this explicit.
+    """
+
+    cache_shape: Tuple[PositiveInt, PositiveInt, PositiveInt]
+    """
+    The shape of the cache per token. The first dimension is the number of individual
+    caches, the second is the number of heads, and the third is the head size. The number
+    of caches argument here is per allocation group.
+    """
+
+    cache_dtype: DtypeEnum = DtypeEnum.fp16
+    """
+    Data type of the KV-cache.
+    """
+
+    max_blocks_per_allocation_group: PositiveInt = 64
+    """
+    Maximum number of blocks that can be associated with an allocation group.
+    """
+
+
+"""
+The config above is a little confusing so let's use a couple of concrete examples of
+usage:
+
+Model 1: Llama-13B with a block size of 256
+
+Llama is uniform attention so we have a single allocation group. The cache shape is
+(40 layers, 40 heads, 128 head size)
+
+```python
+llama_kv_config = KVCacheConfig(block_size=256,
+                                num_allocation_groups=1,
+                                cache_shape=(40, 40, 128))
+```
+
+Model 2: GPTNeo-2.7B with a block size of 128
+
+GPTNeo has alternating local and global attention layers. We have two allocation groups.
+There are 16 layers of each type with 20 heads apiece at 128 head size.
+
+```python
+gptneo_kv_config = KVCacheConfig(num_allocation_groups=2, cache_shape=(16, 20, 128))
+```
+"""
+
+
+class AllocationMode(Enum):
+    """
+    Helper class to describe memory allocation strategies for the KV-cache.
+    """
+
+    RESERVE = "reserve"
+    """
+    Reserve a small amount of memory for non-KV cache allocations.
+    """
+
+    ALLOCATE = "allocate"
+    """
+    Allocate an explicit number of KV blocks.
+    """
+
+
+class MemoryConfig(DeepSpeedConfigModel):
+
+    mode: AllocationMode = AllocationMode.RESERVE
+
+    size: PositiveInt = 1_000_000_000
+    """
+    Parameter for each of the modes.
+
+    If mode is RESERVE, this is the amount of memory in bytes to reserve after allocating the
+    KV-cache. If in a tensor-parallel regime, this amount is guaranteed to be reserved on
+    all devices.
+
+    If mode is ALLOCATE, this is the number of blocks to allocate for the KV-cache. This may
+    require tuning for model/GPU setups.
+    """
+
+
+class DSStateManagerConfig(DeepSpeedConfigModel):
+
+    max_tracked_sequences: PositiveInt = 2048
+    """
+    How many sequences this engine will track simultaneously. This limit should be greater
+    than the ``max_ragged_sequence_count``.
+    """
+
+    max_ragged_batch_size: PositiveInt = 768
+    """
+    The maximum number of tokens that can be contained in a single ragged batch. Passing
+    a larger value than this will raise an exception that must be handled by the runtime.
+    """
+
+    max_ragged_sequence_count: PositiveInt = 512
+    """
+    The maximum number of sequences that can compose a batch. This limitation is only
+    relevant under CUDA graphing scenarios currently, where the maximum number of blocks
+    is largely bound by the total number of sequences in the ragged batch. This number cannot
+    be larger than ``max_tracked_sequences`` or ``max_ragged_batch_size``.
+    """
+
+    max_context: PositiveInt = 8192
+    """
+    The maximum number of tokens (inclusive of generation) that can be contained in a single
+    sequence. Currently used to bound the size of the KV cache metadata.
+    """
+
+    memory_config: MemoryConfig = MemoryConfig()
+    """
+    Directive for how to manage the creation of the KV-cache. See MemoryConfig for more
+    details.
+    """
+
+    offload: bool = False
+    """
+    Enable tracking for offloading KV-cache to host memory. Currently unsupported.
+    """
+
+    @validator("max_ragged_sequence_count")
+    def max_ragged_sequence_count_validator(cls, v: int, values: dict):
+        # If the attributes below failed their validation they won't appear in the values dict.
+        if "max_tracked_sequences" in values and v > values["max_tracked_sequences"]:
+            raise ValueError("max_ragged_sequence_count must be less than max_tracked_sequences")
+        if "max_ragged_batch_size" in values and v > values["max_ragged_batch_size"]:
+            raise ValueError("max_ragged_sequence_count must be less than max_ragged_batch_size")
+        return v
diff --git a/deepspeed/inference/v2/ragged/ragged_manager.py b/deepspeed/inference/v2/ragged/ragged_manager.py
new file mode 100644
index 000000000000..8eff4560b4d0
--- /dev/null
+++ b/deepspeed/inference/v2/ragged/ragged_manager.py
@@ -0,0 +1,208 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+from typing import Any, Dict, Optional, Tuple
+
+from deepspeed.accelerator import get_accelerator
+from deepspeed.ops.op_builder import RaggedUtilsBuilder
+from deepspeed.utils.logging import logger
+
+from .blocked_allocator import BlockedAllocator
+from .kv_cache import BlockedKVCache
+from .manager_configs import DSStateManagerConfig, KVCacheConfig
+from .sequence_descriptor import DSSequenceDescriptor
+
+
+class DSStateManager:
+    """
+    Base abstract class for managing blocked KV caches. Will probably have a single
+    implementation for now.
+    """
+
+    _config: DSStateManagerConfig
+    """
+    Config for state management. See DSStateManagerConfig for more details. The arguments here
+    should come from the engine config.
+    """
+
+    _kv_configs: Tuple[KVCacheConfig]
+    """
+    Config for the KV cache. See KVCacheConfig for more details. These arguments should derive
+    from the model implementation.
+    """
+
+    _kv_cache: BlockedKVCache
+    """
+    Persistent KV cache store.
+    """
+
+    # Container for tracking all sequences in the system.
+    _seqs: Dict[int, DSSequenceDescriptor]
+    """
+    Container for tracking all sequences in the system.
+
+    TODO(cmikeh2): Evaluate if this has any performance implications.
+    """
+
+    # Allocator for tracking sequences.
+    _tracking_allocator: BlockedAllocator
+    _all_block_ids: Tuple[torch.Tensor, ...]
+    _all_block_ids_shadow: Tuple[torch.Tensor, ...]
+
+    def __init__(self,
+                 config: DSStateManagerConfig,
+                 kv_configs: Tuple[KVCacheConfig, ...],
+                 base_mp_group: Optional[Any] = None) -> None:
+        """
+        The key
+
+        Parameters:
+            block_size (int): The number of tokens to allocate in each block.
+        """
+        self._config = config
+        self._kv_configs = kv_configs
+
+        # Load our helpers for host allocation.
+        self._ragged_utils = RaggedUtilsBuilder().load()
+
+        # Initialize the allocator for tracking sequences (so this doesn't need to be ad-hoc).
+        self._tracking_allocator = BlockedAllocator(self._config.max_tracked_sequences)
+
+        all_block_ids = []
+        all_block_ids_shadow = []
+
+        for cache_config in self._kv_configs:
+            # Storage to back tracking the KV cache allocation.
+            ids_shape = (
+                self._config.max_tracked_sequences,
+                cache_config.num_allocation_groups,
+                cache_config.max_blocks_per_allocation_group,
+            )
+
+            all_block_ids.append(torch.zeros(ids_shape, dtype=torch.int32, device=get_accelerator().current_device()))
+            all_block_ids_shadow.append(self._ragged_utils.allocate_fast_host_buffer(all_block_ids[-1]))
+
+        self._all_block_ids = tuple(all_block_ids)
+        self._all_block_ids_shadow = tuple(all_block_ids_shadow)
+
+        # Initialize the sequence container.
+        self._seqs = {}
+
+        # Finally initialize the KV cache.
+        self._kv_cache = BlockedKVCache(self._kv_configs,
+                                        self._config.memory_config,
+                                        mp_group=base_mp_group,
+                                        offload=self._config.offload)
+
+    def get_cache(self, cache_id: int, cache_group: int = 0) -> torch.Tensor:
+        """
+        Return the Tensor associated with the given cache id in the specified cache group.
+
+        Arguments:
+            cache_group (str): The KV cache group.
+            cache_id (int): The cache id within that group.
+        """
+        return self._kv_cache.get_cache(cache_id, cache_group=cache_group)
+
+    def flush_sequence(self, uid: int) -> None:
+        """
+        Free all resources associated with the given sequence id.
+        """
+        if uid not in self._seqs:
+            logger.warning(f"Attempting to flush sequence {uid} which does not exist.")
+            return
+
+        seq = self._seqs[uid]
+        for i in range(self.n_kv_cache_groups):
+            self._kv_cache.free(seq.all_block_ids(cache_group=i), cache_group=i)
+
+        self._tracking_allocator.free(seq.tracking_id)
+        del self._seqs[uid]
+
+    def get_sequence(self, uid: int) -> Optional[DSSequenceDescriptor]:
+        """
+        Get the sequence descriptor for the given sequence id. If the sequence does not exist,
+        then None is returned.
+        """
+        if uid not in self._seqs:
+            return None
+
+        return self._seqs[uid]
+
+    def get_or_create_sequence(self, uid: int) -> DSSequenceDescriptor:
+        """
+        Get the existing sequence descriptor for a given uid or initialize one if
+        it does not exist. NOTE: This will always return a valid sequence descriptor
+        if one may be allocated and should not be used from APIs that are attempting
+        to test the schedulability of a hypothetical batch.
+        """
+        if uid in self._seqs:
+            return self._seqs[uid]
+        else:
+            return self._create_sequence(uid)
+
+    def _create_sequence(self, uid: int) -> DSSequenceDescriptor:
+        """
+        Create a new sequence descriptor for the given sequence id.
+        """
+        if uid in self._seqs:
+            raise ValueError(f"Sequence {uid} already exists.")
+
+        try:
+            tracking_slot = self._tracking_allocator.allocate(1).item()
+        except ValueError:
+            raise RuntimeError(
+                f"Unable to create tracking slot for sequence {uid} since the metadata buffers are full.")
+
+        seq_block_ids = tuple(all_block_ids[tracking_slot] for all_block_ids in self._all_block_ids)
+        seq_block_ids_shadow = tuple(all_block_ids_shadow[tracking_slot]
+                                     for all_block_ids_shadow in self._all_block_ids_shadow)
+
+        self._seqs[uid] = DSSequenceDescriptor(tracking_slot,
+                                               seq_block_ids,
+                                               seq_block_ids_shadow,
+                                               max_context=self._config.max_context)
+        # TODO(cmikeh2): Debug call here might be unnecessary and is potentially on critical path.
+        logger.debug(f"Created sequence {uid} with tracking slot {tracking_slot}.")
+        return self._seqs[uid]
+
+    @property
+    def tracked_sequences(self) -> Dict[int, DSSequenceDescriptor]:
+        """
+        Return the tracked sequences.
+        """
+        return self._seqs
+
+    @property
+    def n_tracked_sequences(self) -> int:
+        """
+        Return the number of sequences currently tracked.
+        """
+        return len(self._seqs)
+
+    @property
+    def kv_block_size(self) -> int:
+        """
+        Return the block size of the KV cache.
+        """
+        return self._kv_config.block_size
+
+    @property
+    def n_kv_cache_groups(self) -> int:
+        """
+        Return the number of KV caches.
+        """
+        return self._kv_cache.num_caches
+
+    @property
+    def free_blocks(self) -> torch.Tensor:
+        """
+        Return the number of free blocks in the KV cache.
+        """
+        return self._kv_cache.free_blocks
+
+    def allocate_blocks(self, n_blocks: int, cache_group: int = 0) -> torch.Tensor:
+        return self._kv_cache.reserve(n_blocks, cache_group=cache_group)
diff --git a/deepspeed/inference/v2/ragged/ragged_wrapper.py b/deepspeed/inference/v2/ragged/ragged_wrapper.py
new file mode 100644
index 000000000000..15abd1987aa1
--- /dev/null
+++ b/deepspeed/inference/v2/ragged/ragged_wrapper.py
@@ -0,0 +1,267 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import Optional
+
+import torch
+
+from deepspeed.accelerator import get_accelerator
+from deepspeed.ops.op_builder import RaggedUtilsBuilder
+
+from .sequence_descriptor import DSSequenceDescriptor
+from .manager_configs import DSStateManagerConfig
+
+
+def to_padded(original_size: int) -> int:
+    """
+    Pad to a backend friendly granularity.
+    """
+
+    def _pad_to_mul_of_pow2(val: int, pow_2_val: int) -> int:
+        return val + (pow_2_val - 1) & ~(pow_2_val - 1)
+
+    # TODO(cmikeh2): Tune this approach. This is mainly a placeholder right now.
+    granularity = 64 if original_size <= 512 else 128
+
+    return _pad_to_mul_of_pow2(original_size, granularity)
+
+
+class RaggedBatchWrapper:
+    """
+    Container for all the auxiliary Tensors used in the management of a ragged batch.
+
+    For each Tensor, we maintain a shadow Tensor on the host. This Tensor is what is
+    directly populated when constructing the ragged batch. The shadow Tensors, when possible,
+    should be allocated so as to support fast host-to-accelerator copies.
+    """
+
+    # Tensors to populate the ragged batch into.
+    _input_ids_shadow: torch.Tensor
+    _input_ids: torch.Tensor
+    """
+    Forward pass input buffer.
+    """
+
+    _batch_metadata_storage: torch.Tensor
+    _batch_metadata_storage_shadow: torch.Tensor
+    """
+    Holds the number of inflight sequences and tokens for the ragged batch.
+    """
+
+    _token_to_seq_storage: torch.Tensor
+    _token_to_seq_storage_shadow: torch.Tensor
+    """
+    Linear mapping for each of the tokens. Let's say we have 8 tokens in the batch,
+    with the sequence breakdown being [4, 1, 3]. Then, the mapping would be:
+    [0, 0, 0, 0, 1, 2, 2, 2]
+    """
+
+    _inflight_seq_descriptors: torch.Tensor
+    _inflight_seq_descriptors_shadow: torch.Tensor
+    """
+    For each sequence in the batch, we store the start token in the batch, the number of tokens
+    the number of tokens in the history of this sequence, and an unused 4th reserved for alignment.
+    For the above example this would give:
+    [[0, 4, H0, X], [4, 1, H1, X], [5, 3, H2, X]]
+    """
+
+    # Holds the block ids for each sequence in the ragged batch.
+    _kv_ptrs: torch.Tensor
+    _kv_ptrs_shadow: torch.Tensor
+    """
+    List of ptrs pointing to the GPU buffer that holds the KV-block ids for each sequence.
+    If there are multiple allocation groups associated with each of the sequences, then
+    then accessing the Nth cache will require accessing the Nth block id
+    """
+
+    def __init__(self, config: DSStateManagerConfig) -> None:
+        """
+        Convenience wrapper around the data structures used to represent a ragged
+        batch for inference. Only a single `RaggedBatchWrapper` should be used per
+        ragged inference engine.
+
+        The underlying data structures are implemented in `ragged_batch_descriptor.h`.
+        """
+        self._config = config
+        self._input_ids = torch.zeros((self._config.max_ragged_batch_size),
+                                      dtype=torch.int64,
+                                      device=get_accelerator().current_device())
+
+        self._batch_metadata_storage = torch.zeros(2, dtype=torch.int32, device=get_accelerator().current_device())
+
+        self._token_to_seq_storage = torch.zeros((self._config.max_ragged_batch_size),
+                                                 dtype=torch.int32,
+                                                 device=get_accelerator().current_device())
+        self._inflight_seq_descriptors = torch.zeros((self._config.max_ragged_sequence_count, 4),
+                                                     dtype=torch.int32,
+                                                     device=get_accelerator().current_device())
+        self._kv_ptrs = torch.zeros((self._config.max_ragged_sequence_count),
+                                    dtype=torch.int64,
+                                    device=get_accelerator().current_device())
+
+        self._utils_module = RaggedUtilsBuilder().load()
+        host_alloc = self._utils_module.allocate_fast_host_buffer
+
+        self._input_ids_shadow = host_alloc(self._input_ids)
+        self._batch_metadata_storage_shadow = host_alloc(self._batch_metadata_storage)
+        self._token_to_seq_storage_shadow = host_alloc(self._token_to_seq_storage)
+        self._inflight_seq_descriptors_shadow = host_alloc(self._inflight_seq_descriptors)
+        self._kv_ptrs_shadow = host_alloc(self._kv_ptrs)
+
+        # Default behavior should be no padding
+        self._is_padded = False
+
+    def clear(self) -> None:
+        """
+        Clear the ragged batch. This will reset the number of tokens and sequences to 0.
+        """
+        self._batch_metadata_storage_shadow[0] = 0
+        self._batch_metadata_storage_shadow[1] = 0
+
+    def insert_sequence(self, seq_descriptor: DSSequenceDescriptor, tokens: torch.Tensor, do_checks=True) -> None:
+        """
+        Incrementally insert a sequence into the ragged batch. This will update the
+        metadata for the ragged batch and the sequence.
+
+        Arguments:
+            seq_descriptor ()
+        """
+        if tokens.device != torch.device("cpu"):
+            # This doesn't really fall under schedulability, so we'll unconditionally check for it.
+            raise RuntimeError(f"Expected tokens to be on host but found device '{tokens.device}'")
+
+        if do_checks and self.current_sequences == self._config.max_ragged_sequence_count:
+            raise RuntimeError(f"Ragged batch is full due to sequence limit: {self._config.max_ragged_sequence_count}")
+
+        seq_tokens = tokens.numel()
+
+        if do_checks and self.current_tokens + seq_tokens > self._config.max_ragged_batch_size:
+            raise RuntimeError(f"Ragged batch is full due to capacity limit: {self._config.max_ragged_batch_size})")
+
+        self._input_ids_shadow[self.current_tokens:self.current_tokens + seq_tokens].copy_(tokens)
+        self._token_to_seq_storage_shadow[self.current_tokens:self.current_tokens + seq_tokens].fill_(
+            self.current_sequences)
+
+        self._inflight_seq_descriptors_shadow[self.current_sequences][0] = self.current_tokens
+        self._inflight_seq_descriptors_shadow[self.current_sequences][1] = seq_tokens
+        self._inflight_seq_descriptors_shadow[self.current_sequences][2] = seq_descriptor.seen_tokens
+
+        self._kv_ptrs_shadow[self.current_sequences] = seq_descriptor.kv_blocks_ptr
+
+        self._batch_metadata_storage_shadow[0] += seq_tokens
+        self._batch_metadata_storage_shadow[1] += 1
+
+    @property
+    def tensor_toks(self) -> torch.Tensor:
+        """
+        The number of tokens in the in-flight ragged batch. This will not trigger
+        synchronization with the device.
+        """
+        cur_toks = self.current_tokens
+        if self._is_padded:
+            return to_padded(cur_toks)
+        else:
+            return cur_toks
+
+    def finalize(self, padding: Optional[bool] = False) -> None:
+        """
+        Completes construction of the ragged batch by flushing the host buffers to the device.
+        """
+        cur_toks = self.current_tokens
+
+        if padding:
+            padded_toks = to_padded(cur_toks)
+            self._input_ids_shadow[cur_toks:padded_toks].fill_(-1)
+            self._token_to_seq_storage_shadow[cur_toks:padded_toks].fill_(-1)
+            self._is_padded = True
+        else:
+            padded_toks = cur_toks
+            self._is_padded = False
+
+        current_sequences = self.current_sequences
+
+        def _noblock_copy(dst: torch.Tensor, src: torch.Tensor) -> None:
+            dst.copy_(src, non_blocking=True)
+
+        _noblock_copy(self._input_ids[:padded_toks], self._input_ids_shadow[:padded_toks])
+        _noblock_copy(self._batch_metadata_storage, self._batch_metadata_storage_shadow)
+        _noblock_copy(self._token_to_seq_storage[:padded_toks], self._token_to_seq_storage_shadow[:padded_toks])
+        _noblock_copy(self._inflight_seq_descriptors[:current_sequences],
+                      self._inflight_seq_descriptors_shadow[:current_sequences])
+        _noblock_copy(self._kv_ptrs[:current_sequences], self._kv_ptrs_shadow[:current_sequences])
+
+    def input_ids(self, on_device: bool = True) -> torch.Tensor:
+        """
+        The input ids tensor for the ragged batch. If the device Tensor is requested, the Tensor
+        is truncated to the number of tokens in the batch.
+        """
+        if on_device:
+            return self._input_ids[:self.tensor_toks]
+        else:
+            return self._input_ids_shadow
+
+    def batch_metadata_buffer(self, on_device: bool = True) -> torch.Tensor:
+        """
+        Buffer associated with the batch metadata tensor that can
+        be populated in preparation for passing a new input to the device.
+        """
+        if on_device:
+            return self._batch_metadata_storage
+        else:
+            return self._batch_metadata_storage_shadow
+
+    def tokens_to_seq(self, on_device: bool = True) -> torch.Tensor:
+        """
+        Mapping of token to which sequence it belongs to in the ragged batch. If the device Tensor
+        is requested, the Tensor is truncated to the number of tokens in the batch.
+        """
+        if on_device:
+            return self._token_to_seq_storage[:self.tensor_toks]
+        else:
+            return self._token_to_seq_storage_shadow
+
+    def inflight_seq_descriptors(self, on_device: bool = True) -> torch.Tensor:
+        """
+        Buffer associated with the metadata of each sequence in the ragged batch. If the device Tensor
+        is requested, the Tensor is truncated to the number of sequences in the batch.
+        """
+        if on_device:
+            return self._inflight_seq_descriptors[:self.current_sequences]
+        else:
+            return self._inflight_seq_descriptors_shadow
+
+    def kv_ptrs(self, on_device: bool = True) -> torch.Tensor:
+        """
+        Pointer to where the list of KV ids associated with a sequence are. If the device Tensor
+        is requested, the Tensor is truncated to the number of sequences in the batch.
+        """
+        if on_device:
+            return self._kv_ptrs[:self.current_sequences]
+        else:
+            return self._kv_ptrs_shadow
+
+    def masks(self, on_device: bool = True) -> Optional[torch.Tensor]:
+        """
+        Placeholder for supporting complex masks. Currently not supported.
+
+        Models that will need this will be BERT-like, not generative.
+        """
+        return None
+
+    @property
+    def current_tokens(self) -> int:
+        """
+        The number of tokens in the in-flight ragged batch. This will not trigger
+        synchronization with the device.
+        """
+        return self._batch_metadata_storage_shadow[0].item()
+
+    @property
+    def current_sequences(self) -> int:
+        """
+        The number of sequences in the in-flight ragged batch. This will not trigger
+        synchronization with the device.
+        """
+        return self._batch_metadata_storage_shadow[1].item()
diff --git a/deepspeed/inference/v2/ragged/sequence_descriptor.py b/deepspeed/inference/v2/ragged/sequence_descriptor.py
new file mode 100644
index 000000000000..c8a0c20764f6
--- /dev/null
+++ b/deepspeed/inference/v2/ragged/sequence_descriptor.py
@@ -0,0 +1,276 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import List, Tuple, Union
+
+import torch
+
+
+class BaseSequenceDescriptor:
+
+    @property
+    def seen_tokens(self) -> int:
+        """
+        The number of tokens for this sequence that have completed a forward pass.
+        """
+        raise NotImplementedError()
+
+    @property
+    def cur_allocated_blocks(self, cache_group: int = 0) -> int:
+        """
+        The number of KV blocks currently allocated for this sequence.
+        """
+        raise NotImplementedError()
+
+    @property
+    def kv_blocks_ptr(self, cache_group: int = 0) -> int:
+        """
+        The pointer to the KV blocks for this sequence.
+        """
+        raise NotImplementedError()
+
+
+class PlaceholderSequenceDescriptor(BaseSequenceDescriptor):
+    """
+    The DummySequenceDescriptor is an empty object that allows us to perform schedulability
+    checks before formally tracking a sequence.
+    """
+
+    def __init__(self, seen_tokens=0, cur_allocated_blocks=0, kv_blocks_ptr=0) -> None:
+        self._seen_tokens = seen_tokens
+        self._cur_allocated_blocks = cur_allocated_blocks
+        self._kv_blocks_ptr = kv_blocks_ptr
+
+    @property
+    def seen_tokens(self) -> int:
+        return self._seen_tokens
+
+    @property
+    def cur_allocated_blocks(self, cache_group: int = 0) -> int:
+        return self._cur_allocated_blocks
+
+    @property
+    def kv_blocks_ptr(self, cache_group: int = 0) -> int:
+        return self._kv_blocks_ptr
+
+
+class DSSequenceDescriptor(BaseSequenceDescriptor):
+
+    _seen_tokens: int
+    """
+    Number of tokens in the sequence that have completed a forward pass.
+    """
+
+    _in_flight_tokens: int
+    """
+    Number of tokens that have begun a forward pass but not yet completed it.
+    """
+
+    _max_context: int
+    """
+    Maximum number of tokens this sequence may eventually include. Currently unused but
+    may be used in future implementations for speculative caching.
+    """
+
+    _num_allocation_groups: Tuple[int, ...]
+    """
+    Number of unique allocation groups associated with the sequence for each cache group.
+    """
+
+    _blocks_per_allocation_group: Tuple[torch.IntTensor, ...]
+    """
+    Number of blocks allocated for each allocation group in each cache group.
+    """
+
+    # Padded list of KV-cache IDs for the sequence.
+    _kv_cache_ids: Tuple[torch.Tensor, ...]
+    _kv_cache_ids_shadow: Tuple[torch.Tensor, ...]
+    """
+    Padded list of KV-cache IDs for the sequence. The padded shape is [num_allocation_groups, max_blocks_per_allocation_group].
+    """
+
+    # The location in the broader ID tensor where the KV-cache IDs for the sequence
+    # are stored. Used on flush.
+    _tracking_id: int
+
+    def __init__(self,
+                 tracking_id: int,
+                 kv_cache_ids: Tuple[torch.Tensor, ...],
+                 kv_cache_ids_shadow: Tuple[torch.Tensor, ...],
+                 max_context: int = -1) -> None:
+        """
+        Create the metadata to track a single sequence in the system.
+
+        Arguments:
+            tracking_id (int): The slot in the tracking buffers used to track this sequence.
+            kv_cache_ids (Tuple[torch.Tensor, ...]): The KV-cache IDs for the sequence. The shape
+                of the tensor should be [num_allocation_groups, max_blocks_per_allocation_group].
+                There should be one tensor per cache group.
+            kv_cache_ids_shadow (Tuple[torch.Tensor, ...]): The shadow tensor for the KV-cache IDs.
+                This tensor should be allocated on the host and should have the same shape as the
+                tensor provided in ``kv_cache_ids``. There should be one tensor per cache group.
+            max_context (int): The maximum number of tokens this sequence may eventually include.
+                Currently unused but may be used in future implementations for speculative caching.
+        """
+        self._tracking_id = tracking_id
+        self._kv_cache_ids = kv_cache_ids
+        self._kv_cache_ids_shadow = kv_cache_ids_shadow
+        self._max_context = max_context
+        self._n_cache_groups = len(kv_cache_ids)
+
+        self._seen_tokens = 0
+        self._in_flight_tokens = 0
+
+        self._num_allocation_groups = tuple(kv_cache_ids_shadow.shape[0]
+                                            for kv_cache_ids_shadow in kv_cache_ids_shadow)
+        self._blocks_per_allocation_group = tuple(
+            torch.zeros(num_groups, dtype=torch.int32, device="cpu") for num_groups in self._num_allocation_groups)
+
+        for cache_group, kv_cache_ids in enumerate(kv_cache_ids):
+            assert self._num_allocation_groups[cache_group] == kv_cache_ids.shape[0]
+            assert len(kv_cache_ids.shape) == 2
+
+    @property
+    def seen_tokens(self) -> int:
+        """
+        Number of tokens in the sequence that have completed a forward pass.
+        """
+        return self._seen_tokens
+
+    @property
+    def in_flight_tokens(self) -> int:
+        """
+        Number of tokens that have begun a forward pass but not yet completed it.
+        """
+        return self._in_flight_tokens
+
+    @property
+    def max_context(self) -> int:
+        """
+        Maximum number of tokens for this sequence. Currently unused.
+        """
+        return self._max_context
+
+    @property
+    def tracking_id(self) -> int:
+        """
+        Return the slot in the tracking buffers used to track this sequence.
+        """
+        return self._tracking_id
+
+    @property
+    def cur_allocated_blocks(self, cache_group: int = 0) -> int:
+        """
+        Returns the number of blocks currently allocated for this sequence in the specified cache group.
+
+        Arguments:
+            cache_group (int): The cache group to query.
+        """
+        return self._blocks_per_allocation_group[cache_group].sum()
+
+    def kv_cache_ids(self, cache_group: int = 0, on_device: bool = False) -> torch.Tensor:
+        """
+        Returns the Tensor containing the block IDs for this sequence on the appropriate device
+        for the specified cache group.
+
+        Arguments:
+            cache_group (int): The cache group to query.
+            on_device (bool): Whether or not to return the Tensor on the device or on the host.
+        """
+        if on_device:
+            return self._kv_cache_ids[cache_group]
+        else:
+            return self._kv_cache_ids_shadow[cache_group]
+
+    @property
+    def kv_blocks_ptr(self, cache_group: int = 0) -> int:
+        """
+        Get the device pointer to the base of the KV-cache ids for the specified cache group and
+        sequence.
+
+        Arguments:
+            cache_group (int): The cache group to query.
+        """
+        return self._kv_cache_ids[cache_group].data_ptr()
+
+    #TODO: this was previously a property but causing issues with PR-4668 need to consult w. Connor
+    def all_block_ids(self, cache_group: int = 0) -> torch.Tensor:
+        """
+        Return the Tensor containing all block IDs for this sequence in the specified cache group.
+
+        Arguments:
+            cache_group (int): The cache group to query.
+        """
+        block_ids = []
+        for allocation_group, num_blocks in zip(self._kv_cache_ids[cache_group],
+                                                self._blocks_per_allocation_group[cache_group]):
+            block_ids.append(allocation_group[:num_blocks])
+        return torch.cat(block_ids)
+
+    def pre_forward(self, num_tokens: int) -> None:
+        """
+        Update the state of the sequence before a forward pass.
+
+        Arguments:
+            num_tokens (int): The number of tokens in the sequence that will be executed during the
+                next forward pass of the model.
+        """
+        self._in_flight_tokens = num_tokens
+
+    def post_forward(self) -> None:
+        """
+        Update the state of the sequence after a forward pass. This should be called after the forward
+        pass completes. NOTE: due to the asynchronous nature of the accelerator, this may be called
+        before the forward pass completes on the device itself.
+        """
+        self._seen_tokens += self._in_flight_tokens
+        self._in_flight_tokens = 0
+
+    def extend_kv_cache(self, new_ids: Union[List[torch.IntTensor], torch.IntTensor], cache_group: int = 0) -> None:
+        """
+        Extend the KV-cache for the sequence.
+
+        Arguments:
+            new_ids (Union[List[torch.IntTensor], torch.IntTensor]): For each allocation group, the IDs
+                to add to the KV-cache. If there is only one allocation group, a single tensor can be
+                provided. Otherwise, a list of tensors should be provided. The tensors do not need
+                to have the same shape.
+        """
+        if isinstance(new_ids, torch.Tensor):
+            new_ids = [new_ids]
+
+        if len(new_ids) != self._num_allocation_groups[cache_group]:
+            raise ValueError(
+                f"Only {len(new_ids)} allocation groups provided, expected {self._num_allocation_groups[cache_group]}")
+
+        for group_id, new_group_ids in enumerate(new_ids):
+            new_blocks = new_group_ids.numel()
+
+            if new_blocks == 0:
+                # If we have multiple groups, it's possible to have an empty group.
+                continue
+
+            shadow_alloc_group = self._kv_cache_ids_shadow[cache_group][group_id]
+            alloc_group = self._kv_cache_ids[cache_group][group_id]
+            cur_blocks = self._blocks_per_allocation_group[cache_group][group_id]
+
+            shadow_alloc_group[cur_blocks:cur_blocks + new_blocks].copy_(new_group_ids)
+            alloc_group[cur_blocks:cur_blocks + new_blocks].copy_(shadow_alloc_group[cur_blocks:cur_blocks +
+                                                                                     new_blocks],
+                                                                  non_blocking=True)
+
+            self._blocks_per_allocation_group[cache_group][group_id] += new_blocks
+
+    def free_kv_cache(self, free_ids: Union[List[torch.IntTensor], torch.IntTensor], cache_group: int = 0) -> None:
+        """
+        Free blocks from the KV-cache for the sequence.
+
+        Arguments:
+            free_ids (Union[List[torch.IntTensor], torch.IntTensor]): The ids of blocks to free
+                from the KV-cache. If there is only one allocation group, a single tensor can be
+                provided. Otherwise, a list of tensors should be provided. The tensors do not need
+                to have the same shape.
+        """
+        raise NotImplementedError("Partial KV-cache freeing is not yet supported.")
diff --git a/deepspeed/inference/v2/scheduling_utils.py b/deepspeed/inference/v2/scheduling_utils.py
new file mode 100644
index 000000000000..6d3818d46675
--- /dev/null
+++ b/deepspeed/inference/v2/scheduling_utils.py
@@ -0,0 +1,54 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from enum import Enum
+
+
+class SchedulingResult(Enum):
+
+    Success = 0
+    """
+    The proposed batch is valid and can be scheduled.
+    """
+
+    EngineSequenceLimitExceeded = 1
+    """
+    The proposed batch would would overflow the number of concurrent sequences the engine may support.
+    """
+
+    BatchSequenceLimitExceeded = 2
+    """
+    The proposed batch contains more sequences than the engine was configured
+    to support in a single forwardp
+    """
+
+    BatchTokenLimitExceeded = 3
+    """
+    The proposed batch contains more tokens than the engine was configured
+    to support in a single forward.
+    """
+
+    KVCacheLimitExceeded = 4
+    """
+    The proposed batch would require more KV cache to be allocated than the engine
+    currently has available.
+    """
+
+    SequenceTokenLimitExceeded = 5
+    """
+    The proposed batch contains a sequence that is longer than the engine/model can support.
+    """
+
+
+class SchedulingError(RuntimeError):
+
+    result: SchedulingResult
+    """
+    The failed result of the scheduling check. Guaranteed to not be SchedulingResult.Success.
+    """
+
+    def __init__(self, result: SchedulingResult) -> None:
+        self.result = result
+        super().__init__(f"Batch scheduling failed with result {result}")
diff --git a/deepspeed/launcher/__init__.py b/deepspeed/launcher/__init__.py
index fcb45ab2b685..6c5067f71c8f 100644
--- a/deepspeed/launcher/__init__.py
+++ b/deepspeed/launcher/__init__.py
@@ -1 +1,5 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 '''Copyright The Microsoft DeepSpeed Team'''
diff --git a/deepspeed/launcher/constants.py b/deepspeed/launcher/constants.py
index 09dd1bdd1def..5bd6ee489def 100644
--- a/deepspeed/launcher/constants.py
+++ b/deepspeed/launcher/constants.py
@@ -1,10 +1,15 @@
-# Copyright 2020 The Microsoft DeepSpeed Team
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 PDSH_LAUNCHER = 'pdsh'
 PDSH_MAX_FAN_OUT = 1024
 
 OPENMPI_LAUNCHER = 'openmpi'
+JSRUN_LAUNCHER = 'jsrun'
 MPICH_LAUNCHER = 'mpich'
+IMPI_LAUNCHER = 'impi'
 SLURM_LAUNCHER = 'slurm'
 MVAPICH_LAUNCHER = 'mvapich'
 MVAPICH_TMP_HOSTFILE = '/tmp/deepspeed_mvapich_hostfile'
diff --git a/deepspeed/launcher/launch.py b/deepspeed/launcher/launch.py
index d9e6fda504b4..b3cb9d1205a9 100755
--- a/deepspeed/launcher/launch.py
+++ b/deepspeed/launcher/launch.py
@@ -1,4 +1,7 @@
-# Copyright 2020 The Microsoft DeepSpeed Team
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 """
 DeepSpeed launcher, this is similar to torch's distributed.launch but supports
 additional features such as arbitrary gpu exclusion.
@@ -21,7 +24,7 @@
 from argparse import ArgumentParser, REMAINDER
 from ..constants import TORCH_DISTRIBUTED_DEFAULT_PORT
 from ..nebula.constants import DLTS_POD_ENV_PATH
-from ..utils import logger
+from ..utils import logger, get_numactl_cmd
 from ..elasticity import is_torch_elastic_compatible
 from .constants import ELASTIC_TRAINING_ID_DEFAULT
 
@@ -52,10 +55,7 @@ def parse_args():
                         help="Master node (rank 0)'s free port that needs to "
                         "be used for communication during distributed "
                         "training")
-    parser.add_argument("--world_info",
-                        default="None",
-                        type=str,
-                        help="world info base64 encoded dictionary")
+    parser.add_argument("--world_info", default="None", type=str, help="world info base64 encoded dictionary")
 
     parser.add_argument("--module",
                         action="store_true",
@@ -68,19 +68,11 @@ def parse_args():
                         help="Skip prepending the training script with "
                         "'python' - just execute it directly.")
 
-    parser.add_argument("--enable_elastic_training",
-                        action="store_true",
-                        help="Enable elastic training support.")
+    parser.add_argument("--enable_elastic_training", action="store_true", help="Enable elastic training support.")
 
-    parser.add_argument("--min_elastic_nodes",
-                        type=int,
-                        default=-1,
-                        help="Min number of nodes in elastic training.")
+    parser.add_argument("--min_elastic_nodes", type=int, default=-1, help="Min number of nodes in elastic training.")
 
-    parser.add_argument("--max_elastic_nodes",
-                        type=int,
-                        default=-1,
-                        help="Max number of nodes in elastic training.")
+    parser.add_argument("--max_elastic_nodes", type=int, default=-1, help="Max number of nodes in elastic training.")
 
     parser.add_argument("--no_local_rank",
                         action="store_true",
@@ -92,11 +84,22 @@ def parse_args():
                         default=0,
                         help="main launching process pid, for internal pid tracking")
 
-    parser.add_argument(
-        "--enable_each_rank_log",
-        default="None",
-        type=str,
-        help="redirect the stdout and stderr from each rank into different log files")
+    parser.add_argument("--enable_each_rank_log",
+                        default="None",
+                        type=str,
+                        help="redirect the stdout and stderr from each rank into different log files")
+
+    parser.add_argument("--bind_cores_to_rank",
+                        action="store_true",
+                        help="Bind each rank to different cores of the host. "
+                        "This improves host efficiency especially for CPU backend")
+
+    parser.add_argument("--bind_core_list",
+                        type=str,
+                        default=None,
+                        help="List of cores to bind to with comma separated list of "
+                        "numbers and range. i.e. 1,3-5,7 => [1,3,4,5,7].  When not "
+                        "specified, all cores on system would be used rank binding")
 
     # positional
     parser.add_argument("training_script",
@@ -145,9 +148,7 @@ def main():
     local_node = node_list[args.node_rank]
     local_gpu_ids = world_info[local_node]
     num_local_procs = len(local_gpu_ids)
-    logger.info(
-        f"nnodes={args.nnodes}, num_local_procs={num_local_procs}, node_rank={args.node_rank}"
-    )
+    logger.info(f"nnodes={args.nnodes}, num_local_procs={num_local_procs}, node_rank={args.node_rank}")
 
     global_rank_mapping = defaultdict(list)
     curr_global_rank = 0
@@ -193,8 +194,7 @@ def main():
             lines = file.readlines()
             lines = [line.rstrip() for line in lines]
             for line in lines:
-                if line.startswith('export FC_TASKROLE_NAME') or line.startswith(
-                        'export FC_TASK_INDEX'):
+                if line.startswith('export FC_TASKROLE_NAME') or line.startswith('export FC_TASK_INDEX'):
                     key_val = line.split()[1]
                     key, val = key_val.split('=')
                     current_env[key] = val
@@ -206,29 +206,31 @@ def main():
         if args.enable_each_rank_log != "None":
             # prepare the log path and the file name prefix
             if os.path.isfile(args.enable_each_rank_log):
-                raise ValueError(
-                    f"{args.enable_each_rank_log} should not be a file, it should be a directory."
-                )
+                raise ValueError(f"{args.enable_each_rank_log} should not be a file, it should be a directory.")
             if not os.path.exists(args.enable_each_rank_log):
                 try:
                     os.makedirs(args.enable_each_rank_log)
                 except Exception as e:
                     print(e)
-                    raise ValueError(
-                        f"unable to create directory {args.enable_each_rank_log} for each rank log."
-                    )
+                    raise ValueError(f"unable to create directory {args.enable_each_rank_log} for each rank log.")
             log_name_prefix = time.strftime("%Y%m%d%H%M%S", time.localtime())
 
-        for local_rank in range(0, num_local_procs):
+        for local_proc in range(0, num_local_procs):
             # each process's rank
-            dist_rank = global_rank_mapping[local_node][local_rank]
+            dist_rank = global_rank_mapping[local_node][local_proc]
+            local_rank = dist_rank % num_local_procs
             current_env["RANK"] = str(dist_rank)
             current_env["LOCAL_RANK"] = str(local_rank)
 
             # spawn the processes
             cmd = []
+            if args.bind_cores_to_rank:
+                cores_per_rank, numactl_cmd = get_numactl_cmd(args.bind_core_list, num_local_procs, local_rank)
+                current_env["OMP_NUM_THREADS"] = f"{cores_per_rank}"
+                cmd = cmd + numactl_cmd
             if not args.no_python:
-                cmd = [sys.executable, "-u"]
+                cmd.append(sys.executable)
+                cmd.append("-u")
                 if args.module:
                     cmd.append("-m")
             else:
@@ -242,13 +244,9 @@ def main():
             cmd += args.training_script_args
 
             if args.enable_each_rank_log != "None":
-                log_file = os.path.join(args.enable_each_rank_log,
-                                        f"{log_name_prefix}_rank{dist_rank}.log")
+                log_file = os.path.join(args.enable_each_rank_log, f"{log_name_prefix}_rank{dist_rank}.log")
                 log_fd = open(log_file, 'w')
-                process = subprocess.Popen(cmd,
-                                           env=current_env,
-                                           stdout=log_fd,
-                                           stderr=log_fd)
+                process = subprocess.Popen(cmd, env=current_env, stdout=log_fd, stderr=log_fd)
             else:
                 process = subprocess.Popen(cmd, env=current_env)
 
@@ -264,7 +262,7 @@ def main():
             args.min_elastic_nodes = 1
         if args.max_elastic_nodes == -1:
             args.max_elastic_nodes = args.nnodes
-        assert args.max_elastic_nodes > 0 and  args.min_elastic_nodes > 0 , "Max and Min nodes should be positive"
+        assert args.max_elastic_nodes > 0 and args.min_elastic_nodes > 0, "Max and Min nodes should be positive"
 
         current_env["NCCL_ASYNC_ERROR_HANDLING"] = str(1)
 
@@ -287,8 +285,7 @@ def main():
 
         # Creating config for rendezvous class
         rdzv_parameters = RendezvousParameters(backend='c10d',
-                                               endpoint=args.master_addr + ":" +
-                                               str(args.master_port),
+                                               endpoint=args.master_addr + ":" + str(args.master_port),
                                                run_id=run_id,
                                                min_nodes=args.min_elastic_nodes,
                                                max_nodes=args.max_elastic_nodes,
diff --git a/deepspeed/launcher/multinode_runner.py b/deepspeed/launcher/multinode_runner.py
index 6027d1076e80..40df5d001dd3 100644
--- a/deepspeed/launcher/multinode_runner.py
+++ b/deepspeed/launcher/multinode_runner.py
@@ -1,5 +1,9 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
 
+# DeepSpeed Team
+
+import json
 import os
 import sys
 import shutil
@@ -8,11 +12,12 @@
 from shlex import split
 from abc import ABC, abstractmethod
 from deepspeed.accelerator import get_accelerator
-from ..utils import logger
+from ..utils import logger, get_numactl_cmd
 from .constants import PDSH_MAX_FAN_OUT, MVAPICH_TMP_HOSTFILE
 
 
 class MultiNodeRunner(ABC):
+
     def __init__(self, args, world_info_base64):
         self.args = args
         self.validate_args()
@@ -45,6 +50,7 @@ def validate_args(self):
 
 
 class PDSHRunner(MultiNodeRunner):
+
     def __init__(self, args, world_info_base64):
         super().__init__(args, world_info_base64)
 
@@ -56,26 +62,20 @@ def name(self):
         return "pdsh"
 
     def parse_user_args(self):
-        return list(
-            map(lambda x: x if x.startswith("-") else f"'{x}'",
-                self.args.user_args))
+        return list(map(lambda x: x if x.startswith("-") else f"'{x}'", self.args.user_args))
 
     def get_cmd(self, environment, active_resources):
         environment['PDSH_RCMD_TYPE'] = 'ssh'
+        if self.args.ssh_port is not None:  # only specify ssh port if it is specified
+            environment["PDSH_SSH_ARGS_APPEND"] += f" -p {self.args.ssh_port}"
 
         active_workers = ",".join(active_resources.keys())
         logger.info("Running on the following workers: %s" % active_workers)
 
         # PDSH flags for max node fan out and specific hosts to launch on
         # See https://linux.die.net/man/1/pdsh for flag details
-        pdsh_cmd_args = [
-            'pdsh',
-            '-S',
-            '-f',
-            str(PDSH_MAX_FAN_OUT),
-            '-w',
-            active_workers
-        ] + split(self.args.launcher_args)
+        pdsh_cmd_args = ['pdsh', '-S', '-f', str(PDSH_MAX_FAN_OUT), '-w', active_workers] + split(
+            self.args.launcher_args)
 
         exports = ""
         for key, val in self.exports.items():
@@ -84,15 +84,8 @@ def get_cmd(self, environment, active_resources):
         # https://linux.die.net/man/1/pdsh
         # %n will be replaced by pdsh command
         deepspeed_launch = [
-            exports,
-            f"cd {os.path.abspath('.')};",
-            sys.executable,
-            "-u",
-            "-m",
-            "deepspeed.launcher.launch",
-            f'--world_info={self.world_info_base64}',
-            "--node_rank=%n",
-            f"--master_addr={self.args.master_addr}",
+            exports, f"cd {os.path.abspath('.')};", sys.executable, "-u", "-m", "deepspeed.launcher.launch",
+            f'--world_info={self.world_info_base64}', "--node_rank=%n", f"--master_addr={self.args.master_addr}",
             f"--master_port={self.args.master_port}"
         ]
         if self.args.no_python:
@@ -111,11 +104,11 @@ def get_cmd(self, environment, active_resources):
         cmd_to_search = [i + "\\" for i in deepspeed_launch[2:6]]
 
         kill_command = pdsh_cmd_args + ["pkill -f ", " ".join(cmd_to_search)[:-2]]
-        return pdsh_cmd_args + deepspeed_launch + [self.user_script
-                                                   ] + self.user_arguments, kill_command
+        return pdsh_cmd_args + deepspeed_launch + [self.user_script] + self.user_arguments, kill_command, environment
 
 
 class OpenMPIRunner(MultiNodeRunner):
+
     def __init__(self, args, world_info_base64, resource_pool):
         super().__init__(args, world_info_base64)
         self.resource_pool = resource_pool
@@ -133,11 +126,9 @@ def validate_args(self):
         super().validate_args()
         #TODO: Allow for include/exclude at node-level but not gpu-level
         if self.args.include != "" or self.args.exclude != "":
-            raise ValueError(
-                f"{self.name} backend does not support worker include/exclusion")
+            raise ValueError(f"{self.name} backend does not support worker include/exclusion")
         if self.args.num_nodes != -1 or self.args.num_gpus != -1:
-            raise ValueError(
-                f"{self.name} backend does not support limiting num nodes/gpus")
+            raise ValueError(f"{self.name} backend does not support limiting num nodes/gpus")
 
     def get_cmd(self, environment, active_resources):
         total_process_count = sum(self.resource_pool.values())
@@ -166,11 +157,66 @@ def get_cmd(self, environment, active_resources):
             if self.args.module:
                 python_exec.append("-m")
 
-        return mpirun_cmd + export_cmd + python_exec + [self.user_script
+        return mpirun_cmd + export_cmd + python_exec + [self.user_script] + self.user_arguments
+
+
+class JSRunner(MultiNodeRunner):
+    def __init__(self, args, world_info_base64, resource_pool):
+        super().__init__(args, world_info_base64)
+        self.resource_pool = resource_pool
+        self.add_export('CUDA_VISIBLE_DEVICES', '0,1,2,3,4,5')
+
+    def backend_exists(self):
+        #TODO: if IB is available we should suggestion mvapich
+        #This ompi check will still work for jsrun since spectrum-mpi is based on ompi
+        return shutil.which('ompi_info')
+
+    @property
+    def name(self):
+        return "jsrun"
+
+    def validate_args(self):
+        super().validate_args()
+        #TODO: Allow for include/exclude at node-level but not gpu-level
+        if self.args.include != "" or self.args.exclude != "":
+            raise ValueError(
+                f"{self.name} backend does not support worker include/exclusion")
+        if self.args.num_nodes != -1 or self.args.num_gpus != -1:
+            raise ValueError(
+                f"{self.name} backend does not support limiting num nodes/gpus")
+
+    def get_cmd(self, environment, active_resources):
+        total_process_count = sum(self.resource_pool.values())
+
+        jsrun_cmd = [
+            'jsrun',
+            '-n',
+            f'{total_process_count}',
+            '-c',
+            f'{7}',
+            '-g',
+            f'{1}',
+            '-a',
+            f'{1}',
+
+        ] + split(self.args.launcher_args)
+
+        export_cmd = []
+        for k, v in self.exports.items():
+            export_cmd += ['-E', "{}={}".format(k, v)]
+
+        python_exec = []
+        if not self.args.no_python:
+            python_exec = [sys.executable, "-u"]
+            if self.args.module:
+                python_exec.append("-m")
+
+        return jsrun_cmd + export_cmd + python_exec + [self.user_script
                                                         ] + self.user_arguments
 
 
 class MPICHRunner(MultiNodeRunner):
+
     def __init__(self, args, world_info_base64, resource_pool):
         super().__init__(args, world_info_base64)
         self.resource_pool = resource_pool
@@ -187,39 +233,146 @@ def validate_args(self):
         super().validate_args()
         #TODO: Allow for include/exclude at node-level but not gpu-level
         if self.args.include != "" or self.args.exclude != "":
-            raise ValueError(
-                f"{self.name} backend does not support worker include/exclusion")
+            raise ValueError(f"{self.name} backend does not support worker include/exclusion")
 
         if self.args.num_nodes != -1 or self.args.num_gpus != -1:
-            raise ValueError(
-                f"{self.name} backend does not support limiting num nodes/gpus")
+            raise ValueError(f"{self.name} backend does not support limiting num nodes/gpus")
 
     def get_cmd(self, environment, active_resources):
         devices_per_node = self.resource_pool.values()
         total_process_count = sum(devices_per_node)
         process_per_node = list(devices_per_node)[0]
+        if not all([n == process_per_node for n in devices_per_node]):
+            raise ValueError("MPICH requires same number of devices per node")
+
+        mpirun_cmd = [
+            'mpirun',
+        ] + split(self.args.launcher_args)
+        export_cmd = []
+
+        for k, v in self.exports.items():
+            export_cmd += ['-genv', "{}={}".format(k, v)]
+
+        export_cmd += ['-genv', 'MASTER_ADDR', str(self.args.master_addr)]
+        export_cmd += ['-genv', 'MASTER_PORT', str(self.args.master_port)]
+        export_cmd += ['-genv', 'WORLD_SIZE', str(total_process_count)]
+        export_cmd += ['-genv', 'LOCAL_SIZE', str(process_per_node)]
+
+        hosts = list(self.resource_pool.keys())
+
+        per_host_cmd = []
+        host_id = 0
+        host_count = 0
+        for i in range(total_process_count):
+            local_rank = i % process_per_node
+            python_exec = []
+            if not self.args.no_python:
+                python_exec += [sys.executable, "-u"]
+                if self.args.module:
+                    python_exec.append("-m")
+            env_mapping = ['-env', 'RANK', str(i)]
+            env_mapping += ['-env', 'LOCAL_RANK', str(local_rank)]
+            if i == 0:
+                per_host_cmd = ['-n', '1', '-host', hosts[host_id]
+                                ] + env_mapping + python_exec + [self.user_script] + self.user_arguments
+            else:
+                per_host_cmd = per_host_cmd + [':', '-n', '1', '-host', hosts[host_id]
+                                               ] + env_mapping + python_exec + [self.user_script] + self.user_arguments
+            host_count = host_count + 1
+            if host_count == process_per_node:
+                host_id = host_id + 1
+                host_count = 0
+
+        return mpirun_cmd + export_cmd + per_host_cmd
+
+
+class IMPIRunner(MultiNodeRunner):
+
+    def __init__(self, args, world_info_base64, resource_pool):
+        super().__init__(args, world_info_base64)
+        self.resource_pool = resource_pool
+
+    def backend_exists(self):
+        #TODO: if IB is available we should suggestion mpich
+        return shutil.which('mpirun')  #mpich_info
+
+    @property
+    def name(self):
+        return "impi"
+
+    def validate_args(self):
+        super().validate_args()
+        #TODO: Allow for include/exclude at node-level but not gpu-level
+        if self.args.include != "" or self.args.exclude != "":
+            raise ValueError(f"{self.name} backend does not support worker include/exclusion")
+
+        if self.args.num_nodes != -1 or self.args.num_gpus != -1:
+            raise ValueError(f"{self.name} backend does not support limiting num nodes/gpus")
+
+    def get_cmd(self, environment, active_resources):
+        devices_per_node = self.resource_pool.values()
+        total_process_count = sum(devices_per_node)
+        process_per_node = list(devices_per_node)[0]
+        if not all([n == process_per_node for n in devices_per_node]):
+            raise ValueError("Intel MPI requires same number of devices per node")
 
         mpirun_cmd = [
             'mpirun',
-            '-n',
-            f'{total_process_count}',
             '-ppn',
             f'{process_per_node}',
         ] + split(self.args.launcher_args)
         export_cmd = []
 
         for k, v in self.exports.items():
-            export_cmd += ['-x', "{}={}".format(k, v)]
+            export_cmd += ['-genv', f'{k}', f'{v}']
 
-        python_exec = []
-        if not self.args.no_python:
-            python_exec = [sys.executable, "-u"]
-            if self.args.module:
-                python_exec.append("-m")
-        return mpirun_cmd + python_exec + [self.user_script] + self.user_arguments
+        if self.args.bind_cores_to_rank:
+            cores_per_rank, _ = get_numactl_cmd(self.args.bind_core_list, process_per_node, 0)
+            export_cmd += ['-genv', 'OMP_NUM_THREADS', str(cores_per_rank)]
+
+        export_cmd += ['-genv', 'MASTER_ADDR', str(self.args.master_addr)]
+        export_cmd += ['-genv', 'MASTER_PORT', str(self.args.master_port)]
+        export_cmd += ['-genv', 'WORLD_SIZE', str(total_process_count)]
+        export_cmd += ['-genv', 'LOCAL_SIZE', str(process_per_node)]
+
+        # turn off IMPI core binding, use deepspeed's own core binding
+        export_cmd += ['-genv', 'I_MPI_PIN', '0']
+
+        export_cmd += ['-hosts']
+        hosts = ""
+        for i, host in enumerate(self.resource_pool.keys()):
+            if i == 0:
+                hosts = f"{host}"
+            else:
+                hosts += f",{host}"
+        export_cmd += [hosts]
+
+        per_host_cmd = []
+
+        for i in range(total_process_count):
+            local_rank = i % process_per_node
+            python_exec = []
+            if self.args.bind_cores_to_rank:
+                _, numactl_cmd = get_numactl_cmd(self.args.bind_core_list, process_per_node, local_rank)
+                python_exec += numactl_cmd
+
+            if not self.args.no_python:
+                python_exec += [sys.executable, "-u"]
+                if self.args.module:
+                    python_exec.append("-m")
+            env_mapping = ['-env', 'RANK', str(i)]
+            env_mapping += ['-env', 'LOCAL_RANK', str(local_rank)]
+            if i == 0:
+                per_host_cmd = ['-n', '1'] + env_mapping + python_exec + [self.user_script] + self.user_arguments
+            else:
+                per_host_cmd = per_host_cmd + [':', '-n', '1'] + env_mapping + python_exec + [self.user_script
+                                                                                              ] + self.user_arguments
+        print(mpirun_cmd + export_cmd + per_host_cmd)
+        return mpirun_cmd + export_cmd + per_host_cmd
 
 
 class SlurmRunner(MultiNodeRunner):
+
     def __init__(self, args, world_info_base64, resource_pool):
         super().__init__(args, world_info_base64)
         self.resource_pool = resource_pool
@@ -227,12 +380,42 @@ def __init__(self, args, world_info_base64, resource_pool):
     def backend_exists(self):
         return shutil.which('sinfo')
 
+    def parse_user_args(self):
+        user_args = []
+        for arg in self.args.user_args:
+            if arg.startswith('{') and arg.endswith('}'):
+                try:
+                    arg_dict = json.loads(arg)
+                    if 'config_files' in arg_dict:
+                        config_files = {}
+                        for k, v in arg_dict.get('config_files', {}).items():
+                            config_files[k] = json.loads(v)
+                        arg_dict['config_files'] = config_files
+                except json.JSONDecodeError as jde:
+                    raise ValueError(
+                        'SLURM is picky and needs you to use plain json for your configs. Check for comments and lowercase trues'
+                    ) from jde
+                arg = json.dumps(arg_dict, separators=(',', ':'))
+            user_args.append(arg)
+        return user_args
+
+    @staticmethod
+    def _pdsh_include_to_nodelist(include_string: str):
+        """If an `--include` string of the form `node1@node2` has been passed in, transforms it to a format SLURM will accept."""
+        NODE_SEP = '@'
+        SLOT_LIST_START = ':'
+        if NODE_SEP not in include_string:
+            return include_string
+        if SLOT_LIST_START in include_string:
+            raise NotImplementedError('Currently only allocating whole nodes is supported while using the SLURM launcher.')
+        return include_string.replace(NODE_SEP, ',')
     @property
     def name(self):
         return 'slurm'
 
     def get_cmd(self, environment, active_resources):
-        assert not getattr(self.args, 'detect_nvlink_pairs', False), "slurm backend does not support remapping visible devices"
+        assert not getattr(self.args, 'detect_nvlink_pairs',
+                           False), "slurm backend does not support remapping visible devices"
         total_process_count = sum(self.resource_pool.values())
         srun_cmd = [
             'srun',
@@ -240,15 +423,16 @@ def get_cmd(self, environment, active_resources):
             f'{total_process_count}',
         ] + split(self.args.launcher_args)
 
-        if getattr(self.args, 'slurm_comment', ''):
-            srun_cmd += ['--comment', self.args.slurm_comment]
+        if getattr(self.args, 'comment', ''):
+            srun_cmd += ['--comment', self.args.comment]
+
+        if getattr(self.args, 'account', ''):
+            srun_cmd += ['--account', self.args.account]
 
         if self.args.include != "":
-            srun_cmd.append('--include')
-            srun_cmd.append(f'{self.args.include}')
-        if self.args.exclude != "":
-            srun_cmd.append('--exclude')
-            srun_cmd.append(f'{self.args.exclude}')
+            srun_cmd.append('--nodelist')
+            srun_cmd.append(self._pdsh_include_to_nodelist(self.args.include)) 
+
         if self.args.num_nodes > 0:
             srun_cmd.append('--nodes')
             srun_cmd.append(f'{self.args.num_nodes}')
@@ -261,12 +445,12 @@ def get_cmd(self, environment, active_resources):
             exports += f",{key}={val}"
 
         python_exec = [sys.executable, "-u"]
-        command = srun_cmd + [exports] + python_exec + [self.user_script
-                                                        ] + self.user_arguments
+        command = srun_cmd + [exports] + python_exec + [self.user_script] + self.user_arguments
         return command
 
 
 class MVAPICHRunner(MultiNodeRunner):
+
     def __init__(self, args, world_info_base64, resource_pool):
         super().__init__(args, world_info_base64)
         self.resource_pool = resource_pool
@@ -303,9 +487,7 @@ def backend_exists(self):
             if "MVAPICH2-GDR" in mpiname_results:
                 exists = True
             else:
-                warnings.warn(
-                    f"Expected MVAPICH2-GDR as return for mpiname but received {mpiname_results}"
-                )
+                warnings.warn(f"Expected MVAPICH2-GDR as return for mpiname but received {mpiname_results}")
         return exists
 
     @property
@@ -316,11 +498,9 @@ def validate_args(self):
         super().validate_args()
         #TODO: Allow for include/exclude at node-level but not gpu-level
         if self.args.include != "" or self.args.exclude != "":
-            raise ValueError(
-                f"{self.name} backend does not support worker include/exclusion")
+            raise ValueError(f"{self.name} backend does not support worker include/exclusion")
         if self.args.num_nodes != -1 or self.args.num_gpus != -1:
-            raise ValueError(
-                f"{self.name} backend does not support limiting num nodes/gpus")
+            raise ValueError(f"{self.name} backend does not support limiting num nodes/gpus")
 
     def get_cmd(self, environment, active_resources):
         devices_per_node = self.resource_pool.values()
@@ -353,5 +533,4 @@ def get_cmd(self, environment, active_resources):
             if self.args.module:
                 python_exec.append("-m")
 
-        return mpirun_cmd + export_cmd + python_exec + [self.user_script
-                                                        ] + self.user_arguments
+        return mpirun_cmd + export_cmd + python_exec + [self.user_script] + self.user_arguments
diff --git a/deepspeed/launcher/runner.py b/deepspeed/launcher/runner.py
index 044f6ff03365..2c53e2d59ef4 100755
--- a/deepspeed/launcher/runner.py
+++ b/deepspeed/launcher/runner.py
@@ -1,4 +1,7 @@
-# Copyright 2020 The Microsoft DeepSpeed Team
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 """
 DeepSpeed runner is the main front-end to launching multi-worker
 training jobs with DeepSpeed. By default this uses pdsh to parallel
@@ -9,6 +12,7 @@
 import os
 import re
 import sys
+import shlex
 import json
 import base64
 import argparse
@@ -18,8 +22,8 @@
 import signal
 import time
 
-from .multinode_runner import PDSHRunner, OpenMPIRunner, MVAPICHRunner, SlurmRunner, MPICHRunner
-from .constants import PDSH_LAUNCHER, OPENMPI_LAUNCHER, MVAPICH_LAUNCHER, SLURM_LAUNCHER, MPICH_LAUNCHER
+from .multinode_runner import PDSHRunner, OpenMPIRunner, MVAPICHRunner, SlurmRunner, MPICHRunner, JSRunner, IMPIRunner
+from .constants import PDSH_LAUNCHER, OPENMPI_LAUNCHER, MVAPICH_LAUNCHER, SLURM_LAUNCHER, MPICH_LAUNCHER, JSRUN_LAUNCHER, IMPI_LAUNCHER
 from ..constants import TORCH_DISTRIBUTED_DEFAULT_PORT
 from ..nebula.constants import NEBULA_EXPORT_ENVS
 from ..utils import logger
@@ -30,15 +34,19 @@
 DLTS_HOSTFILE = "/job/hostfile"
 EXPORT_ENVS = ['MLFLOW', 'NCCL', 'PYTHON', 'MV2', 'UCX']
 EXPORT_ENVS += NEBULA_EXPORT_ENVS
-DEEPSPEED_ENVIRONMENT_NAME = ".deepspeed_env"
+DEEPSPEED_ENVIRONMENT_NAME = os.getenv("DS_ENV_FILE", ".deepspeed_env")
 DEEPSPEED_ENVIRONMENT_PATHS = [os.path.expanduser("~"), '.']
 PDSH_MAX_FAN_OUT = 1024
 
+# On AISC compute, each node sets environment variables independently, want to prevent
+# exporting rank-0 env variables in case of heterogeneous compute.
+EXCLUDE_ENVS = {'AISC_JOB_NAME': ['NCCL_IB_HCA', 'UCX_NET_DEVICES']}
+
 
 def parse_args(args=None):
-    parser = argparse.ArgumentParser(
-        description="DeepSpeed runner to help launch distributed "
-        "multi-node/multi-gpu training jobs.")
+    parser = argparse.ArgumentParser(description="DeepSpeed runner to help launch distributed "
+                                     "multi-node/multi-gpu training jobs.",
+                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 
     parser.add_argument("-H",
                         "--hostfile",
@@ -92,6 +100,7 @@ def parse_args(args=None):
                         "Default is num_nodes when elastic training is enabled")
 
     parser.add_argument("--num_gpus",
+                        "--num_accelerators",
                         type=int,
                         default=-1,
                         help="Max number of GPUs to use on each node, will use "
@@ -109,12 +118,11 @@ def parse_args(args=None):
                         help="(optional) IP address of node 0, will be "
                         "inferred via 'hostname -I' if not specified.")
 
-    parser.add_argument(
-        "--launcher",
-        default=PDSH_LAUNCHER,
-        type=str,
-        help="(optional) choose launcher backend for multi-node "
-        "training. Options currently include PDSH, OpenMPI, MVAPICH, SLURM, MPICH.")
+    parser.add_argument("--launcher",
+                        default=PDSH_LAUNCHER,
+                        type=str,
+                        help="(optional) choose launcher backend for multi-node "
+                        "training. Options currently include PDSH, OpenMPI, MVAPICH, SLURM, MPICH, IMPI.")
 
     parser.add_argument("--launcher_args",
                         default="",
@@ -147,37 +155,60 @@ def parse_args(args=None):
                         help="Force multi-node launcher mode, helps in cases where user "
                         "wants to launch on single remote node.")
 
-    parser.add_argument(
-        "--save_pid",
-        action="store_true",
-        help="Save file containing launcher process id (pid) at /tmp/<main-pid>.ds, "
-        "where <main-pid> is the pid of the first process that invoked `deepspeed`. "
-        "Useful when launching deepspeed processes programmatically.")
+    parser.add_argument("--save_pid",
+                        action="store_true",
+                        help="Save file containing launcher process id (pid) at /tmp/<main-pid>.ds, "
+                        "where <main-pid> is the pid of the first process that invoked `deepspeed`. "
+                        "Useful when launching deepspeed processes programmatically.")
+
+    parser.add_argument("--enable_each_rank_log",
+                        default="None",
+                        type=str,
+                        help="redirect the stdout and stderr from each rank into different log files")
+
+    parser.add_argument("--autotuning",
+                        default="",
+                        choices=["tune", "run"],
+                        type=str,
+                        help="Run DeepSpeed autotuner to discover optimal configuration parameters "
+                        "before running job.")
 
     parser.add_argument(
-        "--enable_each_rank_log",
-        default="None",
+        "--comment",
+        default="",
         type=str,
-        help="redirect the stdout and stderr from each rank into different log files")
+        help="A comment that can be used for metadata. Used to pass --comment argument to srun in Slurm launcher"
+    )
 
     parser.add_argument(
-        "--autotuning",
+        "--account",
         default="",
-        choices=["tune",
-                 "run"],
         type=str,
-        help="Run DeepSpeed autotuner to discover optimal configuration parameters "
-        "before running job.")
+        help="Used to pass --account argument to srun in Slurm launcher"
+    )
 
     parser.add_argument("--elastic_training",
                         action="store_true",
                         help="Enable elastic training support in DeepSpeed.")
 
-    parser.add_argument("user_script",
-                        type=str,
-                        help="User script to launch, followed by any required "
+    parser.add_argument("user_script", type=str, help="User script to launch, followed by any required "
                         "arguments.")
+
     parser.add_argument('user_args', nargs=argparse.REMAINDER)
+
+    parser.add_argument("--bind_cores_to_rank",
+                        action="store_true",
+                        help="Bind each rank to different cores of the host")
+
+    parser.add_argument("--bind_core_list",
+                        type=str,
+                        default=None,
+                        help="List of cores to bind to with comma separated list of "
+                        "numbers and range. i.e. 1,3-5,7 => [1,3,4,5,7].  When not "
+                        "specified, all cores on system would be used rank binding")
+
+    parser.add_argument("--ssh_port", type=int, default=None, help="SSH port to use for remote connections")
+
     return parser.parse_args(args=args)
 
 
@@ -213,21 +244,15 @@ def _parse_hostfile(hostfile_lines):
             num_slots = int(match.group(2))
             if host in resource_pool:
                 logger.error(f"Bad hostfile text: {hostfile_lines}")
-                raise ValueError(
-                    f"Hostfile contains multiple entries for {host}, unable to proceed with launching"
-                )
+                raise ValueError(f"Hostfile contains multiple entries for {host}, unable to proceed with launching")
             resource_pool[host] = num_slots
         else:
             logger.error(f"Bad hostfile text: {hostfile_lines}")
-            raise ValueError(
-                "Hostfile contains a bad entry: {line}, unable to proceed with launching"
-            )
+            raise ValueError(f"Hostfile contains a bad entry: {line}, unable to proceed with launching")
 
     if len(resource_pool) == 0:
         logger.error(f"Bad hostfile text: {hostfile_lines}")
-        raise ValueError(
-            "Hostfile is empty or not formatted correctly, unable to proceed with launching."
-        )
+        raise ValueError("Hostfile is empty or not formatted correctly, unable to proceed with launching.")
 
     return resource_pool
 
@@ -337,9 +362,7 @@ def parse_inclusion_exclusion(resource_pool, inclusion, exclusion):
     for hostname, slots in resource_pool.items():
         active_resources[hostname] = list(range(slots))
 
-    return parse_resource_filter(active_resources,
-                                 include_str=inclusion,
-                                 exclude_str=exclusion)
+    return parse_resource_filter(active_resources, include_str=inclusion, exclude_str=exclusion)
 
 
 def encode_world_info(world_info):
@@ -380,6 +403,9 @@ def parse_num_nodes(str_num_nodes: str, elastic_training: bool):
 def main(args=None):
     args = parse_args(args)
 
+    # For when argparse interprets remaining args as a single string
+    args.user_args = shlex.split(" ".join(list(map(lambda x: x if x.startswith("-") else f'"{x}"', args.user_args))))
+
     if args.elastic_training:
         assert args.master_addr != "", "Master Addr is required when elastic training is enabled"
 
@@ -389,8 +415,7 @@ def main(args=None):
     cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", "")
     if not resource_pool and len(cuda_visible_devices):
         detected_str = f"Detected CUDA_VISIBLE_DEVICES={cuda_visible_devices}"
-        if len(args.include) or len(
-                args.exclude) or args.num_nodes > 1 or args.num_gpus > 0:
+        if len(args.include) or len(args.exclude) or args.num_nodes > 1 or args.num_gpus > 0:
             print(
                 f"{detected_str} but ignoring it because one or several of --include/--exclude/--num_gpus/--num_nodes cl args were used. If you want to use CUDA_VISIBLE_DEVICES don't pass any of these arguments to deepspeed."
             )
@@ -416,20 +441,18 @@ def main(args=None):
     if not multi_node_exec and args.num_nodes > 1:
         raise ValueError("Num nodes is >1 but no extra nodes available via hostfile")
 
-    active_resources = parse_inclusion_exclusion(resource_pool,
-                                                 args.include,
-                                                 args.exclude)
+    active_resources = parse_inclusion_exclusion(resource_pool, args.include, args.exclude)
     env = os.environ.copy()
 
     # validate that passwordless-ssh is workly properly with this hostfile
     if multi_node_exec and not args.no_ssh_check:
         first_host = list(active_resources.keys())[0]
         try:
-            subprocess.check_call(
-                f'ssh -o PasswordAuthentication=no {first_host} hostname',
-                stderr=subprocess.DEVNULL,
-                stdout=subprocess.DEVNULL,
-                shell=True)
+            ssh_check_cmd = "ssh -o PasswordAuthentication=no "
+            if args.ssh_port is not None:
+                ssh_check_cmd += f"-p {args.ssh_port} "
+            ssh_check_cmd += f"{first_host} hostname"
+            subprocess.check_call(ssh_check_cmd, stderr=subprocess.DEVNULL, stdout=subprocess.DEVNULL, shell=True)
         except subprocess.CalledProcessError:
             raise RuntimeError(
                 f"Using hostfile at {args.hostfile} but host={first_host} was not reachable via ssh. If you are running with a single node please remove {args.hostfile} or setup passwordless ssh."
@@ -481,13 +504,8 @@ def main(args=None):
 
     if not multi_node_exec:
         deepspeed_launch = [
-            sys.executable,
-            "-u",
-            "-m",
-            "deepspeed.launcher.launch",
-            f"--world_info={world_info_base64}",
-            f"--master_addr={args.master_addr}",
-            f"--master_port={args.master_port}"
+            sys.executable, "-u", "-m", "deepspeed.launcher.launch", f"--world_info={world_info_base64}",
+            f"--master_addr={args.master_addr}", f"--master_port={args.master_port}"
         ]
         if args.no_python:
             deepspeed_launch.append("--no_python")
@@ -498,12 +516,15 @@ def main(args=None):
         if args.save_pid:
             deepspeed_launch += ["--save_pid", f"{os.getpid()}"]
         if args.enable_each_rank_log:
-            deepspeed_launch.append(
-                f"--enable_each_rank_log={args.enable_each_rank_log}")
+            deepspeed_launch.append(f"--enable_each_rank_log={args.enable_each_rank_log}")
         if args.elastic_training:
             deepspeed_launch.append("--enable_elastic_training")
             deepspeed_launch.append(f"--max_elastic_nodes={args.max_elastic_nodes}")
             deepspeed_launch.append(f"--min_elastic_nodes={args.min_elastic_nodes}")
+        if args.bind_cores_to_rank:
+            deepspeed_launch.append("--bind_cores_to_rank")
+        if args.bind_core_list is not None:
+            deepspeed_launch.append(f"--bind_core_list={args.bind_core_list}")
         cmd = deepspeed_launch + [args.user_script] + args.user_args
     else:
         args.launcher = args.launcher.lower()
@@ -511,8 +532,12 @@ def main(args=None):
             runner = PDSHRunner(args, world_info_base64)
         elif args.launcher == OPENMPI_LAUNCHER:
             runner = OpenMPIRunner(args, world_info_base64, resource_pool)
+        elif args.launcher == JSRUN_LAUNCHER:
+            runner = JSRunner(args, world_info_base64, resource_pool)
         elif args.launcher == MPICH_LAUNCHER:
             runner = MPICHRunner(args, world_info_base64, resource_pool)
+        elif args.launcher == IMPI_LAUNCHER:
+            runner = IMPIRunner(args, world_info_base64, resource_pool)
         elif args.launcher == MVAPICH_LAUNCHER:
             runner = MVAPICHRunner(args, world_info_base64, resource_pool)
         elif args.launcher == SLURM_LAUNCHER:
@@ -529,25 +554,37 @@ def main(args=None):
         else:
             env['PYTHONPATH'] = curr_path
 
+        excluded_vars = []
+        for exclude_key, var_list in EXCLUDE_ENVS.items():
+            if exclude_key in env.keys():
+                # key exists in launcher env -> var list should be used
+                excluded_vars += var_list
+
         exports = ""
         for var in env.keys():
             if any([var.startswith(name) for name in EXPORT_ENVS]):
-                runner.add_export(var, env[var])
+                if not any([var == name for name in excluded_vars]):
+                    runner.add_export(var, env[var])
 
         for environ_path in DEEPSPEED_ENVIRONMENT_PATHS:
-            environ_file = os.path.join(environ_path, DEEPSPEED_ENVIRONMENT_NAME)
+            environ_file = DEEPSPEED_ENVIRONMENT_NAME
+            # handle if users to enter path for `DS_ENV_FILE`
+            if not os.path.isfile(environ_file):
+                environ_file = os.path.join(environ_path, DEEPSPEED_ENVIRONMENT_NAME)
             if os.path.isfile(environ_file):
+                logger.info(f"deepspeed_env file = {environ_file}")
                 with open(environ_file, 'r') as fd:
                     for var in fd.readlines():
                         key, val = var.split('=', maxsplit=1)
                         runner.add_export(key, val)
 
         if args.launcher == PDSH_LAUNCHER:
-            cmd, kill_cmd = runner.get_cmd(env, active_resources)
+            cmd, kill_cmd, env = runner.get_cmd(env, active_resources)
         else:
             cmd = runner.get_cmd(env, active_resources)
 
     logger.info(f"cmd = {' '.join(cmd)}")
+
     result = subprocess.Popen(cmd, env=env)
 
     def sigkill_handler(signum, frame):
@@ -559,8 +596,9 @@ def sigkill_handler(signum, frame):
         time.sleep(1)
         sys.exit(1)
 
-    if args.launcher == PDSH_LAUNCHER:
+    if args.launcher == PDSH_LAUNCHER and multi_node_exec:
         signal.signal(signal.SIGINT, sigkill_handler)
+        signal.signal(signal.SIGTERM, sigkill_handler)
 
     result.wait()
 
diff --git a/deepspeed/model_implementations/__init__.py b/deepspeed/model_implementations/__init__.py
index 77d1c5b1e7cb..ad95c58f7609 100644
--- a/deepspeed/model_implementations/__init__.py
+++ b/deepspeed/model_implementations/__init__.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .transformers.ds_transformer import DeepSpeedTransformerInference
 from .transformers.clip_encoder import DSClipEncoder
diff --git a/deepspeed/model_implementations/diffusers/__init__.py b/deepspeed/model_implementations/diffusers/__init__.py
index fcb45ab2b685..6c5067f71c8f 100644
--- a/deepspeed/model_implementations/diffusers/__init__.py
+++ b/deepspeed/model_implementations/diffusers/__init__.py
@@ -1 +1,5 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 '''Copyright The Microsoft DeepSpeed Team'''
diff --git a/deepspeed/model_implementations/diffusers/unet.py b/deepspeed/model_implementations/diffusers/unet.py
index 8ac97945ceb2..27627914cfcb 100644
--- a/deepspeed/model_implementations/diffusers/unet.py
+++ b/deepspeed/model_implementations/diffusers/unet.py
@@ -1,11 +1,14 @@
-'''
-Copyright 2022 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 import torch
 from ..features.cuda_graph import CUDAGraph
 
 
 class DSUNet(CUDAGraph, torch.nn.Module):
+
     def __init__(self, unet, enable_cuda_graph=True):
         super().__init__(enable_cuda_graph=enable_cuda_graph)
         self.unet = unet
@@ -59,5 +62,18 @@ def _create_cuda_graph(self, *inputs, **kwargs):
 
         self.cuda_graph_created = True
 
-    def _forward(self, sample, timestamp, encoder_hidden_states, return_dict=True):
-        return self.unet(sample, timestamp, encoder_hidden_states, return_dict)
+    def _forward(self,
+                 sample,
+                 timestamp,
+                 encoder_hidden_states,
+                 return_dict=True,
+                 cross_attention_kwargs=None,
+                 timestep_cond=None):
+        if cross_attention_kwargs:
+            return self.unet(sample,
+                             timestamp,
+                             encoder_hidden_states,
+                             return_dict,
+                             cross_attention_kwargs=cross_attention_kwargs)
+        else:
+            return self.unet(sample, timestamp, encoder_hidden_states, return_dict)
diff --git a/deepspeed/model_implementations/diffusers/vae.py b/deepspeed/model_implementations/diffusers/vae.py
index 8f8d13cb2df2..05084f1b985a 100644
--- a/deepspeed/model_implementations/diffusers/vae.py
+++ b/deepspeed/model_implementations/diffusers/vae.py
@@ -1,14 +1,18 @@
-'''
-Copyright 2022 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 import torch
 from ..features.cuda_graph import CUDAGraph
 
 
 class DSVAE(CUDAGraph, torch.nn.Module):
+
     def __init__(self, vae, enable_cuda_graph=True):
         super().__init__(enable_cuda_graph=enable_cuda_graph)
         self.vae = vae
+        self.config = vae.config
         self.device = self.vae.device
         self.dtype = self.vae.dtype
         self.vae.requires_grad_(requires_grad=False)
@@ -26,7 +30,7 @@ def _graph_replay_decoder(self, *inputs, **kwargs):
         self._decoder_cuda_graph.replay()
         return self.static_decoder_output
 
-    def _decode(self, x, return_dict=True):
+    def _decode(self, x, return_dict=True, generator=None):
         return self.vae.decode(x, return_dict=return_dict)
 
     def _create_cuda_graph_decoder(self, *inputs, **kwargs):
@@ -44,8 +48,7 @@ def _create_cuda_graph_decoder(self, *inputs, **kwargs):
         self.static_decoder_kwargs = kwargs
 
         with torch.cuda.graph(self._decoder_cuda_graph):
-            self.static_decoder_output = self._decode(*self.static_decoder_inputs,
-                                                      **self.static_decoder_kwargs)
+            self.static_decoder_output = self._decode(*self.static_decoder_inputs, **self.static_decoder_kwargs)
 
         self.decoder_cuda_graph_created = True
 
@@ -88,8 +91,7 @@ def _create_cuda_graph_encoder(self, *inputs, **kwargs):
         self.static_encoder_kwargs = kwargs
 
         with torch.cuda.graph(self._encoder_cuda_graph):
-            self.static_encoder_output = self._encode(*self.static_encoder_inputs,
-                                                      **self.static_encoder_kwargs)
+            self.static_encoder_output = self._encode(*self.static_encoder_inputs, **self.static_encoder_kwargs)
 
         self.encoder_cuda_graph_created = True
 
diff --git a/deepspeed/model_implementations/features/__init__.py b/deepspeed/model_implementations/features/__init__.py
index fcb45ab2b685..6c5067f71c8f 100644
--- a/deepspeed/model_implementations/features/__init__.py
+++ b/deepspeed/model_implementations/features/__init__.py
@@ -1 +1,5 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 '''Copyright The Microsoft DeepSpeed Team'''
diff --git a/deepspeed/model_implementations/features/cuda_graph.py b/deepspeed/model_implementations/features/cuda_graph.py
index 3224f625c6bc..fc3a29516654 100644
--- a/deepspeed/model_implementations/features/cuda_graph.py
+++ b/deepspeed/model_implementations/features/cuda_graph.py
@@ -1,10 +1,13 @@
-'''
-Copyright 2023 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 from abc import ABC, abstractmethod
 
 
 class CUDAGraph(ABC):
+
     def __init__(self, enable_cuda_graph=False):
         super().__init__()
         self.enable_cuda_graph = enable_cuda_graph
diff --git a/deepspeed/model_implementations/transformers/__init__.py b/deepspeed/model_implementations/transformers/__init__.py
index fcb45ab2b685..6c5067f71c8f 100644
--- a/deepspeed/model_implementations/transformers/__init__.py
+++ b/deepspeed/model_implementations/transformers/__init__.py
@@ -1 +1,5 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 '''Copyright The Microsoft DeepSpeed Team'''
diff --git a/deepspeed/model_implementations/transformers/clip_encoder.py b/deepspeed/model_implementations/transformers/clip_encoder.py
index efa282c4449f..8d9291896986 100644
--- a/deepspeed/model_implementations/transformers/clip_encoder.py
+++ b/deepspeed/model_implementations/transformers/clip_encoder.py
@@ -1,12 +1,15 @@
-'''
-Copyright 2022 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 import torch
 from deepspeed.accelerator import get_accelerator
 from ..features.cuda_graph import CUDAGraph
 
 
 class DSClipEncoder(CUDAGraph, torch.nn.Module):
+
     def __init__(self, enc, enable_cuda_graph=False):
         super().__init__(enable_cuda_graph=enable_cuda_graph)
         enc.text_model._build_causal_attention_mask = self._build_causal_attention_mask
@@ -22,11 +25,7 @@ def __init__(self, enc, enable_cuda_graph=False):
         self.config = self.enc.config
 
     def _build_causal_attention_mask(self, bsz, seq_len, dtype):
-        mask = torch.empty(bsz,
-                           seq_len,
-                           seq_len,
-                           dtype=dtype,
-                           device=get_accelerator().current_device_name())
+        mask = torch.empty(bsz, seq_len, seq_len, dtype=dtype, device=get_accelerator().current_device_name())
         mask.fill_(torch.tensor(torch.finfo(dtype).min))
         mask.triu_(1)
         mask = mask.unsqueeze(1)
@@ -69,9 +68,8 @@ def _create_cuda_graph(self, *inputs, **kwargs):
         self.static_kwargs[self.iter] = kwargs
 
         with torch.cuda.graph(self._cuda_graphs[self.iter]):
-            self.static_output[self.iter] = self._forward(
-                *self.static_inputs[self.iter],
-                **self.static_kwargs[self.iter])
+            self.static_output[self.iter] = self._forward(*self.static_inputs[self.iter],
+                                                          **self.static_kwargs[self.iter])
 
         self.cuda_graph_created[self.iter] = True
 
diff --git a/deepspeed/model_implementations/transformers/ds_base.py b/deepspeed/model_implementations/transformers/ds_base.py
index 9a848eaae1ad..b4b113904997 100644
--- a/deepspeed/model_implementations/transformers/ds_base.py
+++ b/deepspeed/model_implementations/transformers/ds_base.py
@@ -1,9 +1,13 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch.nn as nn
 
 
 class DeepSpeedTransformerBase(nn.module):
+
     def __init__(self):
         pass
 
diff --git a/deepspeed/model_implementations/transformers/ds_bert.py b/deepspeed/model_implementations/transformers/ds_bert.py
index 3e7a7b77fddf..13075553ec8b 100644
--- a/deepspeed/model_implementations/transformers/ds_bert.py
+++ b/deepspeed/model_implementations/transformers/ds_bert.py
@@ -1,6 +1,7 @@
-'''
-Copyright 2022 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from deepspeed.model_implementations.transformers.ds_transformer import DeepSpeedTransformerInference
 
@@ -8,6 +9,7 @@
 class DeepSpeedBERTInference(DeepSpeedTransformerInference):
     """Initialize the DeepSpeed BERT Transformer Layer.
     """
+
     def __init__(self,
                  config,
                  mp_group=None,
@@ -15,9 +17,4 @@ def __init__(self,
                  quantize_groups=1,
                  merge_count=1,
                  mlp_extra_grouping=False):
-        super().__init__(config,
-                         mp_group,
-                         quantize_scales,
-                         quantize_groups,
-                         merge_count,
-                         mlp_extra_grouping)
+        super().__init__(config, mp_group, quantize_scales, quantize_groups, merge_count, mlp_extra_grouping)
diff --git a/deepspeed/model_implementations/transformers/ds_bloom.py b/deepspeed/model_implementations/transformers/ds_bloom.py
index 386352f28a76..c48c7ed58ae5 100644
--- a/deepspeed/model_implementations/transformers/ds_bloom.py
+++ b/deepspeed/model_implementations/transformers/ds_bloom.py
@@ -1,6 +1,7 @@
-'''
-Copyright 2022 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from deepspeed.model_implementations.transformers.ds_transformer import DeepSpeedTransformerInference
 
@@ -8,6 +9,7 @@
 class DeepSpeedBloomInference(DeepSpeedTransformerInference):
     """Initialize the DeepSpeed Bloom Transformer Layer.
     """
+
     def __init__(self,
                  config,
                  mp_group=None,
@@ -15,9 +17,4 @@ def __init__(self,
                  quantize_groups=1,
                  merge_count=1,
                  mlp_extra_grouping=False):
-        super().__init__(config,
-                         mp_group,
-                         quantize_scales,
-                         quantize_groups,
-                         merge_count,
-                         mlp_extra_grouping)
+        super().__init__(config, mp_group, quantize_scales, quantize_groups, merge_count, mlp_extra_grouping)
diff --git a/deepspeed/model_implementations/transformers/ds_gpt.py b/deepspeed/model_implementations/transformers/ds_gpt.py
index 86cc9fdc6715..3c3baed1f618 100644
--- a/deepspeed/model_implementations/transformers/ds_gpt.py
+++ b/deepspeed/model_implementations/transformers/ds_gpt.py
@@ -1,6 +1,7 @@
-'''
-Copyright 2022 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from deepspeed.model_implementations.transformers.ds_transformer import DeepSpeedTransformerInference
 
@@ -8,6 +9,7 @@
 class DeepSpeedGPTInference(DeepSpeedTransformerInference):
     """Initialize the DeepSpeed GPT Transformer Layer.
     """
+
     def __init__(self,
                  config,
                  mp_group=None,
@@ -15,9 +17,4 @@ def __init__(self,
                  quantize_groups=1,
                  merge_count=1,
                  mlp_extra_grouping=False):
-        super().__init__(config,
-                         mp_group,
-                         quantize_scales,
-                         quantize_groups,
-                         merge_count,
-                         mlp_extra_grouping)
+        super().__init__(config, mp_group, quantize_scales, quantize_groups, merge_count, mlp_extra_grouping)
diff --git a/deepspeed/model_implementations/transformers/ds_llama2.py b/deepspeed/model_implementations/transformers/ds_llama2.py
new file mode 100644
index 000000000000..7d9eb4113a8a
--- /dev/null
+++ b/deepspeed/model_implementations/transformers/ds_llama2.py
@@ -0,0 +1,69 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+from deepspeed import comm as dist
+from deepspeed.model_implementations.transformers.ds_transformer import DeepSpeedTransformerInference
+
+inference_module = None
+
+
+class DeepSpeedLlama2Inference(DeepSpeedTransformerInference):
+    """Initialize the DeepSpeed OPT Transformer Layer.
+    """
+
+    def __init__(self,
+                 config,
+                 mp_group=None,
+                 quantize_scales=None,
+                 quantize_groups=1,
+                 merge_count=1,
+                 mlp_extra_grouping=False):
+        super().__init__(config, mp_group, quantize_scales, quantize_groups, merge_count, mlp_extra_grouping)
+
+    def forward(self, *args, **kwargs):
+
+        input = args[0]
+        input_mask = None
+        # Allocate memory only on first layer forward
+        if self.config.layer_id == 0 and self._alloc_workspace:
+            self.allocate_workspace(self.config.hidden_size, self.config.heads,
+                                    input.size()[1],
+                                    input.size()[0], DeepSpeedTransformerInference.layer_id, self.config.mp_size,
+                                    self.config.bigscience_bloom,
+                                    dist.get_rank() if dist.is_initialized() else 0, self.config.max_out_tokens,
+                                    self.config.min_out_tokens)
+            self._alloc_workspace = False
+
+        get_present = True
+
+        # We set the prev key/value to None when there is a prompt
+        if input.shape[1] > 1:
+            self.layer_past = None
+        layer_past = self.layer_past
+
+        input_type = input.dtype
+
+        if (self.config.dtype in [torch.float16, torch.bfloat16, torch.int8]) \
+            and input.dtype == torch.float:
+            target_dtype = torch.half if self.dtype == torch.int8 else self.dtype
+            input = input.to(target_dtype)
+
+        with torch.no_grad():
+            attention_output, key, value, context_outputtn_ctx, inp_norm = \
+                                     self.attention(input,
+                                              input_mask,
+                                              None,
+                                              layer_past,
+                                              get_present,
+                                              None, None, None,
+                                              self.norm_w,
+                                              self.norm_b,
+                                              None)
+            self.layer_past = (key, value)
+            output = self.mlp(attention_output, input, inp_norm, self.attention.attn_ob)
+
+            output = output.to(input_type)
+        return output
diff --git a/deepspeed/model_implementations/transformers/ds_megatron_gpt.py b/deepspeed/model_implementations/transformers/ds_megatron_gpt.py
index aca6b809e5df..055ed6d27d7d 100644
--- a/deepspeed/model_implementations/transformers/ds_megatron_gpt.py
+++ b/deepspeed/model_implementations/transformers/ds_megatron_gpt.py
@@ -1,6 +1,7 @@
-'''
-Copyright 2022 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from deepspeed.model_implementations.transformers.ds_transformer import DeepSpeedTransformerInference
 
@@ -8,6 +9,7 @@
 class DeepSpeedMegatronGPTInference(DeepSpeedTransformerInference):
     """Initialize the DeepSpeed Megatron GPT Transformer Layer.
     """
+
     def __init__(self,
                  config,
                  mp_group=None,
@@ -15,9 +17,4 @@ def __init__(self,
                  quantize_groups=1,
                  merge_count=1,
                  mlp_extra_grouping=False):
-        super().__init__(config,
-                         mp_group,
-                         quantize_scales,
-                         quantize_groups,
-                         merge_count,
-                         mlp_extra_grouping)
+        super().__init__(config, mp_group, quantize_scales, quantize_groups, merge_count, mlp_extra_grouping)
diff --git a/deepspeed/model_implementations/transformers/ds_opt.py b/deepspeed/model_implementations/transformers/ds_opt.py
index a5209a30f818..7bc5524d71c7 100644
--- a/deepspeed/model_implementations/transformers/ds_opt.py
+++ b/deepspeed/model_implementations/transformers/ds_opt.py
@@ -1,6 +1,7 @@
-'''
-Copyright 2022 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from deepspeed.model_implementations.transformers.ds_transformer import DeepSpeedTransformerInference
 
@@ -8,6 +9,7 @@
 class DeepSpeedOPTInference(DeepSpeedTransformerInference):
     """Initialize the DeepSpeed OPT Transformer Layer.
     """
+
     def __init__(self,
                  config,
                  mp_group=None,
@@ -15,9 +17,4 @@ def __init__(self,
                  quantize_groups=1,
                  merge_count=1,
                  mlp_extra_grouping=False):
-        super().__init__(config,
-                         mp_group,
-                         quantize_scales,
-                         quantize_groups,
-                         merge_count,
-                         mlp_extra_grouping)
+        super().__init__(config, mp_group, quantize_scales, quantize_groups, merge_count, mlp_extra_grouping)
diff --git a/deepspeed/model_implementations/transformers/ds_transformer.py b/deepspeed/model_implementations/transformers/ds_transformer.py
index ee5a9bdf8763..a41df58ad059 100644
--- a/deepspeed/model_implementations/transformers/ds_transformer.py
+++ b/deepspeed/model_implementations/transformers/ds_transformer.py
@@ -1,6 +1,7 @@
-'''
-Copyright 2022 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 import torch.nn as nn
@@ -11,8 +12,12 @@
 from deepspeed.ops.transformer.inference.ds_attention import DeepSpeedSelfAttention, BloomSelfAttention
 from deepspeed.accelerator import get_accelerator
 from deepspeed.ops.op_builder import InferenceBuilder
+import deepspeed
+if deepspeed.HAS_TRITON:
+    from deepspeed.ops.transformer.inference.triton.mlp import TritonMLP
+    from deepspeed.ops.transformer.inference.triton.attention import TritonSelfAttention
 
-inference_cuda_module = None
+inference_module = None
 
 
 class DeepSpeedTransformerInference(nn.Module):
@@ -46,52 +51,59 @@ def __init__(self,
         self.config.layer_id = DeepSpeedTransformerInference.layer_id
         DeepSpeedTransformerInference.layer_id += 1
 
-        data_type = torch.half if config.fp16 else torch.float
-        global inference_cuda_module
-        if inference_cuda_module is None:
+        data_type = torch.half if self.config.dtype == torch.int8 else self.config.dtype
+        global inference_module
+        if inference_module is None:
             builder = InferenceBuilder()
-            inference_cuda_module = builder.load()
+            inference_module = builder.load()
 
         if DeepSpeedTransformerInference.layer_id == 1:
             log_dist(f"DeepSpeed-Inference config: {self.config.__dict__}", [0])
+            if deepspeed.HAS_TRITON and self.config.use_triton:
+                log_dist(f"Injecting Triton kernels ...", [0])
 
         if self.config.bigscience_bloom:
-            self.attention = BloomSelfAttention(self.config,
-                                                mp_group,
-                                                quantize_scales,
-                                                quantize_groups,
-                                                merge_count)
+            self.attention = BloomSelfAttention(self.config, mp_group, quantize_scales, quantize_groups, merge_count)
+            assert not self.config.use_triton
+        else:
+            if deepspeed.HAS_TRITON and self.config.use_triton:
+                self.attention = TritonSelfAttention(self.config)
+            else:
+                self.attention = DeepSpeedSelfAttention(self.config, mp_group, quantize_scales, quantize_groups,
+                                                        merge_count)
+
+        if deepspeed.HAS_TRITON and self.config.use_triton:
+            self.mlp = TritonMLP(self.config)
         else:
-            self.attention = DeepSpeedSelfAttention(self.config,
-                                                    mp_group,
-                                                    quantize_scales,
-                                                    quantize_groups,
-                                                    merge_count)
-        self.mlp = DeepSpeedMLP(self.config,
-                                mp_group,
-                                quantize_scales,
-                                quantize_groups,
-                                merge_count,
-                                mlp_extra_grouping)
-
-        device = get_accelerator().current_device_name(
-        )  # if config.bigscience_bloom else 'cpu'
-        self.norm_w = nn.Parameter(torch.empty(self.config.hidden_size,
-                                               dtype=data_type,
-                                               device=device),
-                                   requires_grad=False)
-        self.norm_b = nn.Parameter(torch.empty(self.config.hidden_size,
-                                               dtype=data_type,
-                                               device=device),
-                                   requires_grad=False)
+            self.mlp = DeepSpeedMLP(self.config, mp_group, quantize_scales, quantize_groups, merge_count,
+                                    mlp_extra_grouping)
+
+        device = get_accelerator().current_device_name()  # if config.bigscience_bloom else 'cpu'
+        if self.config.set_empty_params:
+            self.norm_w = None
+            self.norm_b = None
+        else:
+            self.norm_w = nn.Parameter(torch.empty(self.config.hidden_size, dtype=data_type, device=device),
+                                       requires_grad=False)
+            self.norm_b = nn.Parameter(torch.empty(self.config.hidden_size, dtype=data_type, device=device),
+                                       requires_grad=False)
         self.layer_past = None
-        self.allocate_workspace = inference_cuda_module.allocate_workspace_fp32 if (not config.fp16) else \
-                                inference_cuda_module.allocate_workspace_fp16
+        try:
+            if config.dtype == torch.float32:
+                self.allocate_workspace = inference_module.allocate_workspace_fp32
+            elif config.dtype == torch.bfloat16:
+                self.allocate_workspace = inference_module.allocate_workspace_bf16
+            else:
+                self.allocate_workspace = inference_module.allocate_workspace_fp32
+            self._alloc_workspace = True
+        except AttributeError:
+            self.allocate_workspace = None
+            self._alloc_workspace = False
 
     @classmethod
     def reset_cache(cls):
-        if inference_cuda_module is not None:
-            inference_cuda_module.reset_cache()
+        if inference_module is not None:
+            inference_module.reset_cache()
 
     def forward(
             self,
@@ -114,25 +126,25 @@ def forward(
             # TODO(arashb): 'layer_head_mask' and 'past_key_value' are only added to satisfy the OPT models API.
             # This needs to be redesigned later!
             layer_head_mask=None,
-            past_key_value=None):
+            past_key_value=None,
+            **kwargs):
 
         if x is not None:
             input = x
+        if "hidden_states" in kwargs:
+            input = kwargs["hidden_states"]
 
-        input_mask = (input_mask if attn_mask is None else
-                      attn_mask) if attention_mask is None else attention_mask
+        input_mask = (input_mask if attn_mask is None else attn_mask) if attention_mask is None else attention_mask
 
         # Allocate memory only on first layer forward
-        if self.config.layer_id == 0:
-            self.allocate_workspace(self.config.hidden_size,
-                                    self.config.heads,
+        if self.config.layer_id == 0 and self._alloc_workspace:
+            self.allocate_workspace(self.config.hidden_size, self.config.heads,
                                     input.size()[1],
-                                    input.size()[0],
-                                    DeepSpeedTransformerInference.layer_id,
-                                    self.config.mp_size,
+                                    input.size()[0], DeepSpeedTransformerInference.layer_id, self.config.mp_size,
                                     self.config.bigscience_bloom,
-                                    dist.get_rank() if dist.is_initialized() else 0,
-                                    self.config.max_out_tokens)
+                                    dist.get_rank() if dist.is_initialized() else 0, self.config.max_out_tokens,
+                                    self.config.min_out_tokens)
+            self._alloc_workspace = False
 
         get_present = (get_present or get_key_value or use_cache)
         input_mask = input_mask if attention_mask is None else attention_mask
@@ -149,9 +161,11 @@ def forward(
             input = input[0]
         input_type = input.dtype
 
-        if (self.config.fp16 or self.config.q_int8) \
+        if (self.config.dtype in [torch.float16, torch.bfloat16, torch.int8]) \
             and input.dtype == torch.float:
-            input = input.half()
+            target_dtype = torch.half if self.dtype == torch.int8 else self.dtype
+            input = input.to(target_dtype)
+
         with torch.no_grad():
             attention_output, key, value, context_outputtn_ctx, inp_norm = \
                                      self.attention(input,
@@ -171,10 +185,7 @@ def forward(
             output = self.mlp(attention_output, input, inp_norm, self.attention.attn_ob)
 
             if not self.config.pre_layer_norm:
-                output = inference_cuda_module.layer_norm(output,
-                                                          self.norm_w,
-                                                          self.norm_b,
-                                                          self.config.epsilon)
+                output = inference_module.layer_norm(output, self.norm_w, self.norm_b, self.config.epsilon)
 
             output = output.to(input_type)
         if get_present:
diff --git a/deepspeed/module_inject/__init__.py b/deepspeed/module_inject/__init__.py
index aab302864051..4bdabf383b26 100755
--- a/deepspeed/module_inject/__init__.py
+++ b/deepspeed/module_inject/__init__.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .replace_module import replace_transformer_layer, revert_transformer_layer, ReplaceWithTensorSlicing, GroupQuantizer, generic_injection
 from .module_quantize import quantize_transformer_layer
diff --git a/deepspeed/module_inject/auto_tp.py b/deepspeed/module_inject/auto_tp.py
index a2c570f5781c..c235cc766209 100644
--- a/deepspeed/module_inject/auto_tp.py
+++ b/deepspeed/module_inject/auto_tp.py
@@ -1,13 +1,192 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 # Automatic Tensor Parallelism
 import re
 
 from torch import nn
 from .replace_policy import replace_policies
+from typing import Optional
+import torch
+from deepspeed import comm as dist
+from .layers import LinearAllreduce, LinearLayer, LmHeadLinearAllreduce
+from deepspeed.accelerator import get_accelerator
+from .fusedqkv_utils import require_tp_fused_qkvw, prepare_tp_fused_qkvw
+from deepspeed.module_inject.tp_shard import get_shard_size, get_shard_size_list
+
+
+class ReplaceWithTensorSlicing:
+
+    def __init__(self, mp_group=None, mp_size=1, out_dim=1, in_dim=0):
+        if mp_group is not None:
+            self.gpu_index = dist.get_rank(group=mp_group)
+        else:
+            self.gpu_index = 0
+        self.out_dim = out_dim
+        self.in_dim = in_dim
+        self.mp_size = mp_size
+
+    def merge_assert(self, dim1, dim2):
+        assert dim1 > dim2, \
+            'Merging tensors is not allowed here! Please use deepspeed load_checkpoint\
+            for merging your checkpoints before replacing the transformer layer with\
+            inference-kernels'
+
+    def strided_copy(self,
+                     dst: Optional[torch.Tensor],
+                     src: Optional[torch.Tensor],
+                     num_splits: int,
+                     int8: bool = False,
+                     allocate_tensor: bool = False):
+        if src is None:
+            return src
+        src_shape = src.shape
+        dst_shape = dst.shape
+
+        outer_dim = 0 if int8 else -1
+
+        if allocate_tensor:
+            dst = torch.empty_like(dst)
+
+        src_split = torch.split(src.data, src.shape[outer_dim] // num_splits, dim=outer_dim)
+        if (len(src_shape) == 2 and len(dst_shape) == 2):
+            if src_shape[outer_dim] == dst_shape[self.out_dim]:
+                try:
+                    dst = dst.reshape(-1).data.copy_(src.data.reshape(-1)).reshape(src.shape)
+                except:
+                    print(dst.shape, src.shape)
+                    exit()
+                dst = torch.nn.parameter.Parameter(dst, requires_grad=False)
+                if hasattr(src, 'scale'):
+                    dst.scale = src.scale
+                return dst
+            self.merge_assert(src_shape[outer_dim], dst_shape[self.out_dim])
+            qkv_size = dst_shape[self.out_dim] // num_splits
+            qkv_split = [torch.split(src_s, qkv_size, dim=outer_dim) for src_s in src_split]
+            weight_split = [
+                torch.cat([qkv_s[i] for qkv_s in qkv_split], axis=outer_dim) for i in range(len(qkv_split[0]))
+            ]
+            dst = dst.reshape(-1).data.copy_(weight_split[self.gpu_index].contiguous().reshape(-1)).reshape(
+                weight_split[self.gpu_index].shape)
+        else:
+            if src_shape[0] == dst_shape[0]:
+                return torch.nn.parameter.Parameter(src)
+            qkv_size = dst_shape[0] // num_splits
+            qkv_split = [torch.split(src_s, qkv_size, dim=0) for src_s in src_split]
+            bias_split = [torch.cat([qkv_s[i] for qkv_s in qkv_split], axis=0) for i in range(len(qkv_split[0]))]
+            dst.data.copy_(bias_split[self.gpu_index].contiguous())
+
+        dst = torch.nn.parameter.Parameter(dst, requires_grad=False)
+        if hasattr(src, 'scale'):
+            dst.scale = src.scale
+        return dst
+
+    def copy(self, dst, src, int8=False, allocate_tensor=False):
+        if src is None:
+            return src
+        assert not dst.data.is_meta  # the torch.Tensor.copy_ method used below will silently fail on meta tensors
+        if allocate_tensor:
+            dst = torch.empty_like(dst)
+        outer_dim = 0 if int8 else 1
+        inner_dim = 1 if int8 else 0
+        src_shape = src.shape
+        dst_shape = dst.shape
+        if (len(src_shape) == 2 and len(dst_shape) == 2):
+
+            if src_shape[inner_dim] == dst_shape[self.in_dim] and src_shape[outer_dim] == dst_shape[self.out_dim]:
+                dst = dst.reshape(-1).data.copy_(src.data.reshape(-1)).reshape(src.shape)
+            else:
+                if src_shape[inner_dim] != dst_shape[self.in_dim]:
+                    self.merge_assert(src_shape[inner_dim], dst_shape[self.in_dim])
+                    dst.data.copy_(src[:, self.gpu_index * dst_shape[self.in_dim]: (self.gpu_index + 1) * dst_shape[self.in_dim]] if inner_dim == 1 else \
+                                   src[self.gpu_index * dst_shape[self.in_dim]: (self.gpu_index + 1) * dst_shape[self.in_dim], :])
+                else:
+                    self.merge_assert(src_shape[outer_dim], dst_shape[self.out_dim])
+                    dst.data.copy_(src[:, self.gpu_index * dst_shape[self.out_dim]: (self.gpu_index + 1) * dst_shape[self.out_dim]] if outer_dim == 1 else \
+                                   src[self.gpu_index * dst_shape[self.out_dim]: (self.gpu_index + 1) * dst_shape[self.out_dim], :])
+        else:
+            if src_shape[0] == dst_shape[0]:
+                dst = src if src.dtype == dst.dtype else dst.data.copy_(src)
+            else:
+                dst.data.copy_(src[self.gpu_index * dst_shape[-1]:(self.gpu_index + 1) * dst_shape[-1]])
+        dst = torch.nn.parameter.Parameter(dst, requires_grad=False)
+        if hasattr(src, 'scale'):
+            dst.scale = src.scale
+        return dst
+
+
+class Loading():
+
+    def is_load_module(module):
+        load_layers = [nn.Linear, nn.Embedding, nn.LayerNorm]
+        load_layer_names = ["LPLayerNorm", "SharedEmbedding", "OPTLearnedPositionalEmbedding", "LlamaRMSNorm"]
+        return module.__class__ in load_layers or module._get_name() in load_layer_names
+
+    def load_buffer(module, state_dict, prefix):
+        for name in module._buffers.keys():
+            if module._buffers[name].data.is_meta:
+                module._buffers[name] = torch.nn.parameter.Parameter(
+                    data=torch.empty_like(module._buffers[name].data, device="cpu"),
+                    requires_grad=module._buffers[name].data.requires_grad)
+            if prefix + name in state_dict.keys():
+                module._buffers[name].data.copy_(state_dict[prefix + name])
+
+    def load(module, state_dict, prefix, mp_group=None):
+        mp_replace = ReplaceWithTensorSlicing(mp_group=mp_group)
+        if hasattr(module, 'weight'):
+            if module.weight.data.is_meta:
+                # meta tensor cannot be casted or copied to, so we need to replace it with a normal tensor here
+                module.weight = torch.nn.parameter.Parameter(data=torch.empty_like(module.weight.data, device="cpu"),
+                                                             requires_grad=module.weight.data.requires_grad)
+                if 'query_key_value' in prefix:
+                    module.weight = mp_replace.strided_copy(module.weight.data,
+                                                            state_dict[prefix + 'weight'],
+                                                            num_splits=3)
+                else:
+                    module.weight = mp_replace.copy(module.weight.data, state_dict[prefix + 'weight'])
+        else:
+            if hasattr(module, 'norm') and hasattr(module.norm, 'weight'):
+                if module.norm.weight.data.is_meta:
+                    # meta tensor cannot be casted or copied to, so we need to replace it with a normal tensor here
+                    module.norm.weight = torch.nn.parameter.Parameter(
+                        data=torch.empty_like(module.norm.weight.data, device="cpu"),
+                        requires_grad=module.norm.weight.data.requires_grad)
+                module.norm.weight = mp_replace.copy(module.norm.weight.data, state_dict[prefix + 'weight'])
+
+        if prefix + 'bias' in state_dict.keys():
+            if hasattr(module, 'bias'):
+                if module.bias.data.is_meta:
+                    # meta tensor cannot be casted or copied to, so we need to replace it with a normal tensor here
+                    module.bias = torch.nn.parameter.Parameter(data=torch.empty_like(module.bias.data, device="cpu"),
+                                                               requires_grad=module.bias.data.requires_grad)
+                module.bias = mp_replace.copy(module.bias, state_dict[prefix + 'bias'])
+            else:
+                if hasattr(module, 'norm') and hasattr(module.norm, 'bias'):
+                    if module.norm.bias.data.is_meta:
+                        # meta tensor cannot be casted or copied to, so we need to replace it with a normal tensor here
+                        module.norm.bias = torch.nn.parameter.Parameter(
+                            data=torch.empty_like(module.norm.bias.data, device="cpu"),
+                            requires_grad=module.norm.bias.data.requires_grad)
+                    module.norm.bias = mp_replace.copy(module.norm.bias, state_dict[prefix + 'bias'])
 
 
 class AutoTP():
+
+    def __init__(self, module, all_reduce_linears, prefix, state_dict, linear_layer_setting, orig_layer_impl):
+        self.module = module
+        self.all_reduce_linears = all_reduce_linears
+        self.prefix = prefix
+        self.state_dict = state_dict
+
+        self.mp_size = None
+        self.mp_group = None
+        self.linear_layer_setting = linear_layer_setting
+        self.orig_layer_impl = orig_layer_impl
+        self.linear_policies = None
+        self.conv_linear_layer = False
+
     def in_module_list(module, module_list):
         for item in module_list:
             if type(item).__name__ == type(module).__name__:
@@ -28,18 +207,7 @@ def get_module_list(model):
         return mlist
 
     def supported(model):
-        unsupported = [
-            'bloom',
-            'codegen',
-            'deberta',
-            'flaubert',
-            'fsmt',
-            'gpt2',
-            'led',
-            'longformer',
-            'xlm',
-            'xlnet'
-        ]
+        unsupported = ['deberta', 'flaubert', 'fsmt', 'gpt2', 'led', 'longformer', 'xlm', 'xlnet']
         model = str(model)
         key = re.search(r": (.*?)Model", model)
         if key is None:
@@ -56,8 +224,7 @@ def get_layers(parent, module):
         for key, submodule in module._modules.items():
             if isinstance(submodule, nn.Linear):
                 layer_list = layer_list + [parent + "." + key]
-            elif isinstance(submodule,
-                            nn.LayerNorm) or key == 'LayerNorm' or key == 'layer_norm':
+            elif isinstance(submodule, nn.LayerNorm) or key == 'LayerNorm' or key == 'layer_norm':
                 layer_list = layer_list + ["ln"]
             else:
                 layer_list = layer_list + AutoTP.get_layers(key, submodule)
@@ -102,9 +269,7 @@ def tp_parser(model):
             for key, submodule in module._modules.items():
                 if isinstance(submodule, nn.Linear):
                     layer_list = layer_list + ["." + key]
-                elif isinstance(
-                        submodule,
-                        nn.LayerNorm) or key == 'LayerNorm' or key == 'layer_norm':
+                elif isinstance(submodule, nn.LayerNorm) or key == 'LayerNorm' or key == 'layer_norm':
                     layer_list = layer_list + ["ln"]
                 else:
                     layer_list = layer_list + AutoTP.get_layers(key, submodule)
@@ -114,6 +279,16 @@ def tp_parser(model):
                         gem_list = gem_list + [layer_list[i - 1]]
                 elif 'out_proj' in layer:
                     gem_list = gem_list + [layer]
+                elif 'o_proj' in layer:
+                    gem_list = gem_list + [layer]
+                elif 'down_proj' in layer:
+                    gem_list = gem_list + [layer]
+                elif 'attention.dense' in layer and 'GPTNeoX' in str(model):
+                    gem_list = gem_list + [layer]
+                elif 'self_attention.dense' in layer and 'falcon' in str(
+                        type(module)):  # this is a hack to get the right linear layer for this model!
+                    gem_list = gem_list + [layer]
+
             layer_list = []
             if gem_list != []:
                 gem_list = list(set(gem_list))
@@ -122,3 +297,172 @@ def tp_parser(model):
         assert len(policy_list), "AutoTP not supported for model. Please use kernel injection since container policy for model exists." \
         if AutoTP.kernel_supported(module_list) else "Not able to determine model policy automatically. Please provide policy."
         return policy_list
+
+    def set_tensor_parallel_config(self, mp_size, mp_group):
+        self.mp_size = mp_size
+        self.mp_group = mp_group
+
+    def _replace(self, child, name, conv_linear_layer):
+        if getattr(child, "replaced", False) == True:
+            return
+        weight_shape = child.weight.shape
+        mp_replace = ReplaceWithTensorSlicing(mp_group=self.mp_group)
+        if name in self.all_reduce_linears:
+            # if conv_linear_layer [weight_shape[1], weight_shape[0] // mp_size]
+            # else [weight_shape[0], weight_shape[1] // mp_size]
+
+            if self.conv_linear_layer:
+                child.weight.data = child.weight.data.transpose(-1, -2).contiguous()
+            data = child.weight.data.split(get_shard_size_list(
+                weight_shape[0] if self.conv_linear_layer else weight_shape[1], self.mp_size),
+                                           dim=1)
+            data_dc = data[mp_replace.gpu_index].to(get_accelerator().current_device_name()).clone().detach()
+            del data
+
+            setattr(child, "replaced", True)
+            if name == "lm_head" or name == 'embed_out':
+                return LmHeadLinearAllreduce(
+                    torch.nn.parameter.Parameter(data_dc, requires_grad=False), dist.get_rank(), dist.get_world_size(),
+                    child.bias if child.bias is None else torch.nn.parameter.Parameter(
+                        child.bias.to(get_accelerator().current_device_name())), self.mp_group)
+            return LinearAllreduce(torch.nn.parameter.Parameter(data_dc, requires_grad=False), child.bias if child.bias is None else \
+                        torch.nn.parameter.Parameter(child.bias.to(get_accelerator().current_device_name())), self.mp_group)
+        else:
+
+            # if conv_linear_layer [weight_shape[1], weight_shape[0] // mp_size]
+            # else [weight_shape[0] // mp_size, weight_shape[1]]
+            if self.conv_linear_layer:
+                child.weight.data = child.weight.data.transpose(-1, -2).contiguous()
+
+            if require_tp_fused_qkvw(name, self.mp_size):
+                #for detecting fused type
+                module_str = str(self.module).strip()
+                #The copy is a regular copy, The shape of dst and src is the same
+                data_dc = prepare_tp_fused_qkvw(module_str, child.weight.data, self.mp_size, mp_replace.gpu_index)
+
+                bias_data_dc = None if child.bias is None else prepare_tp_fused_qkvw(
+                    module_str, child.bias.data, self.mp_size, mp_replace.gpu_index).to(
+                        get_accelerator().current_device_name())
+            else:
+                data = child.weight.data.split(get_shard_size_list(weight_shape[0], self.mp_size),
+                                               dim=1 if self.conv_linear_layer else 0)
+                data_dc = data[mp_replace.gpu_index].to(get_accelerator().current_device_name()).clone().detach()
+                del data
+
+                if child.bias is not None:
+                    bias_data = child.bias.data.split(get_shard_size_list(
+                        weight_shape[1] if self.conv_linear_layer else weight_shape[0], self.mp_size),
+                                                      dim=0)
+                    bias_data = bias_data[mp_replace.gpu_index].to(get_accelerator().current_device_name())
+                    bias_data_dc = torch.nn.parameter.Parameter(bias_data, requires_grad=False)
+                    del bias_data
+                else:
+                    bias_data_dc = None
+
+            setattr(child, "replaced", True)
+            return LinearLayer(weight=torch.nn.parameter.Parameter(data_dc.to(get_accelerator().current_device_name()), requires_grad=False), \
+                        bias=bias_data_dc)
+
+    def _slice_embedding(self, child, name, conv_linear_layer):
+        if getattr(child, "replaced", False) == True:
+            return
+        mp_replace = ReplaceWithTensorSlicing(mp_group=self.mp_group)
+
+        if hasattr(child.weight, 'ds_tensor'):
+            data = child.weight.ds_tensor.data.split(get_shard_size_list(child.weight.shape[1], self.mp_size), dim=1)
+        else:
+            data = child.weight.data.split(get_shard_size_list(child.weight.shape[1], self.mp_size), dim=1)
+        data = data[mp_replace.gpu_index].to(get_accelerator().current_device_name())
+        data = torch.nn.parameter.Parameter(data, requires_grad=False)
+
+        new_embedding = nn.Embedding(child.weight.shape[0], get_shard_size(child.weight.shape[1], self.mp_size))
+        new_embedding.weight.data.copy_(data)
+        setattr(child, "replaced", True)
+        return new_embedding
+
+    def update_mp_params(self, child):
+        if getattr(child, "replaced", False) == True:
+            return
+        for param in [
+                "n_heads", "inner_dim", "num_heads", "num_kv", "num_attention_heads", "num_attn_heads",
+                "all_head_size", "embed_dim", "hidden_size", "num_key_value_heads", "num_kv_heads"
+        ]:
+            if hasattr(child, param):
+                param_val = getattr(child, param)
+                setattr(child, param, get_shard_size(param_val, self.mp_size))
+        setattr(child, "replaced", True)
+
+    def update_linear_policies(self):
+        self.conv_linear_layer = False
+        if self.linear_layer_setting is not None:
+            self.linear_policies = {self.linear_layer_setting[0]: self._replace}
+            if len(self.linear_layer_setting) == 2:
+                self.linear_policies.update({self.linear_layer_setting[1]: self._slice_embedding})
+        else:
+            import transformers
+            if self.orig_layer_impl is transformers.models.gpt2.modeling_gpt2.GPT2Block:
+                try:
+                    self.conv_linear_layer = True
+                    self.linear_policies = {transformers.pytorch_utils.Conv1D: self._replace}
+                except ImportError:
+                    self.linear_policies = {nn.Linear: self._replace}
+            else:
+                self.linear_policies = {nn.Linear: self._replace, nn.Embedding: self._slice_embedding}
+
+    def _replace_module(self, r_module, prev_name='', prev_class_name=''):
+        for name, child in r_module.named_children():
+            if prev_class_name == "":
+                class_name = prev_name
+            elif prev_name == "":
+                class_name = prev_class_name
+            else:
+                class_name = prev_class_name + '.' + prev_name
+            checking_key = self.prefix + '.' + class_name + '.' + name + '.' if class_name != "" else self.prefix + '.' + name + '.'
+            if Loading.is_load_module(child) and self.state_dict is not None:
+                if any(checking_key in item for item in self.state_dict):
+                    Loading.load(child, self.state_dict, checking_key, self.mp_group)
+                else:
+                    continue
+            if len(child._buffers) != 0 and self.state_dict is not None:
+                Loading.load_buffer(child, self.state_dict, checking_key)
+            if child.__class__ in self.linear_policies:
+                setattr(r_module, name, self.linear_policies[child.__class__](child, prev_name + '.' + name,
+                                                                              self.conv_linear_layer))
+            elif any(isinstance(child, lp) for lp in self.linear_policies):
+                # Added for falcon model support
+                # Note: isinstance will account for class inheritance, child.__class__ does not
+                key = None
+                for lp in self.linear_policies:
+                    if isinstance(child, lp):
+                        key = lp
+                        break
+                assert key is not None
+                setattr(r_module, name, self.linear_policies[key](child, prev_name + '.' + name,
+                                                                  self.conv_linear_layer))
+            else:
+                self.update_mp_params(child)
+                self._replace_module(child, name, class_name)
+        return r_module
+
+    def get_model_num_kv_heads(self, config):
+        num_kv_heads = None
+        kv_head_names = ['num_key_value_heads', 'num_attention_heads', 'n_heads']
+        for name in kv_head_names:
+            if hasattr(config, name):
+                num_kv_heads = getattr(config, name)
+                if num_kv_heads != None:
+                    break
+        return num_kv_heads
+
+    def _replace_last_linear_module(self, r_module):
+        if hasattr(r_module, "lm_head"):
+            name = "lm_head"
+            child = r_module.lm_head
+        elif hasattr(r_module, "embed_out"):
+            name = "embed_out"
+            child = r_module.embed_out
+        else:
+            return r_module
+        if child.__class__ in self.linear_policies:
+            setattr(r_module, name, self.linear_policies[child.__class__](child, name, self.conv_linear_layer))
+        return r_module
diff --git a/deepspeed/module_inject/auto_tp_model_utils.py b/deepspeed/module_inject/auto_tp_model_utils.py
new file mode 100644
index 000000000000..51e52e3258dd
--- /dev/null
+++ b/deepspeed/module_inject/auto_tp_model_utils.py
@@ -0,0 +1,94 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from deepspeed import comm as dist
+import torch
+from typing import Optional
+from deepspeed.module_inject.tp_shard import get_shard_size, get_shard_size_list
+
+
+def build_bloom_alibi_tensor(attention_mask: torch.Tensor, num_heads: int, dtype: torch.dtype) -> torch.Tensor:
+    """
+    Link to paper: https://arxiv.org/abs/2108.12409 Alibi tensor is not causal as the original paper mentions, it
+    relies on a translation invariance of softmax for quick implementation: with l being a tensor, and a fixed value
+    `softmax(l+a) = softmax(l)`. Based on
+    https://github.com/ofirpress/attention_with_linear_biases/blob/a35aaca144e0eb6b789dfcb46784c4b8e31b7983/fairseq/models/transformer.py#L742
+    TODO @thomasw21 this doesn't work as nicely due to the masking strategy, and so masking varies slightly.
+
+    Args:
+    Returns tensor shaped (batch_size * num_heads, 1, max_seq_len)
+        attention_mask (`torch.Tensor`):
+            Token-wise attention mask, this should be of shape (batch_size, max_seq_len).
+        num_heads (`int`, *required*):
+            number of heads
+        dtype (`torch.dtype`, *optional*, default=`torch.bfloat16`):
+            dtype of the output tensor
+    """
+    import math
+    batch_size, seq_length = attention_mask.shape
+    closest_power_of_2 = 2**math.floor(math.log2(num_heads))
+    base = torch.tensor(2**(-(2**-(math.log2(closest_power_of_2) - 3))),
+                        device=attention_mask.device,
+                        dtype=torch.float32)
+    powers = torch.arange(1, 1 + closest_power_of_2, device=attention_mask.device, dtype=torch.int32)
+    slopes = torch.pow(base, powers)
+
+    if closest_power_of_2 != num_heads:
+        extra_base = torch.tensor(2**(-(2**-(math.log2(2 * closest_power_of_2) - 3))),
+                                  device=attention_mask.device,
+                                  dtype=torch.float32)
+        num_remaining_heads = min(closest_power_of_2, num_heads - closest_power_of_2)
+        extra_powers = torch.arange(1, 1 + 2 * num_remaining_heads, 2, device=attention_mask.device, dtype=torch.int32)
+        slopes = torch.cat([slopes, torch.pow(extra_base, extra_powers)], dim=0)
+
+    # Note: alibi will added to the attention bias that will be applied to the query, key product of attention
+    # => therefore alibi will have to be of shape (batch_size, num_heads, query_length, key_length)
+    # => here we set (batch_size=1, num_heads=num_heads, query_length=1, key_length=max_length)
+    # => the query_length dimension will then be broadcasted correctly
+    # This is more or less identical to T5's relative position bias:
+    # https://github.com/huggingface/transformers/blob/f681437203baa7671de3174b0fa583c349d9d5e1/src/transformers/models/t5/modeling_t5.py#L527
+    arange_tensor = ((attention_mask.cumsum(dim=-1) - 1) * attention_mask)[:, None, :]
+    alibi = slopes[..., None] * arange_tensor
+    if dist.is_initialized():
+        num_heads_per_rank = get_shard_size(num_heads, dist.get_world_size())
+        offset = sum(get_shard_size_list(num_heads, dist.get_world_size())[0:dist.get_rank()])
+        alibi = alibi.view(batch_size, num_heads, 1, seq_length)
+        alibi = alibi[:, offset:num_heads_per_rank + offset, :, :]
+        return alibi.reshape(batch_size * num_heads_per_rank, 1, seq_length).to(dtype)
+    else:
+        return alibi.reshape(batch_size * num_heads, 1, seq_length).to(dtype)
+
+
+def build_mpt_atten_bias_tensor(self,
+                                device,
+                                dtype,
+                                attention_mask: Optional[torch.ByteTensor] = None,
+                                prefix_mask: Optional[torch.ByteTensor] = None,
+                                sequence_id: Optional[torch.LongTensor] = None):
+    (attn_bias, attention_mask) = self._attn_bias_orig(device,
+                                                       dtype,
+                                                       attention_mask=attention_mask,
+                                                       prefix_mask=prefix_mask,
+                                                       sequence_id=sequence_id)
+    if dist.is_initialized():
+        num_heads_per_rank = get_shard_size(self.config.n_heads, dist.get_world_size())
+        offset = sum(get_shard_size_list(self.config.n_heads, dist.get_world_size())[0:dist.get_rank()])
+        attn_bias = attn_bias[:, offset:num_heads_per_rank + offset, :, :]
+    return attn_bias, attention_mask
+
+
+def build_mpt_alibi_tensor(self, num_heads, sequence_length, alibi_bias_max=8, device=None) -> torch.Tensor:
+    r"""
+    Link to paper: https://arxiv.org/abs/2108.12409 - Alibi tensor is not causal as the original paper mentions, it
+    relies on a translation invariance of softmax for quick implementation. This implementation has been copied from
+    the alibi implementation of MPT source code that led to slightly different results than the Bloom alibi:
+    https://huggingface.co/mosaicml/mpt-7b/blob/main/attention.py#L292
+    """
+    alibi = self.build_mpt_alibi_tensor_orig(num_heads, sequence_length, alibi_bias_max, device)
+    if dist.is_initialized():
+        num_heads_per_rank = int(num_heads / dist.get_world_size())
+        offset = dist.get_rank() * num_heads_per_rank
+        alibi = alibi[offset:num_heads_per_rank + offset, :, :]
+    return alibi
diff --git a/deepspeed/module_inject/containers/__init__.py b/deepspeed/module_inject/containers/__init__.py
index 6abef517a500..993d14071659 100644
--- a/deepspeed/module_inject/containers/__init__.py
+++ b/deepspeed/module_inject/containers/__init__.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .bert import DS_BERTContainer, HFBertLayerPolicy
 from .bloom import DS_BloomContainer, BLOOMLayerPolicy, supported_models
@@ -7,6 +10,9 @@
 from .gptj import DS_GPTJContainer, HFGPTJLayerPolicy
 from .gptneo import DS_GPTNEOContainer, HFGPTNEOLayerPolicy
 from .gptneox import DS_GPTNEOXContainer, GPTNEOXLayerPolicy
+from .llama import DS_LLAMAContainer, LLAMALayerPolicy
+from .llama2 import LLAMA2LayerPolicy, DS_LLAMA2Container
+from .internlm import DS_InternLMContainer, InternLMLayerPolicy
 from .megatron_gpt import DS_MegatronGPTContainer, MegatronLayerPolicy
 from .megatron_gpt_moe import DS_MegatronGPTMoEContainer, MegatronMoELayerPolicy
 from .opt import DS_OPTContainer, HFOPTLayerPolicy
diff --git a/deepspeed/module_inject/containers/base.py b/deepspeed/module_inject/containers/base.py
index 45faeb477951..83e109167ffe 100644
--- a/deepspeed/module_inject/containers/base.py
+++ b/deepspeed/module_inject/containers/base.py
@@ -1,12 +1,21 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 # Create a container object to save model-specific tensors using the policy file above.
 from abc import ABC
+
 import torch
 
+import deepspeed
 from deepspeed.ops.transformer.inference.config import DeepSpeedInferenceConfig
 from deepspeed.accelerator import get_accelerator
 
+# If the intermediate size attribute is set DEFAULT_INTERMEDIATE_SIZE
+# it is assumed the intermediate size is 4x the embedding dimension
+DEFAULT_INTERMEDIATE_SIZE = -1
+
 
 class BaseConvolutionContainer(ABC):
     # not implemented
@@ -15,6 +24,7 @@ def __init__(self):
 
 
 class BaseTransformerContainer(ABC):
+
     def __init__(self, policy, config, model_config, layer_id, child):
         self.policy = policy
         self.config = config
@@ -28,35 +38,29 @@ def __init__(self, policy, config, model_config, layer_id, child):
 
         # configuration for models. todo: can this be moved to a pydantic model config?
         self.hidden_size = None
+        self.intermediate_size = None
         self.num_attention_heads = None
         self.mp_size = self.config.tensor_parallel.tp_size
-        self.pre_layer_norm = self.policy.pre_attn_norm
-        self.fp16 = False
+        self.pre_layer_norm = self.model_config.do_layer_norm_before if \
+            hasattr(self.model_config, 'do_layer_norm_before') else self.policy.pre_attn_norm
+        self.dtype = self.config.dtype
         self.attn_linear_layer = self.policy.linear_layer
         self.mlp_linear_layer = self.policy.linear_layer
-        self.layer_norm_eps = self.model_config.layer_norm_eps if \
-            hasattr(self.model_config, 'layer_norm_eps') else (self.model_config.layer_norm_epsilon if \
-            hasattr(self.model_config, 'layer_norm_epsilon') else self.model_config.layernorm_epsilon if \
-            hasattr(self.model_config, 'layernorm_epsilon') else 1.0e-12)
         self.return_tuple = self.config.return_tuple
         self.triangular_masking = True
-        self.local_attention = ((self.model_config.attention_layers[self.layer_id]
-                                 == "local") if hasattr(self.model_config,
-                                                        'attention_layers') else False)
+        self.local_attention = ((self.model_config.attention_layers[self.layer_id] == "local") if hasattr(
+            self.model_config, 'attention_layers') else False)
         self.window_size = getattr(self.model_config, "window_size", 1)
         self.mlp_act_func_type = self.policy.mlp_act_func_type
+        self.norm_type = self.policy.norm_type
         self.training_mp_size = self.config.training_mp_size
         self.bigscience_bloom = False
         self.max_out_tokens = self.config.max_out_tokens
-        self.scale_attn_by_inverse_layer_idx = getattr(
-            self.config,
-            "scale_attn_by_inverse_layer_idx",
-            False)
+        self.min_out_tokens = self.config.min_out_tokens
+        self.scale_attn_by_inverse_layer_idx = getattr(self.config, "scale_attn_by_inverse_layer_idx", False)
         self.use_mup = self.policy.use_mup
         self.return_single_tuple = False
-        self.rotary_dim = self.model_config.rotary_dim if hasattr(self.model_config, 'rotary_dim') \
-                          else self.child.attention.rotary_ndims if \
-                          hasattr(self.child, 'attention') and hasattr(self.child.attention,'rotary_ndims') else -1
+        self.rotary_dim = self.get_rotary_dim()
         self.mlp_after_attn = (self.rotary_dim is None or self.rotary_dim < 0)
 
         # Attention tensors
@@ -75,6 +79,12 @@ def __init__(self, policy, config, model_config, layer_id, child):
         self.input_nw = None
         self.input_nb = None
 
+        self.mp_group = None
+        self.use_triton = False
+
+        # Triton
+        self.use_triton = config.use_triton and deepspeed.HAS_TRITON
+
     def create_ds_model_config(self):
         self.set_hidden_heads(*self.policy.get_hidden_heads())
         assert self.num_attention_heads % self.mp_size == 0,\
@@ -83,12 +93,13 @@ def create_ds_model_config(self):
 
         self.ds_model_config = DeepSpeedInferenceConfig(
             hidden_size=self.hidden_size,
+            intermediate_size=self.intermediate_size,
             heads=self.num_attention_heads,
-            layer_norm_eps=self.layer_norm_eps,
-            fp16=self.fp16,
+            layer_norm_eps=self.layernorm_epsilon,
+            dtype=self.dtype,
             pre_layer_norm=self.pre_layer_norm,
+            norm_type=self.norm_type,
             mp_size=self.mp_size,
-            q_int8=self.quantize,
             return_tuple=self.return_tuple,
             triangular_masking=self.triangular_masking,
             local_attention=self.local_attention,
@@ -99,33 +110,58 @@ def create_ds_model_config(self):
             training_mp_size=self.training_mp_size,
             bigscience_bloom=self.bigscience_bloom,
             max_out_tokens=self.max_out_tokens,
+            min_out_tokens=self.min_out_tokens,
             scale_attn_by_inverse_layer_idx=self.scale_attn_by_inverse_layer_idx,
             use_mup=self.use_mup,
             return_single_tuple=self.return_single_tuple,
-        )
+            set_empty_params=self.config.set_empty_params,
+            transposed_mode=self.config.transposed_mode,
+            use_triton=self.use_triton,
+            triton_autotune=self.config.triton_autotune)
+
+        if self.use_triton and deepspeed.HAS_TRITON:
+            from .bert import DS_BERTContainer
+            if not isinstance(self, DS_BERTContainer):
+                raise NotImplementedError("Triton kernels are only for BERT-like models yet")
+
+            if not self.config.triton_autotune:
+                from deepspeed.ops.transformer.inference.triton.matmul_ext import fp16_matmul
+                fp16_matmul.skip_autotune()
 
         return self.ds_model_config
 
-    def initialize_tensors(self):
+    def check_meta_tensor_support(self):
+        if hasattr(self.qkvw, 'is_meta'):
+            if self.qkvw.is_meta:
+                assert self.ckpt_load_enabled, "Meta tensors are not supported for this model currently."
+        else:
+            raise NotImplementedError("Meta tensor support is not available, please upgrade to torch 1.10+")
+
+    def initialize_tensors(self, enable_training=False):
         # Set the tensors from policy (user module) to container (DS module)
-        self.set_attention(*self.policy.attention())
-        self.set_mlp(*self.policy.mlp())
+        self.set_attention(*self.policy.attention(enable_training=enable_training))
+        self.set_mlp(*self.policy.mlp(enable_training=enable_training))
         self.set_layernorm(*self.policy.layernorm())
+        #self.check_meta_tensor_support()
 
-    def convert_to_required_dtype(self, dtype):
+    def convert_to_required_dtype(self):
         # Note: converting tensors to fp16 requires that we do it in-place using self.__dict__ and not make a list/dict copy
-        if dtype == torch.half:
+        if self.dtype in [torch.half, torch.bfloat16]:
             for k, v in self.__dict__.items():
                 # The list comprehension is used for MoE tensor lists
                 if isinstance(v, list) and all((isinstance(tensor, torch.Tensor) \
                    or isinstance(tensor, torch.nn.Parameter)) for tensor in v):
-                    self.__dict__[k] = [moe_tensor.half() for moe_tensor in v]
+                    self.__dict__[k] = [moe_tensor.to(self.dtype) for moe_tensor in v]
 
                 if isinstance(v, torch.Tensor) or isinstance(v, torch.nn.Parameter):
-                    self.__dict__[k] = v.half()
+                    self.__dict__[k] = v.to(self.dtype)
 
-    def set_dtype(self, fp16=False):
-        self.fp16 = fp16
+    def get_rotary_dim(self):
+        if hasattr(self.model_config, 'rotary_dim'):
+            return self.model_config.rotary_dim
+        if hasattr(self.child, 'attention') and hasattr(self.child.attention, 'rotary_ndims'):
+            return self.child.attention.rotary_ndims
+        return -1
 
     def set_moe(self, moe=False):
         self.moe = moe
@@ -134,13 +170,25 @@ def set_tensor_parallel_config(self, mp_size, mp_group):
         self.mp_size = mp_size
         self.mp_group = mp_group
 
-    def set_quantization_config(self, quantize, quantizer):
-        self.quantize = quantize
+    def set_quantization_config(self, quantizer):
         self.quantizer = quantizer
 
-    def set_hidden_heads(self, hidden_size, num_attention_heads):
+    def set_hidden_heads(self, hidden_size, num_attention_heads, epsilon, intermediate_size):
+        """
+        Args:
+            hidden_size: embedding dimension of the model
+            num_attention_heads: number of attention heads in the model
+            epsilon: epsilon value for layer norm (same value used for all norms)
+            intermediate_size: Size of MLP projection. If `DEFAULT_INTERMEDIATE_SIZE` is passed
+                it is assumed to be `4 * hidden_size`
+        """
         self.hidden_size = hidden_size
+        if intermediate_size == DEFAULT_INTERMEDIATE_SIZE:
+            self.intermediate_size = 4 * hidden_size
+        else:
+            self.intermediate_size = intermediate_size
         self.num_attention_heads = num_attention_heads
+        self.layernorm_epsilon = epsilon
 
     def set_attention(self, qkvw, qkvb, dense_w, dense_b):
         self.qkvw = qkvw
@@ -168,10 +216,8 @@ def apply_weight_quantization(self):
         self.mlp_quantization()
 
     def attention_quantization(self):
-        self.module.attention.attn_qkvw = self.quantizer.quantize(
-            self.module.attention.attn_qkvw)
-        self.module.attention.attn_ow = self.quantizer.quantize(
-            self.module.attention.attn_ow)
+        self.module.attention.attn_qkvw = self.quantizer.quantize(self.module.attention.attn_qkvw)
+        self.module.attention.attn_ow = self.quantizer.quantize(self.module.attention.attn_ow)
 
     def mlp_quantization(self):
         self.module.mlp.inter_w = self.quantizer.quantize(self.module.mlp.inter_w)
@@ -187,44 +233,53 @@ def apply_tensor_parallelism(self, mp_replace):
         self.mlp_output_mp(mp_replace)
 
         # Apply weight quantization
-        self.apply_weight_quantization()
-
-    def attention_qkv_mp(self, mp_replace):
-        self.module.attention.attn_qkvw = mp_replace.qkv_copy(
-            self.module.attention.attn_qkvw,
-            self.qkvw)
-        self.module.attention.attn_qkvb = mp_replace.qkv_copy(
-            self.module.attention.attn_qkvb,
-            self.qkvb)
-
-    def attention_o_mp(self, mp_replace):
-        self.module.attention.attn_ow = mp_replace.copy(self.module.attention.attn_ow,
-                                                        self.dense_w)
+        # TODO(cmikeh2): Re-enable this once verified
+        #self.apply_weight_quantization()
+
+    def attention_qkv_mp(self, mp_replace, reversed_dim=False):
+        self.module.attention.attn_qkvw = mp_replace.strided_copy(self.module.attention.attn_qkvw,
+                                                                  self.qkvw,
+                                                                  num_splits=3,
+                                                                  int8=reversed_dim)
+        self.module.attention.attn_qkvb = mp_replace.strided_copy(self.module.attention.attn_qkvb,
+                                                                  self.qkvb,
+                                                                  num_splits=3,
+                                                                  int8=reversed_dim)
+
+    def attention_o_mp(self, mp_replace, reversed_dim=False):
+        self.module.attention.attn_ow = mp_replace.copy(self.module.attention.attn_ow, self.dense_w, int8=reversed_dim)
         self.module.attention.attn_ob = mp_replace.copy(self.module.attention.attn_ob,
-                                                        self.dense_b)
+                                                        self.dense_b,
+                                                        int8=reversed_dim,
+                                                        allocate_tensor=reversed_dim)
 
-    def mlp_inter_mp(self, mp_replace):
-        self.module.mlp.inter_w = mp_replace.copy(self.module.mlp.inter_w, self._h4h_w)
-        self.module.mlp.inter_b = mp_replace.copy(self.module.mlp.inter_b, self._h4h_b)
+    def mlp_inter_mp(self, mp_replace, reversed_dim=False):
+        self.module.mlp.inter_w = mp_replace.copy(self.module.mlp.inter_w, self._h4h_w, int8=reversed_dim)
+        self.module.mlp.inter_b = mp_replace.copy(self.module.mlp.inter_b, self._h4h_b, int8=reversed_dim)
 
-    def mlp_output_mp(self, mp_replace):
-        self.module.mlp.output_w = mp_replace.copy(self.module.mlp.output_w, self._4hh_w)
-        self.module.mlp.output_b = mp_replace.copy(self.module.mlp.output_b, self._4hh_b)
+    def mlp_output_mp(self, mp_replace, reversed_dim=False):
+        self.module.mlp.output_w = mp_replace.copy(self.module.mlp.output_w, self._4hh_w, int8=reversed_dim)
+        self.module.mlp.output_b = mp_replace.copy(self.module.mlp.output_b,
+                                                   self._4hh_b,
+                                                   int8=reversed_dim,
+                                                   allocate_tensor=reversed_dim)
 
     def copy_data_to_new_module(self):
-        if self.attn_nw is None:
-            self.module.mlp.attn_nw = self.attn_nw
-            self.module.mlp.attn_nb = self.attn_nb
-        else:
-            self.module.mlp.attn_nw.data.copy_(
-                self.attn_nw.to(get_accelerator().current_device_name()))
-            self.module.mlp.attn_nb.data.copy_(
-                self.attn_nb.to(get_accelerator().current_device_name()))
-
-        self.module.norm_w.data.copy_(
-            self.input_nw.to(get_accelerator().current_device_name()))
-        self.module.norm_b.data.copy_(
-            self.input_nb.to(get_accelerator().current_device_name()))
+        params = {'attn_nw': self.attn_nw, 'attn_nb': self.attn_nb}
+        for key in params:
+            if params[key] is None:
+                setattr(self.module.mlp, key, None)
+            else:
+                setattr(self.module.mlp, key,
+                        torch.nn.parameter.Parameter(params[key].to(get_accelerator().current_device_name())))
+
+        params = {'norm_w': self.input_nw, 'norm_b': self.input_nb}
+        for key in params:
+            if params[key] is None:
+                setattr(self.module, key, None)
+            else:
+                setattr(self.module, key,
+                        torch.nn.parameter.Parameter(params[key].to(get_accelerator().current_device_name())))
 
     def transpose(self):
         self.transpose_attention()
@@ -246,3 +301,22 @@ def transpose_impl(self, data):
         data = data.reshape(data.shape[-1], data.shape[-2])
         data.to(get_accelerator().current_device_name())
         return data
+
+    def get_all_params(self):
+        params = [
+            self.attn_nw,
+            self.attn_nb,
+            self.input_nw,
+            self.input_nb,
+        ]
+
+        params.extend(self.get_attn_params())
+        params.extend(self.get_mlp_params())
+
+        return params
+
+    def get_attn_params(self):
+        return [self.qkvw, self.qkvb, self.dense_w, self.dense_b]
+
+    def get_mlp_params(self):
+        return [self._h4h_w, self._h4h_b, self._4hh_w, self._4hh_b]
diff --git a/deepspeed/module_inject/containers/base_moe.py b/deepspeed/module_inject/containers/base_moe.py
index 4139b08d9091..4be1b849ba70 100644
--- a/deepspeed/module_inject/containers/base_moe.py
+++ b/deepspeed/module_inject/containers/base_moe.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 # Create a container object to save model-specific tensors using the policy file above.
 from .base import *
@@ -8,6 +11,7 @@
 
 
 class BaseTransformerMoEContainer(BaseTransformerContainer):
+
     def __init__(self, **kwargs):
         # Call the init function of the parent class to initialize the tensors and configs from parent class
         super().__init__(**kwargs)
@@ -16,9 +20,7 @@ def __init__(self, **kwargs):
         self.ep_world_size = dist.get_world_size()
         self.local_ep_size = 1 if self.num_experts < self.ep_world_size else self.num_experts // self.ep_world_size
 
-        self.layer_norm_eps = self.config.layer_norm_eps if hasattr(
-            self.config,
-            'layer_norm_eps') else 1e-12,
+        self.layer_norm_eps = self.config.layer_norm_eps if hasattr(self.config, 'layer_norm_eps') else 1e-12,
 
         # MoE models will have a list of mlp related tensors
         self._h4h_w = []
@@ -102,40 +104,27 @@ def mlp_mp(self):
         gpu_index = dist.get_rank()
         for ep_index in range(self.local_ep_size):
             # mlp inter
-            self.module.mlp[ep_index].inter_w.data = self._h4h_w[
-                gpu_index * self.local_ep_size + ep_index].to(
-                    get_accelerator().current_device_name())
-            self.module.mlp[ep_index].inter_b.data = self._h4h_b[
-                gpu_index * self.local_ep_size + ep_index].to(
-                    get_accelerator().current_device_name())
+            self.module.mlp[ep_index].inter_w.data = self._h4h_w[gpu_index * self.local_ep_size + ep_index].to(
+                get_accelerator().current_device_name())
+            self.module.mlp[ep_index].inter_b.data = self._h4h_b[gpu_index * self.local_ep_size + ep_index].to(
+                get_accelerator().current_device_name())
 
             # mlp output
-            self.module.mlp[ep_index].output_w.data = self._4hh_w[
-                gpu_index * self.local_ep_size + ep_index].to(
-                    get_accelerator().current_device_name())
-            self.module.mlp[ep_index].output_b.data = self._4hh_b[
-                gpu_index * self.local_ep_size + ep_index].to(
-                    get_accelerator().current_device_name())
+            self.module.mlp[ep_index].output_w.data = self._4hh_w[gpu_index * self.local_ep_size + ep_index].to(
+                get_accelerator().current_device_name())
+            self.module.mlp[ep_index].output_b.data = self._4hh_b[gpu_index * self.local_ep_size + ep_index].to(
+                get_accelerator().current_device_name())
 
     def copy_data_to_new_module(self):
-        self.module.attn_nw.data = self.attn_nw.to(
-            get_accelerator().current_device_name())
-        self.module.attn_nb.data = self.attn_nb.to(
-            get_accelerator().current_device_name())
+        self.module.attn_nw.data = self.attn_nw.to(get_accelerator().current_device_name())
+        self.module.attn_nb.data = self.attn_nb.to(get_accelerator().current_device_name())
 
-        self.module.norm_w.data.copy_(
-            self.input_nw.to(get_accelerator().current_device_name()))
-        self.module.norm_b.data.copy_(
-            self.input_nb.to(get_accelerator().current_device_name()))
+        self.module.norm_w.data.copy_(self.input_nw.to(get_accelerator().current_device_name()))
+        self.module.norm_b.data.copy_(self.input_nb.to(get_accelerator().current_device_name()))
 
         if self.config.moe.type == 'residual':
-            self.module.res_mlp.inter_w.data = self._res_h4h_w.to(
-                get_accelerator().current_device_name())
-            self.module.res_mlp.inter_b.data = self._res_h4h_b.to(
-                get_accelerator().current_device_name())
-            self.module.res_mlp.output_w.data = self._res_4hh_w.to(
-                get_accelerator().current_device_name())
-            self.module.res_mlp.output_b.data = self._res_4hh_b.to(
-                get_accelerator().current_device_name())
-            self.module.res_coef.data = self._res_coef.to(
-                get_accelerator().current_device_name())
+            self.module.res_mlp.inter_w.data = self._res_h4h_w.to(get_accelerator().current_device_name())
+            self.module.res_mlp.inter_b.data = self._res_h4h_b.to(get_accelerator().current_device_name())
+            self.module.res_mlp.output_w.data = self._res_4hh_w.to(get_accelerator().current_device_name())
+            self.module.res_mlp.output_b.data = self._res_4hh_b.to(get_accelerator().current_device_name())
+            self.module.res_coef.data = self._res_coef.to(get_accelerator().current_device_name())
diff --git a/deepspeed/module_inject/containers/bert.py b/deepspeed/module_inject/containers/bert.py
index 95d8b485a9d6..20ae575f4514 100644
--- a/deepspeed/module_inject/containers/bert.py
+++ b/deepspeed/module_inject/containers/bert.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .base import *
 from deepspeed.model_implementations.transformers.ds_bert import DeepSpeedBERTInference
@@ -8,12 +11,14 @@
 
 
 class DS_BERTContainer(BaseTransformerContainer):
+
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
 
         # All model specific things should be defined here instead of the base class.
         self.return_tuple = True
         self.triangular_masking = False
+        self.use_triton = kwargs['config'].use_triton and deepspeed.HAS_TRITON
 
     def create_module(self, config=None):
         _config = config if config is not None else self.ds_model_config
@@ -23,6 +28,7 @@ def create_module(self, config=None):
 
 
 class HFBertLayerPolicy(TransformerPolicy):
+
     def __init__(self, client_module, inference=False):
         super().__init__(inference, pre_attn_norm=False)
         self.client_module = client_module
@@ -39,10 +45,16 @@ def __init__(self, client_module, inference=False):
                 HFBertLayerPolicy._orig_layer_class = None
 
     def get_hidden_heads(self):
+        if self.pre_attn_norm:
+            attention_layernorm = self.client_module.PostAttentionLayerNorm
+        else:
+            attention_layernorm = self.client_module.attention.output.LayerNorm
         return self.client_module.attention.self.query.weight.shape[1], \
-                self.client_module.attention.self.num_attention_heads
+                self.client_module.attention.self.num_attention_heads, \
+                attention_layernorm.eps, \
+                DEFAULT_INTERMEDIATE_SIZE
 
-    def attention(self):
+    def attention(self, enable_training=False):
         qw = self.client_module.attention.self.query.weight
         qb = self.client_module.attention.self.query.bias
         kw = self.client_module.attention.self.key.weight
@@ -50,15 +62,15 @@ def attention(self):
         vw = self.client_module.attention.self.value.weight
         vb = self.client_module.attention.self.value.bias
 
-        qkvw = Parameter(torch.cat((qw, kw, vw), dim=0), requires_grad=False)
-        qkvb = Parameter(torch.cat((qb, kb, vb), dim=0), requires_grad=False)
+        qkvw = Parameter(torch.cat((qw, kw, vw), dim=0), requires_grad=enable_training)
+        qkvb = Parameter(torch.cat((qb, kb, vb), dim=0), requires_grad=enable_training)
 
         return qkvw, \
                qkvb, \
                self.client_module.attention.output.dense.weight, \
                self.client_module.attention.output.dense.bias, \
 
-    def mlp(self):
+    def mlp(self, enable_training=False):
         if self.pre_attn_norm:
             intermediate_ff = self.client_module.intermediate.dense_act
         else:
diff --git a/deepspeed/module_inject/containers/bloom.py b/deepspeed/module_inject/containers/bloom.py
index eedf85144fdd..05f30eec8d85 100644
--- a/deepspeed/module_inject/containers/bloom.py
+++ b/deepspeed/module_inject/containers/bloom.py
@@ -1,16 +1,23 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .base import *
 from .features.meta_tensor import MetaTensorContainer
+from .features.hybrid_engine import HybridEngineContainer
 from deepspeed.model_implementations.transformers.ds_bloom import DeepSpeedBloomInference
 from ..policy import TransformerPolicy
 from ..policy import transformer_param_names
 from ..policy import maybe_copy
 
+from ..policy import maybe_get_lora
+
 supported_models = {None}
 
 
-class DS_BloomContainer(MetaTensorContainer, BaseTransformerContainer):
+class DS_BloomContainer(MetaTensorContainer, HybridEngineContainer, BaseTransformerContainer):
+
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
 
@@ -24,13 +31,28 @@ def create_module(self, config=None):
         self.module.config.scale_attention = self.scale_attention
         return self.module
 
-    def attention_qkv_mp(self, mp_replace):
-        self.module.attention.attn_qkvw = mp_replace.copy(
-            self.module.attention.attn_qkvw,
-            self.qkvw)
-        self.module.attention.attn_qkvb = mp_replace.copy(
-            self.module.attention.attn_qkvb,
-            self.qkvb)
+    def attention_qkv_mp(self, mp_replace, reversed_dim=False):
+        self.module.attention.attn_qkvw = mp_replace.copy(self.module.attention.attn_qkvw, self.qkvw)
+        self.module.attention.attn_qkvb = mp_replace.copy(self.module.attention.attn_qkvb, self.qkvb)
+
+    def get_lora_matched_pair(self):
+        """
+        Necessary to implement for `HybridEngineContainer`
+        """
+        fc1_lora, fc2_lora, qkv_lora, out_lora = self.get_lora_params()
+        ret = [(fc1_lora, self._h4h_w), (fc2_lora, self._4hh_w), (qkv_lora, self.qkvw), (out_lora, self.dense_w)]
+        return ret
+
+    def set_lora_params(self):
+        """
+        Necessary to implement for `HybridEngineContainer`
+        """
+        self.lora_params = [
+            maybe_get_lora(p) for p in [
+                self.policy.client_module.mlp.dense_h_to_4h, self.policy.client_module.mlp.dense_4h_to_h, self.policy.
+                client_module.self_attention.query_key_value, self.policy.client_module.self_attention.dense
+            ]
+        ]
 
     def load_params(self, module, sd, weight_quantizer, mp_replace, prefix):
         param_names = (
@@ -58,64 +80,43 @@ def load_params(self, module, sd, weight_quantizer, mp_replace, prefix):
                        megatron_v2=self.policy.is_megatron_v2,
                        split_qkv=self.policy.split_qkv)
         for i in range(2, 4):
-            maybe_copy(module.attention,
-                       sd,
-                       weight_quantizer,
-                       mp_replace,
-                       transformer_param_names[i],
+            maybe_copy(module.attention, sd, weight_quantizer, mp_replace, transformer_param_names[i],
                        prefix + param_names[i])
         for i in range(4, 10):
-            maybe_copy(module.mlp,
-                       sd,
-                       weight_quantizer,
-                       mp_replace,
-                       transformer_param_names[i],
+            maybe_copy(module.mlp, sd, weight_quantizer, mp_replace, transformer_param_names[i],
                        prefix + param_names[i])
         for i in range(10, 12):
-            maybe_copy(module,
-                       sd,
-                       weight_quantizer,
-                       mp_replace,
-                       transformer_param_names[i],
-                       prefix + param_names[i])
+            maybe_copy(module, sd, weight_quantizer, mp_replace, transformer_param_names[i], prefix + param_names[i])
 
 
 class BLOOMLayerPolicy(TransformerPolicy):
     _orig_layer_class = None
 
-    def __init__(self,
-                 client_module,
-                 inference=True,
-                 use_load_prefix=True,
-                 split_qkv=False):
-        super().__init__(inference,
-                         linear_layer=True,
-                         use_load_prefix=use_load_prefix,
-                         split_qkv=split_qkv)
+    def __init__(self, client_module, inference=True, use_load_prefix=True, split_qkv=False):
+        super().__init__(inference, linear_layer=True, use_load_prefix=use_load_prefix, split_qkv=split_qkv)
         self.client_module = client_module
         try:
             import transformers
             BLOOMLayerPolicy._orig_layer_class = transformers.models.bloom.modeling_bloom.BloomBlock
             global supported_models
-            supported_models.update(
-                {transformers.models.bloom.modeling_bloom.BloomModel})
+            supported_models.update({transformers.models.bloom.modeling_bloom.BloomModel})
         except Exception as e:
-            print(
-                f"WARNING! Setting BLOOMLayerPolicy._orig_layer_class to None due to Exception: {e}"
-            )
+            print(f"WARNING! Setting BLOOMLayerPolicy._orig_layer_class to None due to Exception: {e}")
             BLOOMLayerPolicy._orig_layer_class = None
 
     def get_hidden_heads(self):
         return self.client_module.self_attention.hidden_size, \
-                self.client_module.self_attention.num_heads
+                self.client_module.self_attention.num_heads, \
+                self.client_module.input_layernorm.eps, \
+                DEFAULT_INTERMEDIATE_SIZE
 
-    def attention(self):
+    def attention(self, enable_training=False):
         return self.client_module.self_attention.query_key_value.weight, \
                 self.client_module.self_attention.query_key_value.bias, \
                 self.client_module.self_attention.dense.weight, \
                 self.client_module.self_attention.dense.bias,
 
-    def mlp(self):
+    def mlp(self, enable_training=False):
         return self.client_module.mlp.dense_h_to_4h.weight, \
                self.client_module.mlp.dense_h_to_4h.bias, \
                self.client_module.mlp.dense_4h_to_h.weight, \
diff --git a/deepspeed/module_inject/containers/clip.py b/deepspeed/module_inject/containers/clip.py
index 8e6975091c03..afe4a76086d8 100644
--- a/deepspeed/module_inject/containers/clip.py
+++ b/deepspeed/module_inject/containers/clip.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .base import *
 from deepspeed.model_implementations.transformers.ds_gpt import DeepSpeedGPTInference
@@ -8,6 +11,7 @@
 
 
 class DS_CLIPContainer(BaseTransformerContainer):
+
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
 
@@ -21,6 +25,7 @@ def create_module(self, config=None):
 
 
 class HFCLIPLayerPolicy(TransformerPolicy):
+
     def __init__(self, client_module, inference=False):
         super().__init__(inference, pre_attn_norm=True, scale_attention=True)
         self.client_module = client_module
@@ -35,9 +40,11 @@ def __init__(self, client_module, inference=False):
 
     def get_hidden_heads(self):
         return self.client_module.self_attn.q_proj.weight.shape[1], \
-                self.client_module.self_attn.num_heads
+                self.client_module.self_attn.num_heads, \
+                self.client_module.layer_norm1.eps, \
+                DEFAULT_INTERMEDIATE_SIZE
 
-    def attention(self):
+    def attention(self, enable_training=False):
         qw = self.client_module.self_attn.q_proj.weight
         qb = self.client_module.self_attn.q_proj.bias
         kw = self.client_module.self_attn.k_proj.weight
@@ -45,15 +52,15 @@ def attention(self):
         vw = self.client_module.self_attn.v_proj.weight
         vb = self.client_module.self_attn.v_proj.bias
 
-        qkvw = Parameter(torch.cat((qw, kw, vw), dim=0), requires_grad=False)
-        qkvb = Parameter(torch.cat((qb, kb, vb), dim=0), requires_grad=False)
+        qkvw = Parameter(torch.cat((qw, kw, vw), dim=0), requires_grad=enable_training)
+        qkvb = Parameter(torch.cat((qb, kb, vb), dim=0), requires_grad=enable_training)
 
         return qkvw, \
                qkvb, \
                self.client_module.self_attn.out_proj.weight, \
                self.client_module.self_attn.out_proj.bias
 
-    def mlp(self):
+    def mlp(self, enable_training=False):
         return self.client_module.mlp.fc1.weight, \
                self.client_module.mlp.fc1.bias, \
                self.client_module.mlp.fc2.weight, \
diff --git a/deepspeed/module_inject/containers/distil_bert.py b/deepspeed/module_inject/containers/distil_bert.py
index 71f46dc8ff12..ecd0562438b5 100644
--- a/deepspeed/module_inject/containers/distil_bert.py
+++ b/deepspeed/module_inject/containers/distil_bert.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .base import *
 from deepspeed.model_implementations.transformers.ds_bert import DeepSpeedBERTInference
@@ -8,12 +11,14 @@
 
 
 class DS_DistilBERTContainer(BaseTransformerContainer):
+
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
 
         # All model specific things should be defined here instead of the base class.
         self.triangular_masking = False
         self.return_single_tuple = True
+        self.use_triton = kwargs['config'].use_triton and deepspeed.HAS_TRITON
 
     def create_module(self, config=None):
         _config = config if config is not None else self.ds_model_config
@@ -41,9 +46,11 @@ def __init__(self, client_module, inference=False, preln=False):
 
     def get_hidden_heads(self):
         return self.client_module.attention.q_lin.weight.shape[1], \
-                self.client_module.attention.n_heads
+                self.client_module.attention.n_heads, \
+                self.client_module.sa_layer_norm.eps, \
+                DEFAULT_INTERMEDIATE_SIZE
 
-    def attention(self):
+    def attention(self, enable_training=False):
         qw = self.client_module.attention.q_lin.weight
         qb = self.client_module.attention.q_lin.bias
         kw = self.client_module.attention.k_lin.weight
@@ -51,15 +58,15 @@ def attention(self):
         vw = self.client_module.attention.v_lin.weight
         vb = self.client_module.attention.v_lin.bias
 
-        qkvw = Parameter(torch.cat((qw, kw, vw), dim=0))
-        qkvb = Parameter(torch.cat((qb, kb, vb), dim=0))
+        qkvw = Parameter(torch.cat((qw, kw, vw), dim=0), requires_grad=enable_training)
+        qkvb = Parameter(torch.cat((qb, kb, vb), dim=0), requires_grad=enable_training)
 
         return qkvw, \
                qkvb, \
                self.client_module.attention.out_lin.weight, \
                self.client_module.attention.out_lin.bias
 
-    def mlp(self):
+    def mlp(self, enable_training=False):
         intermediate_ff = self.client_module.ffn.lin1
 
         return intermediate_ff.weight, intermediate_ff.bias, \
diff --git a/deepspeed/module_inject/containers/features/__init__.py b/deepspeed/module_inject/containers/features/__init__.py
index 0bd29647d48a..fc2eb2a65531 100644
--- a/deepspeed/module_inject/containers/features/__init__.py
+++ b/deepspeed/module_inject/containers/features/__init__.py
@@ -1,4 +1,9 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
 
+# DeepSpeed Team
+
+from .gated_mlp import HybridGatedMLPContainer
 from .megatron import MegatronContainer
 from .meta_tensor import MetaTensorContainer
+from .split_qkv import HybridSplitQKVContainer
diff --git a/deepspeed/module_inject/containers/features/gated_mlp.py b/deepspeed/module_inject/containers/features/gated_mlp.py
new file mode 100644
index 000000000000..24f0826db14e
--- /dev/null
+++ b/deepspeed/module_inject/containers/features/gated_mlp.py
@@ -0,0 +1,118 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from abc import abstractmethod
+
+from .hybrid_engine import HybridEngineContainer
+
+
+class HybridGatedMLPContainer(HybridEngineContainer):
+    """
+    The HybridGatedMLPContainer supports models for which the first MLP layer
+    is represented with two separate weights, one for the activation function
+    and one for the gating function.
+    """
+
+    def set_mlp(self, _h4h_w, _h4h_b, _4hh_w, _4hh_b):
+        super().set_mlp(_h4h_w, _h4h_b, _4hh_w, _4hh_b)
+        self.set_mlp_gate()
+
+    @abstractmethod
+    def set_mlp_gate(self):
+        """
+        In `set_mlp_gate`, it is necessary to populate the following variables (where appropriate)
+        for the given model:
+            self.inter_up_w: inter up weight
+            self.inter_up_b: inter up bias
+            self.inter_gate_w: inter gate weight
+            self.inter_gate_b: inter gate bias
+        If the parameter does not exist in the original model, set the attribute to None.
+        """
+        raise NotImplementedError("A set_mlp_gate() function must be defined in the model container \
+                                    in order to set the unfused inter up and gate tensors.")
+
+    def mlp_inter_mp(self, mp_replace, reversed_dim=False):
+        # Only need to alter behavior if we can't do the normal destructive copy
+        if self.module.mlp.inter_w is None:
+            params = [
+                (self.module.mlp.inter_up_w, self.inter_up_w),
+                (self.module.mlp.inter_up_b, self.inter_up_b),
+                (self.module.mlp.inter_gate_w, self.inter_gate_w),
+                (self.module.mlp.inter_gate_b, self.inter_gate_b),
+            ]
+            for dst, src in params:
+                dst = mp_replace.copy(dst[:self.inter_up_w.shape[0] // mp_replace.mp_size],
+                                      src,
+                                      int8=reversed_dim,
+                                      allocate_tensor=reversed_dim) if src is not None else None
+        else:
+            self.module.mlp.inter_w = mp_replace.strided_copy(self.module.mlp.inter_w,
+                                                              self._h4h_w,
+                                                              num_splits=2,
+                                                              int8=reversed_dim)
+            self.module.mlp.inter_b = mp_replace.strided_copy(self.module.mlp.inter_b,
+                                                              self._h4h_b,
+                                                              num_splits=2,
+                                                              int8=reversed_dim)
+
+    def release_mlp(self):
+        super().release_mlp()
+        gated_mlp_params = [
+            (self.module.mlp.inter_up_w, self.inter_up_w),
+            (self.module.mlp.inter_up_b, self.inter_up_b),
+            (self.module.mlp.inter_gate_w, self.inter_gate_w),
+            (self.module.mlp.inter_gate_b, self.inter_gate_b),
+        ]
+
+        self._release_params(gated_mlp_params)
+
+    def reset_mlp(self):
+        self._h4h_w.data[:self.inter_up_w.shape[0]] = self.inter_up_w.data
+        self._h4h_w.data[self.inter_up_w.shape[0]:] = self.inter_gate_w.data
+
+        if self.inter_up_b is not None:
+            self._h4h_b.data[:self.inter_up_b.shape[0]] = self.inter_up_b.data
+            self._h4h_b.data[self.inter_up_b.shape[0]:] = self.inter_gate_b.data
+
+        inter_data = [self.inter_up_w.data, self.inter_gate_w.data]
+        if self.inter_up_b is not None:
+            inter_data.extend([self.inter_up_b.data, self.inter_gate_b.data])
+
+        self.inter_up_w.data = self._h4h_w.data[:self.inter_up_w.shape[0]]
+        self.inter_gate_w.data = self._h4h_w.data[self.inter_up_w.shape[0]:]
+
+        if self.inter_up_b is not None:
+            self.inter_up_b.data = self._h4h_b.data[:self.inter_up_b.shape[0]]
+            self.inter_gate_b.data = self._h4h_b.data[self.inter_up_b.shape[0]:]
+
+        for data in inter_data:
+            del data
+
+    def set_mlp_params_wo_copy(self, Z3_enabled=False):
+        self.module.mlp.output_w = self._4hh_w
+        self.module.mlp.output_b = self._4hh_b
+
+        if not Z3_enabled:
+            # In initialize_tensors, we create a fused inter projection with the appropriate shape
+            # and copy the up projection and gate projection into it
+            self.module.mlp.inter_w = self._h4h_w
+            self.module.mlp.inter_b = self._h4h_b
+
+            self.inter_up_w.data = self._h4h_w[:self.inter_up_w.shape[0], :]
+            self.inter_gate_w.data = self._h4h_w[self.inter_up_w.shape[0]:, :]
+
+            if self.inter_up_b is not None:
+                self.inter_up_b.data = self._h4h_b[:self.inter_up_w.shape[0]] if self._h4h_b is not None else None
+                self.inter_gate_b.data = self._h4h_b[self.inter_up_w.shape[0]:] if self._h4h_b is not None else None
+        else:
+            self.module.mlp.inter_up_w = self.inter_up_w
+            self.module.mlp.inter_up_b = self.inter_up_b
+            self.module.mlp.inter_gate_w = self.inter_gate_w
+            self.module.mlp.inter_gate_b = self.inter_gate_b
+
+    def get_mlp_params(self):
+        params = super().get_mlp_params()
+        params.extend([self.inter_up_w, self.inter_up_b, self.inter_gate_w, self.inter_gate_b])
+        return params
diff --git a/deepspeed/module_inject/containers/features/hybrid_engine.py b/deepspeed/module_inject/containers/features/hybrid_engine.py
new file mode 100644
index 000000000000..3f702abcf319
--- /dev/null
+++ b/deepspeed/module_inject/containers/features/hybrid_engine.py
@@ -0,0 +1,212 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from abc import ABC, abstractmethod
+from typing import List, Tuple
+
+import torch
+
+
+class HybridEngineContainer(ABC):
+    """
+    This container identifies which methods need to be overridden in addition to
+    the base container to enable use in the RLHF pipeline. These methods are not
+    necessary for inference alone.
+
+    NOTE: If you are using this feature with a container that
+    also inherits from `MetaTensorContainer`, ensure that `MetaTensorContainer`
+    is inherited before `HybridEngineContainer` in the class definition.
+    """
+
+    def initialize_tensors(self, enable_training=False):
+        """
+        Same purposes as the base container, but also grabs the hooks for any LoRA
+        parameters. If it's necessary to override specific sub-components of the model,
+        it's best to augment the specific `set_[component]` itself rather than modifying
+        the `initialize_tensors` method. See the `HybridSplitQKVContainer` for an example.
+        """
+        super().initialize_tensors(enable_training=enable_training)
+        self.set_lora_params()
+
+    def transform_for_training(self):
+        """
+        If the views on certain parameters are largely incompatible, it may be necessary to do
+        more substantial transformations to the parameters. This method should be overridden to
+        transform the inference format to what is necessary for training.
+        """
+        pass
+
+    def transform_for_inference(self):
+        """
+        If the views on certain parameters are largely incompatible, it may be necessary to do
+        more substantial transformations to the parameters. This method should be overridden to
+        transform the training format to what is necessary for inference.
+        """
+        pass
+
+    @abstractmethod
+    def set_lora_params(self):
+        """
+        If available, set the LoRA parameters for the module.  An implementation
+        for this would iterate over all parameters of the model and use the `maybe_get_lora` helper
+        method to check if the parameter does in fact have any LoRA params.
+        """
+        raise NotImplementedError("A set_lora_params() function must be defined for the relevant parameters.")
+
+    @abstractmethod
+    def get_lora_matched_pair(self):
+        """Get the pair of lora params and its matched model parameters."""
+        raise NotImplementedError("get_lora_matched_pair() must be defined for the relevant parameters.")
+
+    def fuse_lora(self):
+        """Fuse the LoRA parameters for the inference mode."""
+        for maybe_lora_param, param in self.get_lora_matched_pair():
+            if len(maybe_lora_param) == 3:
+                lora_right_weight, \
+                lora_left_weight, \
+                lora_scaling = maybe_lora_param
+                param.data += lora_scaling * torch.matmul(lora_left_weight.t(), lora_right_weight.t())
+
+    def unfuse_lora(self):
+        """Unfuse the LoRA parameters for the training mode."""
+        for maybe_lora_param, param in self.get_lora_matched_pair():
+            if len(maybe_lora_param) == 3:
+                lora_right_weight, \
+                lora_left_weight, \
+                lora_scaling = maybe_lora_param
+                param.data -= lora_scaling * torch.matmul(lora_left_weight.t(), lora_right_weight.t())
+
+    def apply_tensor_parallelism(self, mp_replace, reversed_dim=False):
+        """
+        Add support for reversed dim in tensor parallelism. If necessary, override
+        the called methods to handle partitioned weights (i.e. if qkv is split, override
+        the `attention_qkv_mp` method). If the model component is not split, it should
+        be safe to use the default implementation.
+        """
+        # Setup the new Attention module
+        self.attention_qkv_mp(mp_replace, reversed_dim=reversed_dim)
+        self.attention_o_mp(mp_replace, reversed_dim=reversed_dim)
+
+        # Setup the new MLP module
+        self.mlp_inter_mp(mp_replace, reversed_dim=reversed_dim)
+        self.mlp_output_mp(mp_replace, reversed_dim=reversed_dim)
+
+        # Apply weight quantization
+        # TODO(cmikeh2): Re-enable this once verified
+        #self.apply_weight_quantization()
+
+    def _release_params(self, param_pairs: List[Tuple[torch.Tensor, torch.Tensor]]):
+        """
+        Helper for `release_[component]` methods. Accepts a list of tuples where the first
+        element is the module param that needs to be deleted, and the second is the reassignment
+        from the container.
+        """
+        for module_param, container_param in param_pairs:
+            if module_param is not None:
+                del module_param
+            module_param = container_param
+
+    def release_memory(self):
+        """
+        Delete module parameters if they exist and point them back to the container. The primary
+        purpose of this is for TP-inference with ZeRO-3. In this scenario, we need to delete the
+        parameters we've created for inference to free their memory.
+        """
+        general_params = [
+            (self.module.attention.attn_ow, self.dense_w),
+            (self.module.attention.attn_ob, self.dense_b),
+            (self.module.mlp.attn_nw, self.attn_nw),
+            (self.module.mlp.attn_nb, self.attn_nb),
+            (self.module.norm_w, self.input_nw),
+            (self.module.norm_b, self.input_nb),
+        ]
+
+        self._release_params(general_params)
+
+        self.release_qkv()
+        self.release_mlp()
+
+    def release_qkv(self):
+        """
+        Release for QKV parameters (as well as any aliases).
+        """
+        qkv_params = [
+            (self.module.attention.attn_qkvw, self.qkvw),
+            (self.module.attention.attn_qkvb, self.qkvb),
+        ]
+
+        self._release_params(qkv_params)
+
+    def release_mlp(self):
+        """
+        Release for MLP parameters (as well as any aliases).
+        """
+        mlp_params = [
+            (self.module.mlp.inter_w, self._h4h_w),
+            (self.module.mlp.inter_b, self._h4h_b),
+            (self.module.mlp.output_w, self._4hh_w),
+            (self.module.mlp.output_b, self._4hh_b),
+        ]
+
+        self._release_params(mlp_params)
+
+    def reset_params(self):
+        """
+        The purpose of reset params is to get the weights from the FP16 training
+        copy of the model and copy to them to contiguous inference view. This only needs
+        to be performed when the container parameters cannot be used directly for inference.
+        """
+        self.reset_qkv()
+        self.reset_mlp()
+
+    def reset_qkv(self):
+        """
+        Perform any necessary resets of the model parameters for the QKV components.
+        """
+        pass
+
+    def reset_mlp(self):
+        """
+        Perform any necessary resets of the model parameters for the MLP components.
+        """
+        pass
+
+    def get_lora_params(self):
+        """
+        Return a list of all parameters that would have LoRA for the module.
+        """
+        if not hasattr(self, "lora_params"):
+            self.set_lora_params()
+        return self.lora_params
+
+    def set_params_wo_copy(self, Z3_enabled=False):
+        """
+        Rather than copying into, set the parameters directly. This is necessary to provide
+        an inexpensive (low-memory-overhead) view onto the FP16 forward weights.
+        """
+        self.module.mlp.attn_nw = self.attn_nw
+        self.module.mlp.attn_nb = self.attn_nb
+        self.module.norm_w = self.input_nw
+        self.module.norm_b = self.input_nb
+        self.set_attn_params_wo_copy(Z3_enabled=Z3_enabled)
+        self.set_mlp_params_wo_copy(Z3_enabled=Z3_enabled)
+
+    def set_attn_params_wo_copy(self, **kwargs):
+        """
+        Narrower sub-method for finer grained overriding.
+        """
+        self.module.attention.attn_ow = self.dense_w
+        self.module.attention.attn_ob = self.dense_b
+        self.module.attention.attn_qkvw = self.qkvw
+        self.module.attention.attn_qkvb = self.qkvb
+
+    def set_mlp_params_wo_copy(self, **kwargs):
+        """
+        Narrower sub-method for finer grained overriding.
+        """
+        self.module.mlp.inter_w = self._h4h_w
+        self.module.mlp.inter_b = self._h4h_b
+        self.module.mlp.output_w = self._4hh_w
+        self.module.mlp.output_b = self._4hh_b
diff --git a/deepspeed/module_inject/containers/features/hybrid_megatron.py b/deepspeed/module_inject/containers/features/hybrid_megatron.py
new file mode 100644
index 000000000000..d40f2a6b57e8
--- /dev/null
+++ b/deepspeed/module_inject/containers/features/hybrid_megatron.py
@@ -0,0 +1,87 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+
+from .hybrid_engine import HybridEngineContainer
+from .megatron import MegatronContainer
+
+
+class HybridMegatronContainer(MegatronContainer, HybridEngineContainer):
+
+    def _align_qkv(self, x: torch.Tensor):
+        """
+        Internal helper for accepting the head-contiguous weight matrix and chunking
+        the query, key, and value components.
+        """
+        attention_head_size = x.shape[0] // self.num_attention_heads
+        new_x_shape = (self.num_attention_heads, attention_head_size) + x.size()[1:]
+        x_1 = x.view(*new_x_shape)
+        div_dim = len(x_1.size()) - 2 if len(x.shape) == 2 else -1
+        (q, k, v) = torch.split(x_1, (x_1.shape[div_dim] // 3), dim=div_dim)
+        if len(q.shape) > 2:
+            x.data.copy_(
+                torch.cat((q.reshape(-1, q.shape[-1]), k.reshape(-1, q.shape[-1]), v.reshape(-1, q.shape[-1])),
+                          dim=0).reshape(x.shape))
+        else:
+            x.data.copy_(torch.cat((q.reshape(-1), k.reshape(-1), v.reshape(-1)), dim=-1).reshape(x.shape))
+
+    def transform_for_inference(self) -> None:
+        """
+        Overrides the HybridEngineContainer implementation.
+
+        The alternative layout of the QKV matrix for Megatron is such that each head's Q, K, and V
+        are sequential in memory. This is different from the default layout in which all of the Qs
+        are sequential, followed by all of the Ks, and then all of the Vs. Here, we take the default
+        layout and transform it to the inference layout.
+        """
+        if hasattr(self.qkvw, 'ds_id'):
+            from deepspeed.runtime.zero import GatheredParameters
+            from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
+            param_list = [self.qkvw, self.qkvb]
+            non_active_params = [param for param in param_list if (hasattr(param, 'ds_id') and \
+                            param.ds_status == ZeroParamStatus.NOT_AVAILABLE)]
+            with GatheredParameters(non_active_params):
+                self._align_qkv(self.qkvw)
+                self._align_qkv(self.qkvb)
+        else:
+            self._align_qkv(self.qkvw)
+            self._align_qkv(self.qkvb)
+
+    def _partition_qkv(self, x: torch.Tensor):
+        """
+        Internal helper for taking contiguous QKV and partitioning it for contiguous
+        heads.
+        """
+        q_k_v = torch.split(x, (x.shape[0] // 3), dim=0)
+        attention_head_size = q_k_v[0].shape[0] // self.num_attention_heads
+        new_x_shape = (self.num_attention_heads, attention_head_size) + x.size()[1:]
+        q, k, v = [data.view(*new_x_shape) for data in q_k_v]
+        if len(q.shape) > 2:
+            x.data.copy_(torch.cat((q, k, v), dim=-2).reshape(-1, q.shape[-1]))
+        else:
+            x.data.copy_(torch.cat((q, k, v), dim=-1).reshape(-1))
+
+    def transform_for_training(self):
+        """
+        Overrides the HybridEngineContainer implementation.
+
+        The alternative layout of the QKV matrix for Megatron is such that each head's Q, K, and V
+        are sequential in memory. This is different from the default layout in which all of the Qs
+        are sequential, followed by all of the Ks, and then all of the Vs. This function takes the inference format and reverts it back to the default format.
+        """
+        # If parameter is distributed, handle gathering it
+        if hasattr(self.qkvw, 'ds_id'):
+            from deepspeed.runtime.zero import GatheredParameters
+            from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
+            param_list = [self.qkvw, self.qkvb]
+            non_active_params = [param for param in param_list if (hasattr(param, 'ds_id') and \
+                            param.ds_status == ZeroParamStatus.NOT_AVAILABLE)]
+            with GatheredParameters(non_active_params):
+                self._partition_qkv(self.qkvw)
+                self._partition_qkv(self.qkvb)
+        else:
+            self._partition_qkv(self.qkvw)
+            self._partition_qkv(self.qkvb)
diff --git a/deepspeed/module_inject/containers/features/megatron.py b/deepspeed/module_inject/containers/features/megatron.py
index 45a013be913e..4daccf7d7c8d 100644
--- a/deepspeed/module_inject/containers/features/megatron.py
+++ b/deepspeed/module_inject/containers/features/megatron.py
@@ -1,37 +1,31 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 from abc import ABC
 
 
 class MegatronContainer(ABC):
+
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
         self.megatron_v2 = self.policy.is_megatron_v2
 
-    def transpose_qkv_alignment(self, x):
+    def _align_qkv_transposed(self, x):
         attention_head_size = x.shape[-1] // self.num_attention_heads
         new_x_shape = x.size()[:-1] + (self.num_attention_heads, attention_head_size)
         x_1 = x.view(*new_x_shape)
         (q, k, v) = torch.split(x_1, (x_1.shape[-1] // 3), dim=(x_1.dim() - 1))
         if len(q.shape) > 2:
-            return torch.cat((q.reshape(q.shape[0],
-                                        -1),
-                              k.reshape(q.shape[0],
-                                        -1),
-                              v.reshape(q.shape[0],
-                                        -1)),
+            return torch.cat((q.reshape(q.shape[0], -1), k.reshape(q.shape[0], -1), v.reshape(q.shape[0], -1)),
                              dim=-1).reshape(x.shape)
         else:
-            return torch.cat((q.reshape(-1),
-                              k.reshape(-1),
-                              v.reshape(-1)),
-                             dim=-1).reshape(x.shape)
+            return torch.cat((q.reshape(-1), k.reshape(-1), v.reshape(-1)), dim=-1).reshape(x.shape)
 
     def transpose(self):
         super().transpose()
         if self.megatron_v2:
-            self.qkvw = torch.nn.parameter.Parameter(
-                self.transpose_qkv_alignment(self.qkvw).contiguous())
-            self.qkvb = torch.nn.parameter.Parameter(
-                self.transpose_qkv_alignment(self.qkvb).contiguous())
+            self.qkvw = torch.nn.parameter.Parameter(self._align_qkv_transposed(self.qkvw).contiguous())
+            self.qkvb = torch.nn.parameter.Parameter(self._align_qkv_transposed(self.qkvb).contiguous())
diff --git a/deepspeed/module_inject/containers/features/meta_tensor.py b/deepspeed/module_inject/containers/features/meta_tensor.py
index 5b63c5cf5fbd..5fb55bc74339 100644
--- a/deepspeed/module_inject/containers/features/meta_tensor.py
+++ b/deepspeed/module_inject/containers/features/meta_tensor.py
@@ -1,26 +1,39 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from abc import ABC, abstractmethod
+from packaging import version as pkg_version
+import torch
 
 
 class MetaTensorContainer(ABC):
+    """
+    NOTE: If you are using this feature with a container that
+    also inherits from `HybridEngineContainer`, ensure that `MetaTensorContainer`
+    is inherited before `HybridEngineContainer` in the class definition.
+    """
+
     def __init__(self, **kwargs):
+        if pkg_version.parse('1.10') > pkg_version.parse(torch.__version__):
+            raise NotImplementedError("Meta tensor support is not available, please upgrade to torch 1.10+")
         super().__init__(**kwargs)
         self.is_meta = False
         self.ckpt_load_enabled = True
 
-    def initialize_tensors(self):
-        super().initialize_tensors()
+    def initialize_tensors(self, enable_training=False):
+        super().initialize_tensors(enable_training=enable_training)
         self.is_meta = self.qkvw.is_meta
 
-    def apply_tensor_parallelism(self, mp_replace):
+    def apply_tensor_parallelism(self, mp_replace, **kwargs):
         if self.is_meta:
             if self.qkvb is None:
                 self.module.attention.attn_qkvb = None
             if self.dense_b is None:
                 self.module.attention.attn_ob = None
         else:
-            super().apply_tensor_parallelism(mp_replace)
+            super().apply_tensor_parallelism(mp_replace, **kwargs)
 
     def copy_data_to_new_module(self):
         if self.is_meta:
@@ -39,7 +52,7 @@ def load_params(self, module, sd, weight_quantizer, mp_replace, prefix):
         """
         Load all the transformer parameter from the checkpoint file (sd).
         In addition to the parameter names, we require two
-        more parameters to help read the the data correctly
+        more parameters to help read the data correctly
         from the checkpoint and split the qkv heads in the
         right order:
             1. `use_load_prefix` (Default: False): this specifies
@@ -53,6 +66,5 @@ def load_params(self, module, sd, weight_quantizer, mp_replace, prefix):
                 of q, k, and v are stored together and needs to split in the
                 DeepSpeed-Inference API.
         """
-        raise NotImplementedError(
-            "A load_params() function must be defined in the model container \
+        raise NotImplementedError("A load_params() function must be defined in the model container \
                                   when inheriting the MetaTensorContainer feature")
diff --git a/deepspeed/module_inject/containers/features/split_qkv.py b/deepspeed/module_inject/containers/features/split_qkv.py
new file mode 100644
index 000000000000..f4c14d4e425a
--- /dev/null
+++ b/deepspeed/module_inject/containers/features/split_qkv.py
@@ -0,0 +1,159 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from abc import abstractmethod
+import torch
+
+from .hybrid_engine import HybridEngineContainer
+
+
+class HybridSplitQKVContainer(HybridEngineContainer):
+
+    def set_attention(self, qkvw, qkvb, dense_w, dense_b):
+        super().set_attention(qkvw, qkvb, dense_w, dense_b)
+        self.set_q_k_v()
+
+    @abstractmethod
+    def set_q_k_v(self):
+        """
+        In `set_q_k_v`, it is necessary to populate the following variables (where appropriate)
+        for the given model:
+            self.qw: q weight
+            self.qb: q bias
+            self.kw: k weight
+            self.kb: k bias
+            self.vw: v weight
+            self.vb: v bias
+        """
+        raise NotImplementedError("A set_q_k_v() function must be defined in the model container \
+                                    in order to set the unfused q, k, and v tensors.")
+
+    def attention_qkv_mp(self, mp_replace, reversed_dim=False):
+        # Only need to alter
+        if self.module.attention.attn_qkvw is None:
+            params = [
+                (self.module.attention.attn_qw, self.qw),
+                (self.module.attention.attn_qb, self.qb),
+                (self.module.attention.attn_kw, self.kw),
+                (self.module.attention.attn_kb, self.kb),
+                (self.module.attention.attn_vw, self.vw),
+                (self.module.attention.attn_vb, self.vb),
+            ]
+            for dst, src in params:
+                dst = mp_replace.copy(
+                    dst[:self.qw.shape[0] // mp_replace.mp_size], src, int8=reversed_dim,
+                    allocate_tensor=reversed_dim) if src is not None else None
+        else:
+            super().attention_qkv_mp(mp_replace)
+
+    def release_qkv(self):
+        super().release_qkv()
+        split_qkv_params = [
+            (self.module.attention.attn_qw, self.qw),
+            (self.module.attention.attn_qb, self.qb),
+            (self.module.attention.attn_kw, self.kw),
+            (self.module.attention.attn_kb, self.kb),
+            (self.module.attention.attn_vw, self.vw),
+            (self.module.attention.attn_vb, self.vb),
+        ]
+
+        self._release_params(split_qkv_params)
+
+    def reset_qkv(self):
+        self.qkvw.data[:self.qw.shape[0]] = self.qw.data
+        self.qkvw.data[self.qw.shape[0]:2 * self.qw.shape[0]] = self.kw.data
+        self.qkvw.data[2 * self.qw.shape[0]:] = self.vw.data
+
+        qkv_data = [self.qw.data, self.kw.data, self.vw.data]
+
+        self.qw.data = self.qkvw.data[:self.qw.shape[0]]
+        self.kw.data = self.qkvw.data[self.qw.shape[0]:2 * self.qw.shape[0]]
+        self.vw.data = self.qkvw.data[2 * self.qw.shape[0]:]
+
+        if self.qkvb is not None:
+            self.qkvb.data[:self.qw.shape[0]] = self.qb.data
+            self.qkvb.data[self.qw.shape[0]:2 * self.qw.shape[0]] = self.kb.data
+            self.qkvb.data[2 * self.qw.shape[0]:] = self.vb.data
+
+            qkv_data.extend([self.qb.data, self.kb.data, self.vb.data])
+
+            self.qb.data = self.qkvb.data[:self.qw.shape[0]]
+            self.kb.data = self.qkvb.data[self.qw.shape[0]:2 * self.qw.shape[0]]
+            self.vb.data = self.qkvb.data[2 * self.qw.shape[0]:]
+
+        for data in qkv_data:
+            del data
+
+    def reset_qkv_experimental(self):
+        """
+        WIP - experimental and likely to be changed/improved.
+        Unused by keeping for now.
+        """
+        if self.module.attention.attn_qkvw is None:
+            self.module.attention.attn_qkvw = torch.empty(self.qw.shape[0] * 3,
+                                                          self.qw.shape[0],
+                                                          dtype=self.qw.dtype,
+                                                          device=self.qw.device)
+            self.module.attention.attn_qkvb = torch.empty(self.qw.shape[0] * 3,
+                                                          dtype=self.qw.dtype,
+                                                          device=self.qw.device)
+        self.module.attention.attn_qkvw.data[:self.qw.shape[0]] = self.qw.data
+        self.module.attention.attn_qkvb.data[:self.qw.shape[0]] = self.qb.data
+        self.module.attention.attn_qkvw.data[self.qw.shape[0]:2 * self.qw.shape[0]] = self.kw.data
+        self.module.attention.attn_qkvb.data[self.qw.shape[0]:2 * self.qw.shape[0]] = self.kb.data
+        self.module.attention.attn_qkvw.data[2 * self.qw.shape[0]:] = self.vw.data
+        self.module.attention.attn_qkvb.data[2 * self.qw.shape[0]:] = self.vb.data
+
+        qkv_data = [self.qw.data, \
+                    self.qb.data, \
+                    self.kw.data, \
+                    self.kb.data, \
+                    self.vw.data, \
+                    self.vb.data]
+
+        self.qw.data = self.module.attention.attn_qkvw.data[:self.qw.shape[0]]
+        self.qb.data = self.module.attention.attn_qkvb.data[:self.qw.shape[0]]
+        self.kw.data = self.module.attention.attn_qkvw.data[self.qw.shape[0]:2 * self.qw.shape[0]]
+        self.kb.data = self.module.attention.attn_qkvb.data[self.qw.shape[0]:2 * self.qw.shape[0]]
+        self.vw.data = self.module.attention.attn_qkvw.data[2 * self.qw.shape[0]:]
+        self.vb.data = self.module.attention.attn_qkvb.data[2 * self.qw.shape[0]:]
+
+        for data in qkv_data:
+            del data
+
+    def set_attn_params_wo_copy(self, Z3_enabled=False):
+        self.module.attention.attn_ow = self.dense_w
+        self.module.attention.attn_ob = self.dense_b
+        if not Z3_enabled:
+            # In initialize_tensors, we create a fused qkvw with the appropriate shape
+            # and copy the qw, qb, kw, kb, vw, vb into it
+            self.module.attention.attn_qkvw = self.qkvw
+            self.module.attention.attn_qkvb = self.qkvb
+
+            # We reset the data for qw (which is the original model parameter) to point
+            # to the fused weight matrix we have created here
+            self.qw.data = self.qkvw[:self.qw.shape[0], :]
+            self.kw.data = self.qkvw[self.qw.shape[0]:2 * self.qw.shape[0], :]
+            self.vw.data = self.qkvw[self.qw.shape[0] * 2:, :]
+
+            # Assume if one of the biases is not None, then all of them are not None
+            if self.qb is not None:
+                self.qb.data = self.qkvb[:self.qw.shape[0]]
+                self.kb.data = self.qkvb[self.qw.shape[0]:2 * self.qw.shape[0]]
+                self.vb.data = self.qkvb[self.qw.shape[0] * 2:]
+        else:
+            # In ZeRO-3 this will be managed by ZeRO and handled separately in the
+            # forward of ds_attention
+            self.module.attention.attn_qw = self.qw
+            self.module.attention.attn_qb = self.qb
+            self.module.attention.attn_kw = self.kw
+            self.module.attention.attn_kb = self.kb
+            self.module.attention.attn_vw = self.vw
+            self.module.attention.attn_vb = self.vb
+
+    def get_attn_params(self):
+        params = super().get_attn_params()
+        params.extend([self.qw, self.qb, self.kw, self.kb, self.vw, self.vb])
+        return params
diff --git a/deepspeed/module_inject/containers/gpt2.py b/deepspeed/module_inject/containers/gpt2.py
index dc194d71d459..7a19aac34b44 100644
--- a/deepspeed/module_inject/containers/gpt2.py
+++ b/deepspeed/module_inject/containers/gpt2.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .base import *
 from deepspeed.model_implementations.transformers.ds_gpt import DeepSpeedGPTInference
@@ -6,6 +9,7 @@
 
 
 class DS_GPT2Container(BaseTransformerContainer):
+
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
 
@@ -33,15 +37,17 @@ def __init__(self, client_module, inference=True):
 
     def get_hidden_heads(self):
         return self.client_module.attn.embed_dim, \
-                self.client_module.attn.num_heads
+                self.client_module.attn.num_heads, \
+                self.client_module.ln_1.eps, \
+                DEFAULT_INTERMEDIATE_SIZE
 
-    def attention(self):
+    def attention(self, enable_training=False):
         return  self.client_module.attn.c_attn.weight, \
                 self.client_module.attn.c_attn.bias, \
                 self.client_module.attn.c_proj.weight, \
                 self.client_module.attn.c_proj.bias
 
-    def mlp(self):
+    def mlp(self, enable_training=False):
         return self.client_module.mlp.c_fc.weight, \
                self.client_module.mlp.c_fc.bias, \
                self.client_module.mlp.c_proj.weight, \
diff --git a/deepspeed/module_inject/containers/gptj.py b/deepspeed/module_inject/containers/gptj.py
index 35472c100708..17c0a5027a4c 100644
--- a/deepspeed/module_inject/containers/gptj.py
+++ b/deepspeed/module_inject/containers/gptj.py
@@ -1,7 +1,11 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .base import *
 from .features.meta_tensor import MetaTensorContainer
+from .features.split_qkv import HybridSplitQKVContainer
 from deepspeed.model_implementations.transformers.ds_gpt import DeepSpeedGPTInference
 import torch
 from torch.nn.parameter import Parameter
@@ -10,8 +14,11 @@
 from ..policy import maybe_copy
 from ..policy import maybe_copy_qkv
 
+from ..policy import maybe_get_lora
+
+
+class DS_GPTJContainer(MetaTensorContainer, HybridSplitQKVContainer, BaseTransformerContainer):
 
-class DS_GPTJContainer(MetaTensorContainer, BaseTransformerContainer):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
 
@@ -23,6 +30,35 @@ def create_module(self, config=None):
         self.module.config.scale_attention = self.scale_attention
         return self.module
 
+    def set_lora_params(self):
+        """
+        Necessary to implement for `HybridEngineContainer`
+        """
+        self.lora_params = [
+            maybe_get_lora(p) for p in [
+                self.policy.client_module.mlp.fc_in, self.policy.client_module.mlp.fc_out,
+                self.policy.client_module.attn.q_proj, self.policy.client_module.attn.k_proj,
+                self.policy.client_module.attn.v_proj, self.policy.client_module.attn.out_proj
+            ]
+        ]
+
+    def get_lora_matched_pair(self):
+        fc1_lora, fc2_lora, q_lora, k_lora, v_lora, out_lora = self.get_lora_params()
+        ret = [(fc1_lora, self._h4h_w), (fc2_lora, self._4hh_w), (out_lora, self.dense_w), (q_lora, self.qw),
+               (k_lora, self.kw), (v_lora, self.vw)]
+        return ret
+
+    def set_q_k_v(self):
+        """
+        Necessary to implement for `HybridSplitQKVContainer`
+        """
+        self.qw = self.policy.client_module.attn.q_proj.weight
+        self.qb = None
+        self.kw = self.policy.client_module.attn.k_proj.weight
+        self.kb = None
+        self.vw = self.policy.client_module.attn.v_proj.weight
+        self.vb = None
+
     def load_params(self, module, sd, weight_quantizer, mp_replace, prefix):
         param_names = (
             'attn.q_proj.weight', \
@@ -36,36 +72,20 @@ def load_params(self, module, sd, weight_quantizer, mp_replace, prefix):
             'ln_1.weight', \
             'ln_1.bias'
         )
-        maybe_copy_qkv(
-            module.attention,
-            sd,
-            weight_quantizer,
-            mp_replace,
-            'attn_qkvw',
-            [prefix + param_names[0],
-             prefix + param_names[1],
-             prefix + param_names[2]],
-            split_qkv=self.policy.split_qkv)
-        for i in range(3, 4):
-            maybe_copy(module.attention,
+        maybe_copy_qkv(module.attention,
                        sd,
                        weight_quantizer,
                        mp_replace,
-                       transformer_param_names[i - 1],
+                       'attn_qkvw', [prefix + param_names[0], prefix + param_names[1], prefix + param_names[2]],
+                       split_qkv=self.policy.split_qkv)
+        for i in range(3, 4):
+            maybe_copy(module.attention, sd, weight_quantizer, mp_replace, transformer_param_names[i - 1],
                        prefix + param_names[i])
         for i in range(4, 8):
-            maybe_copy(module.mlp,
-                       sd,
-                       weight_quantizer,
-                       mp_replace,
-                       transformer_param_names[i],
+            maybe_copy(module.mlp, sd, weight_quantizer, mp_replace, transformer_param_names[i],
                        prefix + param_names[i])
         for i in range(8, 10):
-            maybe_copy(module,
-                       sd,
-                       weight_quantizer,
-                       mp_replace,
-                       transformer_param_names[i + 2],
+            maybe_copy(module, sd, weight_quantizer, mp_replace, transformer_param_names[i + 2],
                        prefix + param_names[i])
 
 
@@ -82,22 +102,24 @@ def __init__(self, client_module, inference=True):
             HFGPTJLayerPolicy._orig_layer_class = None
 
     def get_hidden_heads(self):
-        return self.client_module.attn.q_proj.weight.shape[1], \
-                self.client_module.attn.num_attention_heads
+        return self.client_module.attn.embed_dim, \
+                self.client_module.attn.num_attention_heads, \
+                self.client_module.ln_1.eps, \
+                DEFAULT_INTERMEDIATE_SIZE
 
-    def attention(self):
+    def attention(self, enable_training=False):
         qw = self.client_module.attn.q_proj.weight
         kw = self.client_module.attn.k_proj.weight
         vw = self.client_module.attn.v_proj.weight
 
-        qkvw = Parameter(torch.cat((qw, kw, vw), dim=0), requires_grad=False)
+        qkvw = Parameter(torch.cat((qw, kw, vw), dim=0), requires_grad=enable_training)
 
         return qkvw, \
                None, \
                self.client_module.attn.out_proj.weight, \
                None,
 
-    def mlp(self):
+    def mlp(self, enable_training=False):
         return self.client_module.mlp.fc_in.weight, \
                self.client_module.mlp.fc_in.bias, \
                self.client_module.mlp.fc_out.weight, \
diff --git a/deepspeed/module_inject/containers/gptneo.py b/deepspeed/module_inject/containers/gptneo.py
index a8f206f516dc..fca673b375e1 100644
--- a/deepspeed/module_inject/containers/gptneo.py
+++ b/deepspeed/module_inject/containers/gptneo.py
@@ -1,7 +1,11 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .base import *
 from .features.meta_tensor import MetaTensorContainer
+from .features.split_qkv import HybridSplitQKVContainer
 from deepspeed.model_implementations.transformers.ds_gpt import DeepSpeedGPTInference
 import torch
 from torch.nn.parameter import Parameter
@@ -10,8 +14,11 @@
 from ..policy import maybe_copy
 from ..policy import maybe_copy_qkv
 
+from ..policy import maybe_get_lora
+
+
+class DS_GPTNEOContainer(MetaTensorContainer, HybridSplitQKVContainer, BaseTransformerContainer):
 
-class DS_GPTNEOContainer(MetaTensorContainer, BaseTransformerContainer):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
 
@@ -23,6 +30,38 @@ def create_module(self, config=None):
         self.module.config.scale_attention = self.scale_attention
         return self.module
 
+    def set_lora_params(self):
+        """
+        Necessary to implement for `HybridEngineContainer`
+        """
+        self.lora_params = [
+            maybe_get_lora(p) for p in [
+                self.policy.client_module.mlp.c_fc, self.policy.client_module.mlp.c_proj,
+                self.policy.client_module.attn.attention.q_proj, self.policy.client_module.attn.attention.k_proj,
+                self.policy.client_module.attn.attention.v_proj, self.policy.client_module.attn.attention.out_proj
+            ]
+        ]
+
+    def set_q_k_v(self):
+        """
+        Necessary to implement for `HybridSplitQKVContainer`
+        """
+        self.qw = self.policy.client_module.attn.attention.q_proj.weight
+        self.qb = None
+        self.kw = self.policy.client_module.attn.attention.k_proj.weight
+        self.kb = None
+        self.vw = self.policy.client_module.attn.attention.v_proj.weight
+        self.vb = None
+
+    def get_lora_matched_pair(self):
+        """
+        Necessary to implement for `HybridEngineContainer`
+        """
+        fc1_lora, fc2_lora, q_lora, k_lora, v_lora, out_lora = self.get_lora_params()
+        ret = [(fc1_lora, self._h4h_w), (fc2_lora, self._4hh_w), (out_lora, self.dense_w), (q_lora, self.qw),
+               (k_lora, self.kw), (v_lora, self.vw)]
+        return ret
+
     def load_params(self, module, sd, weight_quantizer, mp_replace, prefix):
         param_names = (
             'attn.attention.q_proj.weight', \
@@ -39,40 +78,25 @@ def load_params(self, module, sd, weight_quantizer, mp_replace, prefix):
             'ln_1.weight', \
             'ln_1.bias'
         )
-        maybe_copy_qkv(
-            module.attention,
-            sd,
-            weight_quantizer,
-            mp_replace,
-            'attn_qkvw',
-            [prefix + param_names[0],
-             prefix + param_names[1],
-             prefix + param_names[2]],
-            split_qkv=self.policy.split_qkv)
-        for i in range(3, 5):
-            maybe_copy(module.attention,
+        maybe_copy_qkv(module.attention,
                        sd,
                        weight_quantizer,
                        mp_replace,
-                       transformer_param_names[i - 1],
+                       'attn_qkvw', [prefix + param_names[0], prefix + param_names[1], prefix + param_names[2]],
+                       split_qkv=self.policy.split_qkv)
+        for i in range(3, 5):
+            maybe_copy(module.attention, sd, weight_quantizer, mp_replace, transformer_param_names[i - 1],
                        prefix + param_names[i])
         for i in range(5, 11):
-            maybe_copy(module.mlp,
-                       sd,
-                       weight_quantizer,
-                       mp_replace,
-                       transformer_param_names[i - 1],
+            maybe_copy(module.mlp, sd, weight_quantizer, mp_replace, transformer_param_names[i - 1],
                        prefix + param_names[i])
         for i in range(11, 13):
-            maybe_copy(module,
-                       sd,
-                       weight_quantizer,
-                       mp_replace,
-                       transformer_param_names[i - 1],
+            maybe_copy(module, sd, weight_quantizer, mp_replace, transformer_param_names[i - 1],
                        prefix + param_names[i])
 
 
 class HFGPTNEOLayerPolicy(TransformerPolicy):
+
     def __init__(self, client_module, inference=True):
         super().__init__(inference, scale_attention=False)
         self.client_module = client_module
@@ -83,22 +107,32 @@ def __init__(self, client_module, inference=True):
             HFGPTNEOLayerPolicy._orig_layer_class = None
 
     def get_hidden_heads(self):
-        return self.client_module.attn.attention.q_proj.weight.shape[1], \
-                self.client_module.attn.attention.num_heads
+        return self.client_module.attn.attention.embed_dim, \
+                self.client_module.attn.attention.num_heads, \
+                self.client_module.ln_1.eps, \
+                DEFAULT_INTERMEDIATE_SIZE
+
+    def get_q_k_v(self):
+        return self.client_module.attn.attention.q_proj.weight, \
+               None, \
+               self.client_module.attn.attention.k_proj.weight, \
+               None, \
+               self.client_module.attn.attention.v_proj.weight, \
+               None
 
-    def attention(self):
+    def attention(self, enable_training=False):
         qw = self.client_module.attn.attention.q_proj.weight
         kw = self.client_module.attn.attention.k_proj.weight
         vw = self.client_module.attn.attention.v_proj.weight
 
-        qkvw = Parameter(torch.cat((qw, kw, vw), dim=0), requires_grad=False)
+        qkvw = Parameter(torch.cat((qw, kw, vw), dim=0), requires_grad=enable_training)
 
         return qkvw, \
                None, \
                self.client_module.attn.attention.out_proj.weight, \
                self.client_module.attn.attention.out_proj.bias
 
-    def mlp(self):
+    def mlp(self, enable_training=False):
         return self.client_module.mlp.c_fc.weight, \
                self.client_module.mlp.c_fc.bias, \
                self.client_module.mlp.c_proj.weight, \
diff --git a/deepspeed/module_inject/containers/gptneox.py b/deepspeed/module_inject/containers/gptneox.py
index ebf2db0251cc..3978b166b329 100644
--- a/deepspeed/module_inject/containers/gptneox.py
+++ b/deepspeed/module_inject/containers/gptneox.py
@@ -1,8 +1,11 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .base import *
 from .features.meta_tensor import MetaTensorContainer
-from .features.megatron import MegatronContainer
+from .features.hybrid_megatron import HybridMegatronContainer
 from deepspeed.model_implementations.transformers.ds_gpt import DeepSpeedGPTInference
 import torch
 from ..policy import TransformerPolicy
@@ -10,10 +13,11 @@
 from ..policy import maybe_copy
 from packaging import version as pkg_version
 
+from ..policy import maybe_get_lora
+
+
+class DS_GPTNEOXContainer(MetaTensorContainer, HybridMegatronContainer, BaseTransformerContainer):
 
-class DS_GPTNEOXContainer(MetaTensorContainer,
-                          MegatronContainer,
-                          BaseTransformerContainer):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
 
@@ -30,6 +34,30 @@ def create_module(self, config=None):
 
         return self.module
 
+    def get_lora_matched_pair(self):
+        """
+        Necessary to implement for `HybridEngineContainer`
+        """
+        fc1_lora, fc2_lora, qkv_lora, out_lora = self.get_lora_params()
+        ret = [(fc1_lora, self._h4h_w), (fc2_lora, self._4hh_w), (qkv_lora, self.qkvw), (out_lora, self.dense_w)]
+        return ret
+
+    def set_lora_params(self):
+        """
+        Necessary to implement for `HybridEngineContainer`
+        """
+        if GPTNEOXLayerPolicy.version == 0:
+            attention = self.policy.client_module.attention
+        else:
+            attention = self.policy.client_module.self_attention
+
+        self.lora_params = [
+            maybe_get_lora(p) for p in [
+                self.policy.client_module.mlp.dense_h_to_4h, self.policy.client_module.mlp.dense_4h_to_h,
+                attention.query_key_value, attention.dense
+            ]
+        ]
+
     def load_params(self, module, sd, weight_quantizer, mp_replace, prefix):
         param_names = (
             'attention.query_key_value.weight', \
@@ -57,26 +85,13 @@ def load_params(self, module, sd, weight_quantizer, mp_replace, prefix):
                        split_qkv=self.policy.split_qkv,
                        heads=self.policy.client_module.attention.num_attention_heads)
         for i in range(2, 4):
-            maybe_copy(module.attention,
-                       sd,
-                       weight_quantizer,
-                       mp_replace,
-                       transformer_param_names[i],
+            maybe_copy(module.attention, sd, weight_quantizer, mp_replace, transformer_param_names[i],
                        prefix + param_names[i])
         for i in range(4, 10):
-            maybe_copy(module.mlp,
-                       sd,
-                       weight_quantizer,
-                       mp_replace,
-                       transformer_param_names[i],
+            maybe_copy(module.mlp, sd, weight_quantizer, mp_replace, transformer_param_names[i],
                        prefix + param_names[i])
         for i in range(10, 12):
-            maybe_copy(module,
-                       sd,
-                       weight_quantizer,
-                       mp_replace,
-                       transformer_param_names[i],
-                       prefix + param_names[i])
+            maybe_copy(module, sd, weight_quantizer, mp_replace, transformer_param_names[i], prefix + param_names[i])
 
 
 class GPTNEOXLayerPolicy(TransformerPolicy):
@@ -102,10 +117,12 @@ def get_hidden_heads(self):
         else:
             attention = self.client_module.self_attention
 
-        return self.client_module.attention.query_key_value.weight.shape[1], \
-                self.client_module.attention.num_attention_heads
+        return self.client_module.attention.hidden_size, \
+                self.client_module.attention.num_attention_heads, \
+                self.client_module.input_layernorm.eps, \
+                DEFAULT_INTERMEDIATE_SIZE
 
-    def attention(self):
+    def attention(self, enable_training=False):
         if GPTNEOXLayerPolicy.version == 0:
             attention = self.client_module.attention
         else:
@@ -116,7 +133,7 @@ def attention(self):
                attention.dense.weight, \
                attention.dense.bias
 
-    def mlp(self):
+    def mlp(self, enable_training=False):
         return self.client_module.mlp.dense_h_to_4h.weight, \
                self.client_module.mlp.dense_h_to_4h.bias, \
                self.client_module.mlp.dense_4h_to_h.weight, \
diff --git a/deepspeed/module_inject/containers/internlm.py b/deepspeed/module_inject/containers/internlm.py
new file mode 100644
index 000000000000..31255d4b3ca5
--- /dev/null
+++ b/deepspeed/module_inject/containers/internlm.py
@@ -0,0 +1,181 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import importlib
+
+import torch
+from torch.nn.parameter import Parameter
+
+from deepspeed.model_implementations.transformers.ds_gpt import DeepSpeedGPTInference
+from deepspeed.utils.types import ActivationFuncType, NormType
+
+from ..policy import (TransformerPolicy, maybe_copy, maybe_copy_geglu, maybe_copy_qkv, maybe_get_lora,
+                      transformer_param_names)
+from .base import *
+from .features import HybridGatedMLPContainer, HybridSplitQKVContainer
+
+
+class DS_InternLMContainer(HybridGatedMLPContainer, HybridSplitQKVContainer, BaseTransformerContainer):
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+        # All model specific things should be defined here instead of the base class.
+
+    def create_module(self, config=None):
+        _config = config if config is not None else self.ds_model_config
+
+        _config.rotate_half = True
+        _config.rotate_every_two = False
+        _config.rotary_dim = self.hidden_size // self.num_attention_heads
+        self.module = DeepSpeedGPTInference(_config, mp_group=self.mp_group)
+
+        return self.module
+
+    def set_lora_params(self):
+        """
+        Necessary to implement for `HybridEngineContainer`
+        """
+        self.lora_params = [
+            maybe_get_lora(p) for p in [
+                self.policy.client_module.mlp.up_proj.weight, self.policy.client_module.mlp.gate_proj.weight,
+                self.policy.client_module.mlp.down_proj.weight, self.policy.client_module.self_attn.q_proj.weight,
+                self.policy.client_module.self_attn.k_proj.weight, self.policy.client_module.self_attn.v_proj.weight,
+                self.policy.client_module.self_attn.o_proj.weight
+            ]
+        ]
+
+    def get_lora_matched_pair(self):
+        up_proj_lora, gate_proj_lora, down_proj_lora, q_lora, k_lora, v_lora, out_lora = self.get_lora_params()
+        ret = [(up_proj_lora, self.inter_up_w), (gate_proj_lora, self.inter_gate_w), (down_proj_lora, self._4hh_w),
+               (out_lora, self.dense_w), (q_lora, self.qw), (k_lora, self.kw), (v_lora, self.vw)]
+        return ret
+
+    def set_q_k_v(self):
+        """
+        Necessary to implement for `HybridSplitQKVContainer`
+        """
+        self.qw = self.policy.client_module.self_attn.q_proj.weight
+        self.qb = self.policy.client_module.self_attn.q_proj.bias
+        self.kw = self.policy.client_module.self_attn.k_proj.weight
+        self.kb = self.policy.client_module.self_attn.k_proj.bias
+        self.vw = self.policy.client_module.self_attn.v_proj.weight
+        self.vb = self.policy.client_module.self_attn.v_proj.bias
+
+    def set_mlp_gate(self):
+        """
+        Necessary to implement for `HybridGatedMLPContainer`
+        """
+        self.inter_up_w = self.policy.client_module.mlp.up_proj.weight
+        self.inter_up_b = None
+        self.inter_gate_w = self.policy.client_module.mlp.gate_proj.weight
+        self.inter_gate_b = None
+
+    def load_params(self, module, sd, weight_quantizer, mp_replace, prefix):
+        param_names = (
+            'self_attn.q_proj.weight', \
+            'self_attn.k_proj.weight', \
+            'self_attn.v_proj.weight', \
+            'self_attn.o_proj.weight', \
+            'mlp.up_proj.weight', \
+            'mlp.gate_proj.weight', \
+            'mlp.down_proj.weight', \
+            'input_layernorm.weight', \
+            'post_attention_layernorm.weight'
+            'self_attn.q_proj.bias', \
+            'self_attn.k_proj.bias', \
+            'self_attn.v_proj.bias', \
+            'self_attn.o_proj.bias', \
+        )
+
+        maybe_copy_qkv(module.attention,
+                       sd,
+                       weight_quantizer,
+                       mp_replace,
+                       'attn_qkvw', [prefix + param_names[0], prefix + param_names[1], prefix + param_names[2]],
+                       split_qkv=self.policy.split_qkv)
+        maybe_copy_qkv(module.attention,
+                       sd,
+                       weight_quantizer,
+                       mp_replace,
+                       'attn_qkvb', [prefix + param_names[9], prefix + param_names[10], prefix + param_names[11]],
+                       split_qkv=self.policy.split_qkv)
+        maybe_copy(module.attention, sd, weight_quantizer, mp_replace, transformer_param_names[2],
+                   prefix + param_names[3])
+        maybe_copy(module.attention, sd, weight_quantizer, mp_replace, transformer_param_names[3],
+                   prefix + param_names[12])
+        maybe_copy_geglu(module.mlp, sd, weight_quantizer, mp_replace, 'inter_w',
+                         [prefix + param_names[4], prefix + param_names[5]])
+        maybe_copy(module.mlp, sd, weight_quantizer, mp_replace, 'output_w', prefix + param_names[6])
+
+        maybe_copy(module.mlp, sd, weight_quantizer, mp_replace, transformer_param_names[8], prefix + param_names[7])
+        maybe_copy(module, sd, weight_quantizer, mp_replace, transformer_param_names[10], prefix + param_names[8])
+
+
+class InternLMLayerPolicy(TransformerPolicy):
+    _orig_layer_class = []
+    _orig_layer_class_inited = False
+
+    def __init__(self, client_module, inference=True):
+        super().__init__(
+            inference,
+            mlp_act_func_type=ActivationFuncType.GATED_SILU,
+            norm_type=NormType.RMSNorm,
+        )
+        self.client_module = client_module
+
+        self._init_orig_layer_class_once()
+
+    def _init_orig_layer_class_once(self):
+        if InternLMLayerPolicy._orig_layer_class_inited:
+            return
+
+        for sub_pkg in ['', '.internlm-7b', '.internlm-chat-7b']:
+            try:
+                from transformers.utils import TRANSFORMERS_DYNAMIC_MODULE_NAME
+                module = importlib.import_module(f"{TRANSFORMERS_DYNAMIC_MODULE_NAME}{sub_pkg}.modeling_internlm")
+                if module.InternLMDecoderLayer not in InternLMLayerPolicy._orig_layer_class:
+                    InternLMLayerPolicy._orig_layer_class.append(module.InternLMDecoderLayer)
+            except ImportError:
+                continue
+
+        InternLMLayerPolicy._orig_layer_class_inited = True
+
+    def get_hidden_heads(self):
+        return self.client_module.self_attn.q_proj.weight.shape[1], \
+                self.client_module.self_attn.num_heads, \
+                self.client_module.input_layernorm.variance_epsilon, \
+                self.client_module.mlp.gate_proj.weight.shape[0]
+
+    def attention(self, enable_training=False):
+        qw = self.client_module.self_attn.q_proj.weight
+        kw = self.client_module.self_attn.k_proj.weight
+        vw = self.client_module.self_attn.v_proj.weight
+        qb = self.client_module.self_attn.q_proj.bias
+        kb = self.client_module.self_attn.k_proj.bias
+        vb = self.client_module.self_attn.v_proj.bias
+
+        qkvw = Parameter(torch.cat((qw, kw, vw), dim=0), requires_grad=enable_training)
+        qkvb = Parameter(torch.cat((qb, kb, vb), dim=0), requires_grad=enable_training)
+
+        return qkvw, \
+                qkvb, \
+                self.client_module.self_attn.o_proj.weight, \
+                self.client_module.self_attn.o_proj.bias
+
+    def mlp(self, enable_training=False):
+        mlp1_up = self.client_module.mlp.up_proj.weight
+        mlp1_gate = self.client_module.mlp.gate_proj.weight
+        mlp2 = self.client_module.mlp.down_proj.weight
+
+        mlp1 = Parameter(torch.cat((mlp1_up, mlp1_gate), dim=0), requires_grad=enable_training)
+
+        return mlp1, None, mlp2, None
+
+    def layernorm(self):
+        return self.client_module.post_attention_layernorm.weight, \
+               None, \
+               self.client_module.input_layernorm.weight, \
+               None
diff --git a/deepspeed/module_inject/containers/llama.py b/deepspeed/module_inject/containers/llama.py
new file mode 100644
index 000000000000..f6157e5cdfed
--- /dev/null
+++ b/deepspeed/module_inject/containers/llama.py
@@ -0,0 +1,166 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .base import *
+from .features import HybridSplitQKVContainer, HybridGatedMLPContainer, MetaTensorContainer
+from deepspeed.utils.types import ActivationFuncType, NormType
+from deepspeed.model_implementations.transformers.ds_gpt import DeepSpeedGPTInference
+import torch
+from torch.nn.parameter import Parameter
+
+from ..policy import (
+    TransformerPolicy,
+    transformer_param_names,
+    maybe_copy,
+    maybe_copy_qkv,
+    maybe_copy_geglu,
+    maybe_get_lora,
+)
+
+
+class DS_LLAMAContainer(MetaTensorContainer, HybridGatedMLPContainer, HybridSplitQKVContainer,
+                        BaseTransformerContainer):
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+        # All model specific things should be defined here instead of the base class.
+
+    def create_module(self, config=None):
+        _config = config if config is not None else self.ds_model_config
+
+        _config.rotate_half = True
+        _config.rotate_every_two = False
+        _config.rotary_dim = self.hidden_size // self.num_attention_heads
+        _config.rope_theta = self.policy.client_module.self_attn.rope_theta
+        self.module = DeepSpeedGPTInference(_config, mp_group=self.mp_group)
+
+        return self.module
+
+    def set_lora_params(self):
+        """
+        Necessary to implement for `HybridEngineContainer`
+        """
+        self.lora_params = [
+            maybe_get_lora(p) for p in [
+                self.policy.client_module.mlp.up_proj.weight, self.policy.client_module.mlp.gate_proj.weight,
+                self.policy.client_module.mlp.down_proj.weight, self.policy.client_module.self_attn.q_proj.weight,
+                self.policy.client_module.self_attn.k_proj.weight, self.policy.client_module.self_attn.v_proj.weight,
+                self.policy.client_module.self_attn.o_proj.weight
+            ]
+        ]
+
+    def get_lora_matched_pair(self):
+        up_proj_lora, gate_proj_lora, down_proj_lora, q_lora, k_lora, v_lora, out_lora = self.get_lora_params()
+        ret = [(up_proj_lora, self.inter_up_w), (gate_proj_lora, self.inter_gate_w), (down_proj_lora, self._4hh_w),
+               (out_lora, self.dense_w), (q_lora, self.qw), (k_lora, self.kw), (v_lora, self.vw)]
+        return ret
+
+    def set_q_k_v(self):
+        """
+        Necessary to implement for `HybridSplitQKVContainer`
+        """
+        self.qw = self.policy.client_module.self_attn.q_proj.weight
+        self.qb = None
+        self.kw = self.policy.client_module.self_attn.k_proj.weight
+        self.kb = None
+        self.vw = self.policy.client_module.self_attn.v_proj.weight
+        self.vb = None
+
+    def set_mlp_gate(self):
+        """
+        Necessary to implement for `HybridGatedMLPContainer`
+        """
+        self.inter_up_w = self.policy.client_module.mlp.up_proj.weight
+        self.inter_up_b = None
+        self.inter_gate_w = self.policy.client_module.mlp.gate_proj.weight
+        self.inter_gate_b = None
+
+    def load_params(self, module, sd, weight_quantizer, mp_replace, prefix):
+        param_names = (
+            'self_attn.q_proj.weight', \
+            'self_attn.k_proj.weight', \
+            'self_attn.v_proj.weight', \
+            'self_attn.o_proj.weight', \
+            'mlp.up_proj.weight', \
+            'mlp.gate_proj.weight', \
+            'mlp.down_proj.weight', \
+            'post_attention_layernorm.weight', \
+            'input_layernorm.weight',
+        )
+
+        maybe_copy_qkv(module.attention,
+                       sd,
+                       weight_quantizer,
+                       mp_replace,
+                       'attn_qkvw', [prefix + param_names[0], prefix + param_names[1], prefix + param_names[2]],
+                       split_qkv=self.policy.split_qkv)
+        for i in range(3, 4):
+            maybe_copy(module.attention, sd, weight_quantizer, mp_replace, transformer_param_names[i - 1],
+                       prefix + param_names[i])
+        maybe_copy_geglu(module.mlp, sd, weight_quantizer, mp_replace, 'inter_w',
+                         [prefix + param_names[4], prefix + param_names[5]])
+        maybe_copy(module.mlp, sd, weight_quantizer, mp_replace, 'output_w', prefix + param_names[6])
+
+        maybe_copy(module.mlp, sd, weight_quantizer, mp_replace, transformer_param_names[8], prefix + param_names[7])
+        maybe_copy(module, sd, weight_quantizer, mp_replace, transformer_param_names[10], prefix + param_names[8])
+
+        # This line is necessary for proper output when kernels + meta tensors are used in Llama models
+        # TODO: Investigate root-cause and fix meta tensor loading
+        module.mlp.output_b = None
+
+
+class LLAMALayerPolicy(TransformerPolicy):
+
+    def __init__(self, client_module, inference=True):
+        super().__init__(
+            inference,
+            mlp_act_func_type=ActivationFuncType.GATED_SILU,
+            norm_type=NormType.RMSNorm,
+        )
+        self.client_module = client_module
+        try:
+            import transformers
+            LLAMALayerPolicy._orig_layer_class = transformers.models.llama.modeling_llama.LlamaDecoderLayer  # type: ignore
+        except:
+            LLAMALayerPolicy._orig_layer_class = None
+
+    def get_hidden_heads(self):
+        hidden_heads = (
+            getattr(self.client_module.self_attn.q_proj.weight, "ds_shape",
+                    self.client_module.self_attn.q_proj.weight.shape)[1],
+            self.client_module.self_attn.num_heads,
+            self.client_module.input_layernorm.variance_epsilon,
+            getattr(self.client_module.mlp.gate_proj.weight, "ds_shape",
+                    self.client_module.mlp.gate_proj.weight.shape)[0],
+        )
+        return hidden_heads
+
+    def attention(self, enable_training=False):
+        qw = self.client_module.self_attn.q_proj.weight
+        kw = self.client_module.self_attn.k_proj.weight
+        vw = self.client_module.self_attn.v_proj.weight
+
+        qkvw = Parameter(torch.cat((qw, kw, vw), dim=0), requires_grad=enable_training)
+
+        return qkvw, \
+                None, \
+                self.client_module.self_attn.o_proj.weight, \
+                None
+
+    def mlp(self, enable_training=False):
+        mlp1_up = self.client_module.mlp.up_proj.weight
+        mlp1_gate = self.client_module.mlp.gate_proj.weight
+        mlp2 = self.client_module.mlp.down_proj.weight
+
+        mlp1 = Parameter(torch.cat((mlp1_up, mlp1_gate), dim=0), requires_grad=enable_training)
+
+        return mlp1, None, mlp2, None
+
+    def layernorm(self):
+        return self.client_module.post_attention_layernorm.weight, \
+               None, \
+               self.client_module.input_layernorm.weight, \
+               None
diff --git a/deepspeed/module_inject/containers/llama2.py b/deepspeed/module_inject/containers/llama2.py
new file mode 100644
index 000000000000..b531890ab859
--- /dev/null
+++ b/deepspeed/module_inject/containers/llama2.py
@@ -0,0 +1,158 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .base import *
+from .features import HybridSplitQKVContainer, HybridGatedMLPContainer, MetaTensorContainer
+from deepspeed.utils.types import ActivationFuncType, NormType
+from deepspeed.model_implementations.transformers.ds_llama2 import DeepSpeedLlama2Inference
+import torch
+from torch.nn.parameter import Parameter
+
+from ..policy import (
+    TransformerPolicy,
+    transformer_param_names,
+    maybe_copy,
+    maybe_copy_qkv,
+    maybe_copy_geglu,
+    maybe_get_lora,
+)
+
+
+class DS_LLAMA2Container(MetaTensorContainer, HybridGatedMLPContainer, HybridSplitQKVContainer,
+                         BaseTransformerContainer):
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+        # All model specific things should be defined here instead of the base class.
+
+    def create_module(self, config=None):
+        _config = config if config is not None else self.ds_model_config
+
+        _config.rotate_half = False
+        _config.rotate_every_two = True
+        _config.rotary_dim = self.hidden_size // self.num_attention_heads
+        _config.num_kv = self.policy.client_module.attention.n_kv_heads
+        self.module = DeepSpeedLlama2Inference(_config, mp_group=self.mp_group)
+
+        return self.module
+
+    def set_lora_params(self):
+        """
+        Necessary to implement for `HybridEngineContainer`
+        """
+        self.lora_params = [
+            maybe_get_lora(p) for p in [
+                self.policy.client_module.feed_forward.w3.weight, self.policy.client_module.feed_forward.w1.weight,
+                self.policy.client_module.feed_forward.w2.weight, self.policy.client_module.attention.wq.weight,
+                self.policy.client_module.attention.wk.weight, self.policy.client_module.attention.wv.weight,
+                self.policy.client_module.attention.wo.weight
+            ]
+        ]
+
+    def get_lora_matched_pair(self):
+        up_proj_lora, gate_proj_lora, down_proj_lora, q_lora, k_lora, v_lora, out_lora = self.get_lora_params()
+        ret = [(up_proj_lora, self.inter_up_w), (gate_proj_lora, self.inter_gate_w), (down_proj_lora, self._4hh_w),
+               (out_lora, self.dense_w), (q_lora, self.qw), (k_lora, self.kw), (v_lora, self.vw)]
+        return ret
+
+    def set_q_k_v(self):
+        """
+        Necessary to implement for `HybridSplitQKVContainer`
+        """
+        self.qw = self.policy.client_module.attention.wq.weight
+        self.qb = None
+        self.kw = self.policy.client_module.attention.wk.weight
+        self.kb = None
+        self.vw = self.policy.client_module.attention.wv.weight
+        self.vb = None
+
+    def set_mlp_gate(self):
+        """
+        Necessary to implement for `HybridGatedMLPContainer`
+        """
+        self.inter_up_w = self.policy.client_module.feed_forward.w2.weight
+        self.inter_up_b = None
+        self.inter_gate_w = self.policy.client_module.feed_forward.w1.weight
+        self.inter_gate_b = None
+
+    def load_params(self, module, sd, weight_quantizer, mp_replace, prefix):
+        param_names = (
+            'attention.wq.weight', \
+            'attention.wk.weight', \
+            'attention.wv.weight', \
+            'attention.wo.weight', \
+            'feed_forward.w3.weight', \
+            'feed_forward.w1.weight', \
+            'feed_forward.w2.weight', \
+            'ffn_norm.weight', \
+            'attention_norm.weight'
+        )
+
+        maybe_copy_qkv(module.attention,
+                       sd,
+                       weight_quantizer,
+                       mp_replace,
+                       'attn_qkvw', [prefix + param_names[0], prefix + param_names[1], prefix + param_names[2]],
+                       split_qkv=self.policy.split_qkv)
+        for i in range(3, 4):
+            maybe_copy(module.attention, sd, weight_quantizer, mp_replace, transformer_param_names[i - 1],
+                       prefix + param_names[i])
+        maybe_copy_geglu(module.mlp, sd, weight_quantizer, mp_replace, 'inter_w',
+                         [prefix + param_names[4], prefix + param_names[5]])
+        maybe_copy(module.mlp, sd, weight_quantizer, mp_replace, 'output_w', prefix + param_names[6])
+
+        maybe_copy(module.mlp, sd, weight_quantizer, mp_replace, transformer_param_names[8], prefix + param_names[7])
+        maybe_copy(module, sd, weight_quantizer, mp_replace, transformer_param_names[10], prefix + param_names[8])
+
+
+class LLAMA2LayerPolicy(TransformerPolicy):
+
+    def __init__(self, client_module, inference=True):
+        super().__init__(
+            inference,
+            mlp_act_func_type=ActivationFuncType.GATED_SILU,
+            norm_type=NormType.RMSNorm,
+        )
+        self.client_module = client_module
+        try:
+            import llama
+            LLAMA2LayerPolicy._orig_layer_class = llama.model.TransformerBlock  # type: ignore
+        except:
+            LLAMA2LayerPolicy._orig_layer_class = None
+
+    def get_hidden_heads(self):
+        return self.client_module.attention.wq.weight.shape[1], \
+                self.client_module.n_heads, \
+                self.client_module.ffn_norm.eps, \
+                (self.client_module.feed_forward.w1.weight.shape[0] * \
+                    deepspeed.comm.get_world_size() if deepspeed.comm.is_initialized() else 1) # this is a hack to inject when model is already partitioned!
+
+    def attention(self, enable_training=False):
+        qw = self.client_module.attention.wq.weight
+        kw = self.client_module.attention.wk.weight
+        vw = self.client_module.attention.wv.weight
+
+        qkvw = Parameter(torch.cat((qw, kw, vw), dim=0), requires_grad=enable_training)
+
+        return qkvw, \
+                None, \
+                self.client_module.attention.wo.weight, \
+                None
+
+    def mlp(self, enable_training=False):
+        mlp1_up = self.client_module.feed_forward.w3.weight
+        mlp1_gate = self.client_module.feed_forward.w1.weight
+        mlp2 = self.client_module.feed_forward.w2.weight
+
+        mlp1 = Parameter(torch.cat((mlp1_up, mlp1_gate), dim=0), requires_grad=enable_training)
+
+        return mlp1, None, mlp2, None
+
+    def layernorm(self):
+        return self.client_module.ffn_norm.weight, \
+               None, \
+               self.client_module.attention_norm.weight, \
+               None
diff --git a/deepspeed/module_inject/containers/megatron_gpt.py b/deepspeed/module_inject/containers/megatron_gpt.py
index 7a8db9108f38..2851dd246d99 100644
--- a/deepspeed/module_inject/containers/megatron_gpt.py
+++ b/deepspeed/module_inject/containers/megatron_gpt.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .base import *
 from .features.megatron import MegatronContainer
@@ -9,6 +12,7 @@
 
 
 class DS_MegatronGPTContainer(MegatronContainer, BaseTransformerContainer):
+
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
 
@@ -36,9 +40,7 @@ class MegatronLayerPolicy(TransformerPolicy):
     use_mup = False
 
     def __init__(self, client_module, inference=True):
-        super().__init__(inference,
-                         megatron_v2=MegatronLayerPolicy.megatron_v2,
-                         use_mup=MegatronLayerPolicy.use_mup)
+        super().__init__(inference, megatron_v2=MegatronLayerPolicy.megatron_v2, use_mup=MegatronLayerPolicy.use_mup)
         self.client_module = client_module
         # we use megatron version to differentiate between the old and new
         # megatron-lm source code
@@ -49,14 +51,23 @@ def __init__(self, client_module, inference=True):
                 try:
                     from megatron.model.transformer import ParallelTransformerLayer
                     MegatronLayerPolicy._orig_layer_class = ParallelTransformerLayer
+                    MegatronLayerPolicy.version = 1
                 except ImportError:
                     MegatronLayerPolicy._orig_layer_class = None
 
     def get_hidden_heads(self):
-        return self.client_module.attention.query_key_value.weight.shape[1], \
-                self.client_module.attention.num_attention_heads
+        if MegatronLayerPolicy.version == 0:
+            return self.client_module.attention.query_key_value.weight.shape[1], \
+                    self.client_module.attention.num_attention_heads, \
+                    self.client_module.input_layernorm.eps, \
+                    DEFAULT_INTERMEDIATE_SIZE
+        else:
+            return self.client_module.self_attention.query_key_value.weight.shape[1], \
+                    self.client_module.self_attention.num_attention_heads, \
+                    self.client_module.input_layernorm.eps, \
+                    DEFAULT_INTERMEDIATE_SIZE
 
-    def attention(self):
+    def attention(self, enable_training=False):
         if self.inference:
             if MegatronLayerPolicy.version == 0:
                 attention = self.client_module.attention
@@ -68,7 +79,7 @@ def attention(self):
                attention.dense.weight, \
                attention.dense.bias
 
-    def mlp(self, moe_type='standard'):
+    def mlp(self, moe_type='standard', enable_training=False):
         from deepspeed.moe.utils import has_moe_layers
         moe, _ = has_moe_layers(self.client_module)
 
diff --git a/deepspeed/module_inject/containers/megatron_gpt_moe.py b/deepspeed/module_inject/containers/megatron_gpt_moe.py
index 2968161305c4..c4063be05b6c 100644
--- a/deepspeed/module_inject/containers/megatron_gpt_moe.py
+++ b/deepspeed/module_inject/containers/megatron_gpt_moe.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .base import *
 from .base_moe import *
@@ -10,6 +13,7 @@
 
 
 class DS_MegatronGPTMoEContainer(MegatronContainer, BaseTransformerMoEContainer):
+
     def __init__(self, policy, config, model_config, layer_id):
         super().__init__(policy, config, model_config, layer_id)
 
@@ -53,7 +57,7 @@ def __init__(self, client_module, inference=True):
     def get_num_experts(self):
         return self.num_experts
 
-    def mlp(self, moe_type='standard'):
+    def mlp(self, moe_type='standard', enable_training=False):
         # for now, all of this is tightly coupled to megatron-deepspeed moe implementation
         # todo: think and refactor this to be more general
 
diff --git a/deepspeed/module_inject/containers/opt.py b/deepspeed/module_inject/containers/opt.py
index 8f9c30bcac8c..eba619c79d6e 100644
--- a/deepspeed/module_inject/containers/opt.py
+++ b/deepspeed/module_inject/containers/opt.py
@@ -1,7 +1,10 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .base import *
-from .features.meta_tensor import MetaTensorContainer
+from .features import MetaTensorContainer, HybridSplitQKVContainer
 from deepspeed.model_implementations.transformers.ds_opt import DeepSpeedOPTInference
 import torch
 from torch.nn.parameter import Parameter
@@ -9,10 +12,12 @@
 from ..policy import transformer_param_names
 from ..policy import maybe_copy
 from ..policy import maybe_copy_qkv
+from ..policy import maybe_get_lora
 from deepspeed.utils.types import ActivationFuncType
 
 
-class DS_OPTContainer(MetaTensorContainer, BaseTransformerContainer):
+class DS_OPTContainer(MetaTensorContainer, HybridSplitQKVContainer, BaseTransformerContainer):
+
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
 
@@ -24,6 +29,38 @@ def create_module(self, config=None):
         self.module.config.scale_attention = self.scale_attention
         return self.module
 
+    def set_lora_params(self):
+        """
+        Necessary to implement for `HybridEngineContainer`
+        """
+        self.lora_params = [
+            maybe_get_lora(p) for p in [
+                self.policy.client_module.fc1,
+                self.policy.client_module.fc2,
+                self.policy.client_module.self_attn.q_proj,
+                self.policy.client_module.self_attn.k_proj,
+                self.policy.client_module.self_attn.v_proj,
+                self.policy.client_module.self_attn.out_proj,
+            ]
+        ]
+
+    def set_q_k_v(self):
+        """
+        Necessary to implement for `HybridSplitQKVContainer`
+        """
+        self.qw = self.policy.client_module.self_attn.q_proj.weight
+        self.qb = self.policy.client_module.self_attn.q_proj.bias
+        self.kw = self.policy.client_module.self_attn.k_proj.weight
+        self.kb = self.policy.client_module.self_attn.k_proj.bias
+        self.vw = self.policy.client_module.self_attn.v_proj.weight
+        self.vb = self.policy.client_module.self_attn.v_proj.bias
+
+    def get_lora_matched_pair(self):
+        fc1_lora, fc2_lora, q_lora, k_lora, v_lora, out_lora = self.get_lora_params()
+        ret = [(fc1_lora, self._h4h_w), (fc2_lora, self._4hh_w), (out_lora, self.dense_w), (q_lora, self.qw),
+               (k_lora, self.kw), (v_lora, self.vw)]
+        return ret
+
     def load_params(self, module, sd, weight_quantizer, mp_replace, prefix):
         param_names = (
             'self_attn.q_proj.weight', \
@@ -50,32 +87,16 @@ def load_params(self, module, sd, weight_quantizer, mp_replace, prefix):
                            weight_quantizer,
                            mp_replace,
                            transformer_param_names[i // 3],
-                           [
-                               prefix + param_names[i],
-                               prefix + param_names[i + 1],
-                               prefix + param_names[i + 2]
-                           ],
+                           [prefix + param_names[i], prefix + param_names[i + 1], prefix + param_names[i + 2]],
                            split_qkv=self.policy.split_qkv)
         for i in range(6, 8):
-            maybe_copy(module.attention,
-                       sd,
-                       weight_quantizer,
-                       mp_replace,
-                       transformer_param_names[i - 4],
+            maybe_copy(module.attention, sd, weight_quantizer, mp_replace, transformer_param_names[i - 4],
                        prefix + param_names[i])
         for i in range(8, 14):
-            maybe_copy(module.mlp,
-                       sd,
-                       weight_quantizer,
-                       mp_replace,
-                       transformer_param_names[i - 4],
+            maybe_copy(module.mlp, sd, weight_quantizer, mp_replace, transformer_param_names[i - 4],
                        prefix + param_names[i])
         for i in range(14, 16):
-            maybe_copy(module,
-                       sd,
-                       weight_quantizer,
-                       mp_replace,
-                       transformer_param_names[i - 4],
+            maybe_copy(module, sd, weight_quantizer, mp_replace, transformer_param_names[i - 4],
                        prefix + param_names[i])
 
 
@@ -83,27 +104,33 @@ class HFOPTLayerPolicy(TransformerPolicy):
     _orig_layer_class = None
 
     def __init__(self, client_module, inference=True, use_load_prefix=True):
-        super().__init__(inference,
-                         linear_layer=True,
-                         mlp_act_func_type=ActivationFuncType.ReLU,
-                         pre_attn_norm=True,
-                         use_load_prefix=use_load_prefix)
+        super().__init__(inference, linear_layer=True, pre_attn_norm=True, use_load_prefix=use_load_prefix)
         self.client_module = client_module
-
         try:
             import transformers
             HFOPTLayerPolicy._orig_layer_class = transformers.models.opt.modeling_opt.OPTDecoderLayer
-            if isinstance(TransformerPolicy.hf_model_config,
-                          transformers.models.opt.configuration_opt.OPTConfig):
-                self.pre_attn_norm = TransformerPolicy.hf_model_config.do_layer_norm_before
         except:
             HFOPTLayerPolicy._orig_layer_class = None
 
+        if hasattr(TransformerPolicy, "hf_model_config") and hasattr(TransformerPolicy.hf_model_config,
+                                                                     "activation_function"):
+            if TransformerPolicy.hf_model_config.activation_function == "relu":
+                self.mlp_act_func_type = ActivationFuncType.ReLU
+            elif TransformerPolicy.hf_model_config.activation_function in ["gelu", "gelu_new"]:
+                self.mlp_act_func_type = ActivationFuncType.GELU
+            else:
+                raise ValueError("Unsupported activation function: {}".format(
+                    TransformerPolicy.hf_model_config.activation_function))
+        else:
+            self.mlp_act_func_type = ActivationFuncType.ReLU  # default
+
     def get_hidden_heads(self):
         return self.client_module.self_attn.embed_dim, \
-                self.client_module.self_attn.num_heads
+                self.client_module.self_attn.num_heads, \
+                self.client_module.self_attn_layer_norm.eps, \
+                DEFAULT_INTERMEDIATE_SIZE
 
-    def attention(self):
+    def attention(self, enable_training=False):
         qw = self.client_module.self_attn.q_proj.weight
         qb = self.client_module.self_attn.q_proj.bias
 
@@ -113,15 +140,14 @@ def attention(self):
         vw = self.client_module.self_attn.v_proj.weight
         vb = self.client_module.self_attn.v_proj.bias
 
-        qkvw = Parameter(torch.cat((qw, kw, vw), dim=0), requires_grad=False)
-        qkvb = Parameter(torch.cat((qb, kb, vb), dim=0), requires_grad=False)
-
+        qkvw = Parameter(torch.cat((qw, kw, vw), dim=0), requires_grad=enable_training)
+        qkvb = Parameter(torch.cat((qb, kb, vb), dim=0), requires_grad=enable_training)
         return qkvw, \
                qkvb, \
                self.client_module.self_attn.out_proj.weight, \
                self.client_module.self_attn.out_proj.bias
 
-    def mlp(self):
+    def mlp(self, enable_training=False):
         return self.client_module.fc1.weight, \
                self.client_module.fc1.bias, \
                self.client_module.fc2.weight, \
diff --git a/deepspeed/module_inject/containers/unet.py b/deepspeed/module_inject/containers/unet.py
index 461ca12bf36a..4e15699dc5a1 100644
--- a/deepspeed/module_inject/containers/unet.py
+++ b/deepspeed/module_inject/containers/unet.py
@@ -1,6 +1,8 @@
-'''
-Copyright 2022 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 import torch
 from torch.nn.parameter import Parameter
 
@@ -9,6 +11,7 @@
 
 
 class UNetPolicy(DSPolicy):
+
     def __init__(self):
         super().__init__()
         try:
diff --git a/deepspeed/module_inject/containers/vae.py b/deepspeed/module_inject/containers/vae.py
index c873f9768a87..d7c9642768cc 100644
--- a/deepspeed/module_inject/containers/vae.py
+++ b/deepspeed/module_inject/containers/vae.py
@@ -1,11 +1,14 @@
-'''
-Copyright 2022 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 from ..policy import DSPolicy
 from ...model_implementations.diffusers.vae import DSVAE
 
 
 class VAEPolicy(DSPolicy):
+
     def __init__(self):
         super().__init__()
         try:
@@ -29,5 +32,5 @@ def apply(self, module, enable_cuda_graph=True):
         return DSVAE(module, enable_cuda_graph=enable_cuda_graph)
 
     # NOTE (lekurile): Should we have a diffusers policy class?
-    def attention(self):
+    def attention(self, client_module):
         pass
diff --git a/deepspeed/module_inject/fusedqkv_utils.py b/deepspeed/module_inject/fusedqkv_utils.py
new file mode 100644
index 000000000000..2e8f6b5917ed
--- /dev/null
+++ b/deepspeed/module_inject/fusedqkv_utils.py
@@ -0,0 +1,96 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+import torch
+from deepspeed.utils.logging import warning_once
+from deepspeed.module_inject.tp_shard import get_shard_size, get_shard_size_list, get_num_kv_heads
+import re
+
+
+def split_by_qkvlist_and_refuse(qkv_list, split_size, split_dim=0, cat_dim=0):
+    qkv_split_list = [torch.split(mat, split_size, dim=split_dim) for mat in qkv_list]
+    tp_fusedqkv_list = [
+        torch.cat([qkv_s[i] for qkv_s in qkv_split_list], dim=cat_dim) for i in range(len(qkv_split_list[0]))
+    ]
+    return tp_fusedqkv_list
+
+
+def require_tp_fused_qkvw(name, mp_size):
+    fused_qkvw_name_list = ['qkv_proj', 'query_key_value', 'attn.Wqkv']
+
+    if mp_size == 1:
+        return False
+    for fused_name in fused_qkvw_name_list:
+        if fused_name in name:
+            return True
+    return False
+
+
+def prepare_tp_fused_qkvw(module_str, src, mp_size, gpu_index):
+    if src == None:
+        return
+    fused_type_dict = {
+        'CodeGenBlock': 'codegentype',
+        'BloomBlock': 'bloomtype',
+        'GLMBlock': 'glmtype',
+        "MPTBlock": 'glmtype',
+        "MptBlock": 'glmtype',
+    }
+
+    def _codegen_type_transpose(input, mp_size, codegen_mp_num=4):
+        # codegen_mp_num defined in https://github.com/huggingface/transformers/blob/main/src/transformers/models/codegen/modeling_codegen.py
+        assert get_num_kv_heads() % (
+            mp_size * codegen_mp_num) == 0, "codgen autoTP requires num_kv_heads % (mp_size*codegen_mp_num) == 0"
+        #input : [3*hidden_dim, hidden_dim](weight) or [3*hidden_dim](bias)
+
+        shape = input.shape
+        dst_shape = get_shard_size(shape[0], mp_size)
+        num_mp_blocks = input.reshape(codegen_mp_num, shape[0] // codegen_mp_num, shape[1])
+
+        #num_mp_blocks : [codegen_mp_num, 3*hidden_dim/codegen_mp_num, :]
+        src_split = list(torch.split(num_mp_blocks, num_mp_blocks.shape[1] // 3, dim=1))
+        src_split = [x.reshape(codegen_mp_num * mp_size, -1, shape[1]) for x in src_split]
+
+        split_fusedqkv = split_by_qkvlist_and_refuse(src_split, get_shard_size(shape[0] // 3, mp_size), 0, 1)
+        tp_fuseqkv_weight = torch.cat(split_fusedqkv, dim=0).reshape(shape[0], -1)
+
+        return tp_fuseqkv_weight[gpu_index * dst_shape:(gpu_index + 1) * dst_shape]
+
+    def _glm_type_transpose(input, mp_size):
+        #input : [3*hidden_dim, hidden_dim](weight) or [3*hidden_dim](bias)
+
+        shape = input.shape
+        src_split = torch.split(input, shape[0] // 3, dim=0)
+
+        split_fusedqkv = split_by_qkvlist_and_refuse(src_split, get_shard_size_list(shape[0] // 3, mp_size))
+        return split_fusedqkv[gpu_index]
+
+    def _bloom_type_transpose(input, mp_size):
+        shape = input.shape
+
+        split_fusedqkv = input.split(get_shard_size_list(shape[0], mp_size), dim=0)
+        return split_fusedqkv[gpu_index]
+
+    def _transpose_fused_qkvw(src, mp_size, fused_qkv_type=None):
+
+        # suppose num_heads=n, q(n)_w means the n-th q head linear weight, the weight format are as following
+        # bloomtype: [q(1)_w,k(1)_w,v(1)_w,q(2)_w,k(2)_w,v(2)_w,...,q(n)_w,k(n)_w,v(n)_w]
+        # glmtype:  [q(1)_w, q(2)_w,...,q(n)_w,k(1)_w,k(2)_w,...,k(n)_w,v(1)_w,v(2)_w,...,v(n)_w]
+        # codegentype: [q(1)_w,q(2)_w,...,q(n/t)_w,k(1)_w,k(2)_w,...,k(n/t)_w,v(1)_2,v(2)_w,...v(n/t)_w,q(n/t+1)_w,...], where t is a const defined in model file.
+
+        if fused_qkv_type == 'bloomtype':
+            return _bloom_type_transpose(src, mp_size)
+        elif fused_qkv_type == 'codegentype':
+            return _codegen_type_transpose(src, mp_size)
+        elif fused_qkv_type == 'glmtype':
+            return _glm_type_transpose(src, mp_size)
+
+        raise ValueError("unknown fused_qkv_type")
+
+    for module_name, fused_type in fused_type_dict.items():
+        if re.search(module_name, module_str):
+            return _transpose_fused_qkvw(src, mp_size, fused_type)
+    warning_once(f"Unrecognized fusedkqv weight type, default to using bloom type,"
+                 f"please check in prepare_tp_fused_qkvw() to avoid potential calculation errors")
+    return _bloom_type_transpose(src, mp_size)
diff --git a/deepspeed/module_inject/inject.py b/deepspeed/module_inject/inject.py
index 384bb7279fdf..401da1bd6ef7 100755
--- a/deepspeed/module_inject/inject.py
+++ b/deepspeed/module_inject/inject.py
@@ -1,34 +1,29 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import copy
 import torch
 from deepspeed.ops.transformer import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig
 
 
-def module_inject(layer_obj,
-                  model,
-                  config,
-                  micro_batch_size,
-                  max_seq_length,
-                  seed,
-                  preln,
-                  fp16=True):
+def module_inject(layer_obj, model, config, micro_batch_size, max_seq_length, seed, preln, fp16=True):
     for name, child in model.named_children():
         if isinstance(child, layer_obj):
             print('REPLACING BertLayer')
 
-            cuda_config = DeepSpeedTransformerConfig(
-                batch_size=micro_batch_size,
-                max_seq_length=max_seq_length,
-                hidden_size=config.hidden_size,
-                heads=config.num_attention_heads,
-                attn_dropout_ratio=config.attention_probs_dropout_prob,
-                hidden_dropout_ratio=config.hidden_dropout_prob,
-                num_hidden_layers=config.num_hidden_layers,
-                initializer_range=config.initializer_range,
-                seed=seed,
-                fp16=fp16,
-                pre_layer_norm=preln)
+            cuda_config = DeepSpeedTransformerConfig(batch_size=micro_batch_size,
+                                                     max_seq_length=max_seq_length,
+                                                     hidden_size=config.hidden_size,
+                                                     heads=config.num_attention_heads,
+                                                     attn_dropout_ratio=config.attention_probs_dropout_prob,
+                                                     hidden_dropout_ratio=config.hidden_dropout_prob,
+                                                     num_hidden_layers=config.num_hidden_layers,
+                                                     initializer_range=config.initializer_range,
+                                                     seed=seed,
+                                                     fp16=fp16,
+                                                     pre_layer_norm=preln)
 
             new_module = DeepSpeedTransformerLayer(cuda_config)
 
@@ -71,14 +66,7 @@ def module_inject(layer_obj,
             setattr(model, name, copy.deepcopy(new_module))
 
         else:
-            module_inject(layer_obj,
-                          child,
-                          config,
-                          micro_batch_size,
-                          max_seq_length,
-                          seed,
-                          preln,
-                          fp16)
+            module_inject(layer_obj, child, config, micro_batch_size, max_seq_length, seed, preln, fp16)
 
     return model
 
diff --git a/deepspeed/module_inject/layers.py b/deepspeed/module_inject/layers.py
index 0636e89289d5..969826ad0289 100644
--- a/deepspeed/module_inject/layers.py
+++ b/deepspeed/module_inject/layers.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 from deepspeed import comm as dist
@@ -7,9 +10,11 @@
 
 from torch.nn.parameter import Parameter
 from deepspeed.accelerator import get_accelerator
+from deepspeed.module_inject.tp_shard import get_shard_size, get_shard_size_list
 
 
 class LinearAllreduce(nn.Module):
+
     def __init__(self, weight, bias=None, mp_group=None):
         super(LinearAllreduce, self).__init__()
         self.weight = weight
@@ -19,13 +24,43 @@ def __init__(self, weight, bias=None, mp_group=None):
     def forward(self, input):
         output = torch.matmul(input, self.weight.transpose(-1, -2))
         if self.mp_group is not None:
-            dist.all_reduce(output, group=self.mp_group)
+            dist.inference_all_reduce(output, group=self.mp_group)
+        if self.bias is not None:
+            output += self.bias
+        return output
+
+
+class LmHeadLinearAllreduce(nn.Module):
+
+    def __init__(
+        self,
+        weight,
+        rank,
+        world_size,
+        bias=None,
+        mp_group=None,
+    ):
+        super(LmHeadLinearAllreduce, self).__init__()
+        self.weight = weight
+        self.bias = bias
+        self.mp_group = mp_group
+        self.rank = rank
+        self.world_size = world_size
+
+    def forward(self, input):
+        input_shard_size = get_shard_size(input.shape[-1], self.world_size)
+        input_shard_offset = sum(get_shard_size_list(input.shape[-1], self.world_size)[0:self.rank])
+        output = torch.matmul(input[:, :, input_shard_offset:input_shard_offset + input_shard_size],
+                              self.weight.transpose(-1, -2))
+        if self.mp_group is not None:
+            dist.inference_all_reduce(output, group=self.mp_group)
         if self.bias is not None:
             output += self.bias
         return output
 
 
 class LinearLayer(nn.Module):
+
     def __init__(self, weight_shape=None, dtype=torch.half, weight=None, bias=None):
         super(LinearLayer, self).__init__()
         if weight is not None:
@@ -33,9 +68,7 @@ def __init__(self, weight_shape=None, dtype=torch.half, weight=None, bias=None):
             self.bias = bias
         else:
             self.weight = Parameter(
-                torch.empty(weight_shape,
-                            dtype=dtype,
-                            device=get_accelerator().current_device_name()))
+                torch.empty(weight_shape, dtype=dtype, device=get_accelerator().current_device_name()))
 
             self.bias = Parameter(
                 torch.empty(weight_shape[0],
@@ -51,26 +84,35 @@ def forward(self, input):
 
 
 class Normalize(nn.Module):
-    def __init__(self, dim, dtype=torch.float, eps=1e-5):
+
+    def __init__(self, dim=None, dtype=torch.float, eps=1e-5, weight=None, bias=None):
         super(Normalize, self).__init__()
-        self.norm = nn.LayerNorm(dim,
-                                 eps=eps).to(dtype).to(
-                                     get_accelerator().current_device_name())
-        self.weight = self.norm.weight
-        self.bias = self.norm.bias
+        if weight is not None:
+            self.weight = weight
+            self.bias = bias
+        else:
+            self.norm = nn.LayerNorm(dim, eps=eps).to(dtype).to(get_accelerator().current_device_name())
+            self.weight = self.norm.weight
+            self.bias = self.norm.bias
+
+        self.eps = eps
 
     def forward(self, input):
-        return self.norm(input)
+        return nn.functional.layer_norm(input, input.shape[-1:], self.weight, self.bias, eps=self.eps)
 
 
 class EmbeddingLayer(nn.Module):
-    def __init__(self, weight_shape, dtype=torch.half):
+
+    def __init__(self, weight_shape=None, dtype=torch.half, weight=None, bias=None):
         super(EmbeddingLayer, self).__init__()
-        self.weight = Parameter(
-            torch.empty(weight_shape[0],
-                        weight_shape[1],
-                        dtype=dtype,
-                        device=get_accelerator().current_device_name()))
+        if weight is None:
+            self.weight = Parameter(
+                torch.empty(weight_shape[0],
+                            weight_shape[1],
+                            dtype=dtype,
+                            device=get_accelerator().current_device_name()))
+        else:
+            self.weight = weight
 
     def forward(self, input):
         return F.embedding(input, self.weight)
@@ -80,22 +122,41 @@ class OPTEmbedding(EmbeddingLayer):
     """
     This module learns positional embeddings up to a fixed maximum size.
     """
-    def __init__(self, weight_shape):
+
+    def __init__(self, weight_shape=None, weight=None, bias=None):
         # OPT is set up so that if padding_idx is specified then offset the embedding ids by 2
         # and adjust num_embeddings appropriately. Other models don't have this hack
         self.offset = 2
-        super().__init__(weight_shape)
+        super().__init__(weight_shape, weight=weight)
 
     def forward(self, attention_mask: torch.LongTensor, past_key_values_length: int = 0):
         """`input_ids_shape` is expected to be [bsz x seqlen]."""
         attention_mask = attention_mask.long()
 
         # create positions depending on attention_mask
-        positions = (torch.cumsum(attention_mask,
-                                  dim=1).type_as(attention_mask) *
-                     attention_mask).long() - 1
+        positions = (torch.cumsum(attention_mask, dim=1).type_as(attention_mask) * attention_mask).long() - 1
 
         # cut positions if `past_key_values_length` is > 0
         positions = positions[:, past_key_values_length:]
 
         return super().forward(positions + self.offset)
+
+
+class RMSNormalize(nn.Module):
+
+    def __init__(self, dim=None, dtype=torch.float, eps=1e-5, weight=None):
+        super(RMSNormalize, self).__init__()
+        if weight is not None:
+            self.weight = weight
+        else:
+            self.weight = nn.Parameter(torch.ones(dim, dtype=dtype, device=get_accelerator().current_device_name()))
+
+        self.eps = eps
+
+    def forward(self, hidden_states):
+        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.eps)
+        if self.weight.dtype in [torch.float16, torch.bfloat16]:
+            hidden_states = hidden_states.to(self.weight.dtype)
+
+        return hidden_states * self.weight
diff --git a/deepspeed/module_inject/load_checkpoint.py b/deepspeed/module_inject/load_checkpoint.py
index ff8f454d7247..a24d36344cdc 100644
--- a/deepspeed/module_inject/load_checkpoint.py
+++ b/deepspeed/module_inject/load_checkpoint.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from torch import nn
 from deepspeed.model_implementations.transformers.ds_bloom import DeepSpeedBloomInference
@@ -6,12 +9,14 @@
 from deepspeed.model_implementations.transformers.ds_bert import DeepSpeedBERTInference
 from deepspeed.model_implementations.transformers.ds_megatron_gpt import DeepSpeedMegatronGPTInference
 from deepspeed.model_implementations.transformers.ds_opt import DeepSpeedOPTInference
+from deepspeed.model_implementations.transformers.ds_llama2 import DeepSpeedLlama2Inference
 
 import deepspeed.ops.transformer as transformer_inference
-from .layers import LinearLayer, Normalize, EmbeddingLayer, OPTEmbedding
+from .layers import LinearLayer, Normalize, EmbeddingLayer, OPTEmbedding, RMSNormalize
 import torch
 import gc
 from deepspeed.accelerator import get_accelerator
+import re
 
 
 def load_model_with_checkpoint(r_module,
@@ -24,6 +29,19 @@ def load_model_with_checkpoint(r_module,
                                container=None):
     error_msgs = []
 
+    def prefix_check():
+        # if keys start with 'model.' or 'transformer.', don't skip level 0 prefix
+        for key in sd[0].keys():
+            # OPT models
+            if re.match("^model[.]", key):
+                return False
+            # BLOOM models
+            if re.match("^transformer[.]", key):
+                return False
+        return True
+
+    skip_level_0_prefix = prefix_check() and container.policy.use_load_prefix
+
     def transpose(data):
         with torch.no_grad():
             data = data.contiguous()
@@ -40,10 +58,8 @@ def load(module, prefix):
         if prefix + 'bias' in sd[0].keys():
             if module.bias.data.is_meta:
                 # meta tensor cannot be casted or copied to, so we need to replace it with a normal tensor here
-                module.bias = torch.nn.parameter.Parameter(
-                    data=torch.empty_like(module.bias.data,
-                                          device="cpu"),
-                    requires_grad=module.bias.data.requires_grad)
+                module.bias = torch.nn.parameter.Parameter(data=torch.empty_like(module.bias.data, device="cpu"),
+                                                           requires_grad=module.bias.data.requires_grad)
             module.bias = mp_replace.copy(module.bias.data, sd[0][prefix + 'bias'])
         args = None
         gc.collect()
@@ -61,86 +77,62 @@ def load_parameters(module, prefix):
                             # set the quantizer number of groups using the checkpoint scale shape
                             weight_quantizer.num_groups = scale.shape[0]
                         else:
-                            tmp_data = sd[0][prefix + n].to(
-                                get_accelerator().current_device_name())
+                            tmp_data = sd[0][prefix + n].to(get_accelerator().current_device_name())
                             scale = None
                         src_shape = tmp_data.shape
                         dst_shape = p.shape
                         inner_dim = 1 if tmp_data.dtype == torch.int8 else 0
                         outer_dim = 0 if tmp_data.dtype == torch.int8 else 1
                         if (len(src_shape) == 2 and len(dst_shape) == 2):
-                            if (src_shape[inner_dim] == dst_shape[0]
-                                    and src_shape[outer_dim] == dst_shape[1]):
+                            if (src_shape[inner_dim] == dst_shape[0] and src_shape[outer_dim] == dst_shape[1]):
                                 if tmp_data.dtype != torch.int8:
                                     p = weight_quantizer.quantize(
-                                        transpose(tmp_data) if weight_quantizer.
-                                        q_int8 else tmp_data)
+                                        transpose(tmp_data) if weight_quantizer.q_int8 else tmp_data)
                                 else:
-                                    p = torch.nn.parameter.Parameter(tmp_data,
-                                                                     requires_grad=False)
+                                    p = torch.nn.parameter.Parameter(tmp_data, requires_grad=False)
                                     p.scale = scale
                                 setattr(module, n, p)
                             else:
-                                dim = inner_dim if src_shape[inner_dim] != dst_shape[
-                                    0] else outer_dim
+                                dim = inner_dim if src_shape[inner_dim] != dst_shape[0] else outer_dim
                                 dim1 = 0 if src_shape[inner_dim] != dst_shape[0] else 1
                                 if src_shape[dim] > dst_shape[dim1]:
-                                    weight_partition = torch.split(
-                                        tmp_data,
-                                        dst_shape[dim1],
-                                        dim=dim)[rank].to(
-                                            get_accelerator().current_device_name())
+                                    weight_partition = torch.split(tmp_data, dst_shape[dim1], dim=dim)[rank].to(
+                                        get_accelerator().current_device_name())
                                     assert tmp_data.dtype != torch.int8 or scale.numel() > weight_quantizer.num_groups * (rank+1), \
                                         '''ERROR: We require the quantization scales for larger TP-size when loading INT8 checkpoint!\
                                            Please use the FP16 checkpoint to generate INT8 checkpoint with the sharding parameters!'''
-                                    scale = scale.view(
-                                        -1)[weight_quantizer.num_groups *
-                                            (rank + 1):].reshape(
-                                                weight_quantizer.num_groups,
-                                                -1).contiguous()
+                                    scale = scale.view(-1)[weight_quantizer.num_groups * (rank + 1):].reshape(
+                                        weight_quantizer.num_groups, -1).contiguous()
                                 else:
                                     assert tmp_data.dtype != torch.int8, \
                                         '''Merging of the checkpoints are not supported when using INT8 checkpoint! \
                                           Please use a as many GPUs as TP-size for the checkpoint'''
                                     all_data = [
-                                        sd[j][prefix +
-                                              n] if type(sd[j][prefix + n]) is list else
-                                        sd[j][prefix + n].to(
-                                            get_accelerator().current_device_name())
-                                        for j in range(len(sd))
+                                        sd[j][prefix + n] if type(sd[j][prefix + n]) is list else sd[j][prefix + n].to(
+                                            get_accelerator().current_device_name()) for j in range(len(sd))
                                     ]
                                     # Check if the weight tensor is for the QKV parameter
-                                    if src_shape[1] == (3 *
-                                                        src_shape[0]) // ckpt_mp_size:
+                                    if src_shape[1] == (3 * src_shape[0]) // ckpt_mp_size:
                                         qkv_size = src_shape[outer_dim] // 3
                                         src_split = [
-                                            torch.split(src[0].data,
-                                                        qkv_size,
-                                                        dim=outer_dim)
-                                            for src in all_data
+                                            torch.split(src[0].data, qkv_size, dim=outer_dim) for src in all_data
                                         ]
 
                                         weight_partition = torch.cat([
-                                            torch.cat([qkv_s[i] for qkv_s in src_split],
-                                                      axis=outer_dim)
+                                            torch.cat([qkv_s[i] for qkv_s in src_split], axis=outer_dim)
                                             for i in range(len(src_split[0]))
                                         ],
                                                                      dim=dim)
                                     else:
                                         weight_partition = torch.cat([
-                                            ad[0].to(
-                                                get_accelerator().current_device_name())
-                                            if type(ad) is list else ad
-                                            for ad in all_data
+                                            ad[0].to(get_accelerator().current_device_name())
+                                            if type(ad) is list else ad for ad in all_data
                                         ],
                                                                      dim=dim)
                                     if tmp_data.dtype == torch.int8:
-                                        scale = torch.cat([
-                                            ad[1].to(
-                                                get_accelerator().current_device_name())
-                                            for ad in all_data
-                                        ],
-                                                          dim=dim)
+                                        scale = torch.cat(
+                                            [ad[1].to(get_accelerator().current_device_name()) for ad in all_data],
+                                            dim=dim)
 
                                 if tmp_data.dtype != torch.int8:
                                     weight_partition = weight_quantizer.quantize(
@@ -148,9 +140,8 @@ def load_parameters(module, prefix):
                                         parallel_dim=(0 if dim == 1 else 1)) if weight_quantizer.q_int8 else \
                                         weight_quantizer.quantize(weight_partition)
                                 else:
-                                    weight_partition = torch.nn.parameter.Parameter(
-                                        weight_partition,
-                                        requires_grad=False)
+                                    weight_partition = torch.nn.parameter.Parameter(weight_partition,
+                                                                                    requires_grad=False)
                                     weight_partition.scale = scale
                                 setattr(module, n, weight_partition)
                         else:
@@ -158,42 +149,27 @@ def load_parameters(module, prefix):
                                 p.data.copy_(tmp_data)
                             else:
                                 if src_shape[0] > dst_shape[0]:
-                                    bias_split = torch.split(
-                                        tmp_data,
-                                        dst_shape[-1])[rank].to(get_accelerator(
-                                        ).current_device_name()).contiguous()
+                                    bias_split = torch.split(tmp_data, dst_shape[-1])[rank].to(
+                                        get_accelerator().current_device_name()).contiguous()
                                     p.data.copy_(bias_split)
                                 else:
                                     # Check if the weight tensor is for the QKV parameter
-                                    if src_shape[0] == (3 * r_module.config.hidden_size
-                                                        ) // ckpt_mp_size:
+                                    if src_shape[0] == (3 * r_module.config.hidden_size) // ckpt_mp_size:
                                         qkv_size = src_shape[0] // 3
                                         src_split = [
-                                            torch.split(sd[j][prefix + n],
-                                                        qkv_size,
-                                                        dim=0) for j in range(len(sd))
+                                            torch.split(sd[j][prefix + n], qkv_size, dim=0) for j in range(len(sd))
                                         ]
 
                                         p.data.copy_(
-                                            torch.cat(
-                                                [
-                                                    torch.cat([
-                                                        qkv_s[i] for qkv_s in src_split
-                                                    ],
-                                                              axis=0)
-                                                    for i in range(len(src_split[0]))
-                                                ],
-                                                dim=0).to(get_accelerator(
-                                                ).current_device_name()).contiguous())
+                                            torch.cat([
+                                                torch.cat([qkv_s[i] for qkv_s in src_split], axis=0)
+                                                for i in range(len(src_split[0]))
+                                            ],
+                                                      dim=0).to(get_accelerator().current_device_name()).contiguous())
                                     else:
                                         p.data.copy_(
-                                            torch.cat(
-                                                [
-                                                    sd[j][prefix + n]
-                                                    for j in range(len(sd))
-                                                ],
-                                                dim=0).to(get_accelerator(
-                                                ).current_device_name()).contiguous())
+                                            torch.cat([sd[j][prefix + n] for j in range(len(sd))],
+                                                      dim=0).to(get_accelerator().current_device_name()).contiguous())
 
             load_parameters(module, prefix)
             for n, child in module.named_children():
@@ -204,8 +180,26 @@ def load_parameters(module, prefix):
     try:
         import transformers
         OPTLearnedPositionalEmbedding = transformers.models.opt.modeling_opt.OPTLearnedPositionalEmbedding
+        if hasattr(transformers.models, "llama"):
+            LlamaRMSNorm = transformers.models.llama.modeling_llama.LlamaRMSNorm
+        else:
+            LlamaRMSNorm = None
     except:
         OPTLearnedPositionalEmbedding = None
+    try:
+        from fairscale.nn.model_parallel.layers import (
+            ColumnParallelLinear,
+            ParallelEmbedding,
+            RowParallelLinear,
+        )
+    except:
+        ColumnParallelLinear = None
+        ParallelEmbedding = None
+        RowParallelLinear = None
+    try:
+        from llama.model import RMSNorm
+    except:
+        RMSNorm = None
     layer_policies = {
         nn.Linear: load,
         nn.Embedding: load,
@@ -219,8 +213,15 @@ def load_parameters(module, prefix):
         DeepSpeedBERTInference: load_transformer_layer,
         DeepSpeedMegatronGPTInference: load_transformer_layer,
         DeepSpeedOPTInference: load_transformer_layer,
+        DeepSpeedLlama2Inference: load_transformer_layer,
         OPTLearnedPositionalEmbedding: load,
-        OPTEmbedding: load
+        OPTEmbedding: load,
+        LlamaRMSNorm: load,
+        RMSNormalize: load,
+        ColumnParallelLinear: load,
+        ParallelEmbedding: load,
+        RowParallelLinear: load,
+        RMSNorm: load
     }
 
     all_ds_ids = {}
@@ -239,30 +240,30 @@ def load_module_recursive(module, prefix='', level=0):
                             setattr(module, name, child)
                     continue
                 child_params = list(child.parameters())
-                if len(child_params) > 0 and (child_params[0].numel() == 0
-                                              or child_params[0].is_meta):
+                if len(child_params) > 0 and (child_params[0].numel() == 0 or child_params[0].is_meta):
                     if child.weight.is_meta:
                         ds_shape = child.weight.shape
                     else:
                         ds_shape = child.weight.ds_shape
                     if child.__class__ is nn.LayerNorm:
-                        child = Normalize(dim=ds_shape[-1],
-                                          dtype=child.weight.dtype,
-                                          eps=child.eps)
+                        child = Normalize(dim=ds_shape[-1], dtype=child.weight.dtype, eps=child.eps)
                         setattr(module, name, child)
-                    elif child.__class__ is nn.Linear:
-                        child = LinearLayer(weight_shape=child.weight.shape,
-                                            bias=child.bias)
+                    elif child.__class__ in [nn.Linear, ColumnParallelLinear, RowParallelLinear]:
+                        child = LinearLayer(weight_shape=child.weight.shape, bias=child.bias)
                         setattr(module, name, child)
                     elif child.__class__ is OPTLearnedPositionalEmbedding:
                         child = OPTEmbedding(weight_shape=ds_shape)
                         setattr(module, name, child)
+                    elif child.__class__ in [LlamaRMSNorm, RMSNorm]:
+                        child = RMSNormalize(dim=ds_shape[-1],
+                                             dtype=child.weight.dtype,
+                                             eps=child.eps if hasattr(child, 'eps') else child.variance_epsilon)
+                        setattr(module, name, child)
                     else:
                         ds_id = None
                         if hasattr(child.weight, 'ds_id'):
                             ds_id = child.weight.ds_id
-                        child = EmbeddingLayer(weight_shape=ds_shape,
-                                               dtype=child.weight.dtype)
+                        child = EmbeddingLayer(weight_shape=ds_shape, dtype=child.weight.dtype)
                         if ds_id is not None:
                             all_ds_ids[ds_id] = child.weight
                         setattr(module, name, child)
@@ -270,19 +271,12 @@ def load_module_recursive(module, prefix='', level=0):
             else:
                 load_module_recursive(
                     child,
-                    prefix if (level == 0 and ckpt_type == 'pp') and container.policy.use_load_prefix else \
+                    prefix if (level == 0 and ckpt_type == 'pp') and skip_level_0_prefix else \
                     prefix + name + '.',
                     level + 1)
 
     load_module_recursive(r_module)
 
-    embedding_weight = None
-
-    for n, p in r_module.named_parameters():
-        if "word_embeddings." in n or "embed_tokens." in n or "wte." in n:
-            embedding_weight = p
-    if embedding_weight is not None and r_module.lm_head.weight.is_meta:
-        r_module.lm_head.weight = embedding_weight
     for sd_ in sd:
         del sd_
     sd = None
diff --git a/deepspeed/module_inject/module_quantize.py b/deepspeed/module_inject/module_quantize.py
index 4123a1214f7b..1f5b2f8a1d28 100755
--- a/deepspeed/module_inject/module_quantize.py
+++ b/deepspeed/module_inject/module_quantize.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 
@@ -7,7 +10,7 @@ def quantize_transformer_layer(orig_layer_impl, model, megatron=False, preln=Fal
     """ Quantize bert-style transformer layers with DeepSpeed's transformer layer
     Arguments:
         orig_layer_impl (torch.nn.Module): the original transformer layer implementation to look for,
-            e.g., transformers.modeling_bert.BertLayer.
+            e.g., transformers.models.bert.modeling_bert.BertLayer or transformers.BertLayer
         model (torch.nn.Module): user's nn.module representing their model
 
         megatron (bool): megatron model-parallel implementation (this is supported for inference only)
@@ -18,34 +21,25 @@ def quantize_transformer_layer(orig_layer_impl, model, megatron=False, preln=Fal
     Returns:
         Updated nn.module with quantized transformer layers
     """
+
     def quantize_weight(weight):
         return weight.to(torch.int8)
 
     def megatron_layer_quantize(layer):
-        layer.attention.query_key_value.weight.data = quantize_weight(
-            layer.attention.query_key_value.weight.data)
-        layer.attention.dense.weight.data = quantize_weight(
-            layer.attention.dense.weight.data)
-        layer.mlp.dense_h_to_4h.weight.data = quantize_weight(
-            layer.mlp.dense_h_to_4h.weight.data)
-        layer.mlp.dense_4h_to_h.weight.data = quantize_weight(
-            layer.mlp.dense_4h_to_h.weight.data)
+        layer.attention.query_key_value.weight.data = quantize_weight(layer.attention.query_key_value.weight.data)
+        layer.attention.dense.weight.data = quantize_weight(layer.attention.dense.weight.data)
+        layer.mlp.dense_h_to_4h.weight.data = quantize_weight(layer.mlp.dense_h_to_4h.weight.data)
+        layer.mlp.dense_4h_to_h.weight.data = quantize_weight(layer.mlp.dense_4h_to_h.weight.data)
 
     def bert_layer_quantize(layer):
-        layer.attention.self.query.weight.data = quantize_weight(
-            layer.attention.self.query.weight.data)
-        layer.attention.self.key.weight.data = quantize_weight(
-            layer.attention.self.key.weight.data)
-        layer.attention.self.value.weight.data = quantize_weight(
-            layer.attention.self.value.weight.data)
-        layer.attention.output.dense.weight.data = quantize_weight(
-            layer.attention.output.dense.weight.data)
+        layer.attention.self.query.weight.data = quantize_weight(layer.attention.self.query.weight.data)
+        layer.attention.self.key.weight.data = quantize_weight(layer.attention.self.key.weight.data)
+        layer.attention.self.value.weight.data = quantize_weight(layer.attention.self.value.weight.data)
+        layer.attention.output.dense.weight.data = quantize_weight(layer.attention.output.dense.weight.data)
         if preln:
-            layer.intermediate.dense_act.weight.data = quantize_weight(
-                layer.intermediate.dense_act.weight.data)
+            layer.intermediate.dense_act.weight.data = quantize_weight(layer.intermediate.dense_act.weight.data)
         else:
-            layer.intermediate.dense.weight.data = quantize_weight(
-                layer.intermediate.dense.weight.data)
+            layer.intermediate.dense.weight.data = quantize_weight(layer.intermediate.dense.weight.data)
         layer.output.dense.weight.data = quantize_weight(layer.output.dense.weight.data)
 
     def quantize_fn(child):
@@ -58,9 +52,7 @@ def quantize_fn(child):
 
         return child
 
-    return quantize_module(model=model,
-                           orig_class=orig_layer_impl,
-                           quantize_fn=quantize_fn)
+    return quantize_module(model=model, orig_class=orig_layer_impl, quantize_fn=quantize_fn)
 
 
 def quantize_module(model, orig_class, quantize_fn):
diff --git a/deepspeed/module_inject/policy.py b/deepspeed/module_inject/policy.py
index dfd3343e12a3..41df2b85dc0c 100644
--- a/deepspeed/module_inject/policy.py
+++ b/deepspeed/module_inject/policy.py
@@ -1,8 +1,10 @@
-'''
-Copyright 2022 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 from abc import ABC, abstractmethod
-from deepspeed.utils.types import ActivationFuncType
+from deepspeed.utils.types import ActivationFuncType, NormType
 import torch
 from deepspeed.accelerator import get_accelerator
 
@@ -56,7 +58,9 @@ def __init__(
             # this flag shows whether or not using prefix in loading the checkpoint
             use_load_prefix=False,
             # whether or not the qkv is stored in the split-format
-            split_qkv=True):
+            split_qkv=True,
+            # Type of normalization to perform
+            norm_type=NormType.LayerNorm):
         super().__init__()
         self.cuda_graph_supported = False
         self.inference = inference
@@ -68,6 +72,7 @@ def __init__(
         self.pre_attn_norm = pre_attn_norm
         self.use_load_prefix = use_load_prefix
         self.split_qkv = split_qkv
+        self.norm_type = norm_type
 
     @abstractmethod
     def attention(self):
@@ -116,7 +121,7 @@ def transpose(data):
 
 # TODO (lekurile): This function exists in megatron feature container as well, consolidate as some point
 def _transpose(x, heads=1, mp_replace=None):
-    heads = heads // mp_replace.mp_size
+    heads = heads // mp_replace.mp_size  # type: ignore
     outer_dim = -1
     attention_head_size = x.shape[outer_dim] // heads
     new_x_shape = x.size()[:outer_dim] + (heads, attention_head_size)
@@ -124,15 +129,10 @@ def _transpose(x, heads=1, mp_replace=None):
     (q, k, v) = torch.split(x_1, (x_1.shape[-1] // 3), dim=-1)
     if len(q.shape) > 2:
         new_shape = (q.shape[0], ) + (-1, )
-        return torch.cat((q.reshape(new_shape),
-                          k.reshape(new_shape),
-                          v.reshape(new_shape)),
+        return torch.cat((q.reshape(new_shape), k.reshape(new_shape), v.reshape(new_shape)),
                          dim=outer_dim).reshape(x.shape)
     else:
-        return torch.cat((q.reshape(-1),
-                          k.reshape(-1),
-                          v.reshape(-1)),
-                         dim=-1).reshape(x.shape)
+        return torch.cat((q.reshape(-1), k.reshape(-1), v.reshape(-1)), dim=-1).reshape(x.shape)
 
 
 # This checks if the parameter exits in the checkpoint file and maybe copies it into the corresponding destination tensor.
@@ -152,23 +152,18 @@ def maybe_copy(module,
         tmp = sd[src_name]
         if len(dst.shape) == 1:
             if split_qkv:
-                dst = mp_replace.qkv_copy(dst, tmp)
+                dst = mp_replace.strided_copy(dst, tmp, num_splits=3)
             else:
                 dst = mp_replace.copy(dst, tmp)
             if qkv and megatron_v2:
-                dst = torch.nn.parameter.Parameter(
-                    _transpose(dst,
-                               heads=heads,
-                               mp_replace=mp_replace).contiguous())
+                dst = torch.nn.parameter.Parameter(_transpose(dst, heads=heads, mp_replace=mp_replace).contiguous())
         else:
             if split_qkv:
-                dst = mp_replace.qkv_copy(dst, weight_quantizer.quantize(tmp if weight_quantizer.q_int8 else \
-                                                (transpose(tmp).contiguous())), int8=weight_quantizer.q_int8)
+                dst = mp_replace.strided_copy(dst, weight_quantizer.quantize(tmp if weight_quantizer.q_int8 else \
+                                                (transpose(tmp).contiguous())), num_splits=3, int8=weight_quantizer.q_int8)
             else:
                 if qkv and megatron_v2:
-                    tmp = _transpose(transpose(tmp),
-                                     heads=heads,
-                                     mp_replace=mp_replace).contiguous()
+                    tmp = _transpose(transpose(tmp), heads=heads, mp_replace=mp_replace).contiguous()
                     if weight_quantizer.q_int8:
                         tmp = transpose(tmp)
                 dst = mp_replace.copy(dst, weight_quantizer.quantize(tmp if weight_quantizer.q_int8 else \
@@ -177,13 +172,7 @@ def maybe_copy(module,
 
 
 # Extending the maybe_copy function for when the q, k, and v are in separate parameters!
-def maybe_copy_qkv(module,
-                   sd,
-                   weight_quantizer,
-                   mp_replace,
-                   dst_name,
-                   src_names,
-                   split_qkv=False):
+def maybe_copy_qkv(module, sd, weight_quantizer, mp_replace, dst_name, src_names, split_qkv=False):
     if src_names[0] in sd:
         q = sd[src_names[0]]
         k = sd[src_names[1]]
@@ -192,14 +181,44 @@ def maybe_copy_qkv(module,
         dst = getattr(module, dst_name)
         if len(dst.shape) == 1:
             if split_qkv:
-                dst = mp_replace.qkv_copy(dst, qkv_data.contiguous())
+                dst = mp_replace.strided_copy(dst, qkv_data.contiguous(), num_splits=3)
             else:
                 dst = mp_replace.copy(dst, qkv_data)
         else:
             if split_qkv:
-                dst = mp_replace.qkv_copy(dst, weight_quantizer.quantize(qkv_data.to(get_accelerator().device_name()) if weight_quantizer.q_int8 else \
-                                                ((transpose(qkv_data)).contiguous())), int8=weight_quantizer.q_int8)
+                dst = mp_replace.strided_copy(dst, weight_quantizer.quantize(qkv_data.to(get_accelerator().device_name()) if weight_quantizer.q_int8 else \
+                                                ((transpose(qkv_data)).contiguous())), num_splits=3, int8=weight_quantizer.q_int8)
             else:
                 dst = mp_replace.copy(dst, weight_quantizer.quantize(qkv_data.to(get_accelerator().device_name()) if weight_quantizer.q_int8 else \
                                                 transpose(qkv_data)), int8=weight_quantizer.q_int8)
         setattr(module, dst_name, dst)
+
+
+# Extending the `maybe_copy` function for when mlp1 is in separate parameters for GeGLU
+def maybe_copy_geglu(module, sd, weight_quantizer, mp_replace, dst_name, src_names):
+    if src_names[0] in sd:
+        reg_proj = sd[src_names[0]]
+        gate_proj = sd[src_names[1]]
+
+        mlp1_data = torch.cat((reg_proj, gate_proj), dim=0)
+        dst = getattr(module, dst_name)
+
+        dst = mp_replace.strided_copy(dst, weight_quantizer.quantize(mlp1_data.to(get_accelerator().device_name()) if weight_quantizer.q_int8 else \
+                                            transpose(mlp1_data)), num_splits=2, int8=weight_quantizer.q_int8)
+        setattr(module, dst_name, dst)
+
+
+def pack_lora_weights(p):
+    return [
+        p.lora_right_weight, \
+        p.lora_left_weight, \
+        p.lora_scaling
+    ]
+
+
+def maybe_get_lora(p):
+    if hasattr(p, 'lora_right_weight'):
+        lora_param = pack_lora_weights(p)
+    else:
+        lora_param = []
+    return lora_param
diff --git a/deepspeed/module_inject/replace_module.py b/deepspeed/module_inject/replace_module.py
index c9032f9bb764..fe32378613c9 100644
--- a/deepspeed/module_inject/replace_module.py
+++ b/deepspeed/module_inject/replace_module.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import os
 import torch
@@ -9,129 +12,17 @@
 from deepspeed.ops.transformer.inference.diffusers_transformer_block import DeepSpeedDiffusersTransformerBlock
 from deepspeed.ops.transformer.inference.diffusers_2d_transformer import Diffusers2DTransformerConfig
 from deepspeed.accelerator import get_accelerator
-from .replace_policy import HFGPT2LayerPolicy
 from .replace_policy import replace_policies, generic_policies
+from .auto_tp import AutoTP, ReplaceWithTensorSlicing, Loading
 
 from deepspeed import comm as dist
-from torch import nn
+from deepspeed.module_inject.tp_shard import set_num_kv_heads
 
-from .layers import LinearAllreduce, LinearLayer
 from .load_checkpoint import load_model_with_checkpoint
 import time
 
 from .utils import policy_to_ds_container
-
-
-class ReplaceWithTensorSlicing:
-    def __init__(self, mp_group=None, mp_size=1, out_dim=1, in_dim=0):
-        if mp_group is not None:
-            self.gpu_index = dist.get_rank(group=mp_group)
-        else:
-            self.gpu_index = 0
-        self.out_dim = out_dim
-        self.in_dim = in_dim
-        self.mp_size = mp_size
-
-    def merge_assert(self, dim1, dim2):
-        assert dim1 > dim2, \
-            'Merging tensors is not allowed here! Please use deepspeed load_checkpoint\
-            for merging your checkpoints before replacing the transformer layer with\
-            inference-kernels'
-
-    def qkv_copy(self, dst, src, int8=False):
-        if src is None:
-            return src
-        src_shape = src.shape
-        dst_shape = dst.shape
-
-        outer_dim = 0 if int8 else -1
-        inner_dim = -1 if int8 else 0
-
-        src_split = torch.split(src.data, src.shape[outer_dim] // 3, dim=outer_dim)
-        if (len(src_shape) == 2 and len(dst_shape) == 2):
-            if src_shape[outer_dim] == dst_shape[self.out_dim]:
-                dst = dst.reshape(-1).data.copy_(src.data.reshape(-1)).reshape(src.shape)
-                dst = torch.nn.parameter.Parameter(dst, requires_grad=False)
-                if hasattr(src, 'scale'):
-                    dst.scale = src.scale
-                return dst
-            if self.out_dim == 1:
-                self.merge_assert(src_shape[outer_dim], dst_shape[self.out_dim])
-                qkv_size = dst_shape[self.out_dim] // 3
-                qkv_split = [
-                    torch.split(src_s,
-                                qkv_size,
-                                dim=outer_dim) for src_s in src_split
-                ]
-
-                weight_split = [
-                    torch.cat([qkv_s[i] for qkv_s in qkv_split],
-                              axis=outer_dim) for i in range(len(qkv_split[0]))
-                ]
-                dst = dst.reshape(-1).data.copy_(
-                    weight_split[self.gpu_index].contiguous().reshape(-1)).reshape(
-                        weight_split[self.gpu_index].shape)
-            else:
-                dst.data.copy_(src_split[self.gpu_index].to(
-                    get_accelerator().current_device_name()).contiguous())
-        else:
-            if src_shape[0] == dst_shape[0]:
-                return torch.nn.parameter.Parameter(src)
-            if self.out_dim == 1:
-                qkv_size = dst_shape[0] // 3
-                qkv_split = [torch.split(src_s, qkv_size, dim=0) for src_s in src_split]
-                bias_split = [
-                    torch.cat([qkv_s[i] for qkv_s in qkv_split],
-                              axis=0) for i in range(len(qkv_split[0]))
-                ]
-                dst.data.copy_(bias_split[self.gpu_index].contiguous())
-            else:
-                dst.data.copy_(src_split[self.gpu_index].contiguous())
-
-        dst = torch.nn.parameter.Parameter(dst, requires_grad=False)
-        if hasattr(src, 'scale'):
-            dst.scale = src.scale
-        return dst
-
-    def copy(self, dst, src, int8=False):
-        if src is None:
-            return src
-        assert not dst.data.is_meta  # the torch.Tensor.copy_ method used below will silently fail on meta tensors
-        outer_dim = 0 if int8 else 1
-        inner_dim = 1 if int8 else 0
-        src_shape = src.shape
-        dst_shape = dst.shape
-        if (len(src_shape) == 2 and len(dst_shape) == 2):
-
-            if src_shape[inner_dim] == dst_shape[
-                    self.in_dim] and src_shape[outer_dim] == dst_shape[self.out_dim]:
-                dst = dst.reshape(-1).data.copy_(src.data.reshape(-1)).reshape(src.shape)
-            else:
-                if src_shape[inner_dim] != dst_shape[self.in_dim]:
-                    self.merge_assert(src_shape[inner_dim], dst_shape[self.in_dim])
-                    weight_split = torch.split(
-                        src,
-                        dst_shape[self.in_dim],
-                        dim=inner_dim)[self.gpu_index].contiguous()
-                else:
-                    self.merge_assert(src_shape[outer_dim], dst_shape[self.out_dim])
-                    weight_split = torch.split(
-                        src.data,
-                        dst_shape[self.out_dim],
-                        dim=outer_dim)[self.gpu_index].contiguous()
-                dst = dst.reshape(-1).data.copy_(weight_split.reshape(-1)).reshape(
-                    weight_split.shape)
-        else:
-            if src_shape[0] == dst_shape[0]:
-                dst.data.copy_(src)
-            else:
-                bias_split = torch.split(src.data,
-                                         dst_shape[-1])[self.gpu_index].contiguous()
-                dst.data.copy_(bias_split)
-        dst = torch.nn.parameter.Parameter(dst, requires_grad=False)
-        if hasattr(src, 'scale'):
-            dst.scale = src.scale
-        return dst
+import gc
 
 
 def get_transformer_name(replaced_module):
@@ -150,6 +41,7 @@ def get_transformer_name(replaced_module):
 
 
 class GroupQuantizer:
+
     def __init__(self, q_int8=True, group_size=1, num_bits=8, num_groups=0):
         self.group_size = group_size
         self.num_bits = num_bits
@@ -163,8 +55,7 @@ def quantize(self, inputs, qkv=True, count=1, parallel_dim=0):
             inputs.scale = torch.empty(1)
             return inputs
         q_range = 2**self.num_bits
-        num_groups = self.num_groups if self.num_groups > 0 else inputs.shape[
-            0] // self.group_size
+        num_groups = self.num_groups if self.num_groups > 0 else inputs.shape[0] // self.group_size
         inputs = inputs.to(get_accelerator().current_device_name())
         input_flat = inputs.reshape(num_groups, -1).contiguous()
         input_min = torch.min(input_flat, dim=1, keepdim=True)[0].float()
@@ -174,31 +65,14 @@ def quantize(self, inputs, qkv=True, count=1, parallel_dim=0):
         inputs_q = input_flat.reshape(inputs.shape).to(torch.int8).contiguous()
         out = torch.nn.Parameter(inputs_q, requires_grad=False)
         inputs_split = inputs.split(inputs.shape[parallel_dim] // 2, dim=parallel_dim)
-        input_flat = [
-            inputs_split[i].reshape(num_groups,
-                                    -1).contiguous() for i in range(2)
-        ]
-        input_min = [
-            torch.min(input_flat[i],
-                      dim=1,
-                      keepdim=True)[0].float() for i in range(2)
-        ]
-        input_max = [
-            torch.max(input_flat[i],
-                      dim=1,
-                      keepdim=True)[0].float() for i in range(2)
-        ]
-        scale1 = [
-            (torch.max(input_min[i].abs(),
-                       input_max[i].abs()) * 2.0 / (q_range)).squeeze().unsqueeze(0)
-            for i in range(2)
-        ]
-
-        out.scale = torch.cat([scale.squeeze().unsqueeze(0),
-                               scale1[0],
-                               scale1[1]],
-                              dim=0).reshape(num_groups,
-                                             -1).contiguous()
+        input_flat = [inputs_split[i].reshape(num_groups, -1).contiguous() for i in range(2)]
+        input_min = [torch.min(input_flat[i], dim=1, keepdim=True)[0].float() for i in range(2)]
+        input_max = [torch.max(input_flat[i], dim=1, keepdim=True)[0].float() for i in range(2)]
+        scale1 = [(torch.max(input_min[i].abs(), input_max[i].abs()) * 2.0 / (q_range)).squeeze().unsqueeze(0)
+                  for i in range(2)]
+
+        out.scale = torch.cat([scale.squeeze().unsqueeze(0), scale1[0], scale1[1]], dim=0).reshape(num_groups,
+                                                                                                   -1).contiguous()
         return out
 
 
@@ -210,7 +84,8 @@ def _module_match(module):
     return None
 
 
-def generic_injection(module, fp16=False, enable_cuda_graph=True):
+def generic_injection(module, dtype=None, enable_cuda_graph=True):
+
     def replace_attn(child, policy):
         policy_attn = policy.attention(child)
         if policy_attn is None:
@@ -223,7 +98,7 @@ def replace_attn(child, policy):
         config = transformer_inference.DeepSpeedInferenceConfig(
             hidden_size=hidden_size,
             heads=heads,
-            fp16=fp16,
+            dtype=dtype,
             triangular_masking=False,
             max_out_tokens=4096,
         )
@@ -246,8 +121,7 @@ def transpose(data):
 
         attn_module.attn_qkvb = None
         attn_module.attn_ow.data = transpose(attn_ow.data)
-        attn_module.attn_ob.data.copy_(
-            attn_ob.data.to(get_accelerator().current_device_name()))
+        attn_module.attn_ob.data.copy_(attn_ob.data.to(get_accelerator().current_device_name()))
         return attn_module
 
     def replace_attn_block(child, policy):
@@ -257,12 +131,15 @@ def replace_attn_block(child, policy):
     if isinstance(module, torch.nn.Module):
         pass
     else:
-        if fp16 is False:
+        if dtype not in [torch.float16, torch.half]:
             raise ValueError("Generic injection only supported with FP16")
 
         try:
             import diffusers
-            cross_attention = diffusers.models.attention.CrossAttention
+            if hasattr(diffusers.models.attention, 'CrossAttention'):
+                cross_attention = diffusers.models.attention.CrossAttention
+            else:
+                cross_attention = diffusers.models.attention_processor.Attention
             attention_block = diffusers.models.attention.BasicTransformerBlock
             new_policies = {
                 cross_attention: replace_attn,
@@ -278,8 +155,7 @@ def replace_attn_block(child, policy):
         #                          triangular_masking=True,
         #                          max_out_tokens=8192)
         from ..model_implementations.transformers.clip_encoder import DSClipEncoder
-        cg_encoder = DSClipEncoder(module.text_encoder,
-                                   enable_cuda_graph=enable_cuda_graph)
+        cg_encoder = DSClipEncoder(module.text_encoder, enable_cuda_graph=enable_cuda_graph)
         setattr(module, 'text_encoder', cg_encoder)
         for name in module.__dict__.keys():
             sub_module = getattr(module, name)
@@ -291,13 +167,11 @@ def _replace_module(module, policy):
                     for name, child in module.named_children():
                         _replace_module(child, policy)
                         if child.__class__ in new_policies:
-                            replaced_module = new_policies[child.__class__](child,
-                                                                            policy)
+                            replaced_module = new_policies[child.__class__](child, policy)
                             setattr(module, name, replaced_module)
 
                 _replace_module(sub_module, policy)
-                new_module = policy.apply(sub_module,
-                                          enable_cuda_graph=enable_cuda_graph)
+                new_module = policy.apply(sub_module, enable_cuda_graph=enable_cuda_graph)
                 print(f"**** found and replaced {name} w. {type(new_module)}")
                 setattr(module, name, new_module)
 
@@ -305,15 +179,11 @@ def _replace_module(module, policy):
 container_g = None
 
 
-def replace_transformer_layer(orig_layer_impl,
-                              model,
-                              checkpoint_dict,
-                              config,
-                              model_config):
+def replace_transformer_layer(orig_layer_impl, model, checkpoint_dict, config, model_config):
     """ Replace bert-style transformer layers with DeepSpeed's transformer layer
     Arguments:
         orig_layer_impl (torch.nn.Module): the original transformer layer implementation to look for,
-            e.g., transformers.modeling_bert.BertLayer.
+            e.g., transformers.models.bert.modeling_bert.BertLayer or transformers.BertLayer
         model (torch.nn.Module): user's nn.module representing their model
         checkpoint_dict: Dictionary for checkpoint passed from the Inference Engine
         config: top-level DS Inference config defined in inference/config.py
@@ -322,7 +192,6 @@ def replace_transformer_layer(orig_layer_impl,
         Updated nn.module with replaced transformer layers
     """
     # defining globals as internally defined functions inherit these everywhere
-    fp16 = (config.dtype == torch.float16 or config.dtype == torch.int8)
     quantize = (config.dtype == torch.int8)
     # todo: Refactor later. In future, let's minimize the style used above and use config.** instead
 
@@ -334,15 +203,10 @@ def replace_transformer_layer(orig_layer_impl,
     seed = -1
     local_rank = -1
 
-    mp_replace = ReplaceWithTensorSlicing(
-        mp_group=config.tensor_parallel.tp_group,
-        mp_size=config.tensor_parallel.tp_size)  #, out_dim=0, in_dim=1)
+    mp_replace = ReplaceWithTensorSlicing(mp_group=config.tensor_parallel.tp_group,
+                                          mp_size=config.tensor_parallel.tp_size)  #, out_dim=0, in_dim=1)
 
-    def replace_with_policy(child,
-                            policy_cls,
-                            triangular_masking,
-                            inference=False,
-                            layer_id=0):
+    def replace_with_policy(child, policy_cls, triangular_masking, inference=False, layer_id=0):
         policy = policy_cls(child, inference=inference)
         if not policy.cuda_graph_supported:
             # policy says cuda graph is not supported raise an error if set
@@ -360,23 +224,21 @@ def replace_with_policy(child,
                                             model_config=model_config,
                                             layer_id=layer_id,
                                             child=child)
-        _container.set_dtype(fp16)
         _container.set_moe(moe)
 
         # 2. Set the tensor parallelism config
-        _container.set_tensor_parallel_config(config.tensor_parallel.tp_size,
-                                              config.tensor_parallel.tp_group)
+        _container.set_tensor_parallel_config(config.tensor_parallel.tp_size, config.tensor_parallel.tp_group)
 
         # 3. Initialize tensors
         _container.initialize_tensors()
 
         # 4. deal with data types -- needs refactor to use dtype instead of fp16
-        if fp16:
-            _container.convert_to_required_dtype(dtype=torch.half)
+        if config.dtype in [torch.float16, torch.bfloat16, torch.int8]:
+            _container.convert_to_required_dtype()
 
         # 5. Set the quantization config
         quantizer = GroupQuantizer(q_int8=quantize)
-        _container.set_quantization_config(quantize, quantizer)
+        _container.set_quantization_config(quantizer)
 
         # 6. create a DS Inference config object
         _container.create_ds_model_config()
@@ -401,118 +263,30 @@ def replace_with_policy(child,
 
         return _container.module
 
-    def replace_wo_policy(module, all_reduce_linears):
-        mp_size = config.tensor_parallel.tp_size
-        mp_group = config.tensor_parallel.tp_group
-
-        def _replace(child, name, conv_linear_layer):
-            mp_replace = ReplaceWithTensorSlicing(mp_group=mp_group)
-            weight_shape = child.weight.shape
-            if name in all_reduce_linears:
-                new_weight = torch.empty((
-                    weight_shape[1] if conv_linear_layer else weight_shape[0],
-                    (weight_shape[0] if conv_linear_layer else weight_shape[1]) //
-                    mp_size,
-                ),
-                                         device=child.weight.device,
-                                         dtype=child.weight.dtype)
-                if conv_linear_layer:
-                    child.weight.data = child.weight.data.transpose(-1, -2).contiguous()
-                data = mp_replace.copy(new_weight, child.weight.data)
-                new_bias = torch.empty((weight_shape[0]),
-                                       device=child.weight.device,
-                                       dtype=child.weight.dtype)
-                if child.bias is not None:
-                    new_bias.data.copy_(child.bias.data)
-                return LinearAllreduce(data, child.bias if child.bias is None else \
-                            torch.nn.parameter.Parameter(new_bias.to(get_accelerator().current_device_name())), mp_group)
-            else:
-                new_weight = torch.empty((
-                    (weight_shape[1] if conv_linear_layer else weight_shape[0]) //
-                    mp_size,
-                    weight_shape[0] // mp_size if conv_linear_layer else weight_shape[1],
-                ),
-                                         device=child.weight.device,
-                                         dtype=child.weight.dtype)
-                if conv_linear_layer:
-                    child.weight.data = child.weight.data.transpose(-1, -2).contiguous()
-                data = mp_replace.copy(new_weight, child.weight.data)
-
-                new_bias = torch.empty((weight_shape[0] // mp_size),
-                                       device=child.weight.device,
-                                       dtype=child.weight.dtype)
-                bias_data = None if child.bias is None else mp_replace.copy(
-                    new_bias,
-                    child.bias.data).to(get_accelerator().current_device_name())
-                return LinearLayer(weight=data.to(
-                    get_accelerator().current_device_name()),
-                                   bias=bias_data)
-
-        def _slice_embedding(child, name, conv_linear_layer):
-            mp_replace = ReplaceWithTensorSlicing(mp_group=mp_group)
-            new_weight = torch.empty((child.weight.shape[0],
-                                      child.weight.shape[1] // mp_size),
-                                     device=child.weight.device,
-                                     dtype=child.weight.dtype)
-            data = mp_replace.copy(new_weight,
-                                   child.weight.ds_tensor.data if hasattr(child.weight, 'ds_tensor') else \
-                                   child.weight.data)
-            new_embedding = nn.Embedding(child.weight.shape[0],
-                                         child.weight.shape[1] // mp_size)
-            new_embedding.weight.data.copy_(data)
-            return new_embedding
-
-        def update_mp_params(child):
-            if hasattr(child, 'n_heads'):
-                child.n_heads = child.n_heads // mp_size
-            if hasattr(child, 'inner_dim'):
-                child.inner_dim = child.inner_dim // mp_size
-            if hasattr(child, 'num_heads'):
-                child.num_heads = child.num_heads // mp_size
-            if hasattr(child, 'num_attention_heads'):
-                child.num_attention_heads = child.num_attention_heads // mp_size
-            if hasattr(child, 'num_attn_heads'):
-                child.num_attn_heads = child.num_attn_heads // mp_size
-            if hasattr(child, 'all_head_size'):
-                child.all_head_size = child.all_head_size // mp_size
-            if hasattr(child, 'embed_dim'):
-                child.embed_dim = child.embed_dim // mp_size
-            if hasattr(child, 'hidden_size'):
-                child.hidden_size = child.hidden_size // mp_size
-
-        conv_linear_layer = False
-        if linear_layer_setting is not None:
-            linear_policies = {linear_layer_setting[0]: _replace}
-            if len(linear_layer_setting) == 2:
-                linear_policies.update({linear_layer_setting[1]: _slice_embedding})
-        else:
-            if orig_layer_impl is HFGPT2LayerPolicy._orig_layer_class:
-                try:
-                    import transformers
-                    conv_linear_layer = True
-                    linear_policies = {transformers.model_utils.Conv1D: _replace}
-                except ImportError:
-                    linear_policies = {nn.Linear: _replace}
-            else:
-                linear_policies = {nn.Linear: _replace, nn.Embedding: _slice_embedding}
-
-        def _replace_module(r_module, prev_name=''):
-            for name, child in r_module.named_children():
-                if child.__class__ in linear_policies:
-                    setattr(
-                        r_module,
-                        name,
-                        linear_policies[child.__class__](child,
-                                                         prev_name + '.' + name,
-                                                         conv_linear_layer))
-                else:
-                    update_mp_params(child)
-                    _replace_module(child, name)
-            return r_module
+    def replace_wo_policy(module, all_reduce_linears, prefix="", state_dict=None):
+        #mp_replace = ReplaceWithTensorSlicing(mp_group=config.tensor_parallel.tp_group)
+
+        # 1. Create AutoTP object
+        _autotp = AutoTP(module, all_reduce_linears, prefix, state_dict, linear_layer_setting, orig_layer_impl)
+
+        # 2. Set the tensor parallelism config
+        _autotp.set_tensor_parallel_config(config.tensor_parallel.tp_size, config.tensor_parallel.tp_group)
 
-        return _replace_module(module)
+        # 3. Try to get num_key_heads from model_config.num_key_value_heads
+        num_kv_heads = _autotp.get_model_num_kv_heads(model_config)
 
-    def replace_fn(child, _policy, layer_id=0):
+        # 4. When we have num_kv_heads defined, uneven division is possible, otherwise enforce even division
+        set_num_kv_heads(num_kv_heads)
+
+        # 5. Set linear policies
+        _autotp.update_linear_policies()
+
+        # 6. Replace modules
+        if "lm_head" in all_reduce_linears or "embed_out" in all_reduce_linears:
+            return _autotp._replace_last_linear_module(module)
+        return _autotp._replace_module(module)
+
+    def replace_fn(child, _policy, layer_id=0, prefix="", state_dict=None):
         training = False  # todo: refactor this part to go in the config
         if training:
             # copy relevant state from child -> new module
@@ -527,19 +301,50 @@ def replace_fn(child, _policy, layer_id=0):
                                                  inference=True,
                                                  layer_id=layer_id)
             else:
-                new_module = replace_wo_policy(child, _policy)
+                new_module = replace_wo_policy(child, _policy, prefix=prefix, state_dict=state_dict)
 
         return new_module
 
-    replaced_module = replace_module(model=model,
-                                     orig_class=orig_layer_impl,
-                                     replace_fn=replace_fn,
-                                     _replace_policy=config.injection_policy_tuple)
+    def set_lm_head(module):
+        embedding_weight = None
+        for n, p in module.named_parameters():
+            if "word_embeddings." in n or "embed_tokens." in n or "wte." in n:
+                embedding_weight = p
+        if embedding_weight is not None and hasattr(module, "lm_head") and hasattr(
+                module.lm_head, "weight") and module.lm_head.weight.is_meta:
+            module.lm_head.weight = embedding_weight
+        # enable tensor parallel for the last linear
+        if hasattr(module, "lm_head") and hasattr(module.lm_head, "weight") and not module.lm_head.weight.is_meta:
+            module = replace_wo_policy(module, ("lm_head", ), 0, "lm_head")
+        elif hasattr(module, "embed_out") and hasattr(module.embed_out,
+                                                      "weight") and not module.embed_out.weight.is_meta:
+            module = replace_wo_policy(module, ("embed_out", ), 0, "embed_out")
+        return module
+
+    if checkpoint_dict is not None and not config.replace_with_kernel_inject:
+        # AutoTP shard loading
+        checkpoint = checkpoint_dict["checkpoints"]
+        pbar = tqdm.tqdm(total=len(checkpoint), desc=f"Loading {len(checkpoint)} checkpoint shards")
+        for i in range(len(checkpoint)):
+            checkpoint_file = os.path.join(config.base_dir, checkpoint[i])
+            replaced_module = replace_module(model=model,
+                                             orig_class=orig_layer_impl,
+                                             replace_fn=replace_fn,
+                                             _replace_policy=config.injection_policy_tuple,
+                                             checkpoint=checkpoint_file)
+            pbar.update(1)
+            gc.collect()
+        replaced_module = set_lm_head(replaced_module)
+    else:
+        replaced_module = replace_module(model=model,
+                                         orig_class=orig_layer_impl,
+                                         replace_fn=replace_fn,
+                                         _replace_policy=config.injection_policy_tuple)
 
     quantizer = GroupQuantizer(q_int8=quantize)
     world_size = dist.get_world_size() if dist.is_initialized() else 1
     rank = dist.get_rank() if dist.is_initialized() else 0
-    if checkpoint_dict is not None:
+    if checkpoint_dict is not None and config.replace_with_kernel_inject:
         assert container_g.ckpt_load_enabled, \
                f"Meta Tensor checkpoint loading not supported in {container_g.__class__.__name__} container"
         start_time = time.time()
@@ -551,15 +356,10 @@ def replace_fn(child, _policy, layer_id=0):
         base_dir1 = checkpoint_dict.get('base_dir', config.base_dir)
 
         if ckpt_type == 'pp' and type(checkpoint) is list:
-            pbar = tqdm.tqdm(total=len(checkpoint),
-                             desc=f"Loading {len(checkpoint)} checkpoint shards")
+            pbar = tqdm.tqdm(total=len(checkpoint), desc=f"Loading {len(checkpoint)} checkpoint shards")
 
             for i in range(len(checkpoint)):
-                sd = [
-                    torch.load(os.path.join(base_dir1,
-                                            checkpoint[i]),
-                               map_location='cpu')
-                ]
+                sd = [torch.load(os.path.join(base_dir1, checkpoint[i]), map_location='cpu')]
                 load_model_with_checkpoint(replaced_module,
                                            sd,
                                            mp_replace,
@@ -569,27 +369,19 @@ def replace_fn(child, _policy, layer_id=0):
                                            container=container_g)
                 pbar.update(1)
         else:
-            import gc
             num_checkpoints = len(ckpt_list) // ckpt_mp_size
             tp_split_size = (world_size / ckpt_mp_size)
             sd_offset = int(rank / tp_split_size)
             sd_count = int((rank + max(1, tp_split_size)) / tp_split_size) - sd_offset
-            pbar = tqdm.tqdm(total=num_checkpoints,
-                             desc=f"Loading {num_checkpoints} checkpoint shards")
+            pbar = tqdm.tqdm(total=num_checkpoints, desc=f"Loading {num_checkpoints} checkpoint shards")
             for i in range(num_checkpoints):
                 pbar.update(1)
                 ckpt_index = i * ckpt_mp_size + sd_offset
                 ckpt_files = [
-                    os.path.join(base_dir1,
-                                 ckpt_list[ckpt_index +
-                                           j]) if base_dir1 else ckpt_list[ckpt_index +
-                                                                           j]
+                    os.path.join(base_dir1, ckpt_list[ckpt_index + j]) if base_dir1 else ckpt_list[ckpt_index + j]
                     for j in range(sd_count)
                 ]
-                sds = [
-                    torch.load(ckpt_file,
-                               map_location='cpu') for ckpt_file in ckpt_files
-                ]
+                sds = [torch.load(ckpt_file, map_location='cpu') for ckpt_file in ckpt_files]
                 load_model_with_checkpoint(replaced_module,
                                            sds,
                                            mp_replace,
@@ -602,15 +394,13 @@ def replace_fn(child, _policy, layer_id=0):
                 gc.collect()
 
             if "non_tp" in checkpoint:
-                pbar = tqdm.tqdm(
-                    total=len(checkpoint["non_tp"]),
-                    desc=f"Loading {len(checkpoint['non_tp'])} checkpoint shards")
+                pbar = tqdm.tqdm(total=len(checkpoint["non_tp"]),
+                                 desc=f"Loading {len(checkpoint['non_tp'])} checkpoint shards")
 
                 for i in range(len(checkpoint["non_tp"])):
                     pbar.update(1)
                     ckpt_file = os.path.join(base_dir1,
-                                             checkpoint["non_tp"][i]
-                                             ) if base_dir1 else checkpoint["non_tp"][i]
+                                             checkpoint["non_tp"][i]) if base_dir1 else checkpoint["non_tp"][i]
                     sds = [torch.load(ckpt_file, map_location='cpu')]
                     load_model_with_checkpoint(replaced_module,
                                                sds,
@@ -622,6 +412,7 @@ def replace_fn(child, _policy, layer_id=0):
                                                container=container_g)
                     sds = [None for _ in sds]
                     gc.collect()
+        set_lm_head(replaced_module)
         print(f"checkpoint loading time at rank {rank}: {time.time()-start_time} sec")
 
     if config.save_mp_checkpoint_path is not None:
@@ -649,37 +440,30 @@ def replace_fn(child, _policy, layer_id=0):
         if not dist.is_initialized() or dist.get_rank() == 0:
             print("Saving tp-sharded checkpoints")
             torch.save(
-                OrderedDict({
-                    k: v
-                    for k,
-                    v in dict(replaced_module.state_dict()).items()
-                    if transformer_name not in k
-                }),
-                f'{config.save_mp_checkpoint_path}/{non_tp_ckpt_name}')
+                OrderedDict({k: v
+                             for k, v in dict(replaced_module.state_dict()).items()
+                             if transformer_name not in k}), f'{config.save_mp_checkpoint_path}/{non_tp_ckpt_name}')
+
+            dtype_reprs = {
+                torch.float32: 'float32',
+                torch.float16: 'float16',
+                torch.int8: 'int8',
+                torch.bfloat16: 'bfloat16'
+            }
+
             ckpt_config = json.dumps({
-                'type':
-                ckpt_name,
-                'base_dir':
-                f'{config.save_mp_checkpoint_path}',
+                'type': ckpt_name,
+                'base_dir': f'{config.save_mp_checkpoint_path}',
                 'checkpoints': {
-                    "non_tp":
-                    ckpt_files,
-                    "tp": [
-                        f'tp_{r:0>2d}_{m:0>2d}.pt' for m in range(num_partitions)
-                        for r in range(world_size)
-                    ]
+                    "non_tp": ckpt_files,
+                    "tp": [f'tp_{r:0>2d}_{m:0>2d}.pt' for m in range(num_partitions) for r in range(world_size)]
                 },
-                'version':
-                1.0,
-                'parallelization':
-                'tp',
-                'tp_size':
-                world_size,
-                'dtype':
-                'int8' if quantize else ('float16' if fp16 else 'float32')
+                'version': 1.0,
+                'parallelization': 'tp',
+                'tp_size': world_size,
+                'dtype': dtype_reprs[config.dtype]
             })
-            with open(f"{config.save_mp_checkpoint_path}/ds_inference_config.json",
-                      "w") as cfg:
+            with open(f"{config.save_mp_checkpoint_path}/ds_inference_config.json", "w") as cfg:
                 cfg.write(ckpt_config)
 
         rep_sd = replaced_module.state_dict()
@@ -691,13 +475,9 @@ def replace_fn(child, _policy, layer_id=0):
         for m in range(num_partitions):
             torch.save(
                 OrderedDict({
-                    k: [rep_sd[k],
-                        rep_sd[k].scale] if hasattr(rep_sd[k],
-                                                    'scale') else rep_sd[k]
-                    for k in keys[m * partition_size:(m + 1) * partition_size]
-                    if transformer_name in k
-                }),
-                f'{config.save_mp_checkpoint_path}/tp_{rank:0>2d}_{m:0>2d}.pt')
+                    k: [rep_sd[k], rep_sd[k].scale] if hasattr(rep_sd[k], 'scale') else rep_sd[k]
+                    for k in keys[m * partition_size:(m + 1) * partition_size] if transformer_name in k
+                }), f'{config.save_mp_checkpoint_path}/tp_{rank:0>2d}_{m:0>2d}.pt')
 
     return replaced_module
 
@@ -706,12 +486,13 @@ def revert_transformer_layer(orig_layer_impl, model, config, preln=False):
     """ Revert DeepSpeed's transformer layer back to original bert-style transformer layer
     Arguments:
         orig_layer_impl (torch.nn.Module): the original transformer layer implementation that was replaced,
-            e.g., transformers.modeling_bert.BertLayer.
+            e.g., transformers.models.bert.modeling_bert.BertLayer or transformers.BertLayer
         model (torch.nn.Module): user's nn.module representing their model
         config (dict): model config containing hidden size, attention heads, etc.
     Returns:
         Updated nn.module with original bert-style transformer layers
     """
+
     def replace_fn(child, _replace_policy, layer_id):
         #from turing.nvidia_modelingpreln import BertLayer
         orig_module = orig_layer_impl(config)
@@ -770,7 +551,7 @@ def replace_fn(child, _replace_policy, layer_id):
                           _replace_policy=None)
 
 
-def replace_module(model, orig_class, replace_fn, _replace_policy):
+def replace_module(model, orig_class, replace_fn, _replace_policy, checkpoint=None):
     """ Scan the model for instances of ``orig_clas:`` to replace using ``replace_fn``.
     Arguments:
         model (torch.nn.Module): the model to augment
@@ -780,6 +561,9 @@ def replace_module(model, orig_class, replace_fn, _replace_policy):
     Returns:
         A modified ``model``.
     """
+    sd = None
+    if checkpoint is not None:
+        sd = torch.load(checkpoint, map_location='cpu')
     policy = {}
     if orig_class is not None:
         policy.update({orig_class: (replace_fn, _replace_policy)})
@@ -796,14 +580,33 @@ def replace_module(model, orig_class, replace_fn, _replace_policy):
         "No default policy found! Please specify your policy injection_policy (like {BertLayer:HFBEertLayerPolicy})." +\
         "You can find some samples here: https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/module_inject/replace_policy.py"
 
-    replaced_module, _ = _replace_module(model, policy)
+    replaced_module, _ = _replace_module(model, policy, state_dict=sd)
     return replaced_module
 
 
 from ..pipe import PipelineModule
 
+import re
 
-def _replace_module(model, policies, layer_id=0):
+
+def skip_level_0_prefix(model, state_dict):
+    model = str(model)
+    key = re.search(r": (.*?)Model", model)
+    if key is None:
+        key = re.search(r": (.*?)Stack", model)
+    if key is None:
+        key = re.match(r"(.*?)Model", model)
+    # if keys start with 'model.', don't skip level 0 prefix
+    if state_dict != None:
+        for item in state_dict.keys():
+            if re.match("^model[.]", item):
+                return False
+    if key is not None and key.group(1).lower() in ["bloom", "opt"]:
+        return True
+    return False
+
+
+def _replace_module(model, policies, prefix='', layer_id=0, level_id=0, state_dict=None):
     """ Traverse model's children recursively and apply any transformations in ``policies``.
     Arguments:
         model (torch.nn.Module): model to augment
@@ -815,7 +618,9 @@ def _replace_module(model, policies, layer_id=0):
         if child.__class__ in policies:
             replaced_module = policies[child.__class__][0](child,
                                                            policies[child.__class__][-1],
-                                                           layer_id)
+                                                           layer_id,
+                                                           prefix=prefix + name,
+                                                           state_dict=state_dict)
             setattr(model, name, replaced_module)
             if isinstance(model, PipelineModule):
                 assert hasattr(model, 'forward_funcs'),\
@@ -823,7 +628,25 @@ def _replace_module(model, policies, layer_id=0):
                 model.forward_funcs[model.fwd_map[name]] = replaced_module
             layer_id += 1
         else:
-            _, layer_id = _replace_module(child, policies, layer_id=layer_id)
+            checking_key = prefix + name + '.'
+            if Loading.is_load_module(child) and state_dict is not None:
+                if any(checking_key in item for item in state_dict):
+                    Loading.load(
+                        child,
+                        state_dict,
+                        checking_key,
+                    )
+                else:
+                    continue
+            if len(child._buffers) != 0 and state_dict is not None:
+                Loading.load_buffer(child, state_dict, checking_key)
+            _, layer_id = _replace_module(child,
+                                          policies,
+                                          prefix if level_id == 0 and skip_level_0_prefix(model, state_dict) else \
+                                          prefix + name + '.',
+                                          layer_id=layer_id,
+                                          level_id=level_id + 1,
+                                          state_dict=state_dict)
 
     # Add the reset_cache func to the model, so that it can be called in the beginning of text-generation.
     model.reset_cache = transformer_inference.DeepSpeedTransformerInference.reset_cache
diff --git a/deepspeed/module_inject/replace_policy.py b/deepspeed/module_inject/replace_policy.py
index 65dadcc182cf..2c06e31aaa41 100755
--- a/deepspeed/module_inject/replace_policy.py
+++ b/deepspeed/module_inject/replace_policy.py
@@ -1,6 +1,8 @@
-'''
-Copyright 2020 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 from .containers import HFGPT2LayerPolicy
 from .containers import HFBertLayerPolicy
 from .containers import BLOOMLayerPolicy
@@ -11,21 +13,17 @@
 from .containers import MegatronLayerPolicy
 from .containers import HFDistilBertLayerPolicy
 from .containers import HFCLIPLayerPolicy
+from .containers import LLAMALayerPolicy
 from .containers import UNetPolicy
 from .containers import VAEPolicy
+from .containers import LLAMA2LayerPolicy
+from .containers import InternLMLayerPolicy
 
 # transformer-based policies
 replace_policies = [
-    HFBertLayerPolicy,
-    HFGPTNEOLayerPolicy,
-    GPTNEOXLayerPolicy,
-    HFGPTJLayerPolicy,
-    MegatronLayerPolicy,
-    HFGPT2LayerPolicy,
-    BLOOMLayerPolicy,
-    HFOPTLayerPolicy,
-    HFCLIPLayerPolicy,
-    HFDistilBertLayerPolicy
+    HFBertLayerPolicy, HFGPTNEOLayerPolicy, GPTNEOXLayerPolicy, HFGPTJLayerPolicy, MegatronLayerPolicy,
+    HFGPT2LayerPolicy, BLOOMLayerPolicy, HFOPTLayerPolicy, HFCLIPLayerPolicy, HFDistilBertLayerPolicy,
+    LLAMALayerPolicy, LLAMA2LayerPolicy, InternLMLayerPolicy
 ]
 
 # non-transformer-based policies
diff --git a/deepspeed/module_inject/tp_shard.py b/deepspeed/module_inject/tp_shard.py
new file mode 100644
index 000000000000..8e2fa78d883f
--- /dev/null
+++ b/deepspeed/module_inject/tp_shard.py
@@ -0,0 +1,39 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from deepspeed import comm as dist
+global num_kv_heads
+
+
+def set_num_kv_heads(num):
+    global num_kv_heads
+    num_kv_heads = num
+
+
+def get_num_kv_heads():
+    global num_kv_heads
+    return num_kv_heads
+
+
+def get_shard_size(total_size, mp_size, rank=None):
+    global num_kv_heads
+    # When we have num_kv_heads defined, uneven division is possible, otherwise enforce even division
+    if num_kv_heads != None:
+        if (rank == None):
+            rank = dist.get_rank()
+        my_slices = (num_kv_heads // mp_size) + (1 if rank < (num_kv_heads % mp_size) else 0)
+        return total_size * my_slices // num_kv_heads
+    else:
+        if total_size % mp_size == 0:
+            return total_size // mp_size
+        else:
+            assert False, f"Number of attention heads ({total_size}) must be divisible by mp_size ({mp_size})"
+
+
+def get_shard_size_list(total_size, mp_size):
+    shard_sizes = []
+    for i in range(mp_size):
+        shard_sizes.append(get_shard_size(total_size, mp_size, i))
+    return shard_sizes
diff --git a/deepspeed/module_inject/utils.py b/deepspeed/module_inject/utils.py
index 7ebd797ec31c..42822128f9e1 100644
--- a/deepspeed/module_inject/utils.py
+++ b/deepspeed/module_inject/utils.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from deepspeed.utils import log_dist
 
@@ -14,6 +17,9 @@ def policy_to_ds_container(**kwargs):
     from .containers import HFOPTLayerPolicy, DS_OPTContainer
     from .containers import MegatronLayerPolicy, DS_MegatronGPTContainer
     from .containers import HFDistilBertLayerPolicy, DS_DistilBERTContainer
+    from .containers import LLAMALayerPolicy, DS_LLAMAContainer
+    from .containers import LLAMA2LayerPolicy, DS_LLAMA2Container
+    from .containers import InternLMLayerPolicy, DS_InternLMContainer
 
     policy_to_container = {
         HFGPT2LayerPolicy: DS_GPT2Container,
@@ -25,6 +31,9 @@ def policy_to_ds_container(**kwargs):
         HFOPTLayerPolicy: DS_OPTContainer,
         MegatronLayerPolicy: DS_MegatronGPTContainer,
         HFDistilBertLayerPolicy: DS_DistilBERTContainer,
+        LLAMALayerPolicy: DS_LLAMAContainer,
+        LLAMA2LayerPolicy: DS_LLAMA2Container,
+        InternLMLayerPolicy: DS_InternLMContainer
     }
 
     container = None
diff --git a/deepspeed/moe/__init__.py b/deepspeed/moe/__init__.py
index fcb45ab2b685..6c5067f71c8f 100644
--- a/deepspeed/moe/__init__.py
+++ b/deepspeed/moe/__init__.py
@@ -1 +1,5 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 '''Copyright The Microsoft DeepSpeed Team'''
diff --git a/deepspeed/moe/experts.py b/deepspeed/moe/experts.py
index 0fa440c2883a..8cadb0c387fa 100644
--- a/deepspeed/moe/experts.py
+++ b/deepspeed/moe/experts.py
@@ -1,17 +1,18 @@
-'''
-Copyright 2020 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 import copy
 
 
 class Experts(torch.nn.Module):
+
     def __init__(self, expert, num_local_experts=1, expert_group_name=None):
         super(Experts, self).__init__()
 
-        self.deepspeed_experts = torch.nn.ModuleList(
-            [copy.deepcopy(expert) for i in range(num_local_experts)])
+        self.deepspeed_experts = torch.nn.ModuleList([copy.deepcopy(expert) for i in range(num_local_experts)])
         self.num_local_experts = num_local_experts
 
         # TODO: revisit allreduce for moe.gate...
diff --git a/deepspeed/moe/layer.py b/deepspeed/moe/layer.py
index 6b4a07642cc8..7dd0c6bcb67d 100644
--- a/deepspeed/moe/layer.py
+++ b/deepspeed/moe/layer.py
@@ -1,6 +1,7 @@
-'''
-Copyright 2020 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 
@@ -31,6 +32,7 @@ class MoE(torch.nn.Module):
         use_tutel (bool, optional): default=False, whether to use Tutel optimizations (if installed).
         enable_expert_tensor_parallelism (bool, optional): default=False, whether to use tensor parallelism for experts
     """
+
     def __init__(self,
                  hidden_size,
                  expert,
@@ -65,15 +67,8 @@ def __init__(self,
             'Unsupported noisy_gate_policy: ' + noisy_gate_policy
 
         experts = Experts(expert, self.num_local_experts, self.expert_group_name)
-        self.deepspeed_moe = MOELayer(TopKGate(hidden_size,
-                                               num_experts,
-                                               k,
-                                               capacity_factor,
-                                               eval_capacity_factor,
-                                               min_capacity,
-                                               noisy_gate_policy,
-                                               drop_tokens,
-                                               use_rts),
+        self.deepspeed_moe = MOELayer(TopKGate(hidden_size, num_experts, k, capacity_factor, eval_capacity_factor,
+                                               min_capacity, noisy_gate_policy, drop_tokens, use_rts),
                                       experts,
                                       self.expert_group_name,
                                       self.ep_size,
@@ -84,26 +79,24 @@ def __init__(self,
             # coefficient is used for weighted sum of the output of expert and mlp
             self.coefficient = torch.nn.Linear(hidden_size, 2)
 
-    def set_deepspeed_parallelism(self):
-        self._create_process_groups()
+    def set_deepspeed_parallelism(self, use_data_before_expert_parallel_=False):
+        self._create_process_groups(use_data_before_expert_parallel_=use_data_before_expert_parallel_)
 
-    def _create_process_groups(self):
+    def _create_process_groups(self, use_data_before_expert_parallel_=False):
         # Create process group for a layer if needed
         if self.expert_group_name not in groups._get_expert_parallel_group_dict():
-            print(
-                f"No existing process group found, creating a new group named: {self.expert_group_name}"
-            )
+            print(f"No existing process group found, creating a new group named: {self.expert_group_name}")
             if (groups.mpu is None) or (not self.enable_expert_tensor_parallelism):
                 # Condition 1 - no groups.mpu means no tensor parallelism
                 # Condition 2 - disabling expert tensor parallelism on purpose
-                groups._create_expert_and_data_parallel(self.ep_size)
+                groups._create_expert_and_data_parallel(
+                    self.ep_size, use_data_before_expert_parallel_=use_data_before_expert_parallel_)
             else:
                 # expert tensor parallelism is enabled
-                groups._create_expert_data_and_model_parallel(self.ep_size,
-                                                              mpu=groups.mpu)
+                groups._create_expert_data_and_model_parallel(
+                    self.ep_size, mpu=groups.mpu, use_data_before_expert_parallel_=use_data_before_expert_parallel_)
         # Set the group handle for the MOELayer (deepspeed_moe) object
-        self.deepspeed_moe._set_ep_group(
-            groups._get_expert_parallel_group(self.expert_group_name))
+        self.deepspeed_moe._set_ep_group(groups._get_expert_parallel_group(self.expert_group_name))
 
     def forward(self, hidden_states, used_token=None):
         """ MoE forward
diff --git a/deepspeed/moe/mappings.py b/deepspeed/moe/mappings.py
index 38f1630a6703..6c501ea6503a 100644
--- a/deepspeed/moe/mappings.py
+++ b/deepspeed/moe/mappings.py
@@ -1,6 +1,7 @@
-'''
-Copyright 2022 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 # The file has been adapted from the following Megatron-LM file:
 # https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/mpu/mappings.py
@@ -32,14 +33,9 @@ def _gather_tokens(input_, dim=0):
     # Size and dimension.
     rank = mpu.get_tensor_model_parallel_rank()
 
-    tensor_list = [
-        torch.empty_like(input_)
-        for _ in range(mpu.get_tensor_model_parallel_world_size())
-    ]
+    tensor_list = [torch.empty_like(input_) for _ in range(mpu.get_tensor_model_parallel_world_size())]
     tensor_list[rank] = input_
-    deepspeed.comm.all_gather(tensor_list,
-                              input_,
-                              group=mpu.get_tensor_model_parallel_group())
+    deepspeed.comm.all_gather(tensor_list, input_, group=mpu.get_tensor_model_parallel_group())
 
     # Note: torch.cat already creates a contiguous tensor.
     output = torch.cat(tensor_list, dim=dim).contiguous()
@@ -53,7 +49,8 @@ def _drop_tokens(input_, dim=0):
 
     total_chunks = mpu.get_tensor_model_parallel_world_size()
     this_chunk = mpu.get_tensor_model_parallel_rank()
-    assert input_.shape[dim] % total_chunks == 0, f"input dimension {dim} ({input_.shape[dim]}) is not divisible by tensor parallel world size ({total_chunks})"
+    assert input_.shape[
+        dim] % total_chunks == 0, f"input dimension {dim} ({input_.shape[dim]}) is not divisible by tensor parallel world size ({total_chunks})"
     chunk_size = input_.shape[dim] // total_chunks
 
     return torch.narrow(input_, dim, this_chunk * chunk_size, chunk_size)
@@ -61,6 +58,7 @@ def _drop_tokens(input_, dim=0):
 
 class _GatherTokens(torch.autograd.Function):
     """All gather tokens among the tensor parallel ranks"""
+
     @staticmethod
     def symbolic(graph, input_, dim):
         return _gather_tokens(input_, dim)
diff --git a/deepspeed/moe/sharded_moe.py b/deepspeed/moe/sharded_moe.py
index 211b2127b026..d92211b9d220 100644
--- a/deepspeed/moe/sharded_moe.py
+++ b/deepspeed/moe/sharded_moe.py
@@ -1,11 +1,14 @@
-'''
-Copyright 2021 The Microsoft DeepSpeed Team
-'''
-# The file has been adapted from two fairscale files:
-# (1) https://github.com/facebookresearch/fairscale/blob/master/fairscale/nn/moe/moe_layer.py
-# (2) https://github.com/facebookresearch/fairscale/blob/master/fairscale/nn/moe/top2gate.py
-# Git commit hash: 34df606902a240567a0d898037ece55c2f1336cf
-# We retain the following license from the original files:
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""
+The file has been adapted from two fairscale files:
+ (1) https://github.com/facebookresearch/fairscale/blob/master/fairscale/nn/moe/moe_layer.py
+ (2) https://github.com/facebookresearch/fairscale/blob/master/fairscale/nn/moe/top2gate.py
+ Git commit hash: 34df606902a240567a0d898037ece55c2f1336cf
+ We retain the following license from the original files:
+"""
 
 # Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
 #
@@ -28,6 +31,11 @@
 else:
     Base = Module
 
+TOPK_GATE_TIMER = 'topk_gate'
+MOE_TIMER = 'moe'
+FIRST_ALLTOALL_TIMER = '1st_a2a'
+SECOND_ALLTOALL_TIMER = '2nd_a2a'
+
 uniform_map: Dict[torch.device, Callable] = {}
 gumbel_map: Dict[torch.device, Callable] = {}
 exp_selection_uniform_map: Dict[torch.device, Callable] = {}
@@ -60,11 +68,9 @@ def multiplicative_jitter(x, device: torch.device, epsilon=1e-2):
         return x
     uniform = uniform_map.get(device)
     if uniform is None:
-        uniform = torch.distributions.uniform.Uniform(
-            low=torch.tensor(1.0 - epsilon,
-                             device=device),
-            high=torch.tensor(1.0 + epsilon,
-                              device=device)).rsample  # type: ignore
+        uniform = torch.distributions.uniform.Uniform(low=torch.tensor(1.0 - epsilon, device=device),
+                                                      high=torch.tensor(1.0 + epsilon,
+                                                                        device=device)).rsample  # type: ignore
         uniform_map[device] = uniform
     return x * uniform(x.shape)
 
@@ -87,6 +93,7 @@ def gumbel_rsample(shape: Tuple, device: torch.device) -> Tensor:
 
 # Based on https://github.com/pytorch/pytorch/pull/40762
 class _AllToAll(torch.autograd.Function):
+
     @staticmethod
     def forward(
             ctx: Any,
@@ -181,25 +188,18 @@ def top1gating(logits: Tensor,
                noisy_gate_policy: Optional[str] = None,
                drop_tokens: bool = True,
                use_rts: bool = True,
-               use_tutel: bool = False) -> Tuple[Tensor,
-                                                 Tensor,
-                                                 Tensor,
-                                                 Tensor]:
+               use_tutel: bool = False) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
     """Implements Top1Gating on logits."""
     if noisy_gate_policy == 'RSample':
         logits_w_noise = logits + gumbel_rsample(logits.shape, device=logits.device)
     # everything is in fp32 in this function
     gates = F.softmax(logits, dim=1)
 
-    capacity = _capacity(gates,
-                         torch.tensor(capacity_factor),
-                         torch.tensor(min_capacity))
+    capacity = _capacity(gates, torch.tensor(capacity_factor), torch.tensor(min_capacity))
 
     # Create a mask for 1st's expert per token
     # noisy gating
-    indices1_s = torch.argmax(
-        logits_w_noise if noisy_gate_policy == 'RSample' else gates,
-        dim=1)
+    indices1_s = torch.argmax(logits_w_noise if noisy_gate_policy == 'RSample' else gates, dim=1)
     num_experts = int(gates.shape[1])
     mask1 = F.one_hot(indices1_s, num_classes=num_experts)
 
@@ -225,18 +225,16 @@ def top1gating(logits: Tensor,
     if use_rts:
         uniform = exp_selection_uniform_map.get(logits.device)
         if uniform is None:
-            uniform = torch.distributions.uniform.Uniform(
-                low=torch.tensor(0.0,
-                                 device=logits.device),
-                high=torch.tensor(1.0,
-                                  device=logits.device)).rsample
+            uniform = torch.distributions.uniform.Uniform(low=torch.tensor(0.0, device=logits.device),
+                                                          high=torch.tensor(1.0, device=logits.device)).rsample
             exp_selection_uniform_map[logits.device] = uniform
 
         mask1_rand = mask1 * uniform(mask1.shape)
     else:
         mask1_rand = mask1
 
-    assert logits.shape[0] >= min_capacity, "No. of tokens (batch-size) should be greater than min_capacity. Either set min_capacity to 0 or increase your batch size."
+    assert logits.shape[
+        0] >= min_capacity, "No. of tokens (batch-size) should be greater than min_capacity. Either set min_capacity to 0 or increase your batch size."
 
     top_idx = _top_idx(mask1_rand, capacity)
 
@@ -258,7 +256,13 @@ def top1gating(logits: Tensor,
     if use_tutel:
         gates1_s = (gates * mask1).sum(dim=1)
         locations1_s = torch.sum(locations1 * mask1, dim=1)
-        return l_aux, capacity, num_experts, [indices1_s,], [locations1_s,], [gates1_s,], exp_counts
+        return l_aux, capacity, num_experts, [
+            indices1_s,
+        ], [
+            locations1_s,
+        ], [
+            gates1_s,
+        ], exp_counts
 
     # Store the capacity location for each token
     locations1_s = torch.sum(locations1 * mask1, dim=1)
@@ -275,19 +279,12 @@ def top1gating(logits: Tensor,
     return l_aux, combine_weights, dispatch_mask, exp_counts
 
 
-def top2gating(logits: Tensor,
-               capacity_factor: float,
-               min_capacity: int) -> Tuple[Tensor,
-                                           Tensor,
-                                           Tensor,
-                                           Tensor]:
+def top2gating(logits: Tensor, capacity_factor: float, min_capacity: int) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
     """Implements Top2Gating on logits."""
     # everything is in fp32 in this function
     gates = F.softmax(logits, dim=1)
 
-    capacity = _capacity(gates,
-                         torch.tensor(capacity_factor * 2),
-                         torch.tensor(min_capacity))
+    capacity = _capacity(gates, torch.tensor(capacity_factor * 2), torch.tensor(min_capacity))
 
     # Create a mask for 1st's expert per token
     indices1_s = torch.argmax(gates, dim=1)
@@ -393,16 +390,13 @@ def __init__(self,
         self.drop_tokens = drop_tokens
         self.use_rts = use_rts
 
-    def forward(
-            self,
-            input: torch.Tensor,
-            used_token: torch.Tensor = None,
-            use_tutel: bool = False) -> Tuple[Tensor,
-                                              Tensor,
-                                              Tensor]:  # type: ignore
+    def forward(self,
+                input: torch.Tensor,
+                used_token: torch.Tensor = None,
+                use_tutel: bool = False) -> Tuple[Tensor, Tensor, Tensor]:  # type: ignore
 
         if self.wall_clock_breakdown:
-            self.timers('TopKGate').start()
+            self.timers(TOPK_GATE_TIMER).start()
 
         if self.wg.weight.dtype != torch.float32:
             self.wg = self.wg.float()
@@ -413,25 +407,17 @@ def forward(
         logits = self.wg(input_fp32)
 
         if self.k == 1:
-            gate_output = top1gating(
-                logits,
-                self.capacity_factor if self.training else self.eval_capacity_factor,
-                self.min_capacity,
-                used_token,
-                self.noisy_gate_policy if self.training else None,
-                self.drop_tokens,
-                self.use_rts,
-                use_tutel)
+            gate_output = top1gating(logits, self.capacity_factor if self.training else self.eval_capacity_factor,
+                                     self.min_capacity, used_token, self.noisy_gate_policy if self.training else None,
+                                     self.drop_tokens, self.use_rts, use_tutel)
 
         else:
-            gate_output = top2gating(
-                logits,
-                self.capacity_factor if self.training else self.eval_capacity_factor,
-                self.min_capacity)
+            gate_output = top2gating(logits, self.capacity_factor if self.training else self.eval_capacity_factor,
+                                     self.min_capacity)
 
         if self.wall_clock_breakdown:
-            self.timers('TopKGate').stop()
-            self.gate_time = self.timers('TopKGate').elapsed(reset=False)
+            self.timers(TOPK_GATE_TIMER).stop()
+            self.gate_time = self.timers(TOPK_GATE_TIMER).elapsed(reset=False)
 
         return gate_output
 
@@ -453,6 +439,7 @@ class MOELayer(Base):
         expert (torch.nn.Module):
             expert network
     """
+
     def __init__(self,
                  gate: Module,
                  experts: Module,
@@ -481,9 +468,8 @@ def __init__(self,
             logger.warning("Tutel optimization requested but not installed. "
                            "Proceeding without Tutel.")
         elif use_tutel and TUTEL_INSTALLED and gate.k != 1:
-            logger.warning(
-                "To enable Tutel optimization, use top-1 instead of top-2 gate. "
-                "Proceeding without Tutel.")
+            logger.warning("To enable Tutel optimization, use top-1 instead of top-2 gate. "
+                           "Proceeding without Tutel.")
 
     def _set_ep_group(self, ep_group):
         self.ep_group = ep_group
@@ -491,7 +477,7 @@ def _set_ep_group(self, ep_group):
     def forward(self, *input: Tensor, **kwargs: Any) -> Tensor:
 
         if self.wall_clock_breakdown:
-            self.timers('moe').start()
+            self.timers(MOE_TIMER).start()
 
         # Implement Algorithm 2 from GShard paper.
         d_model = input[0].shape[-1]
@@ -506,21 +492,15 @@ def forward(self, *input: Tensor, **kwargs: Any) -> Tensor:
             S, M = reshaped_input.size(0), reshaped_input.size(1)
 
             if not hasattr(self, '_tutel_dispatcher'):
-                self._tutel_dispatcher = tutel_moe.fast_dispatcher(
-                    E,
-                    C,
-                    M,
-                    dispatch_dtype=reshaped_input.dtype)
+                self._tutel_dispatcher = tutel_moe.fast_dispatcher(E, C, M, dispatch_dtype=reshaped_input.dtype)
             self._tutel_dispatcher.update(indices_, locations_, gates_, capacity=C)
             dispatched_input = self._tutel_dispatcher.encode(reshaped_input)
         else:
             self.l_aux, combine_weights, dispatch_mask, self.exp_counts = self.gate(reshaped_input, input[1])
-            dispatched_input = einsum("sec,sm->ecm",
-                                      dispatch_mask.type_as(input[0]),
-                                      reshaped_input)
+            dispatched_input = einsum("sec,sm->ecm", dispatch_mask.type_as(input[0]), reshaped_input)
 
         if self.wall_clock_breakdown:
-            self.timers('falltoall').start()
+            self.timers(FIRST_ALLTOALL_TIMER).start()
 
         if groups._get_expert_model_parallel_world_size() == 1:
             # If the non-expert is tensor-parallel, it will create
@@ -534,30 +514,25 @@ def forward(self, *input: Tensor, **kwargs: Any) -> Tensor:
         dispatched_input = _AllToAll.apply(self.ep_group, dispatched_input)
 
         if self.wall_clock_breakdown:
-            self.timers('falltoall').stop()
-            self.time_falltoall = self.timers('falltoall').elapsed(reset=False)
+            self.timers(FIRST_ALLTOALL_TIMER).stop()
+            self.time_falltoall = self.timers(FIRST_ALLTOALL_TIMER).elapsed(reset=False)
 
         # Re-shape after all-to-all: ecm -> gecm
-        dispatched_input = dispatched_input.reshape(self.ep_size,
-                                                    self.num_local_experts,
-                                                    -1,
-                                                    d_model)
+        dispatched_input = dispatched_input.reshape(self.ep_size, self.num_local_experts, -1, d_model)
 
         expert_output = self.experts(dispatched_input)
 
         if self.wall_clock_breakdown:
-            self.timers('salltoall').start()
+            self.timers(SECOND_ALLTOALL_TIMER).start()
 
         expert_output = _AllToAll.apply(self.ep_group, expert_output)
 
         if self.wall_clock_breakdown:
-            self.timers('salltoall').stop()
-            self.time_salltoall = self.timers('salltoall').elapsed(reset=False)
+            self.timers(SECOND_ALLTOALL_TIMER).stop()
+            self.time_salltoall = self.timers(SECOND_ALLTOALL_TIMER).elapsed(reset=False)
 
         # Re-shape back: gecm -> ecm
-        expert_output = expert_output.reshape(self.ep_size * self.num_local_experts,
-                                              -1,
-                                              d_model)
+        expert_output = expert_output.reshape(self.ep_size * self.num_local_experts, -1, d_model)
 
         if groups._get_expert_model_parallel_world_size() == 1:
             # the dropped duplicate tokens need to be gathered on each
@@ -568,14 +543,12 @@ def forward(self, *input: Tensor, **kwargs: Any) -> Tensor:
         if self.use_tutel:
             combined_output = self._tutel_dispatcher.decode(expert_output.view(E * C, M))
         else:
-            combined_output = einsum("sec,ecm->sm",
-                                     combine_weights.type_as(input[0]),
-                                     expert_output)
+            combined_output = einsum("sec,ecm->sm", combine_weights.type_as(input[0]), expert_output)
 
         a = combined_output.reshape(input[0].shape)
 
         if self.wall_clock_breakdown:
-            self.timers('moe').stop()
-            self.time_moe = self.timers('moe').elapsed(reset=False)
+            self.timers(MOE_TIMER).stop()
+            self.time_moe = self.timers(MOE_TIMER).elapsed(reset=False)
 
         return a
diff --git a/deepspeed/moe/utils.py b/deepspeed/moe/utils.py
index 1bf52795195b..da31f550aabc 100644
--- a/deepspeed/moe/utils.py
+++ b/deepspeed/moe/utils.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from typing import List, Tuple, Dict
 import torch
@@ -24,8 +27,7 @@ def is_moe_param(param: torch.Tensor) -> bool:
 
 
 def split_params_into_shared_and_expert_params(
-        params: List[torch.nn.Parameter]) -> Tuple[torch.nn.Parameter,
-                                                   torch.nn.Parameter]:
+        params: List[torch.nn.Parameter]) -> Tuple[torch.nn.Parameter, torch.nn.Parameter]:
     shared_params, expert_params = [], []
     for p in params:
         if is_moe_param(p):
@@ -36,8 +38,7 @@ def split_params_into_shared_and_expert_params(
 
 
 def split_params_grads_into_shared_and_expert_params(
-        group: List[torch.nn.Parameter]) -> Tuple[torch.nn.Parameter,
-                                                  torch.nn.Parameter]:
+        group: List[torch.nn.Parameter]) -> Tuple[torch.nn.Parameter, torch.nn.Parameter]:
     """Split grad of parameters into grads of non-expert params
     and grads of expert params. This is useful while computing
     grad-norms for clipping and overflow detection
@@ -62,8 +63,7 @@ def split_params_grads_into_shared_and_expert_params(
 
 
 def split_params_into_different_moe_groups_for_optimizer(param_groups: Tuple[Dict],
-                                                         max_group_size=178956971
-                                                         ) -> Tuple[Dict]:
+                                                         max_group_size=178956971) -> Tuple[Dict]:
     """Split parameters into different MoE groups for optimizer
 
     Args:
@@ -101,8 +101,7 @@ def split_params_into_different_moe_groups_for_optimizer(param_groups: Tuple[Dic
                     if ori_key == 'params':
                         group_moe[param_group['name']][key][ori_key] = []
                     else:
-                        group_moe[
-                            param_group['name']][key][ori_key] = param_group[ori_key]
+                        group_moe[param_group['name']][key][ori_key] = param_group[ori_key]
     # Assign param
     for param_group in param_groups:
         new_params = []
diff --git a/deepspeed/monitor/__init__.py b/deepspeed/monitor/__init__.py
index fcb45ab2b685..6c5067f71c8f 100644
--- a/deepspeed/monitor/__init__.py
+++ b/deepspeed/monitor/__init__.py
@@ -1 +1,5 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 '''Copyright The Microsoft DeepSpeed Team'''
diff --git a/deepspeed/monitor/config.py b/deepspeed/monitor/config.py
index 09ba7ef1af47..5a8ca6ecf5cd 100644
--- a/deepspeed/monitor/config.py
+++ b/deepspeed/monitor/config.py
@@ -1,21 +1,14 @@
-'''Copyright The Microsoft DeepSpeed Team'''
-"""
-Copyright (c) Microsoft Corporation
-Licensed under the MIT license.
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
 
-from pydantic import root_validator
+# DeepSpeed Team
+
+from deepspeed.pydantic_v1 import root_validator
 from deepspeed.runtime.config_utils import DeepSpeedConfigModel
 
 
 def get_monitor_config(param_dict):
-    monitor_dict = {
-        key: param_dict.get(key,
-                            {})
-        for key in ("tensorboard",
-                    "wandb",
-                    "csv_monitor")
-    }
+    monitor_dict = {key: param_dict.get(key, {}) for key in ("tensorboard", "wandb", "csv_monitor")}
     return DeepSpeedMonitorConfig(**monitor_dict)
 
 
@@ -78,10 +71,9 @@ class DeepSpeedMonitorConfig(DeepSpeedConfigModel):
 
     csv_monitor: CSVConfig = {}
     """ Local CSV output of monitoring data. """
+
     @root_validator
     def check_enabled(cls, values):
-        values["enabled"] = False
-        if (values.get("tensorboard").enabled or values.get("wandb").enabled
-                or values.get("csv_monitor").enabled):
-            values["enabled"] = True
+        values["enabled"] = values.get("tensorboard").enabled or values.get("wandb").enabled or values.get(
+            "csv_monitor").enabled
         return values
diff --git a/deepspeed/monitor/csv_monitor.py b/deepspeed/monitor/csv_monitor.py
index 7de4fbcede58..c7a19b14ad82 100644
--- a/deepspeed/monitor/csv_monitor.py
+++ b/deepspeed/monitor/csv_monitor.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .monitor import Monitor
 import os
@@ -7,6 +10,7 @@
 
 
 class csvMonitor(Monitor):
+
     def __init__(self, csv_config):
         super().__init__(csv_config)
         self.filenames = []
diff --git a/deepspeed/monitor/monitor.py b/deepspeed/monitor/monitor.py
index 504c3da391f5..5a32b8bbcadd 100644
--- a/deepspeed/monitor/monitor.py
+++ b/deepspeed/monitor/monitor.py
@@ -1,6 +1,9 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 """
- Support different forms of monitoring such as wandb and tensorboard
+Support different forms of monitoring such as wandb and tensorboard
 """
 
 from abc import ABC, abstractmethod
@@ -8,6 +11,7 @@
 
 
 class Monitor(ABC):
+
     @abstractmethod
     def __init__(self, monitor_config):
         self.monitor_config = monitor_config
@@ -23,6 +27,7 @@ def write_events(self, event_list):
 
 
 class MonitorMaster(Monitor):
+
     def __init__(self, monitor_config):
         super().__init__(monitor_config)
         self.tb_monitor = None
diff --git a/deepspeed/monitor/tensorboard.py b/deepspeed/monitor/tensorboard.py
index db3a50d75e23..985c9ed44b6f 100644
--- a/deepspeed/monitor/tensorboard.py
+++ b/deepspeed/monitor/tensorboard.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .utils import check_tb_availability
 from .monitor import Monitor
@@ -8,6 +11,7 @@
 
 
 class TensorBoardMonitor(Monitor):
+
     def __init__(self, tensorboard_config):
         super().__init__(tensorboard_config)
         check_tb_availability()
@@ -20,9 +24,7 @@ def __init__(self, tensorboard_config):
         if self.enabled and dist.get_rank() == 0:
             self.get_summary_writer()
 
-    def get_summary_writer(self,
-                           base=os.path.join(os.path.expanduser("~"),
-                                             "tensorboard")):
+    def get_summary_writer(self, base=os.path.join(os.path.expanduser("~"), "tensorboard")):
         if self.enabled and dist.get_rank() == 0:
             from torch.utils.tensorboard import SummaryWriter
             if self.output_path is not None:
diff --git a/deepspeed/monitor/utils.py b/deepspeed/monitor/utils.py
index aa37e8d27cff..265fc9811553 100644
--- a/deepspeed/monitor/utils.py
+++ b/deepspeed/monitor/utils.py
@@ -1,11 +1,14 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 
 def check_tb_availability():
     try:
         # torch.utils.tensorboard will fail if `tensorboard` is not available,
         # see their docs for more details: https://pytorch.org/docs/1.8.0/tensorboard.html
-        import tensorboard  # noqa: F401
+        import tensorboard  # noqa: F401 # type: ignore
     except ImportError:
         print('If you want to use tensorboard logging, please `pip install tensorboard`')
         raise
@@ -13,7 +16,7 @@ def check_tb_availability():
 
 def check_wandb_availability():
     try:
-        import wandb  # noqa: F401
+        import wandb  # noqa: F401 # type: ignore
     except ImportError:
         print(
             'If you want to use wandb logging, please `pip install wandb` and follow the instructions at https://docs.wandb.ai/quickstart'
diff --git a/deepspeed/monitor/wandb.py b/deepspeed/monitor/wandb.py
index 49fc0332070f..30209191171a 100644
--- a/deepspeed/monitor/wandb.py
+++ b/deepspeed/monitor/wandb.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .utils import check_wandb_availability
 from .monitor import Monitor
@@ -7,6 +10,7 @@
 
 
 class WandbMonitor(Monitor):
+
     def __init__(self, wandb_config):
         super().__init__(wandb_config)
         check_wandb_availability()
diff --git a/deepspeed/nebula/__init__.py b/deepspeed/nebula/__init__.py
index fcb45ab2b685..6c5067f71c8f 100644
--- a/deepspeed/nebula/__init__.py
+++ b/deepspeed/nebula/__init__.py
@@ -1 +1,5 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 '''Copyright The Microsoft DeepSpeed Team'''
diff --git a/deepspeed/nebula/config.py b/deepspeed/nebula/config.py
index 93b879fd77b7..dc49185738c9 100644
--- a/deepspeed/nebula/config.py
+++ b/deepspeed/nebula/config.py
@@ -1,14 +1,14 @@
-'''Copyright The Microsoft DeepSpeed Team'''
-"""
-Copyright (c) Microsoft Corporation
-Licensed under the MIT license.
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from deepspeed.runtime.config_utils import get_scalar_param, DeepSpeedConfigObject
 from deepspeed.nebula.constants import *
 
 
 class DeepSpeedNebulaConfig(DeepSpeedConfigObject):
+
     def __init__(self, param_dict):
         super(DeepSpeedNebulaConfig, self).__init__()
 
@@ -26,29 +26,18 @@ def __init__(self, param_dict):
         self._initialize(nebula_dict)
 
     def _initialize(self, nebula_dict):
-        self.enabled = get_scalar_param(nebula_dict,
-                                        NEBULA_ENABLED,
-                                        NEBULA_ENABLED_DEFAULT)
+        self.enabled = get_scalar_param(nebula_dict, NEBULA_ENABLED, NEBULA_ENABLED_DEFAULT)
 
-        self.load_path = get_scalar_param(nebula_dict,
-                                          NEBULA_LOAD_PATH,
-                                          NEBULA_LOAD_PATH_DEFAULT)
+        self.load_path = get_scalar_param(nebula_dict, NEBULA_LOAD_PATH, NEBULA_LOAD_PATH_DEFAULT)
 
-        self.enable_nebula_load = get_scalar_param(nebula_dict,
-                                                   NEBULA_ENABLE_NEBULA_LOAD,
+        self.enable_nebula_load = get_scalar_param(nebula_dict, NEBULA_ENABLE_NEBULA_LOAD,
                                                    NEBULA_ENABLE_NEBULA_LOAD_DEFAULT)
 
-        self.persistent_storage_path = get_scalar_param(
-            nebula_dict,
-            NEBULA_PERSISTENT_STORAGE_PATH,
-            NEBULA_PERSISTENT_STORAGE_PATH_DEFAULT)
+        self.persistent_storage_path = get_scalar_param(nebula_dict, NEBULA_PERSISTENT_STORAGE_PATH,
+                                                        NEBULA_PERSISTENT_STORAGE_PATH_DEFAULT)
 
-        self.persistent_time_interval = get_scalar_param(
-            nebula_dict,
-            NEBULA_PERSISTENT_TIME_INTERVAL,
-            NEBULA_PERSISTENT_TIME_INTERVAL_DEFAULT)
+        self.persistent_time_interval = get_scalar_param(nebula_dict, NEBULA_PERSISTENT_TIME_INTERVAL,
+                                                         NEBULA_PERSISTENT_TIME_INTERVAL_DEFAULT)
 
-        self.num_of_version_in_retention = get_scalar_param(
-            nebula_dict,
-            NEBULA_NUM_OF_VERSION_IN_RETENTION,
-            NEBULA_NUM_OF_VERSION_IN_RETENTION_DEFAULT)
+        self.num_of_version_in_retention = get_scalar_param(nebula_dict, NEBULA_NUM_OF_VERSION_IN_RETENTION,
+                                                            NEBULA_NUM_OF_VERSION_IN_RETENTION_DEFAULT)
diff --git a/deepspeed/nebula/constants.py b/deepspeed/nebula/constants.py
index 6ad876a8d1a3..9fa5769b5597 100644
--- a/deepspeed/nebula/constants.py
+++ b/deepspeed/nebula/constants.py
@@ -1,8 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
-"""
-Copyright (c) Microsoft Corporation
-Licensed under the MIT license.
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 #########################################
 # nebula
@@ -30,8 +29,8 @@
 # There is a case where customer want to load the checkpoint saved
 # by raw torch. Because nebula cannot load torch checkpoint directly
 # as they have different folder structures to bring the gap for
-# loading(the data are totaly same in bytes for torch and enbula s
-# aving).
+# loading(the data are totally same in bytes for torch and nebula
+# saving).
 # In this case, we must disable nebula load to use raw torch load.
 # Customer can just set NEBULA_ENABLE_NEBULA_LOAD to False. Then use
 # original way of deepspeed to load, i.e. set the value of "--load".
@@ -61,26 +60,13 @@
 NEBULA_NUM_OF_VERSION_IN_RETENTION = "num_of_version_in_retention"
 NEBULA_NUM_OF_VERSION_IN_RETENTION_DEFAULT = 2
 
-# Neubla envs
+# Nebula envs
 NEBULA_EXPORT_ENVS = [
-    'DLTS_JOB_ID',
-    'DLTS_NUM_WORKER',
-    'NEBULA_PERSISTENT_STORAGE_PATH',
-    'NEBULA_PERSISTENT_TIME_INTERVAL',
-    'AML_RUN_ID',
-    'AZUREML_RUN_TOKEN',
-    'AZUREML_WORKSPACE_SCOPE',
-    'AZUREML_EXPERIMENT_SCOPE',
-    'AZUREML_RUN_HISTORY_SERVICE_ENDPOINT',
-    'AZUREML_RUN_ID',
-    'NEBULA_MEMORY_BUFFER_SIZE',
-    'AZUREML_PARAMETER_ITPJOB_NAME',
-    'FC_TASKROLE_NAME',
-    'FC_TASK_INDEX',
-    'MASTER_HOST',
-    'LOCAL_HOST',
-    'AZUREML_BLOB_ACCOUNT_NAME',
-    'AZUREML_BLOB_ACCOUNT_KEY'
+    'DLTS_JOB_ID', 'DLTS_NUM_WORKER', 'NEBULA_PERSISTENT_STORAGE_PATH', 'NEBULA_PERSISTENT_TIME_INTERVAL',
+    'AML_RUN_ID', 'AZUREML_RUN_TOKEN', 'AZUREML_WORKSPACE_SCOPE', 'AZUREML_EXPERIMENT_SCOPE',
+    'AZUREML_RUN_HISTORY_SERVICE_ENDPOINT', 'AZUREML_RUN_ID', 'NEBULA_MEMORY_BUFFER_SIZE',
+    'AZUREML_PARAMETER_ITPJOB_NAME', 'FC_TASKROLE_NAME', 'FC_TASK_INDEX', 'MASTER_HOST', 'LOCAL_HOST',
+    'AZUREML_BLOB_ACCOUNT_NAME', 'AZUREML_BLOB_ACCOUNT_KEY'
 ]
 
 # ITP env files
diff --git a/deepspeed/ops/__init__.py b/deepspeed/ops/__init__.py
index efec4e62c3c9..ba1c9c1fd9f0 100755
--- a/deepspeed/ops/__init__.py
+++ b/deepspeed/ops/__init__.py
@@ -1,8 +1,12 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from . import adam
 from . import adagrad
 from . import lamb
+from . import lion
 #from ..git_version_info_installed import installed_ops as __installed_ops__
 #if __installed_ops__['sparse_attn']:
 from . import sparse_attention
diff --git a/deepspeed/ops/adagrad/__init__.py b/deepspeed/ops/adagrad/__init__.py
index a5ab6de0086c..5bf2e4721cd6 100644
--- a/deepspeed/ops/adagrad/__init__.py
+++ b/deepspeed/ops/adagrad/__init__.py
@@ -1,3 +1,6 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .cpu_adagrad import DeepSpeedCPUAdagrad
diff --git a/deepspeed/ops/adagrad/cpu_adagrad.py b/deepspeed/ops/adagrad/cpu_adagrad.py
index 07cdaa48c11f..c356a52777f2 100755
--- a/deepspeed/ops/adagrad/cpu_adagrad.py
+++ b/deepspeed/ops/adagrad/cpu_adagrad.py
@@ -1,6 +1,7 @@
-'''
-Copyright 2020 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 from deepspeed.ops.op_builder import CPUAdagradBuilder
@@ -10,13 +11,7 @@
 class DeepSpeedCPUAdagrad(torch.optim.Optimizer):
     optimizer_id = 0
 
-    def __init__(self,
-                 model_params,
-                 lr=1e-2,
-                 eps=1e-10,
-                 weight_decay=0,
-                 amsgrad=False,
-                 fp32_optimizer_states=True):
+    def __init__(self, model_params, lr=1e-2, eps=1e-10, weight_decay=0, amsgrad=False, fp32_optimizer_states=True):
 
         default_args = dict(lr=lr, eps=eps, weight_decay=weight_decay, amsgrad=amsgrad)
         super(DeepSpeedCPUAdagrad, self).__init__(model_params, default_args)
@@ -26,11 +21,7 @@ def __init__(self,
         self.fp32_optimizer_states = fp32_optimizer_states
         self.ds_opt_adagrad = CPUAdagradBuilder().load()
 
-        self.ds_opt_adagrad.create_adagrad(self.opt_id,
-                                           lr,
-                                           eps,
-                                           weight_decay,
-                                           should_log_le("info"))
+        self.ds_opt_adagrad.create_adagrad(self.opt_id, lr, eps, weight_decay, should_log_le("info"))
 
     def __del__(self):
         # need to destroy the C++ object explicitly to avoid a memory leak when deepspeed.initialize
@@ -90,9 +81,7 @@ def step(self, closure=None, fp16_param_groups=None):
 
                     #memory_format=torch.preserve_format)
                     # gradient variances
-                    state['exp_avg_sq'] = torch.zeros_like(p.data,
-                                                           dtype=state_dtype,
-                                                           device='cpu')
+                    state['exp_avg_sq'] = torch.zeros_like(p.data, dtype=state_dtype, device='cpu')
                     #memory_format=torch.preserve_format)
 
                 state['step'] += 1
@@ -100,39 +89,21 @@ def step(self, closure=None, fp16_param_groups=None):
                 if p.grad.is_sparse == True:
                     sparse_param = p.sparse_mask(p.grad)
                     sparse_exp_avg_sq = state['exp_avg_sq'].sparse_mask(p.grad)
-                    self.ds_opt_adagrad.adagrad_update(self.opt_id,
-                                                       state['step'],
-                                                       group['lr'],
-                                                       group['eps'],
-                                                       group['weight_decay'],
-                                                       sparse_param.values(),
-                                                       p.grad.values(),
+                    self.ds_opt_adagrad.adagrad_update(self.opt_id, state['step'], group['lr'], group['eps'],
+                                                       group['weight_decay'], sparse_param.values(), p.grad.values(),
                                                        sparse_exp_avg_sq.values())
                     p[sparse_param.indices()] = sparse_param.values()
-                    state['exp_avg_sq'][
-                        sparse_exp_avg_sq.indices()] = sparse_exp_avg_sq.values()
+                    state['exp_avg_sq'][sparse_exp_avg_sq.indices()] = sparse_exp_avg_sq.values()
                     if fp16_param_groups is not None:
-                        fp16_param_groups[group_id][param_id][
-                            sparse_param.indices()] = sparse_param.values()
+                        fp16_param_groups[group_id][param_id][sparse_param.indices()] = sparse_param.values()
                 else:
                     if fp16_param_groups is not None:
-                        self.ds_opt_adagrad.adagrad_update_copy(
-                            self.opt_id,
-                            state['step'],
-                            group['lr'],
-                            group['eps'],
-                            group['weight_decay'],
-                            p.data,
-                            p.grad.data,
-                            state['exp_avg_sq'],
-                            fp16_param_groups[group_id][param_id].data)
+                        self.ds_opt_adagrad.adagrad_update_copy(self.opt_id, state['step'], group['lr'], group['eps'],
+                                                                group['weight_decay'], p.data, p.grad.data,
+                                                                state['exp_avg_sq'],
+                                                                fp16_param_groups[group_id][param_id].data)
                     else:
-                        self.ds_opt_adagrad.adagrad_update(self.opt_id,
-                                                           state['step'],
-                                                           group['lr'],
-                                                           group['eps'],
-                                                           group['weight_decay'],
-                                                           p.data,
-                                                           p.grad.data,
+                        self.ds_opt_adagrad.adagrad_update(self.opt_id, state['step'], group['lr'], group['eps'],
+                                                           group['weight_decay'], p.data, p.grad.data,
                                                            state['exp_avg_sq'])
         return loss
diff --git a/deepspeed/ops/adam/__init__.py b/deepspeed/ops/adam/__init__.py
index 111d3175f89e..a29bb9447d01 100755
--- a/deepspeed/ops/adam/__init__.py
+++ b/deepspeed/ops/adam/__init__.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .cpu_adam import DeepSpeedCPUAdam
 from .fused_adam import FusedAdam
diff --git a/deepspeed/ops/adam/cpu_adam.py b/deepspeed/ops/adam/cpu_adam.py
index 04c5ac34cbe2..1ffaf873f4e9 100755
--- a/deepspeed/ops/adam/cpu_adam.py
+++ b/deepspeed/ops/adam/cpu_adam.py
@@ -1,6 +1,7 @@
-'''
-Copyright 2020 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 from cpuinfo import get_cpu_info
@@ -16,8 +17,7 @@ def __init__(self,
                  model_params,
                  lr=1e-3,
                  bias_correction=True,
-                 betas=(0.9,
-                        0.999),
+                 betas=(0.9, 0.999),
                  eps=1e-8,
                  weight_decay=0,
                  amsgrad=False,
@@ -63,7 +63,7 @@ def __init__(self,
                 algorithm from the paper `On the Convergence of Adam and Beyond`_
                 (default: False) NOT SUPPORTED in DeepSpeed CPUAdam!
             adamw_mode: select between Adam and AdamW implementations (default: AdamW)
-            full_precision_optimizer_states: creates momementum and variance in full precision regardless of
+            full_precision_optimizer_states: creates momentum and variance in full precision regardless of
                         the precision of the parameters (default: True)
         """
 
@@ -76,14 +76,12 @@ def __init__(self,
         super(DeepSpeedCPUAdam, self).__init__(model_params, default_args)
 
         cpu_info = get_cpu_info()
-        self.cpu_vendor = cpu_info["vendor_id_raw"].lower(
-        ) if "vendor_id_raw" in cpu_info else "unknown"
+        self.cpu_vendor = cpu_info["vendor_id_raw"].lower() if "vendor_id_raw" in cpu_info else "unknown"
         if "amd" in self.cpu_vendor:
             for group_id, group in enumerate(self.param_groups):
                 for param_id, p in enumerate(group['params']):
                     if p.dtype == torch.half:
-                        logger.warning(
-                            "FP16 params for CPUAdam may not work on AMD CPUs")
+                        logger.warning("FP16 params for CPUAdam may not work on AMD CPUs")
                         break
                 else:
                     continue
@@ -95,13 +93,7 @@ def __init__(self,
         self.fp32_optimizer_states = fp32_optimizer_states
         self.ds_opt_adam = CPUAdamBuilder().load()
 
-        self.ds_opt_adam.create_adam(self.opt_id,
-                                     lr,
-                                     betas[0],
-                                     betas[1],
-                                     eps,
-                                     weight_decay,
-                                     adamw_mode,
+        self.ds_opt_adam.create_adam(self.opt_id, lr, betas[0], betas[1], eps, weight_decay, adamw_mode,
                                      should_log_le("info"))
 
     def __del__(self):
@@ -168,45 +160,22 @@ def step(self, closure=None, fp16_param_groups=None):
                     state_dtype = torch.float if self.fp32_optimizer_states else p.dtype
 
                     # gradient momentums
-                    state['exp_avg'] = torch.zeros_like(p.data,
-                                                        dtype=state_dtype,
-                                                        device=device)
+                    state['exp_avg'] = torch.zeros_like(p.data, dtype=state_dtype, device=device)
                     #memory_format=torch.preserve_format)
                     # gradient variances
-                    state['exp_avg_sq'] = torch.zeros_like(p.data,
-                                                           dtype=state_dtype,
-                                                           device=device)
+                    state['exp_avg_sq'] = torch.zeros_like(p.data, dtype=state_dtype, device=device)
                     #memory_format=torch.preserve_format)
 
                 state['step'] += 1
                 beta1, beta2 = group['betas']
 
                 if fp16_param_groups is not None:
-                    self.ds_opt_adam.adam_update_copy(
-                        self.opt_id,
-                        state['step'],
-                        group['lr'],
-                        beta1,
-                        beta2,
-                        group['eps'],
-                        group['weight_decay'],
-                        group['bias_correction'],
-                        p.data,
-                        p.grad.data,
-                        state['exp_avg'],
-                        state['exp_avg_sq'],
-                        fp16_param_groups[group_id][param_id].data)
+                    self.ds_opt_adam.adam_update_copy(self.opt_id, state['step'], group['lr'], beta1, beta2,
+                                                      group['eps'], group['weight_decay'], group['bias_correction'],
+                                                      p.data, p.grad.data, state['exp_avg'], state['exp_avg_sq'],
+                                                      fp16_param_groups[group_id][param_id].data)
                 else:
-                    self.ds_opt_adam.adam_update(self.opt_id,
-                                                 state['step'],
-                                                 group['lr'],
-                                                 beta1,
-                                                 beta2,
-                                                 group['eps'],
-                                                 group['weight_decay'],
-                                                 group['bias_correction'],
-                                                 p.data,
-                                                 p.grad.data,
-                                                 state['exp_avg'],
-                                                 state['exp_avg_sq'])
+                    self.ds_opt_adam.adam_update(self.opt_id, state['step'], group['lr'], beta1, beta2, group['eps'],
+                                                 group['weight_decay'], group['bias_correction'], p.data, p.grad.data,
+                                                 state['exp_avg'], state['exp_avg_sq'])
         return loss
diff --git a/deepspeed/ops/adam/fused_adam.py b/deepspeed/ops/adam/fused_adam.py
index 169fde67e52e..53f859e9cc87 100644
--- a/deepspeed/ops/adam/fused_adam.py
+++ b/deepspeed/ops/adam/fused_adam.py
@@ -1,9 +1,11 @@
-'''
-Copyright 2020 The Microsoft DeepSpeed Team
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
 
+# DeepSpeed Team
+"""
 Copyright NVIDIA/apex
-This file is adapted from fused adam in NVIDIA/apex, commit a109f85
-'''
+This file is adapted from fused adam in NVIDIA/apex, commit 6bd01c4
+"""
 
 import torch
 from .multi_tensor_apply import MultiTensorApply
@@ -16,13 +18,36 @@
 class FusedAdam(torch.optim.Optimizer):
     """Implements Adam algorithm.
 
-    Currently GPU-only.
+    Currently GPU-only.  Requires Apex to be installed via
+    ``pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./``.
 
     This version of fused Adam implements 2 fusions.
 
       * Fusion of the Adam update's elementwise operations
       * A multi-tensor apply launch that batches the elementwise updates applied to all the model's parameters into one or a few kernel launches.
 
+    :class:`apex.optimizers.FusedAdam` may be used as a drop-in replacement for ``torch.optim.AdamW``,
+    or ``torch.optim.Adam`` with ``adam_w_mode=False``::
+
+        opt = apex.optimizers.FusedAdam(model.parameters(), lr = ....)
+        ...
+        opt.step()
+
+    :class:`apex.optimizers.FusedAdam` may be used with or without Amp.  If you wish to use :class:`FusedAdam` with Amp,
+    you may choose any ``opt_level``::
+
+        opt = apex.optimizers.FusedAdam(model.parameters(), lr = ....)
+        model, opt = amp.initialize(model, opt, opt_level="O0" or "O1 or "O2")
+        ...
+        opt.step()
+
+    In general, ``opt_level="O1"`` is recommended.
+
+
+    .. warning::
+        A previous version of :class:`FusedAdam` allowed a number of additional arguments to ``step``.  These additional arguments
+        are now deprecated and unnecessary.
+
     Adam was been proposed in `Adam: A Method for Stochastic Optimization`_.
 
     Arguments:
@@ -47,12 +72,12 @@ class FusedAdam(torch.optim.Optimizer):
     .. _On the Convergence of Adam and Beyond:
         https://openreview.net/forum?id=ryQu7f-RZ
     """
+
     def __init__(self,
                  params,
                  lr=1e-3,
                  bias_correction=True,
-                 betas=(0.9,
-                        0.999),
+                 betas=(0.9, 0.999),
                  eps=1e-8,
                  adam_w_mode=True,
                  weight_decay=0.,
@@ -61,11 +86,7 @@ def __init__(self,
 
         if amsgrad:
             raise RuntimeError('FusedAdam does not support the AMSGrad variant.')
-        defaults = dict(lr=lr,
-                        bias_correction=bias_correction,
-                        betas=betas,
-                        eps=eps,
-                        weight_decay=weight_decay)
+        defaults = dict(lr=lr, bias_correction=bias_correction, betas=betas, eps=eps, weight_decay=weight_decay)
         super(FusedAdam, self).__init__(params, defaults)
         self.adam_w_mode = 1 if adam_w_mode else 0
         self.set_grad_none = set_grad_none
@@ -83,12 +104,7 @@ def zero_grad(self):
         else:
             super(FusedAdam, self).zero_grad()
 
-    def step(self,
-             closure=None,
-             grads=None,
-             output_params=None,
-             scale=None,
-             grad_norms=None):
+    def step(self, closure=None, grads=None, output_params=None, scale=None, grad_norms=None, grad_scaler=None):
         """Performs a single optimization step.
 
         Arguments:
@@ -106,14 +122,19 @@ def step(self,
             loss = closure()
 
         for group in self.param_groups:
+            if len(group['params']) == 0:
+                continue
             bias_correction = 1 if group['bias_correction'] else 0
             beta1, beta2 = group['betas']
 
+            # assume same step across group now to simplify things
+            # per parameter step can be easily support by making it tensor, or pass list into kernel
             if 'step' not in group:
                 group['step'] = 0
 
             # create lists for multi-tensor apply
             g_16, p_16, m_16, v_16 = [], [], [], []
+            g_bf, p_bf, m_bf, v_bf = [], [], [], []
             g_32, p_32, m_32, v_32 = [], [], [], []
 
             for p in group['params']:
@@ -121,14 +142,13 @@ def step(self,
                     continue
                 if p.grad.data.is_sparse:
                     raise RuntimeError(
-                        'FusedAdam does not support sparse gradients, please consider SparseAdam instead'
-                    )
+                        'FusedAdam does not support sparse gradients, please consider SparseAdam instead')
 
                 state = self.state[p]
                 # State initialization
                 if len(state) == 0:
                     # DeepSpeed ZeRO 3 processes each subgroup a time, so we need to keep tracking step count for each tensor separately.
-                    # While this is not an issue for ZeRO 1 & 2, since they apply a single optimizatin step to the whole param group at the same time.
+                    # While this is not an issue for ZeRO 1 & 2, since they apply a single optimization step to the whole param group at the same time.
                     # In order to keep backward compatibility for the existing checkpoints, we use group['state'] to initialize state['step'] if it exists.
                     state['step'] = group.get('step', 0)
                     # Exponential moving average of gradient values
@@ -141,45 +161,35 @@ def step(self,
                     p_16.append(p.data)
                     m_16.append(state['exp_avg'])
                     v_16.append(state['exp_avg_sq'])
+                elif p.dtype == torch.bfloat16:
+                    g_bf.append(p.grad)
+                    p_bf.append(p)
+                    m_bf.append(state['exp_avg'])
+                    v_bf.append(state['exp_avg_sq'])
                 elif p.dtype == torch.float32:
                     g_32.append(p.grad.data)
                     p_32.append(p.data)
                     m_32.append(state['exp_avg'])
                     v_32.append(state['exp_avg_sq'])
                 else:
-                    raise RuntimeError('FusedAdam only support fp16 and fp32.')
+                    raise RuntimeError('FusedAdam only support fp16, bf16 and fp32.')
 
-            if (len(g_16) > 0):
+            if len(g_16) > 0:
                 state['step'] += 1
-                multi_tensor_applier(self.multi_tensor_adam,
-                                     self._dummy_overflow_buf,
-                                     [g_16,
-                                      p_16,
-                                      m_16,
-                                      v_16],
-                                     group['lr'],
-                                     beta1,
-                                     beta2,
-                                     group['eps'],
-                                     state['step'],
-                                     self.adam_w_mode,
-                                     bias_correction,
-                                     group['weight_decay'])
-            if (len(g_32) > 0):
+                multi_tensor_applier(self.multi_tensor_adam, self._dummy_overflow_buf, [g_16, p_16, m_16, v_16],
+                                     group['lr'], beta1, beta2, group['eps'], state['step'], self.adam_w_mode,
+                                     bias_correction, group['weight_decay'])
+
+            if len(g_bf) > 0:
+                state['step'] += 1
+                multi_tensor_applier(self.multi_tensor_adam, self._dummy_overflow_buf, [g_bf, p_bf, m_bf, v_bf],
+                                     group['lr'], beta1, beta2, group['eps'], state['step'], self.adam_w_mode,
+                                     bias_correction, group['weight_decay'])
+
+            if len(g_32) > 0:
                 state['step'] += 1
-                multi_tensor_applier(self.multi_tensor_adam,
-                                     self._dummy_overflow_buf,
-                                     [g_32,
-                                      p_32,
-                                      m_32,
-                                      v_32],
-                                     group['lr'],
-                                     beta1,
-                                     beta2,
-                                     group['eps'],
-                                     state['step'],
-                                     self.adam_w_mode,
-                                     bias_correction,
-                                     group['weight_decay'])
+                multi_tensor_applier(self.multi_tensor_adam, self._dummy_overflow_buf, [g_32, p_32, m_32, v_32],
+                                     group['lr'], beta1, beta2, group['eps'], state['step'], self.adam_w_mode,
+                                     bias_correction, group['weight_decay'])
 
         return loss
diff --git a/deepspeed/ops/adam/multi_tensor_apply.py b/deepspeed/ops/adam/multi_tensor_apply.py
index e837309be629..0ba228505cef 100644
--- a/deepspeed/ops/adam/multi_tensor_apply.py
+++ b/deepspeed/ops/adam/multi_tensor_apply.py
@@ -1,12 +1,15 @@
-'''
-Copyright 2020 The Microsoft DeepSpeed Team
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
 
+# DeepSpeed Team
+"""
 Copyright NVIDIA/apex
 This file is adapted from NVIDIA/apex, commit a109f85
-'''
+"""
 
 
 class MultiTensorApply(object):
+
     def __init__(self, chunk_size):
         self.chunk_size = chunk_size
 
diff --git a/deepspeed/ops/aio/__init__.py b/deepspeed/ops/aio/__init__.py
index d25f815739aa..6aea264237e3 100755
--- a/deepspeed/ops/aio/__init__.py
+++ b/deepspeed/ops/aio/__init__.py
@@ -1,6 +1,6 @@
-'''
-Copyright 2020 The Microsoft DeepSpeed Team.
-Licensed under the MIT license.
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from ..op_builder import AsyncIOBuilder
diff --git a/deepspeed/ops/deepspeed4science/__init__.py b/deepspeed/ops/deepspeed4science/__init__.py
new file mode 100644
index 000000000000..1c5fd280fc32
--- /dev/null
+++ b/deepspeed/ops/deepspeed4science/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .evoformer_attn import DS4Sci_EvoformerAttention, EvoformerFusedAttention
diff --git a/deepspeed/ops/deepspeed4science/evoformer_attn.py b/deepspeed/ops/deepspeed4science/evoformer_attn.py
new file mode 100644
index 000000000000..da5843d6de31
--- /dev/null
+++ b/deepspeed/ops/deepspeed4science/evoformer_attn.py
@@ -0,0 +1,106 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+import numpy as np
+from deepspeed.ops.op_builder import EvoformerAttnBuilder
+from deepspeed.accelerator import get_accelerator
+
+kernel_ = None
+
+
+def _attention(Q, K, V, bias1, bias2):
+    assert Q.shape[-3] > 16, "seq_len must be greater than 16"
+    O = torch.empty_like(Q, dtype=Q.dtype)
+    assert get_accelerator().on_accelerator(Q), "Q must be on cuda"
+    assert get_accelerator().on_accelerator(K), "K must be on cuda"
+    assert get_accelerator().on_accelerator(V), "V must be on cuda"
+    assert get_accelerator().on_accelerator(bias1), "bias1 must be on cuda"
+    assert get_accelerator().on_accelerator(bias2), "bias2 must be on cuda"
+    global kernel_
+    if kernel_ is None:
+        kernel_ = EvoformerAttnBuilder().load()
+    nheads = Q.shape[-2]
+    nq = (Q.shape[-3] + 31) // 32 * 32
+    nb = np.prod(Q.shape[:-3])
+    lse = torch.empty((nb, nheads, nq), dtype=torch.float32, device=Q.device)
+    kernel_.attention(Q, K, V, bias1, bias2, O, lse)
+    return O, lse
+
+
+def attention_bwd(dO, Q, K, V, O, lse, bias1, bias2, bias1_grad, bias2_grad):
+    assert max(Q.shape[-1], V.shape[-1]) <= 64, "Hidden size is too large. Need to change kMax to a larger value"
+    dQ = torch.empty_like(Q, dtype=Q.dtype)
+    dK = torch.empty_like(K, dtype=K.dtype)
+    dV = torch.empty_like(V, dtype=V.dtype)
+    assert get_accelerator().on_accelerator(dO), "dO must be on cuda"
+    assert get_accelerator().on_accelerator(Q), "Q must be on cuda"
+    assert get_accelerator().on_accelerator(K), "K must be on cuda"
+    assert get_accelerator().on_accelerator(V), "V must be on cuda"
+    assert get_accelerator().on_accelerator(O), "O must be on cuda"
+    global kernel_
+    if kernel_ is None:
+        kernel_ = EvoformerAttnBuilder().load()
+    delta = torch.empty_like(lse)
+    if bias1_grad:
+        dB1 = torch.zeros_like(bias1, dtype=torch.float32)
+    else:
+        dB1 = torch.tensor([], dtype=torch.float32, device=bias1.device)
+    if bias2_grad:
+        dB2 = torch.zeros_like(bias2, dtype=torch.float32)
+    else:
+        dB2 = torch.tensor([], dtype=torch.float32, device=bias2.device)
+    kernel_.attention_bwd(dO, Q, K, V, O, lse, delta, bias1, bias2, dQ, dK, dV, dB1, dB2)
+    return dQ, dK, dV, dB1.to(dO.dtype), dB2.to(dO.dtype)
+
+
+class EvoformerFusedAttention(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, q, k, v, bias1=None, bias2=None):
+        """
+        q, k, v: are in shape [*, L, H, D]
+        """
+        bias1_ = bias1.contiguous() if bias1 is not None else torch.tensor([], dtype=q.dtype, device=q.device)
+        bias2_ = bias2.contiguous() if bias2 is not None else torch.tensor([], dtype=q.dtype, device=q.device)
+        q = q.contiguous()
+        k = k.contiguous()
+        v = v.contiguous()
+        o, lse = _attention(q, k, v, bias1_, bias2_)
+        ctx.save_for_backward(q, k, v, o, lse, bias1_, bias2_)
+        return o
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        q, k, v, o, lse, bias1, bias2 = ctx.saved_tensors
+        is_b1_grad = bias1.numel() != 0 and ctx.needs_input_grad[3]
+        is_b2_grad = bias2.numel() != 0 and ctx.needs_input_grad[4]
+        dQ, dK, dV, dB1, dB2 = attention_bwd(grad_output, q, k, v, o, lse, bias1, bias2, is_b1_grad, is_b2_grad)
+        if not is_b1_grad:
+            dB1 = None
+        if not is_b2_grad:
+            dB2 = None
+        return dQ, dK, dV, dB1, dB2
+
+
+def DS4Sci_EvoformerAttention(Q, K, V, biases):
+    assert len(biases) <= 2
+
+    if (len(biases) == 0):
+        biases.append(None)
+
+    if (len(biases) == 1):
+        biases.append(None)
+
+    bias_1_shape = lambda x: (x.shape[0], x.shape[1], 1, 1, x.shape[2])
+    bias_2_shape = lambda x: (x.shape[0], 1, x.shape[3], x.shape[2], x.shape[2])
+
+    if biases[0] is not None:
+        assert biases[0].shape == bias_1_shape(Q), "bias1 shape is incorrect"
+
+    if biases[1] is not None:
+        assert biases[1].shape == bias_2_shape(Q), "bias2 shape is incorrect"
+
+    return EvoformerFusedAttention.apply(Q, K, V, biases[0], biases[1])
diff --git a/deepspeed/ops/lamb/__init__.py b/deepspeed/ops/lamb/__init__.py
index 942d76687a60..81bc07e827c8 100644
--- a/deepspeed/ops/lamb/__init__.py
+++ b/deepspeed/ops/lamb/__init__.py
@@ -1,3 +1,6 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .fused_lamb import FusedLamb
diff --git a/deepspeed/ops/lamb/fused_lamb.py b/deepspeed/ops/lamb/fused_lamb.py
index 33a1461e9883..6ccd9d4c6b06 100644
--- a/deepspeed/ops/lamb/fused_lamb.py
+++ b/deepspeed/ops/lamb/fused_lamb.py
@@ -1,9 +1,11 @@
-'''
-Copyright 2019 The Microsoft DeepSpeed Team
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
 
+# DeepSpeed Team
+"""
 Copyright NVIDIA/apex
 This file is adapted from NVIDIA/apex/optimizer/fused_adam and implements the LAMB optimizer
-'''
+"""
 import types
 import torch
 from deepspeed.ops.op_builder import FusedLambBuilder
@@ -35,12 +37,12 @@ class FusedLamb(torch.optim.Optimizer):
         min_coeff(float, optional): minimum value of the lamb coefficient (default: 0.01)
         amsgrad (boolean, optional): NOT SUPPORTED in FusedLamb!
     """
+
     def __init__(self,
                  params,
                  lr=1e-3,
                  bias_correction=True,
-                 betas=(0.9,
-                        0.999),
+                 betas=(0.9, 0.999),
                  eps=1e-8,
                  eps_inside_sqrt=False,
                  weight_decay=0.,
@@ -64,12 +66,7 @@ def __init__(self,
         self.eps_mode = 0 if eps_inside_sqrt else 1
         self.lamb_coeffs = []
 
-    def step(self,
-             closure=None,
-             grads=None,
-             output_params=None,
-             scale=1.,
-             grad_norms=None):
+    def step(self, closure=None, grads=None, output_params=None, scale=1., grad_norms=None):
         """Performs a single optimization step.
 
         Arguments:
@@ -114,7 +111,8 @@ def step(self,
         #remove the previous coeffs
         del self.lamb_coeffs[:]
 
-        for group, grads_this_group, output_params_this_group, grad_norm_group in zip(self.param_groups, grads_group, output_params_group, grad_norms):
+        for group, grads_this_group, output_params_this_group, grad_norm_group in zip(
+                self.param_groups, grads_group, output_params_group, grad_norms):
             if grads_this_group is None:
                 grads_this_group = [None] * len(group['params'])
             if output_params_this_group is None:
@@ -127,7 +125,8 @@ def step(self,
 
             bias_correction = 1 if group['bias_correction'] else 0
 
-            for p, grad, output_param, grad_norm in zip(group['params'], grads_this_group, output_params_this_group, grad_norm_group):
+            for p, grad, output_param, grad_norm in zip(group['params'], grads_this_group, output_params_this_group,
+                                                        grad_norm_group):
 
                 # compute combined scale factor for this group
                 combined_scale = scale
@@ -162,24 +161,10 @@ def step(self,
 
                 state['step'] += 1
 
-                out_p = torch.tensor(
-                    [],
-                    dtype=torch.float) if output_param is None else output_param
-                lamb_coeff = self.fused_lamb_cuda.lamb(p.data,
-                                                       out_p,
-                                                       exp_avg,
-                                                       exp_avg_sq,
-                                                       grad,
-                                                       group['lr'],
-                                                       beta1,
-                                                       beta2,
-                                                       max_coeff,
-                                                       min_coeff,
-                                                       group['eps'],
-                                                       combined_scale,
-                                                       state['step'],
-                                                       self.eps_mode,
-                                                       bias_correction,
+                out_p = torch.tensor([], dtype=torch.float) if output_param is None else output_param
+                lamb_coeff = self.fused_lamb_cuda.lamb(p.data, out_p, exp_avg, exp_avg_sq, grad, group['lr'], beta1,
+                                                       beta2, max_coeff, min_coeff, group['eps'], combined_scale,
+                                                       state['step'], self.eps_mode, bias_correction,
                                                        group['weight_decay'])
                 self.lamb_coeffs.append(lamb_coeff)
         return loss
diff --git a/deepspeed/ops/lion/__init__.py b/deepspeed/ops/lion/__init__.py
new file mode 100755
index 000000000000..2f90e5ec2e80
--- /dev/null
+++ b/deepspeed/ops/lion/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .cpu_lion import DeepSpeedCPULion
+from .fused_lion import FusedLion
diff --git a/deepspeed/ops/lion/cpu_lion.py b/deepspeed/ops/lion/cpu_lion.py
new file mode 100755
index 000000000000..a91a00643873
--- /dev/null
+++ b/deepspeed/ops/lion/cpu_lion.py
@@ -0,0 +1,141 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+from cpuinfo import get_cpu_info
+from deepspeed.utils import logger
+from deepspeed.utils.logging import should_log_le
+from deepspeed.ops.op_builder import CPULionBuilder
+
+
+class DeepSpeedCPULion(torch.optim.Optimizer):
+    optimizer_id = 0
+
+    def __init__(self, model_params, lr=1e-3, betas=(0.9, 0.999), weight_decay=0, fp32_optimizer_states=True):
+        """Fast vectorized implementation of Lion optimizer on CPU:
+
+        See Symbolic Discovery of Optimization Algorithms (https://doi.org/10.48550/arXiv.2302.06675).
+
+        .. note::
+                We recommend using our `config
+                <https://www.deepspeed.ai/docs/config-json/#optimizer-parameters>`_
+                to allow :meth:`deepspeed.initialize` to build this optimizer
+                for you.
+
+
+        Arguments:
+            model_params (iterable): iterable of parameters to optimize or dicts defining
+                parameter groups.
+            lr (float, optional): learning rate. (default: 1e-3)
+            betas (Tuple[float, float], optional): coefficients used for computing
+                running averages of gradient and its square. (default: (0.9, 0.999))
+            weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+            full_precision_optimizer_states: creates momentum and variance in full precision regardless of
+                        the precision of the parameters (default: True)
+        """
+
+        default_args = dict(lr=lr, betas=betas, weight_decay=weight_decay)
+        super(DeepSpeedCPULion, self).__init__(model_params, default_args)
+
+        cpu_info = get_cpu_info()
+        self.cpu_vendor = cpu_info["vendor_id_raw"].lower() if "vendor_id_raw" in cpu_info else "unknown"
+        if "amd" in self.cpu_vendor:
+            for group_id, group in enumerate(self.param_groups):
+                for param_id, p in enumerate(group['params']):
+                    if p.dtype == torch.half:
+                        logger.warning("FP16 params for CPULion may not work on AMD CPUs")
+                        break
+                else:
+                    continue
+                break
+
+        self.opt_id = DeepSpeedCPULion.optimizer_id
+        DeepSpeedCPULion.optimizer_id = DeepSpeedCPULion.optimizer_id + 1
+        self.fp32_optimizer_states = fp32_optimizer_states
+        self.ds_opt_lion = CPULionBuilder().load()
+
+        self.ds_opt_lion.create_lion(self.opt_id, lr, betas[0], betas[1], weight_decay, should_log_le("info"))
+
+    def __del__(self):
+        # need to destroy the C++ object explicitly to avoid a memory leak when deepspeed.initialize
+        # is used multiple times in the same process (notebook or pytest worker)
+        self.ds_opt_lion.destroy_lion(self.opt_id)
+
+    def __setstate__(self, state):
+        super(DeepSpeedCPULion, self).__setstate__(state)
+        for group in self.param_groups:
+            group.setdefault('amsgrad', False)
+
+    @torch.no_grad()
+    def step(self, closure=None, fp16_param_groups=None):
+        """Update the model parameters.
+
+        .. note::
+            This method will be called internally by ZeRO-Offload. DeepSpeed
+            users should still use ``engine.step()`` as shown in the
+            `Getting Started
+            <https://www.deepspeed.ai/getting-started/#training>`_ guide.
+
+        Args:
+            closure (callable, optional): closure to compute the loss.
+                Defaults to ``None``.
+            fp16_param_groups: FP16 GPU parameters to update. Performing the
+                copy here reduces communication time. Defaults to ``None``.
+
+        Returns:
+            loss: if ``closure`` is provided. Otherwise ``None``.
+        """
+
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+
+        # intended device for step
+        device = torch.device('cpu')
+
+        # converting the fp16 params to a group of parameter
+        if type(fp16_param_groups) is list:
+            if type(fp16_param_groups[0]) is not list:
+                fp16_param_groups = [fp16_param_groups]
+        elif fp16_param_groups is not None:
+            fp16_param_groups = [[fp16_param_groups]]
+
+        for group_id, group in enumerate(self.param_groups):
+            for param_id, p in enumerate(group['params']):
+
+                if p.grad is None:
+                    continue
+
+                assert p.device == device, f"CPULion param is on {p.device} and must be 'cpu', make " \
+                        "sure you enabled 'offload_optimizer': 'cpu' in your ZeRO config."
+
+                state = self.state[p]
+                # State initialization
+                if len(state) == 0:
+                    #print(f'group {group_id} param {param_id} = {p.numel()}')
+                    state['step'] = 0
+
+                    #use full precision by default unless self.fp32_optimizer_states is off
+                    state_dtype = torch.float if self.fp32_optimizer_states else p.dtype
+
+                    # gradient momentums
+                    state['exp_avg'] = torch.zeros_like(p.data, dtype=state_dtype, device=device)
+                    #memory_format=torch.preserve_format)
+                    # gradient variances
+                    state['exp_avg_sq'] = torch.zeros_like(p.data, dtype=state_dtype, device=device)
+                    #memory_format=torch.preserve_format)
+
+                state['step'] += 1
+                beta1, beta2 = group['betas']
+
+                if fp16_param_groups is not None:
+                    self.ds_opt_lion.lion_update_copy(self.opt_id, state['step'], group['lr'], beta1, beta2,
+                                                      group['weight_decay'], p.data, p.grad.data, state['exp_avg'],
+                                                      fp16_param_groups[group_id][param_id].data)
+                else:
+                    self.ds_opt_lion.lion_update(self.opt_id, state['step'], group['lr'], beta1, beta2,
+                                                 group['weight_decay'], p.data, p.grad.data, state['exp_avg'])
+        return loss
diff --git a/deepspeed/ops/lion/fused_lion.py b/deepspeed/ops/lion/fused_lion.py
new file mode 100644
index 000000000000..7332a7f96361
--- /dev/null
+++ b/deepspeed/ops/lion/fused_lion.py
@@ -0,0 +1,131 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""
+This file is modified from fused_adam.py
+"""
+
+import torch
+from .multi_tensor_apply import MultiTensorApply
+
+multi_tensor_applier = MultiTensorApply(2048 * 32)
+from deepspeed.accelerator import get_accelerator
+from deepspeed.ops.op_builder import FusedLionBuilder
+
+
+class FusedLion(torch.optim.Optimizer):
+    """Implements Lion algorithm.
+
+    Currently GPU-only.
+
+    Arguments:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups.
+        lr (float, optional): learning rate. (default: 1e-3)
+        betas (Tuple[float, float], optional): coefficients used for computing
+            running averages of gradient and its square. (default: (0.9, 0.999))
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+        set_grad_none (bool, optional): whether set grad to None when zero_grad()
+            method is called. (default: True)
+
+    .. _Symbolic Discovery of Optimization Algorithms:
+        https://doi.org/10.48550/arXiv.2302.06675
+    """
+
+    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), weight_decay=0., set_grad_none=True):
+
+        defaults = dict(lr=lr, betas=betas, weight_decay=weight_decay)
+        super(FusedLion, self).__init__(params, defaults)
+        self.set_grad_none = set_grad_none
+
+        fused_lion_cuda = FusedLionBuilder().load()
+        # Skip buffer
+        self._dummy_overflow_buf = get_accelerator().IntTensor([0])
+        self.multi_tensor_lion = fused_lion_cuda.multi_tensor_lion
+
+    def zero_grad(self):
+        if self.set_grad_none:
+            for group in self.param_groups:
+                for p in group['params']:
+                    p.grad = None
+        else:
+            super(FusedLion, self).zero_grad()
+
+    def step(self, closure=None, grads=None, output_params=None, scale=None, grad_norms=None, grad_scaler=None):
+        """Performs a single optimization step.
+
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+
+        The remaining arguments are deprecated, and are only retained (for the moment) for error-checking purposes.
+        """
+        if any(p is not None for p in [grads, output_params, scale, grad_norms]):
+            raise RuntimeError('FusedLion has been updated.')
+        loss = None
+        if closure is not None:
+            loss = closure()
+
+        for group in self.param_groups:
+            if len(group['params']) == 0:
+                continue
+            beta1, beta2 = group['betas']
+
+            # assume same step across group now to simplify things
+            # per parameter step can be easily support by making it tensor, or pass list into kernel
+            if 'step' not in group:
+                group['step'] = 0
+
+            # create lists for multi-tensor apply
+            g_16, p_16, m_16 = [], [], []
+            g_bf, p_bf, m_bf = [], [], []
+            g_32, p_32, m_32 = [], [], []
+
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                if p.grad.data.is_sparse:
+                    raise NotImplementedError('FusedLion does not support sparse gradients')
+
+                state = self.state[p]
+                # State initialization
+                if len(state) == 0:
+                    # DeepSpeed ZeRO 3 processes each subgroup a time, so we need to keep tracking step count for each tensor separately.
+                    # While this is not an issue for ZeRO 1 & 2, since they apply a single optimization step to the whole param group at the same time.
+                    # In order to keep backward compatibility for the existing checkpoints, we use group['state'] to initialize state['step'] if it exists.
+                    state['step'] = group.get('step', 0)
+                    # Exponential moving average of gradient values
+                    state['exp_avg'] = torch.zeros_like(p.data)
+
+                if p.dtype == torch.float16:
+                    g_16.append(p.grad.data)
+                    p_16.append(p.data)
+                    m_16.append(state['exp_avg'])
+                elif p.dtype == torch.bfloat16:
+                    g_bf.append(p.grad)
+                    p_bf.append(p)
+                    m_bf.append(state['exp_avg'])
+                elif p.dtype == torch.float32:
+                    g_32.append(p.grad.data)
+                    p_32.append(p.data)
+                    m_32.append(state['exp_avg'])
+                else:
+                    raise RuntimeError('FusedLion only support fp16, bf16 and fp32.')
+
+            if len(g_16) > 0:
+                state['step'] += 1
+                multi_tensor_applier(self.multi_tensor_lion, self._dummy_overflow_buf, [g_16, p_16, m_16], group['lr'],
+                                     beta1, beta2, state['step'], group['weight_decay'])
+
+            if len(g_bf) > 0:
+                state['step'] += 1
+                multi_tensor_applier(self.multi_tensor_lion, self._dummy_overflow_buf, [g_bf, p_bf, m_bf], group['lr'],
+                                     beta1, beta2, state['step'], group['weight_decay'])
+
+            if len(g_32) > 0:
+                state['step'] += 1
+                multi_tensor_applier(self.multi_tensor_lion, self._dummy_overflow_buf, [g_32, p_32, m_32], group['lr'],
+                                     beta1, beta2, state['step'], group['weight_decay'])
+
+        return loss
diff --git a/deepspeed/ops/lion/multi_tensor_apply.py b/deepspeed/ops/lion/multi_tensor_apply.py
new file mode 100644
index 000000000000..0ba228505cef
--- /dev/null
+++ b/deepspeed/ops/lion/multi_tensor_apply.py
@@ -0,0 +1,17 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""
+Copyright NVIDIA/apex
+This file is adapted from NVIDIA/apex, commit a109f85
+"""
+
+
+class MultiTensorApply(object):
+
+    def __init__(self, chunk_size):
+        self.chunk_size = chunk_size
+
+    def __call__(self, op, noop_flag_buffer, tensor_lists, *args):
+        return op(self.chunk_size, noop_flag_buffer, tensor_lists, *args)
diff --git a/deepspeed/ops/quantizer/__init__.py b/deepspeed/ops/quantizer/__init__.py
index 922256532d43..0a5d1da5e3ae 100644
--- a/deepspeed/ops/quantizer/__init__.py
+++ b/deepspeed/ops/quantizer/__init__.py
@@ -1,3 +1,6 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .quantizer import ds_quantizer
diff --git a/deepspeed/ops/quantizer/quantizer.py b/deepspeed/ops/quantizer/quantizer.py
index 31cfa96d60c3..eb4bfd357000 100755
--- a/deepspeed/ops/quantizer/quantizer.py
+++ b/deepspeed/ops/quantizer/quantizer.py
@@ -1,6 +1,8 @@
-'''
-Copyright 2020 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 import torch
 
 from deepspeed.ops.op_builder import QuantizerBuilder
diff --git a/deepspeed/ops/random_ltd/__init__.py b/deepspeed/ops/random_ltd/__init__.py
index 34b0dd30e294..1ade5811ce76 100644
--- a/deepspeed/ops/random_ltd/__init__.py
+++ b/deepspeed/ops/random_ltd/__init__.py
@@ -1,3 +1,6 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .dropping_utils import gpt_sample_tokens, bert_sample_tokens, GatherTokens, ScatterTokens
diff --git a/deepspeed/ops/random_ltd/dropping_utils.py b/deepspeed/ops/random_ltd/dropping_utils.py
index 102ffe13d9d1..dd36c94537f8 100644
--- a/deepspeed/ops/random_ltd/dropping_utils.py
+++ b/deepspeed/ops/random_ltd/dropping_utils.py
@@ -1,6 +1,8 @@
-"""
-Copyright 2022 The Microsoft DeepSpeed Team
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 import torch
 
 from deepspeed.ops.op_builder import RandomLTDBuilder
@@ -23,16 +25,14 @@ def gpt_sample_tokens(reserved_length: int,
     prob_dist = torch.ones((layers * batch_size, seq_length), device=device)
     sampled_indices = torch.multinomial(prob_dist, reserved_length)
 
-    sampled_indices = sampled_indices.reshape(layers,
-                                              batch_size,
-                                              reserved_length).to(torch.int32)
+    sampled_indices = sampled_indices.reshape(layers, batch_size, reserved_length).to(torch.int32)
     global random_ltd_module
     if random_ltd_module is None:
         random_ltd_module = RandomLTDBuilder().load()
     sampled_indices = random_ltd_module.token_sort_(sampled_indices, seq_length)
 
     # Not certain the optimized kernel is actually better here, cause it kind of screws
-    # with alignment right if the sequence length is not divisble by like 16
+    # with alignment right if the sequence length is not divisible by like 16
     # new_mask = random_ltd_module.mask_gather_gpt(attn_mask, reserved_length)
     if attn_mask is not None:
         new_mask = attn_mask[:, :, :reserved_length, :reserved_length]
@@ -59,9 +59,7 @@ def bert_sample_tokens(reserved_length: int,
     prob_dist = torch.ones((layers * batch_size, seq_length), device=device)
     sampled_indices = torch.multinomial(prob_dist, reserved_length)
 
-    sampled_indices = sampled_indices.reshape(layers,
-                                              batch_size,
-                                              reserved_length).to(torch.int32)
+    sampled_indices = sampled_indices.reshape(layers, batch_size, reserved_length).to(torch.int32)
     global random_ltd_module
     if random_ltd_module is None:
         random_ltd_module = RandomLTDBuilder().load()
@@ -82,11 +80,9 @@ def bert_sample_tokens(reserved_length: int,
 
 
 class GatherTokens(torch.autograd.Function):
+
     @staticmethod
-    def forward(ctx,
-                activations: torch.Tensor,
-                sorted_indices: torch.Tensor,
-                batch_first: bool):
+    def forward(ctx, activations: torch.Tensor, sorted_indices: torch.Tensor, batch_first: bool):
         global random_ltd_module
         if random_ltd_module is None:
             random_ltd_module = RandomLTDBuilder().load()
@@ -104,25 +100,18 @@ def backward(ctx, a_gradients: torch.Tensor, g_gradients: torch.Tensor):
         activations, sorted_indices = ctx.saved_tensors
         batch_first = ctx.batch_first
 
-        return random_ltd_module.token_scatter_(a_gradients,
-                                                g_gradients,
-                                                sorted_indices,
-                                                batch_first), None, None
+        return random_ltd_module.token_scatter_(a_gradients, g_gradients, sorted_indices, batch_first), None, None
 
 
 class ScatterTokens(torch.autograd.Function):
+
     @staticmethod
-    def forward(ctx,
-                all_activations: torch.Tensor,
-                layer_activations: torch.Tensor,
-                sorted_indices: torch.Tensor,
+    def forward(ctx, all_activations: torch.Tensor, layer_activations: torch.Tensor, sorted_indices: torch.Tensor,
                 batch_first: bool):
         global random_ltd_module
         if random_ltd_module is None:
             random_ltd_module = RandomLTDBuilder().load()
-        scatter_results = random_ltd_module.token_scatter_(all_activations.clone(),
-                                                           layer_activations,
-                                                           sorted_indices,
+        scatter_results = random_ltd_module.token_scatter_(all_activations.clone(), layer_activations, sorted_indices,
                                                            batch_first)
 
         ctx.save_for_backward(sorted_indices)
@@ -139,7 +128,5 @@ def backward(ctx, out_gradients: torch.Tensor):
         sorted_indices, = ctx.saved_tensors
         batch_first = ctx.batch_first
 
-        ret_val = random_ltd_module.token_gather(out_gradients,
-                                                 sorted_indices,
-                                                 batch_first)
+        ret_val = random_ltd_module.token_gather(out_gradients, sorted_indices, batch_first)
         return out_gradients, ret_val, None, None
diff --git a/deepspeed/ops/sparse_attention/__init__.py b/deepspeed/ops/sparse_attention/__init__.py
index 870d73768d1d..e357257869f7 100644
--- a/deepspeed/ops/sparse_attention/__init__.py
+++ b/deepspeed/ops/sparse_attention/__init__.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .sparsity_config import SparsityConfig, DenseSparsityConfig, FixedSparsityConfig, VariableSparsityConfig, BigBirdSparsityConfig, BSLongformerSparsityConfig, LocalSlidingWindowSparsityConfig
 from .sparse_self_attention import SparseSelfAttention
diff --git a/deepspeed/ops/sparse_attention/bert_sparse_self_attention.py b/deepspeed/ops/sparse_attention/bert_sparse_self_attention.py
index 6c134d71f2b5..e25621bd0977 100755
--- a/deepspeed/ops/sparse_attention/bert_sparse_self_attention.py
+++ b/deepspeed/ops/sparse_attention/bert_sparse_self_attention.py
@@ -1,6 +1,7 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from torch import nn
 from deepspeed.ops.sparse_attention import SparseSelfAttention, FixedSparsityConfig
@@ -13,6 +14,7 @@ class BertSparseSelfAttention(nn.Module):
 
     For usage example please see, TODO DeepSpeed Sparse Transformer Tutorial.
     """
+
     def __init__(
         self,
         config,
@@ -29,10 +31,8 @@ def __init__(
 
         super(BertSparseSelfAttention, self).__init__()
         if config.hidden_size % config.num_attention_heads != 0:
-            raise ValueError(
-                "The hidden size (%d) is not a multiple of the number of attention "
-                "heads (%d)" % (config.hidden_size,
-                                config.num_attention_heads))
+            raise ValueError("The hidden size (%d) is not a multiple of the number of attention "
+                             "heads (%d)" % (config.hidden_size, config.num_attention_heads))
         self.num_attention_heads = config.num_attention_heads
         self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
         self.all_head_size = self.num_attention_heads * self.attention_head_size
@@ -44,8 +44,7 @@ def __init__(
         self.sparse_self_attention = SparseSelfAttention(sparsity_config)
 
     def transpose_for_scores(self, x):
-        new_x_shape = x.size()[:-1] + (self.num_attention_heads,
-                                       self.attention_head_size)
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
         x = x.view(*new_x_shape)
         return x.permute(0, 2, 1, 3)
 
diff --git a/deepspeed/ops/sparse_attention/matmul.py b/deepspeed/ops/sparse_attention/matmul.py
index 17b0898fdd0a..b30028fffbaa 100755
--- a/deepspeed/ops/sparse_attention/matmul.py
+++ b/deepspeed/ops/sparse_attention/matmul.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 # DeepSpeed note, code taken & adapted from commit 9aa94789f13ada713af36cfd8cca2fc9a7f6b79a
 # https://github.com/ptillet/torch-blocksparse/blob/master/torch_blocksparse/matmul.py
@@ -12,29 +15,8 @@
 
 
 @triton.jit
-def _kernel(A,
-            B,
-            C,
-            stride_za,
-            stride_ha,
-            stride_ma,
-            stride_ka,
-            stride_zb,
-            stride_hb,
-            stride_kb,
-            stride_nb,
-            stride_zc,
-            stride_hc,
-            stride_mc,
-            stride_nc,
-            DS0,
-            DS1,
-            SDD_K,
-            SDD_off_width,
-            lut,
-            locks,
-            nlocks,
-            **meta):
+def _kernel(A, B, C, stride_za, stride_ha, stride_ma, stride_ka, stride_zb, stride_hb, stride_kb, stride_nb, stride_zc,
+            stride_hc, stride_mc, stride_nc, DS0, DS1, SDD_K, SDD_off_width, lut, locks, nlocks, **meta):
     TM = meta['TM']
     TN = meta['TN']
     TK = meta['TK']
@@ -194,8 +176,7 @@ def _kernel(A,
         tl.store(pc, c, mask=checkc)
     # accumulate partial results using spin-locks
     else:
-        plock = locks + tl.program_id(2) * nlocks * tl.num_programs(1) + tl.program_id(
-            1) * nlocks + lockid - 1
+        plock = locks + tl.program_id(2) * nlocks * tl.num_programs(1) + tl.program_id(1) * nlocks + lockid - 1
         pcount = plock + tl.num_programs(2) * tl.num_programs(1) * nlocks
         while tl.atomic_cas(plock, 0, 1) == 1:
             pass
@@ -292,10 +273,7 @@ def make_sdd_lut(layout, block, dtype, device):
         #segmented = _sparse_matmul.sdd_segment(layout.type(torch.int32), start_width)
         start_width = (128 if block > 16 else 32) // block
         layout = layout.type(torch.int32)
-        segmented = libtriton.superblock(layout.data_ptr(),
-                                         layout.shape[0],
-                                         layout.shape[1],
-                                         layout.shape[2],
+        segmented = libtriton.superblock(layout.data_ptr(), layout.shape[0], layout.shape[1], layout.shape[2],
                                          start_width)
         luts, widths, packs = [], [], []
         for size, nnz in segmented:
@@ -317,19 +295,7 @@ def make_sdd_lut(layout, block, dtype, device):
         return luts, None, widths, packs
 
     @staticmethod
-    def _sdd_matmul(a,
-                    b,
-                    trans_a,
-                    trans_b,
-                    trans_c,
-                    spdims,
-                    block,
-                    luts,
-                    num_locks,
-                    widths,
-                    packs,
-                    bench,
-                    time):
+    def _sdd_matmul(a, b, trans_a, trans_b, trans_c, spdims, block, luts, num_locks, widths, packs, bench, time):
         if trans_c:
             a, b = b, a
             trans_a, trans_b = not trans_b, not trans_a
@@ -339,9 +305,8 @@ def _sdd_matmul(a,
         b_dim = -1 if trans_b else -2
         a_inner, b_inner = a.shape[a_dim], b.shape[b_dim]
         if a_inner != b_inner:
-            raise ValueError(
-                f"Size of tensor A along the {a_dim} dim ({a_inner}) must match size "
-                f"of tensor B along the {b_dim} dim ({b_inner})")
+            raise ValueError(f"Size of tensor A along the {a_dim} dim ({a_inner}) must match size "
+                             f"of tensor B along the {b_dim} dim ({b_inner})")
         if a_inner % 16 != 0:
             raise ValueError('Reduction size for SDD must be a multiple of 16')
 
@@ -356,12 +321,7 @@ def _sdd_matmul(a,
         device = a.device
         # create kernel
         total_width = sum([width * pack * pack for width, pack in zip(widths, packs)])
-        c = torch.empty((batch_size,
-                         total_width,
-                         block,
-                         block),
-                        dtype=dtype,
-                        device=a.device)
+        c = torch.empty((batch_size, total_width, block, block), dtype=dtype, device=a.device)
         for lut, width, pack in zip(luts, widths, packs):
             F32TK = [8, 16]
             F16TK = [16]
@@ -387,12 +347,7 @@ def _sdd_matmul(a,
             max_width = 49152
             total = 0 if bench else None
             for off_width in range(0, width, max_width):
-                grid = lambda meta: [
-                    meta['TZ'],
-                    min(max_width,
-                        width - off_width),
-                    batch_size
-                ]
+                grid = lambda meta: [meta['TZ'], min(max_width, width - off_width), batch_size]
                 _kernel[grid](a,
                               b,
                               c,
@@ -504,13 +459,7 @@ def make_dxx_lut(layout, block, step, trans, device, transform=lambda idx: idx):
         # create header
         width = column.size(0)
         offsets += 6 * width
-        header = torch.stack((offsets,
-                              segments,
-                              column,
-                              depth,
-                              lockid,
-                              maxid),
-                             dim=1).view(-1).contiguous()
+        header = torch.stack((offsets, segments, column, depth, lockid, maxid), dim=1).view(-1).contiguous()
         incs = torch.stack((xincs, wincs), dim=1).view(-1).contiguous()
         incs = torch.cat((incs, torch.zeros(2, device=incs.device, dtype=incs.dtype)))
         # create lut
@@ -521,19 +470,7 @@ def make_dxx_lut(layout, block, step, trans, device, transform=lambda idx: idx):
         return lut, num_locks, width, None
 
     @staticmethod
-    def _dds_matmul(a,
-                    b,
-                    trans_a,
-                    trans_b,
-                    trans_c,
-                    spdims,
-                    block,
-                    lut,
-                    num_locks,
-                    width,
-                    packs,
-                    bench,
-                    time):
+    def _dds_matmul(a, b, trans_a, trans_b, trans_c, spdims, block, lut, num_locks, width, packs, bench, time):
         global triton
         if triton is None:
             triton = importlib.import_module('triton')
@@ -548,16 +485,7 @@ def _dds_matmul(a,
         BS2 = block * spdims[1 if trans_b else 2]
         dtype = a.dtype
         # kernel
-        meta = {
-            'TN': block,
-            'TM': 128,
-            'TK': 16,
-            'BLOCK': block,
-            'TZ': 1,
-            'SDD': False,
-            'DSD': False,
-            'DDS': True
-        }
+        meta = {'TN': block, 'TM': 128, 'TK': 16, 'BLOCK': block, 'TZ': 1, 'SDD': False, 'DSD': False, 'DDS': True}
         # output
         CS0 = AS0
         CS1 = AS1
@@ -593,19 +521,7 @@ def _dds_matmul(a,
         return c
 
     @staticmethod
-    def _dsd_matmul(a,
-                    b,
-                    trans_a,
-                    trans_b,
-                    trans_c,
-                    spdims,
-                    block,
-                    lut,
-                    num_locks,
-                    width,
-                    packs,
-                    bench,
-                    time):
+    def _dsd_matmul(a, b, trans_a, trans_b, trans_c, spdims, block, lut, num_locks, width, packs, bench, time):
         global triton
         if triton is None:
             triton = importlib.import_module('triton')
@@ -621,16 +537,7 @@ def _dsd_matmul(a,
         dtype = a.dtype
         # kernel
 
-        meta = {
-            'TM': block,
-            'TN': 128,
-            'TK': 16,
-            'BLOCK': block,
-            'TZ': 1,
-            'SDD': False,
-            'DSD': True,
-            'DDS': False
-        }
+        meta = {'TM': block, 'TN': 128, 'TK': 16, 'BLOCK': block, 'TZ': 1, 'SDD': False, 'DSD': True, 'DDS': False}
         # output
         CS0 = BS0
         CS1 = BS1
@@ -665,53 +572,14 @@ def _dsd_matmul(a,
                       **meta)
         return c
 
-    fn = {
-        'sdd': _sdd_matmul.__get__(object),
-        'dsd': _dsd_matmul.__get__(object),
-        'dds': _dds_matmul.__get__(object)
-    }
+    fn = {'sdd': _sdd_matmul.__get__(object), 'dsd': _dsd_matmul.__get__(object), 'dds': _dds_matmul.__get__(object)}
 
     @staticmethod
-    def forward(ctx,
-                a,
-                b,
-                trans_a,
-                trans_b,
-                trans_c,
-                mode,
-                spdims,
-                block,
-                c_lut,
-                c_num_locks,
-                c_width,
-                c_packs,
-                c_bench,
-                c_time,
-                da_lut,
-                da_num_locks,
-                da_width,
-                da_packs,
-                da_bench,
-                da_time,
-                db_lut,
-                db_num_locks,
-                db_width,
-                db_packs,
-                db_bench,
-                db_time):
-        c = _sparse_matmul.fn[mode](a,
-                                    b,
-                                    trans_a,
-                                    trans_b,
-                                    trans_c,
-                                    spdims,
-                                    block,
-                                    c_lut,
-                                    c_num_locks,
-                                    c_width,
-                                    c_packs,
-                                    c_bench,
-                                    c_time)
+    def forward(ctx, a, b, trans_a, trans_b, trans_c, mode, spdims, block, c_lut, c_num_locks, c_width, c_packs,
+                c_bench, c_time, da_lut, da_num_locks, da_width, da_packs, da_bench, da_time, db_lut, db_num_locks,
+                db_width, db_packs, db_bench, db_time):
+        c = _sparse_matmul.fn[mode](a, b, trans_a, trans_b, trans_c, spdims, block, c_lut, c_num_locks, c_width,
+                                    c_packs, c_bench, c_time)
         # save for backward
         ctx.save_for_backward(a, b)
         ctx.da_num_locks = da_num_locks
@@ -741,34 +609,14 @@ def backward(ctx, dc):
         # gradients w.r.t. a
         if ctx.needs_input_grad[0]:
             mode_da = mode[1] + mode[0] + mode[2]
-            da = _sparse_matmul.fn[mode_da](dc,
-                                            b,
-                                            False,
-                                            not ctx.trans_b,
-                                            ctx.trans_a,
-                                            ctx.spdims,
-                                            ctx.block,
-                                            ctx.da_lut,
-                                            ctx.da_num_locks,
-                                            ctx.da_width,
-                                            ctx.da_packs,
-                                            ctx.da_bench,
+            da = _sparse_matmul.fn[mode_da](dc, b, False, not ctx.trans_b, ctx.trans_a, ctx.spdims, ctx.block,
+                                            ctx.da_lut, ctx.da_num_locks, ctx.da_width, ctx.da_packs, ctx.da_bench,
                                             ctx.da_time)
         # gradients w.r.t. b
         if ctx.needs_input_grad[1]:
             mode_db = mode[2] + mode[1] + mode[0]
-            db = _sparse_matmul.fn[mode_db](a,
-                                            dc,
-                                            not ctx.trans_a,
-                                            False,
-                                            ctx.trans_b,
-                                            ctx.spdims,
-                                            ctx.block,
-                                            ctx.db_lut,
-                                            ctx.db_num_locks,
-                                            ctx.db_width,
-                                            ctx.db_packs,
-                                            ctx.db_bench,
+            db = _sparse_matmul.fn[mode_db](a, dc, not ctx.trans_a, False, ctx.trans_b, ctx.spdims, ctx.block,
+                                            ctx.db_lut, ctx.db_num_locks, ctx.db_width, ctx.db_packs, ctx.db_bench,
                                             ctx.db_time)
         return da, db, None, None, None,\
                None, None, None, None,\
@@ -785,6 +633,7 @@ class MatMul:
 
     For more details about sparsity config, please see `Generative Modeling with Sparse Transformers`: https://arxiv.org/abs/1904.10509
     """
+
     def make_lut(self, dtype, device):
         """Generates the sparsity layout/s used in block-sparse matmul
         """
@@ -797,21 +646,25 @@ def make_lut(self, dtype, device):
         if self.mode == 'sdd':
             c_lut, c_num_locks, c_width, c_packs = _sparse_matmul.make_sdd_lut(layout, block, dtype, device)
         elif self.mode == 'dsd':
-            c_lut, c_num_locks, c_width, c_packs = _sparse_matmul.make_dxx_lut(layout, block, step, not self.trans_a, device)
+            c_lut, c_num_locks, c_width, c_packs = _sparse_matmul.make_dxx_lut(layout, block, step, not self.trans_a,
+                                                                               device)
         elif self.mode == 'dds':
-            c_lut, c_num_locks, c_width, c_packs = _sparse_matmul.make_dxx_lut(layout, block, step, self.trans_b, device)
+            c_lut, c_num_locks, c_width, c_packs = _sparse_matmul.make_dxx_lut(layout, block, step, self.trans_b,
+                                                                               device)
         # DA look-up table
         if self.mode == 'sdd':
             da_lut, da_num_locks, da_width, da_packs = _sparse_matmul.make_dxx_lut(layout, block, step, True, device)
         elif self.mode == 'dsd':
             da_lut, da_num_locks, da_width, da_packs = _sparse_matmul.make_sdd_lut(layout, block, dtype, device)
         elif self.mode == 'dds':
-            da_lut, da_num_locks, da_width, da_packs = _sparse_matmul.make_dxx_lut(layout, block, step, not self.trans_b, device)
+            da_lut, da_num_locks, da_width, da_packs = _sparse_matmul.make_dxx_lut(layout, block, step,
+                                                                                   not self.trans_b, device)
         # DB look-up table
         if self.mode == 'sdd':
             db_lut, db_num_locks, db_width, db_packs = _sparse_matmul.make_dxx_lut(layout, block, step, False, device)
         elif self.mode == 'dsd':
-            db_lut, db_num_locks, db_width, db_packs = _sparse_matmul.make_dxx_lut(layout, block, step, self.trans_a, device)
+            db_lut, db_num_locks, db_width, db_packs = _sparse_matmul.make_dxx_lut(layout, block, step, self.trans_a,
+                                                                                   device)
         elif self.mode == 'dds':
             db_lut, db_num_locks, db_width, db_packs = _sparse_matmul.make_sdd_lut(layout, block, dtype, device)
         self.lut_cache[key] = (c_lut, c_num_locks, c_width, c_packs,\
@@ -845,11 +698,10 @@ def __init__(self, layout, block, mode, trans_a=False, trans_b=False, bench=Fals
         assert layout_dim in (2, 3), "Layout should be a 2 or 3 dimensional tensor of 0s and 1s"
         if not mode == 'sdd':
             # Dims to be reduced on the 'inside' of the matmul, either -1 or -2
-            trans_dense, trans_sparse, sparse_inner = (trans_b, trans_a, -1) if mode == 'dsd' else (trans_a, trans_b, -2)
-            self.dense_inner_dim = -(
-                (sparse_inner % 2) + 1) if not trans_dense else sparse_inner
-            sparse_inner = sparse_inner if not trans_sparse else -(
-                (sparse_inner % 2) + 1)
+            trans_dense, trans_sparse, sparse_inner = (trans_b, trans_a, -1) if mode == 'dsd' else (trans_a, trans_b,
+                                                                                                    -2)
+            self.dense_inner_dim = -((sparse_inner % 2) + 1) if not trans_dense else sparse_inner
+            sparse_inner = sparse_inner if not trans_sparse else -((sparse_inner % 2) + 1)
 
             # Inner dim of the dense input should be equal to the inner dim of the sparse input
             self.dense_inner_size = layout.shape[sparse_inner] * block
@@ -860,8 +712,7 @@ def __init__(self, layout, block, mode, trans_a=False, trans_b=False, bench=Fals
         if layout_dim == 2:
             layout = layout.unsqueeze(0)
 
-        layout = layout.long(
-        )  # Above code assumes the layout tensor is an integral type
+        layout = layout.long()  # Above code assumes the layout tensor is an integral type
 
         self.spdims = layout.shape
         # timings
@@ -909,31 +760,9 @@ def __call__(self, a, b):
         b = MatMul._pad_shape(b, self.mode == 'dds')
         # execute
 
-        c = _sparse_matmul.apply(a,
-                                 b,
-                                 self.trans_a,
-                                 self.trans_b,
-                                 False,
-                                 self.mode,
-                                 self.spdims,
-                                 self.block,
-                                 c_lut,
-                                 c_num_locks,
-                                 c_width,
-                                 c_packs,
-                                 self.bench,
-                                 time_c,
-                                 da_lut,
-                                 da_num_locks,
-                                 da_width,
-                                 da_packs,
-                                 self.bench,
-                                 time_da,
-                                 db_lut,
-                                 db_num_locks,
-                                 db_width,
-                                 db_packs,
-                                 self.bench,
+        c = _sparse_matmul.apply(a, b, self.trans_a, self.trans_b, False, self.mode, self.spdims, self.block, c_lut,
+                                 c_num_locks, c_width, c_packs, self.bench, time_c, da_lut, da_num_locks, da_width,
+                                 da_packs, self.bench, time_da, db_lut, db_num_locks, db_width, db_packs, self.bench,
                                  time_db)
 
         # This removes any leading singleton dimensions we may have added to the tensor that weren't in the input
@@ -948,9 +777,8 @@ def __call__(self, a, b):
 
     def _validate_inputs(self, a, b):
         if a.device != b.device:
-            raise ValueError(
-                f"Inputs must be on the same device; got {a.device} for tensor A "
-                f"and {b.device} for tensor B")
+            raise ValueError(f"Inputs must be on the same device; got {a.device} for tensor A "
+                             f"and {b.device} for tensor B")
         if not get_accelerator().on_accelerator(a):
             raise ValueError("Only GPU devices are supported for now")
 
@@ -958,9 +786,7 @@ def _validate_inputs(self, a, b):
         if torch.is_autocast_enabled():
             a, b = a.half(), b.half()
         elif a.dtype != b.dtype:
-            raise ValueError(
-                f"Inputs must be the same dtype; got {a.dtype} for A and {b.dtype} for B"
-            )
+            raise ValueError(f"Inputs must be the same dtype; got {a.dtype} for A and {b.dtype} for B")
 
         mode, trans_a, trans_b = self.mode, self.trans_a, self.trans_b
         if mode != 'sdd':
@@ -968,14 +794,12 @@ def _validate_inputs(self, a, b):
             dense, dense_name, sparse, sparse_name = (a, 'A', b, 'B') if mode == 'dds' else (b, 'B', a, 'A')
             dense_inner = dense.shape[self.dense_inner_dim]
             if dense_inner != self.dense_inner_size:
-                raise ValueError(
-                    f"Expected tensor {dense_name} to have size {self.dense_inner_size} at dim "
-                    f"{self.dense_inner_dim % dense.ndim}, got {dense_inner}.")
+                raise ValueError(f"Expected tensor {dense_name} to have size {self.dense_inner_size} at dim "
+                                 f"{self.dense_inner_dim % dense.ndim}, got {dense_inner}.")
 
             if sparse.shape[-len(self.sparse_shape):] != self.sparse_shape:
-                raise ValueError(
-                    f"Expected tensor with trailing dimensions of shape {self.sparse_shape} for argument "
-                    f"{sparse_name}, got {sparse.shape}")
+                raise ValueError(f"Expected tensor with trailing dimensions of shape {self.sparse_shape} for argument "
+                                 f"{sparse_name}, got {sparse.shape}")
 
         def add_extra_dims(x):
             # Add extra leading singleton dimensions if needed
@@ -984,8 +808,7 @@ def add_extra_dims(x):
                 singletons = [1] * dims_needed
                 x = x.view(*singletons, *x.shape)
             elif dims_needed < 0:
-                raise ValueError(
-                    "Tensors with more than 4 dimensions are not currently supported")
+                raise ValueError("Tensors with more than 4 dimensions are not currently supported")
 
             return x
 
diff --git a/deepspeed/ops/sparse_attention/softmax.py b/deepspeed/ops/sparse_attention/softmax.py
index 09560e103d16..debee5688fe3 100755
--- a/deepspeed/ops/sparse_attention/softmax.py
+++ b/deepspeed/ops/sparse_attention/softmax.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 # DeepSpeed note, code taken & adapted from commit 9aa94789f13ada713af36cfd8cca2fc9a7f6b79a
 # https://github.com/ptillet/torch-blocksparse/blob/master/torch_blocksparse/matmul.py
@@ -28,29 +31,11 @@ def num_warps(n):
     return 16
 
 
-@triton.heuristics({
-    'num_warps': lambda *args,
-    **meta: num_warps(args[6] * meta['BLOCK'])
-})
-@triton.heuristics({
-    'TN': lambda *args,
-    **meta: next_power_of_2(args[6] * meta['BLOCK'])
-})
+@triton.heuristics({'num_warps': lambda *args, **meta: num_warps(args[6] * meta['BLOCK'])})
+@triton.heuristics({'TN': lambda *args, **meta: next_power_of_2(args[6] * meta['BLOCK'])})
 @triton.jit
-def _forward(X,
-             scale,
-             LUT,
-             RPE,
-             KP_M,
-             ATTN_M,
-             sizemax,
-             stride_zx,
-             stride_zrpe,
-             stride_hrpe,
-             stride_srpe,
-             stride_zkpm,
-             stride_zattnm,
-             **meta):
+def _forward(X, scale, LUT, RPE, KP_M, ATTN_M, sizemax, stride_zx, stride_zrpe, stride_hrpe, stride_srpe, stride_zkpm,
+             stride_zattnm, **meta):
     TN = meta['TN']
     BLOCK = meta['BLOCK']
     pidhm = tl.program_id(0)
@@ -102,14 +87,8 @@ def _forward(X,
     tl.store(px, x, mask=check)
 
 
-@triton.heuristics({
-    'num_warps': lambda *args,
-    **meta: num_warps(args[4] * meta['BLOCK'])
-})
-@triton.heuristics({
-    'TN': lambda *args,
-    **meta: next_power_of_2(args[4]) * meta['BLOCK']
-})
+@triton.heuristics({'num_warps': lambda *args, **meta: num_warps(args[4] * meta['BLOCK'])})
+@triton.heuristics({'TN': lambda *args, **meta: next_power_of_2(args[4]) * meta['BLOCK']})
 @triton.jit
 def _backward(X, scale, DX, LUT, sizemax, stride_zx, stride_zdx, **meta):
     pidhm = tl.program_id(0)
@@ -168,21 +147,8 @@ def make_lut(layout, block, device):
         return lut, int(sizes.max())
 
     @staticmethod
-    def forward(ctx,
-                x,
-                scale,
-                rpe,
-                key_padding_mask,
-                attn_mask,
-                kp_mask_mode,
-                attn_mask_mode,
-                spdims,
-                block,
-                lut,
-                num_blocks,
-                maxlut,
-                bench,
-                time):
+    def forward(ctx, x, scale, rpe, key_padding_mask, attn_mask, kp_mask_mode, attn_mask_mode, spdims, block, lut,
+                num_blocks, maxlut, bench, time):
 
         apply_scale = False if scale == 1.0 else True
 
@@ -251,14 +217,7 @@ def backward(ctx, dx):
         # run kernel
         M = x.shape[0]
         grid = lambda opt: [ctx.spdims[0] * ctx.spdims[1] * ctx.block, M]
-        _backward[grid](x,
-                        ctx.scale,
-                        dx,
-                        lut,
-                        ctx.maxlut,
-                        x.stride(0),
-                        dx.stride(0),
-                        BLOCK=ctx.block)
+        _backward[grid](x, ctx.scale, dx, lut, ctx.maxlut, x.stride(0), dx.stride(0), BLOCK=ctx.block)
         return dx, None, None, None, None, None, None, None, None, None, None, None, None, None, None
 
 
@@ -270,6 +229,7 @@ class Softmax:
 
     For more details about sparsity config, please see `Generative Modeling with Sparse Transformers`: https://arxiv.org/abs/1904.10509
     """
+
     def sparse_softmax(*args, **kwargs):
         return _sparse_softmax.apply(*args, **kwargs)
 
@@ -278,9 +238,7 @@ def make_lut(self, device):
         """
         key = (device, )
         if key not in self.lut_cache:
-            self.lut_cache[key] = _sparse_softmax.make_lut(self.layout,
-                                                           self.block,
-                                                           device)
+            self.lut_cache[key] = _sparse_softmax.make_lut(self.layout, self.block, device)
         return self.lut_cache[key]
 
     def __init__(self, layout, block, bench=False):
@@ -332,19 +290,7 @@ def __call__(self,
         if key_padding_mask is not None and key_padding_mask.dtype != x.dtype:
             raise ValueError('Key padding mask must be %s' % x.dtype)
         lut, maxlut = self.make_lut(x.device)
-        x = Softmax.sparse_softmax(x,
-                                   scale,
-                                   rpe,
-                                   key_padding_mask,
-                                   attn_mask,
-                                   key_padding_mask_mode,
-                                   attn_mask_mode,
-                                   self.spdims,
-                                   self.block,
-                                   lut,
-                                   self.num_blocks,
-                                   maxlut,
-                                   self.bench,
-                                   time_y)
+        x = Softmax.sparse_softmax(x, scale, rpe, key_padding_mask, attn_mask, key_padding_mask_mode, attn_mask_mode,
+                                   self.spdims, self.block, lut, self.num_blocks, maxlut, self.bench, time_y)
         self.time_y = time_y[0]
         return x
diff --git a/deepspeed/ops/sparse_attention/sparse_attention_utils.py b/deepspeed/ops/sparse_attention/sparse_attention_utils.py
index 90edb10fd84a..ccb0f940dff6 100644
--- a/deepspeed/ops/sparse_attention/sparse_attention_utils.py
+++ b/deepspeed/ops/sparse_attention/sparse_attention_utils.py
@@ -1,6 +1,7 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 from torch.nn import functional as F
@@ -15,6 +16,7 @@ class SparseAttentionUtils:
     Such utilities include extending position embeddings, replacing current self-attention layer with sparse attention, padding sequences to multiple of block size, etc.
 
     """
+
     @staticmethod
     def extend_position_embedding(model, max_position):
         """This function extends the position embedding weights of a model loaded from a checkpoint.
@@ -28,13 +30,11 @@ def extend_position_embedding(model, max_position):
         """
 
         if hasattr(model, 'bert'):
-            original_max_position = model.bert.embeddings.position_embeddings.weight.size(
-                0)
+            original_max_position = model.bert.embeddings.position_embeddings.weight.size(0)
             assert max_position > original_max_position
             extend_multiples = max(1, max_position // original_max_position)
             model.bert.embeddings.position_embeddings.weight.data = model.bert.embeddings.position_embeddings.weight.repeat(
-                extend_multiples,
-                1)
+                extend_multiples, 1)
         elif hasattr(model, 'roberta'):
             # RoBERTa has positions 0 & 1 reserved, so embedding size is max position + 2
             original_max_position, embed_size = model.roberta.embeddings.position_embeddings.weight.shape
@@ -43,13 +43,11 @@ def extend_position_embedding(model, max_position):
             assert max_position > original_max_position
             max_position += 2
             extended_position_embedding = model.roberta.embeddings.position_embeddings.weight.new_empty(
-                max_position,
-                embed_size)
+                max_position, embed_size)
             k = 2
             for i in range(extend_multiples):
                 extended_position_embedding[k:(
-                    k + original_max_position
-                )] = model.roberta.embeddings.position_embeddings.weight[2:]
+                    k + original_max_position)] = model.roberta.embeddings.position_embeddings.weight[2:]
                 k += original_max_position
             model.roberta.embeddings.position_embeddings.weight.data = extended_position_embedding
         else:
@@ -58,9 +56,7 @@ def extend_position_embedding(model, max_position):
             )
 
         model.config.max_position_embeddings = max_position
-        print(
-            f'Extended position embeddings to {original_max_position * extend_multiples}'
-        )
+        print(f'Extended position embeddings to {original_max_position * extend_multiples}')
 
         return model
 
@@ -102,21 +98,17 @@ def replace_model_self_attention_with_sparse_self_attention(
 
         if hasattr(model, 'bert'):
             model.config.max_position_embeddings = max_position
-            model.replace_self_attention_layer_with_sparse_self_attention_layer(
-                model.config,
-                model.bert.encoder.layer,
-                sparsity_config)
+            model.replace_self_attention_layer_with_sparse_self_attention_layer(model.config, model.bert.encoder.layer,
+                                                                                sparsity_config)
         elif hasattr(model, 'roberta'):
             model.config.max_position_embeddings = max_position + 2
-            model.replace_self_attention_layer_with_sparse_self_attention_layer(
-                model.config,
-                model.roberta.encoder.layer,
-                sparsity_config)
+            model.replace_self_attention_layer_with_sparse_self_attention_layer(model.config,
+                                                                                model.roberta.encoder.layer,
+                                                                                sparsity_config)
         else:
             raise ValueError(
                 'Please extend \"update_model_self_attention_to_sparse_self_attention\" function to support \
-                                     your model type. It currently only supports \"bert\" & \"roberta\"!'
-            )
+                                     your model type. It currently only supports \"bert\" & \"roberta\"!')
         return model
 
     @staticmethod
@@ -148,14 +140,8 @@ def replace_self_attention_layer_with_sparse_self_attention_layer(
         return layers
 
     @staticmethod
-    def pad_to_block_size(block_size,
-                          input_ids,
-                          attention_mask,
-                          token_type_ids,
-                          position_ids,
-                          inputs_embeds,
-                          pad_token_id,
-                          model_embeddings):
+    def pad_to_block_size(block_size, input_ids, attention_mask, token_type_ids, position_ids, inputs_embeds,
+                          pad_token_id, model_embeddings):
         """This function pads input tokens and attention mask on sequence length dimension to be multiple of block size.
             This is a requirement for Sparse Transformer in which the self attention layer works on sequences of length multiple of block size.
             It needs to be called in your model, such as BertModel, right before you calculate the embedding outputs.
@@ -187,10 +173,7 @@ def pad_to_block_size(block_size,
         pad_len = (block_size - seq_len % block_size) % block_size
         if pad_len > 0:
             if inputs_embeds is not None:
-                pad_input_ids = inputs_embeds.new_full((batch_size,
-                                                        pad_len),
-                                                       pad_token_id,
-                                                       dtype=torch.long)
+                pad_input_ids = inputs_embeds.new_full((batch_size, pad_len), pad_token_id, dtype=torch.long)
                 pad_inputs_embeds = model_embeddings(pad_input_ids)
                 inputs_embeds = torch.cat([inputs_embeds, pad_inputs_embeds], dim=-2)
             # may not be needed as input_ids are not used if inputs_embeds are given
diff --git a/deepspeed/ops/sparse_attention/sparse_self_attention.py b/deepspeed/ops/sparse_attention/sparse_self_attention.py
index 46dedd0779f9..b673c4561902 100644
--- a/deepspeed/ops/sparse_attention/sparse_self_attention.py
+++ b/deepspeed/ops/sparse_attention/sparse_self_attention.py
@@ -1,6 +1,7 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch.nn as nn
 import torch
@@ -15,6 +16,7 @@ class SparseSelfAttention(nn.Module):
 
     For usage example please see, TODO DeepSpeed Sparse Transformer Tutorial.
     """
+
     def __init__(
             self,
             # SparsityConfig parameters needs to be set accordingly
@@ -53,8 +55,7 @@ def get_layout(self, L):
 
         if (L % self.sparsity_config.block != 0):
             raise ValueError(
-                f'Sequence Length, {L}, needs to be dividable by Block size {self.sparsity_config.block}!'
-            )
+                f'Sequence Length, {L}, needs to be dividable by Block size {self.sparsity_config.block}!')
 
         num_blocks = L // self.sparsity_config.block
         return self.master_layout[..., :num_blocks, :num_blocks].cpu()  # layout needs to be a CPU tensor
@@ -65,11 +66,7 @@ def get_ops(self, H, L):
         from deepspeed.ops.sparse_attention.softmax import Softmax
         if L not in SparseSelfAttention.ops:
             sparsity_layout = self.get_layout(L)
-            sparse_dot_sdd_nt = MatMul(sparsity_layout,
-                                       self.sparsity_config.block,
-                                       'sdd',
-                                       trans_a=False,
-                                       trans_b=True)
+            sparse_dot_sdd_nt = MatMul(sparsity_layout, self.sparsity_config.block, 'sdd', trans_a=False, trans_b=True)
 
             sparse_dot_dsd_nn = MatMul(sparsity_layout,
                                        self.sparsity_config.block,
@@ -79,9 +76,7 @@ def get_ops(self, H, L):
 
             sparse_softmax = Softmax(sparsity_layout, self.sparsity_config.block)
 
-            SparseSelfAttention.ops[L] = (sparse_dot_sdd_nt,
-                                          sparse_dot_dsd_nn,
-                                          sparse_softmax)
+            SparseSelfAttention.ops[L] = (sparse_dot_sdd_nt, sparse_dot_dsd_nn, sparse_softmax)
         return SparseSelfAttention.ops[L]
 
     def transpose_key_for_scores(self, x, L):
@@ -100,13 +95,7 @@ def transpose_mask_for_sparse(self, qtype, x, is_key_padding_mask=False):
         return x.squeeze()
 
     # forward pass
-    def forward(self,
-                query,
-                key,
-                value,
-                rpe=None,
-                key_padding_mask=None,
-                attn_mask=None):
+    def forward(self, query, key, value, rpe=None, key_padding_mask=None, attn_mask=None):
         """Applies forward phase of sparse self attention
 
         Arguments:
@@ -134,9 +123,7 @@ def forward(self,
 
         # squeeze key_padding_mask if it is given
         if key_padding_mask is not None:
-            key_padding_mask = self.transpose_mask_for_sparse(query.dtype,
-                                                              key_padding_mask,
-                                                              is_key_padding_mask=True)
+            key_padding_mask = self.transpose_mask_for_sparse(query.dtype, key_padding_mask, is_key_padding_mask=True)
 
         # squeeze attn_mask if it is given
         if attn_mask is not None:
@@ -149,14 +136,13 @@ def forward(self,
 
         # attention scores
         attn_output_weights = sparse_dot_sdd_nt(query, key)
-        attn_output_weights = sparse_softmax(
-            attn_output_weights,
-            scale=scaling,
-            rpe=rpe,
-            key_padding_mask=key_padding_mask,
-            attn_mask=attn_mask,
-            key_padding_mask_mode=self.key_padding_mask_mode,
-            attn_mask_mode=self.attn_mask_mode)
+        attn_output_weights = sparse_softmax(attn_output_weights,
+                                             scale=scaling,
+                                             rpe=rpe,
+                                             key_padding_mask=key_padding_mask,
+                                             attn_mask=attn_mask,
+                                             key_padding_mask_mode=self.key_padding_mask_mode,
+                                             attn_mask_mode=self.attn_mask_mode)
 
         # outputs
         attn_output = sparse_dot_dsd_nn(attn_output_weights, value)
diff --git a/deepspeed/ops/sparse_attention/sparsity_config.py b/deepspeed/ops/sparse_attention/sparsity_config.py
index efdad2e4d7e5..1f59c4b46928 100644
--- a/deepspeed/ops/sparse_attention/sparsity_config.py
+++ b/deepspeed/ops/sparse_attention/sparsity_config.py
@@ -1,6 +1,7 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 import random
@@ -10,6 +11,7 @@ class SparsityConfig:
     """Abstract Configuration class to store `sparsity configuration of a self attention layer`.
     It contains shared property of different block-sparse sparsity patterns. However, each class needs to extend it based on required property and functionality.
     """
+
     def __init__(self, num_heads, block=16, different_layout_per_head=False):
         """Initialize the Sparsity Pattern Config.
 
@@ -37,9 +39,7 @@ def setup_layout(self, seq_len):
         """
 
         if (seq_len % self.block != 0):
-            raise ValueError(
-                f'Sequence Length, {seq_len}, needs to be dividable by Block size {self.block}!'
-            )
+            raise ValueError(f'Sequence Length, {seq_len}, needs to be dividable by Block size {self.block}!')
         num_blocks = seq_len // self.block
         # TODO Currently we allocate layout per head; needs to be updated if heads share a single layout.
         layout = torch.zeros((self.num_heads, num_blocks, num_blocks), dtype=torch.int64)
@@ -64,6 +64,7 @@ class DenseSparsityConfig(SparsityConfig):
     """Configuration class to store `Dense` configuration.
     In reality, this is not sparse and all blocks are used. We keep it for the sake of comparison and comprehension.
     """
+
     def __init__(self, num_heads, block=16, different_layout_per_head=False):
         """Initialize the Dense Sparsity Pattern Config.
         In reality, this is not sparse and all blocks are used. We keep it for the sake of comparison and comprehension.
@@ -77,7 +78,7 @@ def __init__(self, num_heads, block=16, different_layout_per_head=False):
         super().__init__(num_heads, block, different_layout_per_head)
 
     def make_layout(self, seq_len):
-        """Set 1 to all blocks of the layout meanins the pattern is dense; not sparse.
+        """Set 1 to all blocks of the layout meaning the pattern is dense; not sparse.
 
         Arguments:
              seq_len: required: an integer determining the underling sequence length; must be <= max sequence length
@@ -96,6 +97,7 @@ class FixedSparsityConfig(SparsityConfig):
     For more details about this sparsity config, please see `Generative Modeling with Sparse Transformers`: https://arxiv.org/abs/1904.10509; this has been customized.
     This class extends parent class of `SparsityConfig` and customizes it for `Fixed` sparsity.
     """
+
     def __init__(self,
                  num_heads,
                  block=16,
@@ -131,14 +133,11 @@ def __init__(self,
         self.num_global_blocks = num_global_blocks
 
         if (attention != 'unidirectional' and attention != 'bidirectional'):
-            raise NotImplementedError(
-                'only \"uni/bi-directional\" attentions are supported for now!')
+            raise NotImplementedError('only \"uni/bi-directional\" attentions are supported for now!')
         self.attention = attention
 
         if (attention != 'bidirectional' and horizontal_global_attention):
-            raise ValueError(
-                'only \"bi-directional\" attentions can support horizontal global attention!'
-            )
+            raise ValueError('only \"bi-directional\" attentions can support horizontal global attention!')
         self.horizontal_global_attention = horizontal_global_attention
 
         if (num_different_global_patterns > 1 and not different_layout_per_head):
@@ -166,9 +165,7 @@ def set_local_layout(self, h, layout):
         for i in range(0, num_blocks, self.num_local_blocks):
             end = min(i + self.num_local_blocks, num_blocks)
             for row in range(i, end):
-                for col in range(
-                        i,
-                    (row + 1 if self.attention == 'unidirectional' else end)):
+                for col in range(i, (row + 1 if self.attention == 'unidirectional' else end)):
                     layout[h, row, col] = 1
         return layout
 
@@ -206,8 +203,7 @@ def set_global_layout(self, h, layout):
 
         # set last global blocks; handle possible short last local window
         if (end < num_blocks):
-            start = min(end + first_global_block_idx,
-                        num_blocks - self.num_global_blocks)
+            start = min(end + first_global_block_idx, num_blocks - self.num_global_blocks)
             end = start + self.num_global_blocks
 
             # vertical global attention
@@ -250,6 +246,7 @@ class VariableSparsityConfig(SparsityConfig):
     For more details about `Fixed` sparsity config, please see `Generative Modeling with Sparse Transformers`: https://arxiv.org/abs/1904.10509; this has been customized.
     This class extends parent class of `SparsityConfig` and customizes it for `Fixed` sparsity.
     """
+
     def __init__(self,
                  num_heads,
                  block=16,
@@ -296,14 +293,11 @@ def __init__(self,
         self.global_block_end_indices = global_block_end_indices
 
         if (attention != 'unidirectional' and attention != 'bidirectional'):
-            raise NotImplementedError(
-                'only \"uni/bi-directional\" attentions are supported for now!')
+            raise NotImplementedError('only \"uni/bi-directional\" attentions are supported for now!')
         self.attention = attention
 
         if (attention != 'bidirectional' and horizontal_global_attention):
-            raise ValueError(
-                'only \"bi-directional\" attentions can support horizontal global attention!'
-            )
+            raise ValueError('only \"bi-directional\" attentions can support horizontal global attention!')
         self.horizontal_global_attention = horizontal_global_attention
 
     def set_random_layout(self, h, layout):
@@ -345,9 +339,7 @@ def set_local_layout(self, h, layout):
             end_block_idx += block_size
             end_block_idx = min(end_block_idx, num_blocks)
             for row in range(start_block_idx, end_block_idx):
-                for col in range(
-                        start_block_idx,
-                    (row + 1 if self.attention == 'unidirectional' else end_block_idx)):
+                for col in range(start_block_idx, (row + 1 if self.attention == 'unidirectional' else end_block_idx)):
                     layout[h, row, col] = 1
             start_block_idx += block_size
 
@@ -355,9 +347,7 @@ def set_local_layout(self, h, layout):
         for i in range(start_block_idx, num_blocks, block_size):
             end_block_idx = min(i + block_size, num_blocks)
             for row in range(i, end_block_idx):
-                for col in range(
-                        i,
-                    (row + 1 if self.attention == 'unidirectional' else end_block_idx)):
+                for col in range(i, (row + 1 if self.attention == 'unidirectional' else end_block_idx)):
                     layout[h, row, col] = 1
         return layout
 
@@ -423,6 +413,7 @@ class BigBirdSparsityConfig(SparsityConfig):
     For more details about this sparsity config, please see `Big Bird: Transformers for Longer Sequences`: https://arxiv.org/pdf/2007.14062.pdf
     This class extends parent class of `SparsityConfig` and customizes it for `BigBird` sparsity.
     """
+
     def __init__(self,
                  num_heads,
                  block=16,
@@ -452,8 +443,7 @@ def __init__(self,
         self.num_global_blocks = num_global_blocks
 
         if (attention != 'unidirectional' and attention != 'bidirectional'):
-            raise NotImplementedError(
-                'only \"uni/bi-directional\" attentions are supported for now!')
+            raise NotImplementedError('only \"uni/bi-directional\" attentions are supported for now!')
         self.attention = attention
 
     def set_random_layout(self, h, layout):
@@ -475,10 +465,7 @@ def set_random_layout(self, h, layout):
             )
 
         for row in range(0, num_blocks):
-            sample_range = range(
-                0,
-                num_blocks) if self.attention == 'bidirectional' else range(0,
-                                                                            row + 1)
+            sample_range = range(0, num_blocks) if self.attention == 'bidirectional' else range(0, row + 1)
             rnd_cols = random.sample(sample_range, self.num_random_blocks)
             layout[h, row, rnd_cols] = 1
         return layout
@@ -564,6 +551,7 @@ class BSLongformerSparsityConfig(SparsityConfig):
     For more details about this sparsity config, please see `Longformer: The Long-Document Transformer`: https://arxiv.org/pdf/2004.05150.pdf
     This class extends parent class of `SparsityConfig` and customizes it for `Longformer` sparsity.
     """
+
     def __init__(self,
                  num_heads,
                  block=16,
@@ -687,11 +675,8 @@ class LocalSlidingWindowSparsityConfig(SparsityConfig):
     """Configuration class to store `Local Sliding Window` sparsity configuration - a purely-local sliding window attention.
     This class extends parent class of `SparsityConfig` and customizes it for `Local` sparsity.
     """
-    def __init__(self,
-                 num_heads,
-                 block=16,
-                 num_sliding_window_blocks=3,
-                 attention='unidirectional'):
+
+    def __init__(self, num_heads, block=16, num_sliding_window_blocks=3, attention='unidirectional'):
         """Initialize the Local Sliding Window Sparsity Pattern Config.
         For usage example please see, TODO DeepSpeed Sparse Transformer Tutorial
         Arguments:
@@ -717,14 +702,13 @@ def set_sliding_window_layout(self, h, layout):
         num_blocks = layout.shape[1]
         if (num_blocks < self.num_sliding_window_blocks):
             raise ValueError(
-                f'Number of sliding window blocks, {self.num_sliding_window_blocks}, must be smaller than overal number of blocks in a row, {num_blocks}!'
+                f'Number of sliding window blocks, {self.num_sliding_window_blocks}, must be smaller than overall number of blocks in a row, {num_blocks}!'
             )
 
         w = self.num_sliding_window_blocks // 2
         for row in range(0, num_blocks):
             start = max(0, row - w)
-            end = min(row + w + 1,
-                      num_blocks) if self.attention == "bidirectional" else row + 1
+            end = min(row + w + 1, num_blocks) if self.attention == "bidirectional" else row + 1
             layout[h, row, start:end] = 1
         return layout
 
diff --git a/deepspeed/ops/sparse_attention/trsrc/__init__.py b/deepspeed/ops/sparse_attention/trsrc/__init__.py
index b21068e98f8e..18d58b5af4bf 100644
--- a/deepspeed/ops/sparse_attention/trsrc/__init__.py
+++ b/deepspeed/ops/sparse_attention/trsrc/__init__.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import sys
 import os
diff --git a/deepspeed/ops/sparse_attention/trsrc/matmul.tr b/deepspeed/ops/sparse_attention/trsrc/matmul.tr
index bf87e993feda..9be6aa2aed7c 100644
--- a/deepspeed/ops/sparse_attention/trsrc/matmul.tr
+++ b/deepspeed/ops/sparse_attention/trsrc/matmul.tr
@@ -1,5 +1,12 @@
-// DeepSpeed note, code taken & adapted from commit 9aa94789f13ada713af36cfd8cca2fc9a7f6b79a
-// https://github.com/ptillet/torch-blocksparse/blob/master/torch_blocksparse/matmul.py
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+/*
+DeepSpeed note, code taken & adapted from commit 9aa94789f13ada713af36cfd8cca2fc9a7f6b79a
+ https:github.com/ptillet/torch-blocksparse/blob/master/torch_blocksparse/matmul.py
+*/
 
 __global__ void NAME (TYPE* A __readonly  __noalias __aligned(16),
                         TYPE* B __readonly  __noalias __aligned(16),
diff --git a/deepspeed/ops/sparse_attention/trsrc/softmax_bwd.tr b/deepspeed/ops/sparse_attention/trsrc/softmax_bwd.tr
index 25d15a99e468..1a90f41d9494 100644
--- a/deepspeed/ops/sparse_attention/trsrc/softmax_bwd.tr
+++ b/deepspeed/ops/sparse_attention/trsrc/softmax_bwd.tr
@@ -1,5 +1,12 @@
-// DeepSpeed note, code taken & adapted from commit 9aa94789f13ada713af36cfd8cca2fc9a7f6b79a
-// https://github.com/ptillet/torch-blocksparse/blob/master/torch_blocksparse/softmax.py
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+/*
+DeepSpeed note, code taken & adapted from commit 9aa94789f13ada713af36cfd8cca2fc9a7f6b79a
+ https:github.com/ptillet/torch-blocksparse/blob/master/torch_blocksparse/softmax.py
+*/
 
 __global__ void softmax_bwd(TYPE * X __readonly __noalias __aligned(16),
                             float scale,
diff --git a/deepspeed/ops/sparse_attention/trsrc/softmax_fwd.tr b/deepspeed/ops/sparse_attention/trsrc/softmax_fwd.tr
index 7d5cc50b282d..ebd317d9469b 100644
--- a/deepspeed/ops/sparse_attention/trsrc/softmax_fwd.tr
+++ b/deepspeed/ops/sparse_attention/trsrc/softmax_fwd.tr
@@ -1,5 +1,12 @@
-// DeepSpeed note, code taken & adapted from commit 9aa94789f13ada713af36cfd8cca2fc9a7f6b79a
-// https://github.com/ptillet/torch-blocksparse/blob/master/torch_blocksparse/softmax.py
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+/*
+DeepSpeed note, code taken & adapted from commit 9aa94789f13ada713af36cfd8cca2fc9a7f6b79a
+ https:github.com/ptillet/torch-blocksparse/blob/master/torch_blocksparse/softmax.py
+*/
 
 __global__ void softmax_fwd(TYPE *X __readonly __noalias __aligned(16),
                             float scale,
diff --git a/deepspeed/ops/transformer/__init__.py b/deepspeed/ops/transformer/__init__.py
index 77d666c869a9..b80fe2b4ba71 100755
--- a/deepspeed/ops/transformer/__init__.py
+++ b/deepspeed/ops/transformer/__init__.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .transformer import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig
 from .inference.config import DeepSpeedInferenceConfig
diff --git a/deepspeed/ops/transformer/inference/__init__.py b/deepspeed/ops/transformer/inference/__init__.py
index 6e9ca0051e78..c8b31a90eac2 100644
--- a/deepspeed/ops/transformer/inference/__init__.py
+++ b/deepspeed/ops/transformer/inference/__init__.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .config import DeepSpeedInferenceConfig
 from ....model_implementations.transformers.ds_transformer import DeepSpeedTransformerInference
diff --git a/deepspeed/ops/transformer/inference/bias_add.py b/deepspeed/ops/transformer/inference/bias_add.py
index b7d2944fb5f8..253784f001ae 100644
--- a/deepspeed/ops/transformer/inference/bias_add.py
+++ b/deepspeed/ops/transformer/inference/bias_add.py
@@ -1,6 +1,7 @@
-'''
-Copyright 2022 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from typing import Optional
 import torch
@@ -22,7 +23,4 @@ def nhwc_bias_add(activation: torch.Tensor,
     elif other_bias is None:
         return spatial_cuda_module.nhwc_bias_add_add(activation, bias, other)
     else:
-        return spatial_cuda_module.nhwc_bias_add_bias_add(activation,
-                                                          bias,
-                                                          other,
-                                                          other_bias)
+        return spatial_cuda_module.nhwc_bias_add_bias_add(activation, bias, other, other_bias)
diff --git a/deepspeed/ops/transformer/inference/config.py b/deepspeed/ops/transformer/inference/config.py
index f0c87ba5619b..d5aff4f541f7 100644
--- a/deepspeed/ops/transformer/inference/config.py
+++ b/deepspeed/ops/transformer/inference/config.py
@@ -1,11 +1,15 @@
-'''
-Copyright 2022 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 import json
-from deepspeed.utils.types import ActivationFuncType
+import torch
+from deepspeed.utils.types import ActivationFuncType, NormType
 
 
 class TransformerConfig():
+
     def __init__(self, hidden_size, intermediate_size, heads, num_hidden_layers):
         self.layer_id = -1
         self.hidden_size = hidden_size
@@ -28,7 +32,6 @@ class DeepSpeedInferenceConfig(TransformerConfig):
             mp_size (optional): This argument is mainly used to create the parameters on the kernel side
                 using model-parallel architecture. If the client model already takes care of this, there is no
                 need to pass this argument.
-            fp16: Enable half-precision computation
             pre_layer_norm: Select between Pre-LN or Post-LN transformer architecture
             stochastic_mode:  Enable for high performance, please note that this flag has some level of
                 non-determinism and can produce different results on different runs.  However, we have seen
@@ -39,7 +42,9 @@ class DeepSpeedInferenceConfig(TransformerConfig):
             scale_attention: If true, both q and k are scaled by 1/sqrt(attention_heads) before attention computation.
             return_tuple: if True, returns the transformer output as a tuple, otherwise returns as a tensor
             bigscience_bloom: This flag is added temporarily for supporting the BLOOM-176B model architecture.
+            use_triton: This flag is to enable triton kernels in inference or not.
     """
+
     def __init__(self,
                  hidden_size=-1,
                  intermediate_size=-1,
@@ -48,9 +53,9 @@ def __init__(self,
                  layer_norm_eps=1e-12,
                  local_rank=-1,
                  mp_size=1,
-                 fp16=False,
-                 q_int8=False,
+                 dtype=torch.float16,
                  pre_layer_norm=True,
+                 norm_type=NormType.LayerNorm,
                  stochastic_mode=False,
                  scale_attention=True,
                  triangular_masking=True,
@@ -65,23 +70,27 @@ def __init__(self,
                  training_mp_size=1,
                  bigscience_bloom=False,
                  max_out_tokens=1024,
+                 min_out_tokens=1,
                  enable_qkv_quantization=False,
                  use_mup=False,
                  scale_attn_by_inverse_layer_idx=False,
-                 return_single_tuple=False):
+                 return_single_tuple=False,
+                 set_empty_params=False,
+                 transposed_mode=False,
+                 use_triton=False,
+                 triton_autotune=False,
+                 num_kv=-1,
+                 rope_theta=10000):
         super(DeepSpeedInferenceConfig,
-              self).__init__(
-                  hidden_size,
-                  (intermediate_size if intermediate_size > 0 else 4 * hidden_size),
-                  heads,
-                  num_hidden_layers)
-        self.fp16 = fp16
+              self).__init__(hidden_size, (intermediate_size if intermediate_size > 0 else 4 * hidden_size), heads,
+                             num_hidden_layers)
+        self.dtype = dtype
         self.pre_layer_norm = pre_layer_norm
+        self.norm_type = norm_type
         self.local_rank = local_rank
         self.stochastic_mode = stochastic_mode
         self.epsilon = layer_norm_eps
         self.mp_size = mp_size
-        self.q_int8 = q_int8
         self.scale_attention = scale_attention
         self.triangular_masking = triangular_masking
         self.local_attention = local_attention
@@ -96,10 +105,17 @@ def __init__(self,
         self.training_mp_size = training_mp_size
         self.bigscience_bloom = bigscience_bloom
         self.max_out_tokens = max_out_tokens
+        self.min_out_tokens = min_out_tokens
         self.scale_attn_by_inverse_layer_idx = scale_attn_by_inverse_layer_idx
         self.enable_qkv_quantization = enable_qkv_quantization
         self.use_mup = use_mup
         self.return_single_tuple = return_single_tuple
+        self.set_empty_params = set_empty_params
+        self.transposed_mode = transposed_mode
+        self.use_triton = use_triton
+        self.triton_autotune = triton_autotune
+        self.num_kv = num_kv
+        self.rope_theta = rope_theta
 
     @classmethod
     def from_dict(cls, json_object):
diff --git a/deepspeed/ops/transformer/inference/diffusers_2d_transformer.py b/deepspeed/ops/transformer/inference/diffusers_2d_transformer.py
index 246eab8676fe..fa4c6d53f871 100644
--- a/deepspeed/ops/transformer/inference/diffusers_2d_transformer.py
+++ b/deepspeed/ops/transformer/inference/diffusers_2d_transformer.py
@@ -1,8 +1,10 @@
-'''
-Copyright 2022 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 
 class Diffusers2DTransformerConfig():
+
     def __init__(self, int8_quantization=False):
         self.int8_quantization = int8_quantization
diff --git a/deepspeed/ops/transformer/inference/diffusers_attention.py b/deepspeed/ops/transformer/inference/diffusers_attention.py
index 9d829ce3e072..5efc560db75e 100644
--- a/deepspeed/ops/transformer/inference/diffusers_attention.py
+++ b/deepspeed/ops/transformer/inference/diffusers_attention.py
@@ -1,6 +1,8 @@
-'''
-Copyright 2022 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 import math
 import torch
 from torch.autograd import Function
@@ -11,7 +13,7 @@
 from deepspeed.ops.op_builder import InferenceBuilder
 
 # Cuda modules will be imported if needed
-inference_cuda_module = None
+inference_module = None
 minus_inf = -10000.0
 triton_flash_attn = None
 
@@ -30,26 +32,12 @@ def load_triton_flash_attn():
 
 
 class DeepSpeedDiffusersAttentionFunction(Function):
+
     @staticmethod
-    def forward(ctx,
-                input,
-                context,
-                input_mask,
-                config,
-                attn_qkvw,
-                attn_qw,
-                attn_kw,
-                attn_vw,
-                attn_qkvb,
-                num_attention_heads_per_partition,
-                norm_factor,
-                hidden_size_per_partition,
-                attn_ow,
-                attn_ob,
-                do_out_bias,
-                score_context_func,
-                linear_func,
-                triton_flash_attn_kernel):
+    def forward(ctx, input, context, input_mask, config, attn_qkvw, attn_qw, attn_kw, attn_vw, attn_qkvb,
+                num_attention_heads_per_partition, norm_factor, hidden_size_per_partition, attn_ow, attn_ob,
+                do_out_bias, score_context_func, linear_func, triton_flash_attn_kernel, rope_theta):
+
         def _transpose_for_context(x):
             x = x.permute(0, 2, 1, 3)
             new_x_layer_shape = x.size()[:-2] + \
@@ -58,32 +46,24 @@ def _transpose_for_context(x):
 
         def _transpose_for_scores(x):
             attention_head_size = x.shape[-1] // num_attention_heads_per_partition
-            new_x_shape = x.size()[:-1] + (num_attention_heads_per_partition,
-                                           attention_head_size)
+            new_x_shape = x.size()[:-1] + (num_attention_heads_per_partition, attention_head_size)
             x = x.reshape(*new_x_shape)
             x = x.permute(0, 2, 1, 3)
             return x.contiguous()
 
         def selfAttention_fp(input, context, input_mask):
-            if config.fp16 and input.dtype == torch.float32:
+            if config.dtype in [torch.half, torch.float16] and input.dtype == torch.float32:
                 input = input.half()
             head_size = input.shape[-1] // config.heads
             do_flash_attn = (head_size <= 128)
             scale = (1 / norm_factor) * (1 / norm_factor)
-            if do_flash_attn and context == None:
-                qkv_out = linear_func(input,
-                                      attn_qkvw,
-                                      attn_qkvb if attn_qkvb is not None else attn_qkvw,
-                                      attn_qkvb is not None,
-                                      do_flash_attn,
-                                      config.heads)
-
-                context_layer = triton_flash_attn_kernel(qkv_out[0],
-                                                         qkv_out[1],
-                                                         qkv_out[2],
-                                                         scale,
+            if do_flash_attn and context is None:
+                qkv_out = linear_func(input, attn_qkvw, attn_qkvb if attn_qkvb is not None else attn_qkvw, attn_qkvb
+                                      is not None, do_flash_attn, config.heads, False, rope_theta)
+
+                context_layer = triton_flash_attn_kernel(qkv_out[0], qkv_out[1], qkv_out[2], scale,
                                                          input.shape[-2] % 128 == 0)
-                context_layer = _transpose_for_context(context_layer[:,:,:,:head_size])
+                context_layer = _transpose_for_context(context_layer[:, :, :, :head_size])
 
             else:
                 do_flash_attn = False
@@ -97,21 +77,11 @@ def selfAttention_fp(input, context, input_mask):
                     query = query.contiguous()
                     key = key.contiguous()
                     value = value.contiguous()
-                query, key, value = inference_cuda_module.pad_transform_fp16(query, key, value, config.heads, do_flash_attn)
-                attention_scores = (torch.matmul(query,
-                                                 key.transpose(-1,
-                                                               -2)) *
-                                    scale).softmax(dim=-1)
-                context_layer = _transpose_for_context(
-                    torch.matmul(attention_scores,
-                                 value))
-
-            output = linear_func(context_layer,
-                                 attn_ow,
-                                 attn_ob,
-                                 do_out_bias,
-                                 False,
-                                 config.heads)
+                query, key, value = inference_module.pad_transform_fp16(query, key, value, config.heads, do_flash_attn)
+                attention_scores = (torch.matmul(query, key.transpose(-1, -2)) * scale).softmax(dim=-1)
+                context_layer = _transpose_for_context(torch.matmul(attention_scores, value))
+
+            output = linear_func(context_layer, attn_ow, attn_ob, do_out_bias, False, config.heads, False, rope_theta)
             return output
 
         output = selfAttention_fp(input, context, input_mask)
@@ -142,16 +112,15 @@ def __init__(
         self.config = config
         self.config.layer_id = DeepSpeedDiffusersAttention.layer_id
         DeepSpeedDiffusersAttention.layer_id += 1
-        device = get_accelerator().current_device_name(
-        ) if config.bigscience_bloom else 'cpu'
+        device = get_accelerator().current_device_name() if config.bigscience_bloom else 'cpu'
         qkv_size_per_partition = (self.config.hidden_size // self.config.mp_size) * 3
 
-        data_type = torch.int8 if config.q_int8 else torch.half if config.fp16 else torch.float
-        data_type_fp = torch.half if config.fp16 else torch.float
-        global inference_cuda_module
-        if inference_cuda_module is None:
+        data_type = self.config.dtype
+        data_type_fp = torch.half if self.config.dtype == torch.int8 else self.config.dtype
+        global inference_module
+        if inference_module is None:
             builder = InferenceBuilder()
-            inference_cuda_module = builder.load()
+            inference_module = builder.load()
 
         if DeepSpeedDiffusersAttention.layer_id == 1:
             log_dist(f"DeepSpeed-Attention config: {self.config.__dict__}", [0])
@@ -176,9 +145,7 @@ def __init__(
                                                 dtype=data_type,
                                                 device=device),
                                     requires_grad=False)
-        self.attn_qkvb = nn.Parameter(torch.empty(qkv_size_per_partition,
-                                                  dtype=data_type_fp,
-                                                  device=device),
+        self.attn_qkvb = nn.Parameter(torch.empty(qkv_size_per_partition, dtype=data_type_fp, device=device),
                                       requires_grad=False)
         out_size_per_partition = self.config.hidden_size // self.config.mp_size
         self.attn_ow = nn.Parameter(torch.empty(out_size_per_partition,
@@ -187,9 +154,7 @@ def __init__(
                                                 device=device),
                                     requires_grad=False)
 
-        self.attn_ob = nn.Parameter(torch.empty(self.config.hidden_size,
-                                                dtype=data_type_fp,
-                                                device=device),
+        self.attn_ob = nn.Parameter(torch.empty(self.config.hidden_size, dtype=data_type_fp, device=device),
                                     requires_grad=False)
         self.do_out_bias = True
 
@@ -200,49 +165,32 @@ def __init__(
         self.hidden_size_per_partition = self.config.hidden_size // self.config.mp_size
         self.hidden_size_per_attention_head = self.config.hidden_size // self.config.heads
 
-        self.norm_factor = math.sqrt(
-            math.sqrt(self.config.hidden_size // self.config.heads))
+        self.norm_factor = math.sqrt(math.sqrt(self.config.hidden_size // self.config.heads))
 
         if self.config.scale_attn_by_inverse_layer_idx is True:
             self.norm_factor *= math.sqrt(self.config.layer_id + 1)
             # https://github.com/huggingface/transformers/blob/v4.24.0/src/transformers/models/gpt2/modeling_gpt2.py#L191
 
-        self.score_context_func = inference_cuda_module.softmax_context_fp32 if (not config.fp16) else \
-                                    inference_cuda_module.softmax_context_fp16
-        self.linear_func = inference_cuda_module.linear_layer_fp16 if config.fp16 else \
-                                    inference_cuda_module.linear_layer_fp32
-        self.allocate_workspace = inference_cuda_module.allocate_workspace_fp32 if not (config.fp16) else \
-                                    inference_cuda_module.allocate_workspace_fp16
+        if self.config.dtype in [torch.float16, torch.int8]:
+            self.score_context_func = inference_module.softmax_context_fp16
+            self.linear_func = inference_module.linear_layer_fp16
+            self.allocate_workspace = inference_module.allocate_workspace_fp16
+        else:
+            self.score_context_func = inference_module.softmax_context_fp32
+            self.linear_func = inference_module.linear_layer_fp32
+            self.allocate_workspace = inference_module.allocate_workspace_fp32
 
     def forward(self, input, context=None, input_mask=None):
         if self.config.layer_id == 0:
-            self.allocate_workspace(self.config.hidden_size,
-                                    self.config.heads,
+            self.allocate_workspace(self.config.hidden_size, self.config.heads,
                                     input.size()[1],
-                                    input.size()[0],
-                                    DeepSpeedDiffusersAttention.layer_id,
-                                    self.config.mp_size,
-                                    False,
-                                    0,
-                                    self.config.max_out_tokens)
-        output = DeepSpeedDiffusersAttentionFunction.apply(
-            input,
-            context,
-            input_mask,
-            self.config,
-            self.attn_qkvw,
-            self.attn_qw,
-            self.attn_kw,
-            self.attn_vw,
-            self.attn_qkvb,
-            self.num_attention_heads_per_partition,
-            self.norm_factor,
-            self.hidden_size_per_partition,
-            self.attn_ow,
-            self.attn_ob,
-            self.do_out_bias,
-            self.score_context_func,
-            self.linear_func,
-            self.triton_flash_attn_kernel)
+                                    input.size()[0], DeepSpeedDiffusersAttention.layer_id, self.config.mp_size, False,
+                                    0, self.config.max_out_tokens, self.config.min_out_tokens)
+        output = DeepSpeedDiffusersAttentionFunction.apply(input, context, input_mask, self.config, self.attn_qkvw,
+                                                           self.attn_qw, self.attn_kw, self.attn_vw, self.attn_qkvb,
+                                                           self.num_attention_heads_per_partition, self.norm_factor,
+                                                           self.hidden_size_per_partition, self.attn_ow, self.attn_ob,
+                                                           self.do_out_bias, self.score_context_func, self.linear_func,
+                                                           self.triton_flash_attn_kernel, self.config.rope_theta)
 
         return output
diff --git a/deepspeed/ops/transformer/inference/diffusers_transformer_block.py b/deepspeed/ops/transformer/inference/diffusers_transformer_block.py
index e453c343e9f0..b0156f905a06 100644
--- a/deepspeed/ops/transformer/inference/diffusers_transformer_block.py
+++ b/deepspeed/ops/transformer/inference/diffusers_transformer_block.py
@@ -1,6 +1,7 @@
-'''
-Copyright 2022 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 import torch.nn as nn
@@ -10,6 +11,7 @@
 from .bias_add import nhwc_bias_add
 from .diffusers_2d_transformer import Diffusers2DTransformerConfig
 from deepspeed.ops.op_builder import InferenceBuilder, SpatialInferenceBuilder
+from deepspeed.utils.types import ActivationFuncType
 
 # Ops will be loaded on demand
 transformer_cuda_module = None
@@ -31,41 +33,30 @@ def load_spatial_module():
 
 
 class DeepSpeedDiffusersTransformerBlock(nn.Module):
-    def __init__(self,
-                 equivalent_module: nn.Module,
-                 config: Diffusers2DTransformerConfig):
+
+    def __init__(self, equivalent_module: nn.Module, config: Diffusers2DTransformerConfig):
         super(DeepSpeedDiffusersTransformerBlock, self).__init__()
         self.quantizer = module_inject.GroupQuantizer(q_int8=config.int8_quantization)
         # Ensure ops are built by the time we start running
         self.config = config
 
         self.ff1_w = self.quantizer.quantize(
-            nn.Parameter(equivalent_module.ff.net[0].proj.weight.data,
-                         requires_grad=False))
-        self.ff1_b = nn.Parameter(equivalent_module.ff.net[0].proj.bias.data,
-                                  requires_grad=False)
-        self.ff2_w = self.quantizer.quantize(
-            nn.Parameter(equivalent_module.ff.net[2].weight.data,
-                         requires_grad=False))
-        self.ff2_b = nn.Parameter(equivalent_module.ff.net[2].bias.data,
-                                  requires_grad=False)
-
-        self.norm1_g = nn.Parameter(equivalent_module.norm1.weight.data,
-                                    requires_grad=False)
-        self.norm1_b = nn.Parameter(equivalent_module.norm1.bias.data,
-                                    requires_grad=False)
+            nn.Parameter(equivalent_module.ff.net[0].proj.weight.data, requires_grad=False))
+        self.ff1_b = nn.Parameter(equivalent_module.ff.net[0].proj.bias.data, requires_grad=False)
+        self.ff2_w = self.quantizer.quantize(nn.Parameter(equivalent_module.ff.net[2].weight.data,
+                                                          requires_grad=False))
+        self.ff2_b = nn.Parameter(equivalent_module.ff.net[2].bias.data, requires_grad=False)
+
+        self.norm1_g = nn.Parameter(equivalent_module.norm1.weight.data, requires_grad=False)
+        self.norm1_b = nn.Parameter(equivalent_module.norm1.bias.data, requires_grad=False)
         self.norm1_eps = equivalent_module.norm1.eps
 
-        self.norm2_g = nn.Parameter(equivalent_module.norm2.weight.data,
-                                    requires_grad=False)
-        self.norm2_b = nn.Parameter(equivalent_module.norm2.bias.data,
-                                    requires_grad=False)
+        self.norm2_g = nn.Parameter(equivalent_module.norm2.weight.data, requires_grad=False)
+        self.norm2_b = nn.Parameter(equivalent_module.norm2.bias.data, requires_grad=False)
         self.norm2_eps = equivalent_module.norm2.eps
 
-        self.norm3_g = nn.Parameter(equivalent_module.norm3.weight.data,
-                                    requires_grad=False)
-        self.norm3_b = nn.Parameter(equivalent_module.norm3.bias.data,
-                                    requires_grad=False)
+        self.norm3_g = nn.Parameter(equivalent_module.norm3.weight.data, requires_grad=False)
+        self.norm3_b = nn.Parameter(equivalent_module.norm3.bias.data, requires_grad=False)
         self.norm3_eps = equivalent_module.norm3.eps
 
         self.attn_1 = equivalent_module.attn1
@@ -76,16 +67,14 @@ def __init__(self,
             self.attn_1.do_out_bias = False
             self.attn_1_bias = self.attn_1.attn_ob
         else:
-            self.attn_1_bias = nn.Parameter(torch.zeros_like(self.norm2_g),
-                                            requires_grad=False)
+            self.attn_1_bias = nn.Parameter(torch.zeros_like(self.norm2_g), requires_grad=False)
 
         # Pull the bias in if we can
         if isinstance(self.attn_2, DeepSpeedDiffusersAttention):
             self.attn_2.do_out_bias = False
             self.attn_2_bias = self.attn_2.attn_ob
         else:
-            self.attn_2_bias = nn.Paramaeter(torch.zeros_like(self.norm3_g),
-                                             requires_grad=False)
+            self.attn_2_bias = nn.Paramaeter(torch.zeros_like(self.norm3_g), requires_grad=False)
 
         self.transformer_cuda_module = load_transformer_module()
         load_spatial_module()
@@ -96,31 +85,20 @@ def forward(self, hidden_states, context=None, timestep=None, **kwargs):
 
         # In v0.11.0 of diffusers, the kwarg was changed from 'context' to 'encoder_hidden_states'
         # This is so we can support older and newer versions of diffusers
-        if "encoder_hidden_states" in kwargs and kwargs["encoder_hidden_states"] != None:
+        if "encoder_hidden_states" in kwargs and kwargs["encoder_hidden_states"] is not None:
             context = kwargs["encoder_hidden_states"]
 
-        out_norm_1 = self.transformer_cuda_module.layer_norm(hidden_states,
-                                                             self.norm1_g,
-                                                             self.norm1_b,
-                                                             self.norm1_eps)
+        out_norm_1 = self.transformer_cuda_module.layer_norm(hidden_states, self.norm1_g, self.norm1_b, self.norm1_eps)
         out_attn_1 = self.attn_1(out_norm_1)
 
-        out_norm_2, out_attn_1 = self.transformer_cuda_module.layer_norm_residual_store_pre_ln_res(out_attn_1,
-                                                                 self.attn_1_bias,
-                                                                 hidden_states,
-                                                                 self.norm2_g,
-                                                                 self.norm2_b,
-                                                                 self.norm2_eps)
+        out_norm_2, out_attn_1 = self.transformer_cuda_module.layer_norm_residual_store_pre_ln_res(
+            out_attn_1, self.attn_1_bias, hidden_states, self.norm2_g, self.norm2_b, self.norm2_eps)
         out_attn_2 = self.attn_2(out_norm_2, context=context)
-        out_norm_3, out_attn_2 = self.transformer_cuda_module.layer_norm_residual_store_pre_ln_res(out_attn_2,
-                                                                 self.attn_2_bias,
-                                                                 out_attn_1,
-                                                                 self.norm3_g,
-                                                                 self.norm3_b,
-                                                                 self.norm3_eps)
+        out_norm_3, out_attn_2 = self.transformer_cuda_module.layer_norm_residual_store_pre_ln_res(
+            out_attn_2, self.attn_2_bias, out_attn_1, self.norm3_g, self.norm3_b, self.norm3_eps)
 
         out_ff1 = nn.functional.linear(out_norm_3, self.ff1_w)
-        out_geglu = self.transformer_cuda_module.bias_geglu(out_ff1, self.ff1_b)
+        out_geglu = self.transformer_cuda_module.gated_activation(out_ff1, self.ff1_b, ActivationFuncType.GATED_GELU)
 
         out_ff2 = nn.functional.linear(out_geglu, self.ff2_w)
         return nhwc_bias_add(out_ff2, self.ff2_b, other=out_attn_2)
diff --git a/deepspeed/ops/transformer/inference/ds_attention.py b/deepspeed/ops/transformer/inference/ds_attention.py
index f4ec14bf4990..eb6ce2f75c69 100644
--- a/deepspeed/ops/transformer/inference/ds_attention.py
+++ b/deepspeed/ops/transformer/inference/ds_attention.py
@@ -1,6 +1,7 @@
-'''
-Copyright 2022 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import math
 import torch
@@ -14,39 +15,49 @@
 
 class DeepSpeedSelfAttention(nn.Module):
     num_layers = 0
+    _qkv_buffers = []
 
     def __init__(self, config, mp_group=None, q_scales=None, q_groups=1, merge_count=1):
         super(DeepSpeedSelfAttention, self).__init__()
         self.config = config
-        data_type = torch.int8 if config.q_int8 else torch.half if config.fp16 else torch.float
-        data_type_fp = torch.half if config.fp16 else torch.float
+        data_type = self.config.dtype
+        data_type_fp = torch.half if self.config.dtype == torch.int8 else self.config.dtype
         self.config.layer_id = DeepSpeedSelfAttention.num_layers
         DeepSpeedSelfAttention.num_layers = DeepSpeedSelfAttention.num_layers + 1
-        device = get_accelerator().current_device_name(
-        )  #if config.bigscience_bloom else 'cpu'
-        qkv_size_per_partition = (self.config.hidden_size // self.config.mp_size) * 3
-        self.attn_qkvw = nn.Parameter(torch.empty(self.config.hidden_size,
-                                                  qkv_size_per_partition,
-                                                  dtype=data_type,
-                                                  device=device),
-                                      requires_grad=False)
-        self.attn_qkvb = nn.Parameter(torch.empty(qkv_size_per_partition,
-                                                  dtype=data_type_fp,
-                                                  device=device),
-                                      requires_grad=False)
-        out_size_per_partition = self.config.hidden_size // self.config.mp_size
-        self.attn_ow = nn.Parameter(torch.empty(out_size_per_partition,
-                                                self.config.hidden_size,
-                                                dtype=data_type,
-                                                device=device),
-                                    requires_grad=False)
-
-        self.attn_ob = nn.Parameter(torch.empty(self.config.hidden_size,
-                                                dtype=data_type_fp,
-                                                device=device),
-                                    requires_grad=False)
+        device = get_accelerator().current_device_name()  #if config.bigscience_bloom else 'cpu'
+        if self.config.set_empty_params:
+            self.attn_qw = None
+            self.attn_qb = None
+            self.attn_kw = None
+            self.attn_kb = None
+            self.attn_vw = None
+            self.attn_vb = None
+            self.attn_qkvw = None
+            self.attn_qkvb = None
+            self.attn_ow = None
+            self.attn_ob = None
+        else:
+            qkv_size_per_partition = (self.config.hidden_size // self.config.mp_size) * 3 if config.num_kv < 0 else \
+                                     ((self.config.heads + self.config.num_kv * 2) // self.config.mp_size) * (self.config.hidden_size // self.config.heads)
+            self.attn_qkvw = nn.Parameter(torch.empty(self.config.hidden_size,
+                                                      qkv_size_per_partition,
+                                                      dtype=data_type,
+                                                      device=device),
+                                          requires_grad=False)
+            self.attn_qkvb = nn.Parameter(torch.empty(qkv_size_per_partition, dtype=data_type_fp, device=device),
+                                          requires_grad=False)
+            out_size_per_partition = self.config.hidden_size // self.config.mp_size
+            self.attn_ow = nn.Parameter(torch.empty(out_size_per_partition,
+                                                    self.config.hidden_size,
+                                                    dtype=data_type,
+                                                    device=device),
+                                        requires_grad=False)
+
+            self.attn_ob = nn.Parameter(torch.empty(self.config.hidden_size, dtype=data_type_fp, device=device),
+                                        requires_grad=False)
 
         self.num_attention_heads_per_partition = self.config.heads // self.config.mp_size
+        self.num_kv_partition = self.config.num_kv // self.config.mp_size
         self.hidden_size_per_partition = self.config.hidden_size // self.config.mp_size
         self.hidden_size_per_attention_head = self.config.hidden_size // self.config.heads
 
@@ -69,9 +80,17 @@ def __init__(self, config, mp_group=None, q_scales=None, q_groups=1, merge_count
         self.score_context_func = SoftmaxContextOp(config)
         self.linear_func = LinearOp(config)
         self.vector_matmul_func = VectorMatMulOp(config)
+        if len(DeepSpeedSelfAttention._qkv_buffers) == 0:
+            DeepSpeedSelfAttention._qkv_buffers = [
+                torch.empty(self.hidden_size_per_partition * 3,
+                            self.config.hidden_size,
+                            dtype=data_type_fp,
+                            device=device),
+                torch.empty(self.hidden_size_per_partition * 3, dtype=data_type_fp, device=device)
+            ]
 
     def compute_attention(self, qkv_out, input_mask, layer_past, alibi):
-        if isinstance(qkv_out, list):
+        if isinstance(qkv_out, list) or isinstance(qkv_out, tuple):
             qkv_out = qkv_out[0]
 
         no_masking = input_mask is None
@@ -84,6 +103,7 @@ def compute_attention(self, qkv_out, input_mask, layer_past, alibi):
             attn_mask=((1 - input_mask).to(qkv_out.dtype) *
                        minus_inf) if input_mask.dtype == torch.int64 else input_mask,
             heads=self.num_attention_heads_per_partition,
+            num_kv=self.num_kv_partition,
             norm_factor=(1 / self.norm_factor if self.config.scale_attention else 1.0),
             no_masking=no_masking,
             layer_id=self.config.layer_id,
@@ -93,6 +113,18 @@ def compute_attention(self, qkv_out, input_mask, layer_past, alibi):
         context_layer, key_layer, value_layer = attn_key_value
         return context_layer, key_layer, value_layer
 
+    def _merge_qkv(self):
+        qvkw = DeepSpeedSelfAttention._qkv_buffers[0]
+        qvkw[:self.hidden_size_per_partition, :] = self.attn_qw  # type: ignore
+        qvkw[self.hidden_size_per_partition:2 * self.hidden_size_per_partition, :] = self.attn_kw  # type: ignore
+        qvkw[2 * self.hidden_size_per_partition:, :] = self.attn_vw  # type: ignore
+        if self.attn_qb is not None:
+            qvkb = DeepSpeedSelfAttention._qkv_buffers[1]
+            qvkb[:self.hidden_size_per_partition] = self.attn_qb
+            qvkb[self.hidden_size_per_partition:2 * self.hidden_size_per_partition] = self.attn_kb  # type: ignore
+            qvkb[2 * self.hidden_size_per_partition:] = self.attn_vb  # type: ignore
+        return DeepSpeedSelfAttention._qkv_buffers
+
     def forward(self,
                 input,
                 input_mask,
@@ -105,44 +137,41 @@ def forward(self,
                 norm_w=None,
                 norm_b=None,
                 alibi=None):
-
+        if self.attn_qkvw is None:
+            self._attn_qkvw, self._attn_qkvb = self._merge_qkv()
+        else:
+            self._attn_qkvw = self.attn_qkvw
+            self._attn_qkvb = self.attn_qkvb
         if not self.config.pre_layer_norm:
             qkv_out = self.linear_func(input=input,
-                                       weight=self.attn_qkvw,
-                                       bias=self.attn_qkvb,
+                                       weight=self._attn_qkvw,
+                                       bias=self._attn_qkvb,
                                        add_bias=self.attn_qkvb is not None,
                                        do_flash_attn=False,
                                        num_heads=self.num_attention_heads_per_partition,
                                        num_layers=DeepSpeedSelfAttention.num_layers)
         else:
-            qkv_out = self.qkv_func(
-                input=input,
-                weight=self.attn_qkvw,
-                bias=(self.attn_qkvb if self.attn_qkvb is not None else norm_b),
-                gamma=norm_w,
-                beta=norm_b,
-                add_bias=(self.attn_qkvb is not None),
-                num_layers=DeepSpeedSelfAttention.num_layers,
-                num_heads=self.num_attention_heads_per_partition)
-
-        context_layer, key_layer, value_layer = self.compute_attention(
-            qkv_out=qkv_out,
-            input_mask=input_mask,
-            layer_past=layer_past,
-            alibi=alibi)
+            qkv_out = self.qkv_func(input=input,
+                                    weight=self._attn_qkvw,
+                                    bias=self._attn_qkvb,
+                                    gamma=norm_w,
+                                    beta=norm_b)
 
-        output = self.vector_matmul_func(input=context_layer, weight=self.attn_ow)
+        context_layer, key_layer, value_layer = self.compute_attention(qkv_out=qkv_out,
+                                                                       input_mask=input_mask,
+                                                                       layer_past=layer_past,
+                                                                       alibi=alibi)
 
+        output = self.vector_matmul_func(input=context_layer, weight=self.attn_ow)
         inp_norm = qkv_out[-1]
 
-        if self.config.mlp_after_attn and self.mp_group is not None and dist.get_world_size(
-                group=self.mp_group) > 1:
+        if self.config.mlp_after_attn and self.mp_group is not None and dist.get_world_size(group=self.mp_group) > 1:
             dist.all_reduce(output, group=self.mp_group)
-
         return (output, key_layer, value_layer, context_layer, inp_norm)
 
 
 class BloomSelfAttention(DeepSpeedSelfAttention):
+
     def __init__(self, *args, **kwargs):
         super(BloomSelfAttention, self).__init__(*args, **kwargs)
         self.softmax_func = SoftmaxOp(self.config)
@@ -156,10 +185,7 @@ def _transpose_for_context(self, x):
                                     (self.hidden_size_per_partition,)
         return x.view(*new_x_layer_shape).contiguous()
 
-    def _split_tensor_along_last_dim(self,
-                                     tensor,
-                                     num_partitions,
-                                     contiguous_split_chunks=True):
+    def _split_tensor_along_last_dim(self, tensor, num_partitions, contiguous_split_chunks=True):
         """Split a tensor along its last dimension.
 
         Args:
@@ -185,7 +211,7 @@ def _split_tensor_along_last_dim(self,
         return tensor_list
 
     def compute_attention(self, qkv_out, input_mask, layer_past, alibi):
-        if isinstance(qkv_out, list):
+        if isinstance(qkv_out, list) or isinstance(qkv_out, tuple):
             qkv_out = qkv_out[0]
 
         no_masking = input_mask is None
@@ -196,64 +222,49 @@ def compute_attention(self, qkv_out, input_mask, layer_past, alibi):
         mixed_x_layer = qkv_out
         alibi = alibi.to(get_accelerator().current_device_name())
         head_dim = self.hidden_size_per_partition // self.num_attention_heads_per_partition
-        new_tensor_shape = mixed_x_layer.size()[:-1] + (
-            self.num_attention_heads_per_partition,
-            3 * head_dim)
+        new_tensor_shape = mixed_x_layer.size()[:-1] + (self.num_attention_heads_per_partition, 3 * head_dim)
         mixed_x_layer = mixed_x_layer.view(*new_tensor_shape)
 
         query_layer, key_layer, value_layer = self._split_tensor_along_last_dim(mixed_x_layer, 3)
 
         # [batch_size, head_dim, q_length, k_length]
-        output_size = (query_layer.size(0),
-                       query_layer.size(2),
-                       query_layer.size(1),
-                       key_layer.size(1))
+        output_size = (query_layer.size(0), query_layer.size(2), query_layer.size(1), key_layer.size(1))
         # [batch_size, q_length, num_heads, head_dim] -> [q_length, batch_size * num_heads, head_dim]
-        query_layer = query_layer.transpose(1,
-                                            2).reshape(output_size[0] * output_size[1],
-                                                       output_size[2],
-                                                       -1)
+        query_layer = query_layer.transpose(1, 2).reshape(output_size[0] * output_size[1], output_size[2], -1)
         # [batch_size, k_length, num_heads, head_dim] -> [k_length, batch_size * num_heads, head_dim]
-        key_layer = key_layer.transpose(1,
-                                        2).reshape(output_size[0] * output_size[1],
-                                                   output_size[3],
-                                                   -1).transpose(-1,
-                                                                 -2)
-        value_layer = value_layer.transpose(1,
-                                            2).reshape(output_size[0] * output_size[1],
-                                                       output_size[3],
-                                                       -1)
+        key_layer = key_layer.transpose(1, 2).reshape(output_size[0] * output_size[1], output_size[3],
+                                                      -1).transpose(-1, -2)
+        value_layer = value_layer.transpose(1, 2).reshape(output_size[0] * output_size[1], output_size[3], -1)
         if layer_past is not None:
             past_key, past_value = layer_past
             # concatenate along seq_length dimension -> [batch_size, qk_length, num_heads, head_dim]
             key_layer = torch.cat((past_key.type_as(key_layer), key_layer), dim=-1)
-            value_layer = torch.cat((past_value.type_as(value_layer),
-                                     value_layer),
-                                    dim=-2)
+            value_layer = torch.cat((past_value.type_as(value_layer), value_layer), dim=-2)
 
         presents = (key_layer, value_layer)
         # Raw attention scores. [batch_size * num_heads, q_length, k_length]
         matmul_result = torch.matmul(query_layer, key_layer)
         # change view to [batch_size, num_heads, q_length, k_length]
-        attention_scores = matmul_result.view(output_size[0],
-                                              output_size[1],
-                                              output_size[2],
-                                              -1)
-
-        offset = dist.get_rank(
-        ) * self.num_attention_heads_per_partition if dist.is_initialized() else 0
-        attention_probs = self.softmax_func(
-            attn_scores=attention_scores,
-            attn_mask=((1 - input_mask).half() * minus_inf),
-            alibi=alibi,
-            triangular=(self.config.triangular_masking
-                        and (attention_scores.shape[-2] > 1)),
-            recompute=False,
-            local_attention=False,
-            window_size=1,
-            async_op=False,
-            layer_scale=1 / (self.norm_factor * self.norm_factor),
-            head_offset=offset)
+        attention_scores = matmul_result.view(output_size[0], output_size[1], output_size[2], -1)
+
+        offset = dist.get_rank() * self.num_attention_heads_per_partition if dist.is_initialized() else 0
+        target_dtype = torch.float16 if self.config.dtype == torch.int8 else self.config.dtype
+
+        # When using the hybrid engine with BLOOM, input_mask needs to be converted from torch.bool -> torch.int64
+        if input_mask.dtype == torch.bool:
+            input_mask = input_mask.long()
+
+        attention_probs = self.softmax_func(attn_scores=attention_scores,
+                                            attn_mask=((1 - input_mask).to(target_dtype) * minus_inf),
+                                            alibi=alibi,
+                                            triangular=(self.config.triangular_masking
+                                                        and (attention_scores.shape[-2] > 1)),
+                                            recompute=False,
+                                            local_attention=False,
+                                            window_size=1,
+                                            async_op=False,
+                                            layer_scale=1 / (self.norm_factor * self.norm_factor),
+                                            head_offset=offset)
 
         # change view [batch_size x num_heads, q_length, k_length]
         attention_probs_reshaped = attention_probs.view(*matmul_result.shape)
@@ -263,10 +274,8 @@ def compute_attention(self, qkv_out, input_mask, layer_past, alibi):
 
         # change view [batch_size, num_heads, q_length, head_dim]
         context_layer = context_layer.view(
-            context_layer.size(0) // self.num_attention_heads_per_partition,
-            self.num_attention_heads_per_partition,
-            context_layer.size(1),
-            context_layer.shape[-1])
+            context_layer.size(0) // self.num_attention_heads_per_partition, self.num_attention_heads_per_partition,
+            context_layer.size(1), context_layer.shape[-1])
 
         context_layer = self._transpose_for_context(context_layer)
         key_layer = presents[0]
diff --git a/deepspeed/ops/transformer/inference/ds_mlp.py b/deepspeed/ops/transformer/inference/ds_mlp.py
index 02d492d8e32b..36de06db920f 100644
--- a/deepspeed/ops/transformer/inference/ds_mlp.py
+++ b/deepspeed/ops/transformer/inference/ds_mlp.py
@@ -1,56 +1,65 @@
-'''
-Copyright 2022 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import math
 import torch
 import torch.nn as nn
 from deepspeed import comm as dist
+from deepspeed.utils.types import GATED_ACTIVATION_TYPES
 from deepspeed.accelerator import get_accelerator
 from .op_binding import MLPGemmOp, VectorMatMulOp, GELUGemmOp, ResidualAddOp
 
 
 class DeepSpeedMLP(nn.Module):
-    def __init__(self,
-                 config,
-                 mp_group=None,
-                 q_scales=None,
-                 q_groups=1,
-                 merge_count=1,
-                 mlp_extra_grouping=False):
+    _inter_w_buffers = []
+
+    def __init__(self, config, mp_group=None, q_scales=None, q_groups=1, merge_count=1, mlp_extra_grouping=False):
         super(DeepSpeedMLP, self).__init__()
 
         self.config = config
-        data_type = torch.int8 if config.q_int8 else torch.half if config.fp16 else torch.float
-        data_type_fp = torch.half if config.fp16 else torch.float
+
+        data_type = torch.int8 if self.config.dtype == torch.int8 else self.config.dtype
+        data_type_fp = torch.half if self.config.dtype == torch.int8 else self.config.dtype
         device = get_accelerator().current_device_name()
-        self.attn_nw = nn.Parameter(torch.empty(self.config.hidden_size,
-                                                dtype=data_type_fp,
-                                                device=device),
-                                    requires_grad=False)
-        self.attn_nb = nn.Parameter(torch.empty(self.config.hidden_size,
-                                                dtype=data_type_fp,
-                                                device=device),
-                                    requires_grad=False)
-        intm_size_per_partition = self.config.intermediate_size // self.config.mp_size
-        self.inter_w = nn.Parameter(torch.empty(self.config.hidden_size,
-                                                intm_size_per_partition,
-                                                dtype=data_type,
-                                                device=device),
-                                    requires_grad=False)
-        self.inter_b = nn.Parameter(torch.empty(intm_size_per_partition,
-                                                dtype=data_type_fp,
-                                                device=device),
-                                    requires_grad=False)
-        self.output_w = nn.Parameter(torch.empty(intm_size_per_partition,
-                                                 self.config.hidden_size,
-                                                 dtype=data_type,
-                                                 device=device),
-                                     requires_grad=False)
-        self.output_b = nn.Parameter(torch.empty(self.config.hidden_size,
-                                                 dtype=data_type_fp,
-                                                 device=device),
-                                     requires_grad=False)
+
+        proj_factor = 2 if self.config.mlp_act_func_type in GATED_ACTIVATION_TYPES else 1
+        self.config.intermediate_size = self.config.intermediate_size if self.config.intermediate_size > 0 else 4 * self.config.hidden_size
+        self.intm_w_sz_per_partition = self.config.intermediate_size * proj_factor // self.config.mp_size
+        self.intm_o_sz_per_partition = self.config.intermediate_size // self.config.mp_size
+
+        if self.config.set_empty_params:
+            self.attn_nw = None
+            self.attn_nb = None
+            self.inter_w = None
+            self.inter_b = None
+            self.inter_up_w = None
+            self.inter_up_b = None
+            self.inter_gate_w = None
+            self.inter_gate_b = None
+            self.output_w = None
+            self.output_b = None
+        else:
+            self.attn_nw = nn.Parameter(torch.empty(self.config.hidden_size, dtype=data_type_fp, device=device),
+                                        requires_grad=False)
+            self.attn_nb = nn.Parameter(torch.empty(self.config.hidden_size, dtype=data_type_fp, device=device),
+                                        requires_grad=False)
+
+            self.inter_w = nn.Parameter(torch.empty(self.config.hidden_size,
+                                                    self.intm_w_sz_per_partition,
+                                                    dtype=data_type,
+                                                    device=device),
+                                        requires_grad=False)
+            self.inter_b = nn.Parameter(torch.empty(self.intm_w_sz_per_partition, dtype=data_type_fp, device=device),
+                                        requires_grad=False)
+            self.output_w = nn.Parameter(torch.empty(self.intm_o_sz_per_partition,
+                                                     self.config.hidden_size,
+                                                     dtype=data_type,
+                                                     device=device),
+                                         requires_grad=False)
+            self.output_b = nn.Parameter(torch.empty(self.config.hidden_size, dtype=data_type_fp, device=device),
+                                         requires_grad=False)
 
         # used for quantization
         self.q_scales = q_scales
@@ -63,31 +72,52 @@ def __init__(self,
         self.fused_gemm_gelu = GELUGemmOp(config)
         self.residual_add_func = ResidualAddOp(config)
 
+        if len(DeepSpeedMLP._inter_w_buffers) == 0:
+            DeepSpeedMLP._inter_w_buffers = [
+                torch.empty(self.intm_w_sz_per_partition, self.config.hidden_size, dtype=data_type, device=device),
+                torch.empty(self.intm_w_sz_per_partition, dtype=data_type_fp, device=device)
+            ]
+
+    def _merge_inter_w(self):
+        inter_w = DeepSpeedMLP._inter_w_buffers[0]
+        inter_w[:self.intm_w_sz_per_partition // 2, :] = self.inter_up_w  # type: ignore
+        inter_w[self.intm_w_sz_per_partition // 2:, :] = self.inter_gate_w  # type: ignore
+        if self.inter_up_b is not None:
+            inter_b = DeepSpeedMLP._inter_w_buffers[1]
+            inter_b[:self.intm_w_sz_per_partition // 2] = self.inter_up_b  # type: ignore
+            inter_b[self.intm_w_sz_per_partition // 2:] = self.inter_gate_b  # type: ignore
+        return DeepSpeedMLP._inter_w_buffers
+
     def forward(self, input, residual, residual_norm, bias):
+        if self.inter_w is None:
+            self._inter_w, self._inter_b = self._merge_inter_w()
+        else:
+            self._inter_w = self.inter_w
+            self._inter_b = self.inter_b
+
         residual_add = None
         if self.attn_nw is None:
             output = self.fused_gemm_gelu(input=residual_norm,
-                                          weight=self.inter_w,
-                                          bias=self.inter_b,
+                                          weight=self._inter_w,
+                                          bias=self._inter_b,
                                           weight_out=self.output_w)
         else:
             output, residual_add = self.mlp_gemm_func(input=input,
                                                       residual=residual,
-                                                      input_bias=bias,
-                                                      weight_interm=self.inter_w,
+                                                      weight_interm=self._inter_w,
                                                       weight_out=self.output_w,
-                                                      bias=self.inter_b,
+                                                      input_bias=bias,
+                                                      bias=self._inter_b,
                                                       gamma=self.attn_nw,
                                                       beta=self.attn_nb)
-        residual = self.residual_add_func(
-            hidden_state=output,
-            residual=residual,
-            attention_output=input,
-            attention_bias=bias if bias is not None else self.output_b,
-            final_bias=self.output_b,
-            add_bias=bias is not None,
-            residual_add=residual_add)
 
+        residual = self.residual_add_func(hidden_state=output,
+                                          residual=residual,
+                                          add_bias=bias is not None,
+                                          attention_output=input,
+                                          attention_bias=bias if bias is not None else self.output_b,
+                                          final_bias=self.output_b,
+                                          residual_add=residual_add)
         if self.mp_group is not None and dist.get_world_size(group=self.mp_group) > 1:
             dist.all_reduce(residual, group=self.mp_group)
 
diff --git a/deepspeed/ops/transformer/inference/moe_inference.py b/deepspeed/ops/transformer/inference/moe_inference.py
index d5e45c7eb009..f985a9a4ab6b 100644
--- a/deepspeed/ops/transformer/inference/moe_inference.py
+++ b/deepspeed/ops/transformer/inference/moe_inference.py
@@ -1,13 +1,14 @@
-'''
-Copyright 2020 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 import json
 import math
 import torch
 from torch.autograd import Function
-#from ...inference.engine import inference_cuda_module, specialized_mode
-# Cuda modules will be imported if needed
-inference_cuda_module = None
+# accelerator modules will be imported if needed
+inference_module = None
 specialized_mode = None
 import torch.nn as nn
 from .ds_attention import DeepSpeedSelfAttention
@@ -33,6 +34,7 @@ class DeepSpeedMoEInferenceConfig(DeepSpeedInferenceConfig):
                 using model-parallel architecture. If the client model already takes care of this, there is no
                 need to pass this argument.
             fp16: Enable half-precision computation
+            bf16: Enable bf16 floating point computation
             pre_layer_norm: Select between Pre-LN or Post-LN transformer architecture
             stochastic_mode:  Enable for high performance, please note that this flag has some level of
                 non-determinism and can produce different results on different runs.  However, we have seen
@@ -43,6 +45,7 @@ class DeepSpeedMoEInferenceConfig(DeepSpeedInferenceConfig):
             scale_attention: If true, both q and k are scaled by 1/sqrt(attention_heads) before attention computation.
             return_tuple: if True, returns the transformer output as a tuple, otherwise returns as a tensor
     """
+
     def __init__(self,
                  hidden_size=-1,
                  intermediate_size=-1,
@@ -52,6 +55,7 @@ def __init__(self,
                  local_rank=-1,
                  mp_size=1,
                  fp16=False,
+                 bf16=False,
                  q_int8=False,
                  pre_layer_norm=True,
                  stochastic_mode=False,
@@ -72,23 +76,10 @@ def __init__(self,
                  mlp_type='standard',
                  scale_attn_by_inverse_layer_idx=False):
         super(DeepSpeedMoEInferenceConfig,
-              self).__init__(
-                  hidden_size,
-                  (intermediate_size if intermediate_size > 0 else 4 * hidden_size),
-                  heads,
-                  num_hidden_layers,
-                  layer_norm_eps,
-                  local_rank,
-                  mp_size,
-                  fp16,
-                  q_int8,
-                  pre_layer_norm,
-                  stochastic_mode,
-                  scale_attention,
-                  triangular_masking,
-                  local_attention,
-                  window_size,
-                  return_tuple)
+              self).__init__(hidden_size, (intermediate_size if intermediate_size > 0 else 4 * hidden_size), heads,
+                             num_hidden_layers, layer_norm_eps, local_rank, mp_size, fp16, bf16, q_int8,
+                             pre_layer_norm, stochastic_mode, scale_attention, triangular_masking, local_attention,
+                             window_size, return_tuple)
         self.moe_experts = moe_experts
         self.k = k
         self.capacity_factor = capacity_factor
@@ -116,44 +107,19 @@ def from_json_file(cls, json_file):
 
 
 class DeepSpeedMLPFunction(Function):
+
     @staticmethod
-    def forward(ctx,
-                input,
-                inter_w,
-                inter_b,
-                config,
-                output_b,
-                output_w,
-                q_scales,
-                q_groups,
-                merge_count,
-                mp_group,
+    def forward(ctx, input, inter_w, inter_b, config, output_b, output_w, q_scales, q_groups, merge_count, mp_group,
                 async_op):
         if config.q_int8:
-            intermediate = inference_cuda_module.fused_gemm_gelu_int8(
-                input,
-                inter_w,
-                inter_b,
-                config.epsilon,
-                q_scales[2],
-                (q_groups * (2**merge_count)),
-                config.pre_layer_norm)
-            output = inference_cuda_module.vector_matmul_int8(intermediate,
-                                                              output_w,
-                                                              q_scales[3],
-                                                              q_groups,
-                                                              (merge_count))
+            intermediate = inference_module.fused_gemm_gelu_int8(input, inter_w, inter_b, config.epsilon, q_scales[2],
+                                                                 (q_groups * (2**merge_count)), config.pre_layer_norm)
+            output = inference_module.vector_matmul_int8(intermediate, output_w, q_scales[3], q_groups, (merge_count))
         else:
-            mlp_gemm_func = inference_cuda_module.fused_gemm_gelu_fp16 if config.fp16 else \
-                                    inference_cuda_module.fused_gemm_gelu_fp32
-
-            output = mlp_gemm_func(input,
-                                   inter_w,
-                                   inter_b,
-                                   output_w,
-                                   config.epsilon,
-                                   config.pre_layer_norm,
-                                   async_op)
+            mlp_gemm_func = inference_module.fused_gemm_gelu_fp16 if config.fp16 else \
+                                    inference_module.fused_gemm_gelu_fp32
+
+            output = mlp_gemm_func(input, inter_w, inter_b, output_w, config.epsilon, config.pre_layer_norm, async_op)
         if mp_group is not None and dist.get_world_size(group=mp_group) > 1:
             dist.all_reduce(output, group=mp_group, async_op=async_op)
 
@@ -166,24 +132,17 @@ def backward(ctx, grad_output):
 
 
 class DeepSpeedMoEMLP(nn.Module):
-    def __init__(self,
-                 config,
-                 q_scales=None,
-                 q_groups=1,
-                 merge_count=1,
-                 mlp_extra_grouping=False,
-                 mp_group=None):
+
+    def __init__(self, config, q_scales=None, q_groups=1, merge_count=1, mlp_extra_grouping=False, mp_group=None):
         super(DeepSpeedMoEMLP, self).__init__()
 
         self.config = config
         self.attn_nw = nn.Parameter(torch.Tensor(self.config.hidden_size))
         self.attn_nb = nn.Parameter(torch.Tensor(self.config.hidden_size))
-        interm_size = self.config.intermediate_size // (
-            1 if mp_group is None else dist.get_world_size(group=mp_group))
+        interm_size = self.config.intermediate_size // (1 if mp_group is None else dist.get_world_size(group=mp_group))
         self.inter_w = nn.Parameter(torch.Tensor(self.config.hidden_size, interm_size))
         self.inter_b = nn.Parameter(torch.Tensor(interm_size))
-        self.output_w = nn.Parameter(torch.Tensor((interm_size),
-                                                  self.config.hidden_size))
+        self.output_w = nn.Parameter(torch.Tensor((interm_size), self.config.hidden_size))
         self.output_b = nn.Parameter(torch.Tensor(self.config.hidden_size))
 
         # used for quantization
@@ -193,17 +152,8 @@ def __init__(self,
         self.mp_group = mp_group
 
     def forward(self, input, async_op=False):
-        return DeepSpeedMLPFunction.apply(input,
-                                          self.inter_w,
-                                          self.inter_b,
-                                          self.config,
-                                          self.output_b,
-                                          self.output_w,
-                                          self.q_scales,
-                                          self.q_groups,
-                                          self.merge_count,
-                                          self.mp_group,
-                                          async_op)
+        return DeepSpeedMLPFunction.apply(input, self.inter_w, self.inter_b, self.config, self.output_b, self.output_w,
+                                          self.q_scales, self.q_groups, self.merge_count, self.mp_group, async_op)
 
 
 class DeepSpeedMoEInference(nn.Module):
@@ -237,25 +187,22 @@ def __init__(self,
 
         self.config = config
         self.config.layer_id = DeepSpeedMoEInference.layer_id
-        global inference_cuda_module
+        global inference_module
         global specialized_mode
-        if inference_cuda_module is None:
+        if inference_module is None:
             specialized_mode = False
             # InferenceSpecializedBuilder is not among DeepSpeed provided builder yet, so we infer by builder name string
             builder = get_accelerator().create_op_builder("InferenceSpecializedBuilder")
-            if builder != None and builder.is_compatible():
-                inference_cuda_module = builder.load()
+            if builder is not None and builder.is_compatible():
+                inference_module = builder.load()
                 specialized_mode = True
             else:
-                inference_cuda_module = InferenceBuilder().load()
+                inference_module = InferenceBuilder().load()
         self.config.specialized_mode = specialized_mode
+        assert self.config.dtype != torch.bfloat16, "DeepSpeed MoE Transformer Inference not yet tested for bfloat support"
 
         DeepSpeedMoEInference.layer_id += 1
-        self.attention = DeepSpeedSelfAttention(self.config,
-                                                mp_group,
-                                                quantize_scales,
-                                                quantize_groups,
-                                                merge_count)
+        self.attention = DeepSpeedSelfAttention(self.config, mp_group, quantize_scales, quantize_groups, merge_count)
         self.attn_nw = nn.Parameter(torch.Tensor(self.config.hidden_size))
         self.attn_nb = nn.Parameter(torch.Tensor(self.config.hidden_size))
 
@@ -263,35 +210,22 @@ def __init__(self,
         self.norm_b = nn.Parameter(torch.Tensor(self.config.hidden_size))
 
         if config.mlp_type == 'residual':
-            self.res_mlp = DeepSpeedMoEMLP(config,
-                                           quantize_scales,
-                                           quantize_groups,
-                                           merge_count,
-                                           mlp_extra_grouping,
+            self.res_mlp = DeepSpeedMoEMLP(config, quantize_scales, quantize_groups, merge_count, mlp_extra_grouping,
                                            mp_group)
             self.res_coef = nn.Parameter(torch.Tensor(self.config.hidden_size, 2))
-            self.coef_func = inference_cuda_module.softmax_fp16 if self.config.fp16 or self.config.q_int8 else \
-                                        inference_cuda_module.softmax_fp32
-            self.vector_matmul_func = inference_cuda_module.vector_matmul_fp16 if config.fp16 else \
-                                    inference_cuda_module.vector_matmul_fp32
+            self.coef_func = inference_module.softmax_fp16 if self.config.dtype in [torch.float16, torch.int8] else \
+                                        inference_module.softmax_fp32
+            self.vector_matmul_func = inference_module.vector_matmul_fp16 if self.config.dtype == torch.float16 else \
+                                    inference_module.vector_matmul_fp32
 
         config.mp_size = 1
         self.mlp = nn.ModuleList(
-            DeepSpeedMoEMLP(config,
-                            quantize_scales,
-                            quantize_groups,
-                            merge_count,
-                            mlp_extra_grouping,
-                            expert_mp_group) for i in range(self.config.moe_experts))
-
-        self.moe_gate = TopKGate(self.config.hidden_size,
-                                 self.config.global_experts,
-                                 self.config.k,
-                                 self.config.capacity_factor,
-                                 self.config.eval_capacity_factor,
-                                 self.config.min_capacity,
-                                 self.config.noisy_gate_policy,
-                                 self.config.drop_tokens,
+            DeepSpeedMoEMLP(config, quantize_scales, quantize_groups, merge_count, mlp_extra_grouping, expert_mp_group)
+            for i in range(self.config.moe_experts))
+
+        self.moe_gate = TopKGate(self.config.hidden_size, self.config.global_experts, self.config.k,
+                                 self.config.capacity_factor, self.config.eval_capacity_factor,
+                                 self.config.min_capacity, self.config.noisy_gate_policy, self.config.drop_tokens,
                                  self.config.use_rts)
 
         self.ep_group = ep_group
@@ -300,12 +234,12 @@ def __init__(self,
 
         print("DeepSpeed MoE Transformer Inference config is ", self.config.__dict__)
 
-        self.bias_residual_func = inference_cuda_module.bias_residual_fp16 if config.fp16 or config.q_int8 else \
-                                        inference_cuda_module.bias_residual_fp32
-        self.ds_layernorm = inference_cuda_module.layer_norm_fp16 if self.config.fp16 or self.config.q_int8 else \
-                                        inference_cuda_module.layer_norm_fp32
-        self.einsum_sec_sm_ecm = inference_cuda_module.einsum_sec_sm_ecm_fp16 if self.config.fp16 or self.config.q_int8 else \
-                                        inference_cuda_module.einsum_sec_sm_ecm_fp32
+        self.bias_residual_func = inference_module.bias_residual_fp16 if self.config.dtype in [torch.float16, torch.int8] else \
+                                        inference_module.bias_residual_fp32
+        self.ds_layernorm = inference_module.layer_norm_fp16 if self.config.dtype in [torch.float16, torch.int8] else \
+                                        inference_module.layer_norm_fp32
+        self.einsum_sec_sm_ecm = inference_module.einsum_sec_sm_ecm_fp16 if self.config.dtype in [torch.float16, torch.int8] else \
+                                        inference_module.einsum_sec_sm_ecm_fp32
 
     def res_coef_func(self, inp, async_op):
         inp = self.vector_matmul_func(inp, self.res_coef, async_op)
@@ -315,19 +249,14 @@ def moe_gate_einsum(self, attention_output):
         _, combined_weights, dispatch_mask, _ = self.moe_gate(
             attention_output.view(-1, self.config.hidden_size),
             None,
-            )
-        dispatched_attention = self.einsum_sec_sm_ecm(
-            dispatch_mask.type_as(attention_output),
-            attention_output.view(-1,
-                                  self.config.hidden_size))
+        )
+        dispatched_attention = self.einsum_sec_sm_ecm(dispatch_mask.type_as(attention_output),
+                                                      attention_output.view(-1, self.config.hidden_size))
         return dispatched_attention, combined_weights
 
     def expert_exec(self, dispatched_input):
-        dispatched_input = dispatched_input.reshape(
-            self.config.global_experts // self.config.moe_experts,
-            self.config.moe_experts,
-            -1,
-            self.config.hidden_size)
+        dispatched_input = dispatched_input.reshape(self.config.global_experts // self.config.moe_experts,
+                                                    self.config.moe_experts, -1, self.config.hidden_size)
 
         chunks = dispatched_input.chunk(self.config.moe_experts, dim=1)
         expert_outputs = torch.empty((
@@ -337,29 +266,22 @@ def expert_exec(self, dispatched_input):
                                      dtype=dispatched_input.dtype,
                                      device=dispatched_input.device)
         for chunk, expert in zip(chunks, range(len(self.mlp))):
-            expert_outputs[expert] = self.mlp[expert](chunk.view(
-                -1,
-                dispatched_input.shape[-2],
-                dispatched_input.shape[-1]))
+            expert_outputs[expert] = self.mlp[expert](chunk.view(-1, dispatched_input.shape[-2],
+                                                                 dispatched_input.shape[-1]))
         return expert_outputs
 
     def _alltoall(self, dispatched_attention):
         if dist.get_world_size(group=self.ep_group) > 1:
             dispatched_input = torch.empty_like(dispatched_attention)
-            dist.all_to_all_single(dispatched_input,
-                                   dispatched_attention,
-                                   group=self.ep_group)
+            dist.all_to_all_single(dispatched_input, dispatched_attention, group=self.ep_group)
             return dispatched_input
         else:
             return dispatched_attention
 
     def scale_expert_output(self, attention_output, expert_output, combined_weights):
         combined_output = torch.matmul(
-            combined_weights.type_as(attention_output).reshape(
-                combined_weights.shape[0],
-                -1),
-            expert_output.reshape(-1,
-                                  expert_output.shape[-1]))
+            combined_weights.type_as(attention_output).reshape(combined_weights.shape[0], -1),
+            expert_output.reshape(-1, expert_output.shape[-1]))
         return combined_output.reshape(attention_output.shape)
 
     def forward(self,
@@ -380,21 +302,13 @@ def forward(self,
         input_mask = input_mask if attention_mask is None else attention_mask
         input_type = input.dtype
 
-        if (self.config.fp16 or self.config.q_int8) \
-            and input.dtype == torch.float:
+        if (self.config.dtype in [torch.float16, torch.int8]) and input_type == torch.float:
             input = input.half()
 
         with torch.no_grad():
-            attention_output = self.attention(input,
-                                              input_mask,
-                                              head_mask,
-                                              layer_past,
-                                              get_present,
-                                              encoder_hidden_states,
-                                              encoder_attention_mask,
-                                              output_attentions,
-                                              self.norm_w,
-                                              self.norm_b)
+            attention_output = self.attention(input, input_mask, head_mask, layer_past, get_present,
+                                              encoder_hidden_states, encoder_attention_mask, output_attentions,
+                                              self.norm_w, self.norm_b)
 
             if get_present:
                 attention_output, p_key, p_value = attention_output[0:3]
@@ -405,10 +319,7 @@ def forward(self,
                 attention_output = attention_output[0]
 
             residual_add = attention_output + self.attention.attn_ob
-            attention_output = self.ds_layernorm(residual_add,
-                                                 self.attn_nw,
-                                                 self.attn_nb,
-                                                 self.config.epsilon)
+            attention_output = self.ds_layernorm(residual_add, self.attn_nw, self.attn_nb, self.config.epsilon)
 
             if self.config.mlp_type == 'residual':
                 res_mlp_out = self.res_mlp(attention_output, async_op=True)
@@ -416,13 +327,10 @@ def forward(self,
 
             if self.expert_mp_group is not None:
                 tensor_list = [
-                    torch.empty_like(attention_output)
-                    for _ in range(dist.get_world_size(group=self.expert_mp_group))
+                    torch.empty_like(attention_output) for _ in range(dist.get_world_size(group=self.expert_mp_group))
                 ]
                 tensor_list[dist.get_rank(group=self.expert_mp_group)] = attention_output
-                dist.all_gather(tensor_list,
-                                attention_output,
-                                group=self.expert_mp_group)
+                dist.all_gather(tensor_list, attention_output, group=self.expert_mp_group)
                 attention_output = torch.cat(tensor_list).contiguous()
 
             ############## MoE Gating + Experts ###############
@@ -430,26 +338,20 @@ def forward(self,
             dispatched_input = self._alltoall(dispatched_attention)
             expert_outputs = self.expert_exec(dispatched_input)
             expert_output = self._alltoall(expert_outputs)
-            output = self.scale_expert_output(attention_output,
-                                              expert_output,
-                                              combined_weights)
+            output = self.scale_expert_output(attention_output, expert_output, combined_weights)
             ################################################
 
             if self.expert_mp_group is not None:
-                output = output.split(output.shape[0] //
-                                      dist.get_world_size(group=self.expert_mp_group),
+                output = output.split(output.shape[0] // dist.get_world_size(group=self.expert_mp_group),
                                       dim=0)[dist.get_rank(group=self.expert_mp_group)]
 
             if self.config.mlp_type == 'residual':
-                inference_cuda_module.moe_res_matmul(res_mlp_out, res_coef_out, output)
+                inference_module.moe_res_matmul(res_mlp_out, res_coef_out, output)
 
             output = self.bias_residual_func(output, residual_add, torch.empty(1))
 
             if not self.config.pre_layer_norm:
-                output = self.ds_layernorm(output,
-                                           self.norm_w,
-                                           self.norm_b,
-                                           self.config.epsilon)
+                output = self.ds_layernorm(output, self.norm_w, self.norm_b, self.config.epsilon)
 
             if input_type != output.dtype:
                 output = output.to(input_type)
diff --git a/deepspeed/ops/transformer/inference/op_binding/__init__.py b/deepspeed/ops/transformer/inference/op_binding/__init__.py
index 890290692e66..20b7bf12a917 100644
--- a/deepspeed/ops/transformer/inference/op_binding/__init__.py
+++ b/deepspeed/ops/transformer/inference/op_binding/__init__.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .linear import LinearOp
 from .vector_matmul import VectorMatMulOp
diff --git a/deepspeed/ops/transformer/inference/op_binding/base.py b/deepspeed/ops/transformer/inference/op_binding/base.py
index ad29e0f773cb..5a997f95d5cc 100644
--- a/deepspeed/ops/transformer/inference/op_binding/base.py
+++ b/deepspeed/ops/transformer/inference/op_binding/base.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 from ..config import DeepSpeedInferenceConfig
@@ -7,11 +10,11 @@
 
 
 class BaseOp(torch.nn.Module):
-    inference_cuda_module = None
+    inference_module = None
 
     def __init__(self, config: DeepSpeedInferenceConfig):
         super(BaseOp, self).__init__()
         self.config = config
-        if BaseOp.inference_cuda_module is None:
+        if BaseOp.inference_module is None:
             builder = InferenceBuilder()
-            BaseOp.inference_cuda_module = builder.load()
+            BaseOp.inference_module = builder.load()
diff --git a/deepspeed/ops/transformer/inference/op_binding/gelu_gemm.py b/deepspeed/ops/transformer/inference/op_binding/gelu_gemm.py
index 9ab4ef926870..63323c150752 100644
--- a/deepspeed/ops/transformer/inference/op_binding/gelu_gemm.py
+++ b/deepspeed/ops/transformer/inference/op_binding/gelu_gemm.py
@@ -1,32 +1,45 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 from ..config import DeepSpeedInferenceConfig
 from .base import BaseOp
+import deepspeed
 
 
 class GELUGemmOp(BaseOp):
+
     def __init__(self, config: DeepSpeedInferenceConfig):
         super(GELUGemmOp, self).__init__(config)
-        if self.config.fp16:
-            self.fused_gemm_gelu = self.inference_cuda_module.fused_gemm_gelu_fp16
-        else:
-            self.fused_gemm_gelu = self.inference_cuda_module.fused_gemm_gelu_fp32
-
-    def forward(self,
-                input: torch.Tensor,
-                weight: torch.Tensor,
-                bias: torch.Tensor,
-                weight_out: torch.Tensor,
-                async_op: bool = False):
-        output = self.fused_gemm_gelu(input,
-                                      weight,
-                                      weight.scale,
-                                      bias,
-                                      weight_out,
-                                      weight_out.scale,
-                                      self.config.epsilon,
-                                      self.config.pre_layer_norm,
-                                      self.config.q_int8,
-                                      async_op)
+        try:
+            if self.config.dtype in [torch.float16, torch.int8]:
+                if deepspeed.HAS_TRITON and self.config.use_triton and self.config.dtype == torch.float16:
+                    from deepspeed.ops.transformer.inference.triton.ops import fused_gemm_gelu as _triton_fused_gemm_gelu
+                    self.fused_gemm_gelu = _triton_fused_gemm_gelu  # type: ignore
+                else:
+                    self.fused_gemm_gelu = self.inference_module.fused_gemm_gelu_fp16  # type: ignore
+            elif self.config.dtype == torch.bfloat16:
+                self.fused_gemm_gelu = self.inference_module.fused_gemm_gelu_bf16  # type: ignore
+            else:
+                self.fused_gemm_gelu = self.inference_module.fused_gemm_gelu_fp32  # type: ignore
+        except AttributeError:
+            self.fused_gemm_gelu = self.gelu_gemm_fallback
+
+    def gelu_gemm_fallback(self, input, weight, scale, bias, out, out_scale, dtype, transpose):
+        raise NotImplementedError
+
+    def forward(self, input: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor, weight_out: torch.Tensor):
+
+        output = self.fused_gemm_gelu(
+            input,
+            weight,
+            weight.scale if hasattr(weight, 'scale') else torch.empty(1),  # type: ignore
+            bias,
+            weight_out,
+            weight_out.scale if hasattr(weight_out, 'scale') else torch.empty(1),  # type: ignore
+            self.config.dtype == torch.int8,
+            self.config.transposed_mode)
+
         return output
diff --git a/deepspeed/ops/transformer/inference/op_binding/linear.py b/deepspeed/ops/transformer/inference/op_binding/linear.py
index 6d83ffce22bd..b8decb6dc5ea 100644
--- a/deepspeed/ops/transformer/inference/op_binding/linear.py
+++ b/deepspeed/ops/transformer/inference/op_binding/linear.py
@@ -1,17 +1,38 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 from ..config import DeepSpeedInferenceConfig
 from .base import BaseOp
+import deepspeed
 
 
 class LinearOp(BaseOp):
+
     def __init__(self, config: DeepSpeedInferenceConfig):
         super(LinearOp, self).__init__(config)
-        if self.config.fp16:
-            self.linear_func = self.inference_cuda_module.linear_layer_fp16
-        else:
-            self.linear_func = self.inference_cuda_module.linear_layer_fp32
+        try:
+            if self.config.dtype in [torch.float16, torch.int8]:
+                if deepspeed.HAS_TRITON and self.config.use_triton and self.config.dtype == torch.float16:
+                    from deepspeed.ops.transformer.inference.triton.ops import linear_func as _triton_linear_func
+                    self.linear_func = _triton_linear_func
+                    triton_autotune = config.triton_autotune and config.layer_id == 0
+                    if triton_autotune:
+                        __class__._triton_autotune(2, self.config.max_out_tokens, self.config.hidden_size)
+                else:
+                    self.linear_func = self.inference_module.linear_layer_fp16
+                self.linear_func = self.inference_module.linear_layer_fp16
+            elif self.config.dtype == torch.bfloat16:
+                self.linear_func = self.inference_module.linear_layer_bf16
+            else:
+                self.linear_func = self.inference_module.linear_layer_fp32
+        except AttributeError:
+            self.linear_func = self.linear_fallback
+
+    def linear_fallback(self, input, weight, bias, add_bias, do_flash_attn, num_heads, transpose, rope_theta):
+        raise NotImplementedError
 
     def forward(self,
                 input: torch.Tensor,
@@ -22,10 +43,18 @@ def forward(self,
                 num_heads: int,
                 external_cache: bool = None,
                 num_layers: int = None):
-        qkv_out = self.linear_func(input,
-                                   weight,
-                                   bias,
-                                   add_bias,
-                                   do_flash_attn,
-                                   num_heads)
+        qkv_out = self.linear_func(input, weight, bias, add_bias, do_flash_attn, num_heads,
+                                   self.config.transposed_mode, self.config.rope_theta)
         return qkv_out
+
+    @staticmethod
+    def _triton_autotune(min_seqlen, max_seqlen, hidden_size, dtype=torch.float16):
+        from deepspeed.ops.transformer.inference.triton.matmul_ext import Fp16Matmul, matmul
+        seqlen = [(min_seqlen + i)
+                  for i in range(0, max_seqlen - min_seqlen + Fp16Matmul._cache_stride + 1, Fp16Matmul._cache_stride)]
+        Fp16Matmul._read_autotune_table()
+        for N in seqlen:
+            A = torch.randn((N, hidden_size), dtype=dtype, device='cuda')
+            B = torch.randn((hidden_size, 3 * hidden_size), dtype=dtype, device='cuda')
+            matmul(A, B)
+        Fp16Matmul._update_autotune_table()
diff --git a/deepspeed/ops/transformer/inference/op_binding/mlp_gemm.py b/deepspeed/ops/transformer/inference/op_binding/mlp_gemm.py
index 4df8ef52c3fb..3064c00d1755 100644
--- a/deepspeed/ops/transformer/inference/op_binding/mlp_gemm.py
+++ b/deepspeed/ops/transformer/inference/op_binding/mlp_gemm.py
@@ -1,41 +1,102 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
 
+# DeepSpeed Team
+
+from typing import Optional
+
+import os
 import torch
+import torch.nn.functional as F
 from ..config import DeepSpeedInferenceConfig
 from .base import BaseOp
+from deepspeed.utils.types import NormType
 
 
 class MLPGemmOp(BaseOp):
+
     def __init__(self, config: DeepSpeedInferenceConfig):
         super(MLPGemmOp, self).__init__(config)
-        if self.config.fp16:
-            self.mlp_gemm_func = self.inference_cuda_module.mlp_gemm_fp16
+        try:
+            if self.config.norm_type == NormType.LayerNorm:
+                if self.config.dtype in [
+                        torch.float16, torch.int8
+                ]:  # non-triton cuda kernel has a higher performance in MLP than mlp_gemm_func in triton.ops
+                    self.mlp_gemm_func = self.inference_module.mlp_gemm_fp16  # type: ignore
+                elif self.config.dtype == torch.bfloat16:
+                    self.mlp_gemm_func = self.inference_module.mlp_gemm_bf16
+                else:
+                    self.mlp_gemm_func = self.inference_module.mlp_gemm_fp32  # type: ignore
+            elif self.config.norm_type == NormType.RMSNorm:
+                if self.config.dtype in [torch.float16, torch.int8]:
+                    self.mlp_gemm_func = self.inference_module.rms_mlp_gemm_fp16  # type: ignore
+                elif self.config.dtype == torch.bfloat16:
+                    self.mlp_gemm_func = self.inference_module.rms_mlp_gemm_bf16
+                else:
+                    self.mlp_gemm_func = self.inference_module.rms_mlp_gemm_fp32  # type: ignore
+        except AttributeError:
+            if self.config.norm_type == NormType.LayerNorm:
+                self.mlp_gemm_func = self.mlp_gemm_fallback
+            elif self.config.norm_type == NormType.RMSNorm:
+                self.mlp_gemm_func = self.rms_mlp_gemm_fallback
+
+    def mlp_gemm_fallback(self, input, residual, input_bias, weight_interm, weight_out, bias, gamma, beta, eps,
+                          pre_layer_norm, mlp_after_attn, interm_scale, out_scale, dtype, mlp_act_func_type,
+                          transpose):
+        if os.environ.get('DS_KI_FALLBACK') == 'True' and mlp_after_attn and not transpose:
+            residual_add = F.layer_norm(input + residual + input_bias, (input.shape[2], ), gamma, beta,
+                                        self.config.epsilon)
+            tmp = torch.matmul(residual_add, weight_interm)
+            tmp = F.gelu(tmp + bias)
+            output = torch.matmul(tmp, weight_out)
+            return (output, residual_add)
         else:
-            self.mlp_gemm_func = self.inference_cuda_module.mlp_gemm_fp32
+            raise NotImplementedError
+
+    def rms_mlp_gemm_fallback(self, input, residual, weight_interm, weight_out, gamma, eps, interm_scale, out_scale,
+                              dtype, mlp_act_func_type, transpose):
+        raise NotImplementedError
 
     def forward(self,
                 input: torch.Tensor,
                 residual: torch.Tensor,
-                input_bias: torch.Tensor,
                 weight_interm: torch.Tensor,
                 weight_out: torch.Tensor,
-                bias: torch.Tensor,
-                gamma: torch.Tensor,
-                beta: torch.Tensor):
-        output, residual_add = self.mlp_gemm_func(
-                                    input,
-                                    residual,
-                                    input_bias,
-                                    weight_interm,
-                                    weight_out,
-                                    bias,
-                                    gamma,
-                                    beta,
-                                    self.config.epsilon,
-                                    self.config.pre_layer_norm,
-                                    self.config.mlp_after_attn,
-                                    weight_interm.scale,
-                                    weight_out.scale,
-                                    self.config.q_int8,
-                                    self.config.mlp_act_func_type)
+                input_bias: Optional[torch.Tensor] = None,
+                bias: Optional[torch.Tensor] = None,
+                gamma: Optional[torch.Tensor] = None,
+                beta: Optional[torch.Tensor] = None):
+        if self.config.norm_type == NormType.LayerNorm:
+            output, residual_add = self.mlp_gemm_func(
+                input,
+                residual,
+                input_bias,
+                weight_interm,
+                weight_out,
+                bias,
+                gamma,
+                beta,
+                self.config.epsilon,
+                self.config.pre_layer_norm,
+                self.config.mlp_after_attn,
+                weight_interm.scale if hasattr(weight_interm, 'scale') else torch.empty(1),  # type: ignore
+                weight_out.scale if hasattr(weight_out, 'scale') else torch.empty(1),  # type: ignore
+                self.config.dtype == torch.int8,
+                self.config.mlp_act_func_type,
+                self.config.transposed_mode)
+        else:
+            if input_bias is not None:
+                input += input_bias
+            output, residual_add = self.mlp_gemm_func(
+                input,
+                residual,
+                weight_interm,
+                weight_out,
+                gamma,
+                self.config.epsilon,
+                weight_interm.scale if hasattr(weight_interm, 'scale') else torch.empty(1),  # type: ignore
+                weight_out.scale if hasattr(weight_out, 'scale') else torch.empty(1),  # type: ignore
+                self.config.dtype == torch.int8,
+                self.config.mlp_act_func_type,
+                self.config.transposed_mode)
         return output, residual_add
diff --git a/deepspeed/ops/transformer/inference/op_binding/qkv_gemm.py b/deepspeed/ops/transformer/inference/op_binding/qkv_gemm.py
index d50dbfd3e7ed..250bf9864e1e 100644
--- a/deepspeed/ops/transformer/inference/op_binding/qkv_gemm.py
+++ b/deepspeed/ops/transformer/inference/op_binding/qkv_gemm.py
@@ -1,44 +1,90 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
 
+# DeepSpeed Team
+
+import os
 import torch
+import torch.nn.functional as F
 from ..config import DeepSpeedInferenceConfig
 from .base import BaseOp
-from deepspeed import comm as dist
+import deepspeed
+from deepspeed.utils.types import NormType
 
 
 class QKVGemmOp(BaseOp):
+
     def __init__(self, config: DeepSpeedInferenceConfig):
         super(QKVGemmOp, self).__init__(config)
-        if self.config.fp16:
-            self.qkv_gemm_func = self.inference_cuda_module.qkv_gemm_fp16
+        try:
+            if self.config.norm_type == NormType.LayerNorm:
+                if self.config.dtype in [torch.float16, torch.int8]:
+                    if deepspeed.HAS_TRITON and self.config.use_triton and self.config.dtype == torch.float16:
+                        from deepspeed.ops.transformer.inference.triton.ops import qkv_gemm_func as _triton_qkv_gemm_func
+                        self.qkv_gemm_func = _triton_qkv_gemm_func
+                        triton_autotune = config.triton_autotune and config.layer_id == 0
+                        if triton_autotune:
+                            __class__._triton_autotune(2, self.config.max_out_tokens, self.config.hidden_size)
+                    else:
+                        self.qkv_gemm_func = self.inference_module.qkv_gemm_fp16  # type: ignore
+                elif self.config.dtype == torch.bfloat16:
+                    self.qkv_gemm_func = self.inference_module.qkv_gemm_bf16
+                else:
+                    self.qkv_gemm_func = self.inference_module.qkv_gemm_fp32  # type: ignore
+            elif self.config.norm_type == NormType.RMSNorm:
+                if self.config.dtype in [torch.float16, torch.int8]:
+                    self.qkv_gemm_func = self.inference_module.rms_qkv_gemm_fp16  # type: ignore
+                elif self.config.dtype == torch.bfloat16:
+                    self.qkv_gemm_func = self.inference_module.rms_qkv_gemm_bf16
+                else:
+                    self.qkv_gemm_func = self.inference_module.rms_qkv_gemm_fp32  # type: ignore
+        except AttributeError:
+            if self.config.norm_type == NormType.LayerNorm:
+                self.qkv_gemm_func = self.qkv_gemm_fallback
+            elif self.config.norm_type == NormType.RMSNorm:
+                self.qkv_gemm_func = self.rms_qkv_gemm_fallback
+
+    @staticmethod
+    def _triton_autotune(min_seqlen, max_seqlen, hidden_size, dtype=torch.float16):
+        from deepspeed.ops.transformer.inference.triton.matmul_ext import Fp16Matmul, matmul
+        seqlen = [(min_seqlen + i)
+                  for i in range(0, max_seqlen - min_seqlen + Fp16Matmul._cache_stride + 1, Fp16Matmul._cache_stride)]
+        Fp16Matmul._read_autotune_table()
+        for N in seqlen:
+            A = torch.randn((N, hidden_size), dtype=dtype, device='cuda')
+            B = torch.randn((hidden_size, 3 * hidden_size), dtype=dtype, device='cuda')
+            matmul(A, B)
+        Fp16Matmul._update_autotune_table()
+
+    def qkv_gemm_fallback(self, input, weight, q_scale, bias, gamma, beta, eps, add_bias, q_int8, transpose):
+        if os.environ.get('DS_KI_FALLBACK') == 'True' and not transpose:
+            inp_norm = F.layer_norm(input, (input.shape[2], ), gamma, beta, eps)
+            tmp = torch.matmul(inp_norm, weight)
+            if add_bias:
+                tmp += bias
+            output = [tmp, inp_norm]
+            return output
+        else:
+            raise NotImplementedError
+
+    def rms_qkv_gemm_fallback(self, input, weight, q_scale, gamma, eps, q_int8, transpose):
+        raise NotImplementedError
+
+    def forward(self, input: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor, gamma: torch.Tensor,
+                beta: torch.Tensor):
+
+        add_bias = bias is not None
+        bias = bias if add_bias else torch.empty(1)  # type: ignore
+        q_scale = weight.scale if hasattr(weight, 'scale') else torch.empty(1)  # type: ignore
+        q_int8 = self.config.dtype == torch.int8
+
+        if self.config.norm_type == NormType.LayerNorm:
+            output, norm = self.qkv_gemm_func(input, weight, q_scale, bias, gamma, beta, self.config.epsilon, add_bias,
+                                              q_int8, self.config.transposed_mode)
         else:
-            self.qkv_gemm_func = self.inference_cuda_module.qkv_gemm_fp32
-
-    def forward(self,
-                input: torch.Tensor,
-                weight: torch.Tensor,
-                bias: torch.Tensor,
-                gamma: torch.Tensor,
-                beta: torch.Tensor,
-                add_bias: bool,
-                num_layers: int,
-                num_heads: int = None,
-                max_out_tokens: int = None):
-        q_scale = weight.scale
-        external_cache = self.config.bigscience_bloom
-        rank = dist.get_rank() if dist.is_initialized() else 0
-        q_int8 = self.config.q_int8
-        output = self.qkv_gemm_func(input,
-                                    weight,
-                                    q_scale,
-                                    bias,
-                                    gamma,
-                                    beta,
-                                    self.config.epsilon,
-                                    add_bias,
-                                    num_layers,
-                                    external_cache,
-                                    self.config.mp_size,
-                                    rank,
-                                    q_int8)
-        return output
+            output, norm = self.qkv_gemm_func(input, weight, q_scale, gamma, self.config.epsilon, q_int8,
+                                              self.config.transposed_mode)
+            if add_bias:
+                output += bias
+
+        return output, norm
diff --git a/deepspeed/ops/transformer/inference/op_binding/residual_add.py b/deepspeed/ops/transformer/inference/op_binding/residual_add.py
index 0fb1741a0095..6f9b35cbc05d 100644
--- a/deepspeed/ops/transformer/inference/op_binding/residual_add.py
+++ b/deepspeed/ops/transformer/inference/op_binding/residual_add.py
@@ -1,38 +1,64 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
 
+# DeepSpeed Team
+
+import os
 import torch
+from typing import Optional
 from ..config import DeepSpeedInferenceConfig
 from .base import BaseOp
 
 
 class ResidualAddOp(BaseOp):
+
     def __init__(self, config: DeepSpeedInferenceConfig):
         super(ResidualAddOp, self).__init__(config)
-        if self.config.fp16 or self.config.q_int8:
-            self.residual_add_func = self.inference_cuda_module.residual_add_bias_fp16
-        else:
-            self.residual_add_func = self.inference_cuda_module.residual_add_bias_fp32
+        try:
+            if self.config.dtype in [torch.float16, torch.int8]:
+                self.residual_add_func = self.inference_module.residual_add_bias_fp16
+            elif self.config.dtype == torch.bfloat16:
+                self.residual_add_func = self.inference_module.residual_add_bias_bf16
+            else:
+                self.residual_add_func = self.inference_module.residual_add_bias_fp32
+        except AttributeError:
+            self.residual_add_func = None
+        try:
+            self._vector_add = self.inference_module._vector_add
+        except AttributeError:
+            self._vector_add = None
 
     def forward(self,
                 hidden_state: torch.Tensor,
                 residual: torch.Tensor,
-                attention_output: torch.Tensor,
-                attention_bias: torch.Tensor,
-                final_bias: torch.Tensor,
                 add_bias: bool,
-                residual_add: torch.Tensor):
-
-        if not self.config.pre_layer_norm and residual_add is not None:
-            # only use residual add if its set and we are not pre layer norm
-            residual = residual_add
-
-        self.residual_add_func(hidden_state,
-                               residual,
-                               attention_output,
-                               attention_bias,
-                               final_bias,
-                               self.config.mp_size,
-                               self.config.mlp_after_attn,
-                               add_bias,
-                               self.config.pre_layer_norm)
+                attention_output: Optional[torch.Tensor] = None,
+                residual_add: Optional[torch.Tensor] = None,
+                attention_bias: Optional[torch.Tensor] = None,
+                final_bias: Optional[torch.Tensor] = None):
+
+        if self.residual_add_func is not None:
+            if final_bias is None:
+                residual = self._vector_add(residual, hidden_state, 1.0 / self.config.mp_size)
+            else:
+                if not self.config.pre_layer_norm and residual_add is not None:
+                    # only use residual add if its set and we are not pre layer norm
+                    residual = residual_add
+
+                self.residual_add_func(hidden_state, residual, attention_output, attention_bias, final_bias,
+                                       self.config.mp_size, self.config.mlp_after_attn, add_bias,
+                                       self.config.pre_layer_norm)
+        else:
+            # fallback
+            if os.environ.get('DS_KI_FALLBACK') == 'True' and self.config.mlp_after_attn:
+                if self.config.pre_layer_norm:
+                    tmp = (residual.float() + attention_output.float() + attention_bias.float() +
+                           final_bias.float()) / self.config.mp_size + hidden_state.float()
+                else:
+                    tmp = residual.float() + hidden_state.float() + final_bias.float()
+
+                input_dtype = hidden_state.dtype
+                residual = tmp.to(input_dtype)
+            else:
+                raise NotImplementedError
         return residual
diff --git a/deepspeed/ops/transformer/inference/op_binding/softmax.py b/deepspeed/ops/transformer/inference/op_binding/softmax.py
index 4d58ba4a4825..bc309d94df14 100644
--- a/deepspeed/ops/transformer/inference/op_binding/softmax.py
+++ b/deepspeed/ops/transformer/inference/op_binding/softmax.py
@@ -1,41 +1,53 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
 
+# DeepSpeed Team
+
+import os
 import torch
+import torch.nn.functional as F
 from ..config import DeepSpeedInferenceConfig
 from .base import BaseOp
 
 
 class SoftmaxOp(BaseOp):
+
     def __init__(self, config: DeepSpeedInferenceConfig):
         super(SoftmaxOp, self).__init__(config)
-        if self.config.fp16:
-            self.softmax_func = self.inference_cuda_module.softmax_fp16
+        self.num_attention_heads_per_partition = config.heads // config.mp_size
+        try:
+            if self.config.dtype in [torch.float16, torch.int8]:
+                self.softmax_func = self.inference_module.softmax_fp16
+            elif self.config.dtype == torch.bfloat16:
+                self.softmax_func = self.inference_module.softmax_bf16
+            else:
+                self.softmax_func = self.inference_module.softmax_fp32
+        except AttributeError:
+            self.softmax_func = self.softmax_fallback
+
+    def softmax_fallback(self, attn_scores, attn_mask, alibi, triangular, recompute, local_attention, window_size,
+                         async_op, layer_scale, head_offset, mp_size):
+        if os.environ.get('DS_KI_FALLBACK') == 'True':
+            alibi = alibi[head_offset:head_offset + self.num_attention_heads_per_partition]
+            input_dtype = attn_scores.dtype
+            if (triangular):
+                tri = ~torch.tril(torch.ones(attn_scores.size(), device=attn_scores.device)).to(bool)
+                attn_scores = torch.masked_fill(attn_scores * layer_scale, tri, torch.finfo(input_dtype).min)
+            if alibi is not None:
+                attn_scores += alibi
+            if attn_mask is not None:
+                # expand atten_mask from two dim into 4 dim, insert two dims in the middle
+                attn_mask = attn_mask[:, None, None, :]
+                attn_scores += attn_mask
+            output = F.softmax(attn_scores, dim=-1, dtype=torch.float32).to(input_dtype)
+            return output
         else:
-            self.softmax_func = self._not_implemented
-
-    def _not_implemented(self, *args, **kwargs):
-        raise NotImplementedError
-
-    def forward(self,
-                attn_scores: torch.Tensor,
-                attn_mask: torch.Tensor,
-                alibi: torch.Tensor,
-                triangular: bool,
-                recompute: bool,
-                local_attention: bool,
-                window_size: int,
-                async_op: bool,
-                layer_scale: float,
+            raise NotImplementedError
+
+    def forward(self, attn_scores: torch.Tensor, attn_mask: torch.Tensor, alibi: torch.Tensor, triangular: bool,
+                recompute: bool, local_attention: bool, window_size: int, async_op: bool, layer_scale: float,
                 head_offset: int):
-        output = self.softmax_func(attn_scores,
-                                   attn_mask,
-                                   alibi,
-                                   triangular,
-                                   recompute,
-                                   local_attention,
-                                   window_size,
-                                   async_op,
-                                   layer_scale,
-                                   head_offset,
-                                   self.config.mp_size)
+        output = self.softmax_func(attn_scores, attn_mask, alibi, triangular, recompute, local_attention, window_size,
+                                   async_op, layer_scale, head_offset, self.config.mp_size)
+
         return output
diff --git a/deepspeed/ops/transformer/inference/op_binding/softmax_context.py b/deepspeed/ops/transformer/inference/op_binding/softmax_context.py
index 818af5f34a23..0dc4e08a3633 100644
--- a/deepspeed/ops/transformer/inference/op_binding/softmax_context.py
+++ b/deepspeed/ops/transformer/inference/op_binding/softmax_context.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 from deepspeed import comm as dist
@@ -7,22 +10,26 @@
 
 
 class SoftmaxContextOp(BaseOp):
+
     def __init__(self, config: DeepSpeedInferenceConfig):
         super(SoftmaxContextOp, self).__init__(config)
-        if self.config.fp16:
-            self.softmax_context_func = self.inference_cuda_module.softmax_context_fp16
-        else:
-            self.softmax_context_func = self.inference_cuda_module.softmax_context_fp32
-
-    def forward(self,
-                query_key_value: torch.Tensor,
-                attn_mask: torch.Tensor,
-                heads: int,
-                norm_factor: float,
-                no_masking: bool,
-                layer_id: int,
-                num_layers: int,
-                alibi: torch.Tensor):
+        try:
+            if self.config.dtype in [torch.float16, torch.int8]:
+                self.softmax_context_func = self.inference_module.softmax_context_fp16
+            elif self.config.dtype == torch.bfloat16:
+                self.softmax_context_func = self.inference_module.softmax_context_bf16
+            else:
+                self.softmax_context_func = self.inference_module.softmax_context_fp32
+        except AttributeError:
+            self.softmax_context_func = self.softmax_context_fallback
+
+    def softmax_context_fallback(self, query_key_value, attn_mask, rotary_dim, rotate_half, rotate_every_two, heads,
+                                 num_kv, norm_factor, triangular_masking, local_attention, window_size, no_masking,
+                                 layer_id, num_layers, alibi, rope_theta):
+        raise NotImplementedError
+
+    def forward(self, query_key_value: torch.Tensor, attn_mask: torch.Tensor, heads: int, num_kv: int,
+                norm_factor: float, no_masking: bool, layer_id: int, num_layers: int, alibi: torch.Tensor):
 
         if alibi is not None:
             batch_heads = query_key_value.shape[0] * heads
@@ -31,18 +38,10 @@ def forward(self,
         else:
             alibi = torch.empty(1)
 
-        output = self.softmax_context_func(query_key_value,
-                                           attn_mask,
-                                           self.config.rotary_dim,
-                                           self.config.rotate_half,
-                                           self.config.rotate_every_two,
-                                           heads,
-                                           norm_factor,
-                                           self.config.triangular_masking,
-                                           self.config.local_attention,
-                                           self.config.window_size,
-                                           no_masking,
-                                           layer_id,
-                                           num_layers,
-                                           alibi)
+        output = self.softmax_context_func(query_key_value, attn_mask, self.config.rotary_dim, self.config.rotate_half,
+                                           self.config.rotate_every_two, heads, num_kv, norm_factor,
+                                           self.config.triangular_masking, self.config.local_attention,
+                                           self.config.window_size, no_masking, layer_id, num_layers, alibi,
+                                           self.config.rope_theta)
+
         return output
diff --git a/deepspeed/ops/transformer/inference/op_binding/vector_matmul.py b/deepspeed/ops/transformer/inference/op_binding/vector_matmul.py
index c22b61cbd7e5..011be859634d 100644
--- a/deepspeed/ops/transformer/inference/op_binding/vector_matmul.py
+++ b/deepspeed/ops/transformer/inference/op_binding/vector_matmul.py
@@ -1,20 +1,58 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
 
+# DeepSpeed Team
+
+import os
 import torch
 from ..config import DeepSpeedInferenceConfig
 from .base import BaseOp
+import deepspeed
 
 
 class VectorMatMulOp(BaseOp):
+
     def __init__(self, config: DeepSpeedInferenceConfig):
         super(VectorMatMulOp, self).__init__(config)
-        if self.config.fp16:
-            self.vector_matmul_func = self.inference_cuda_module.vector_matmul_fp16
+        try:
+            if self.config.dtype == torch.float16:
+                if deepspeed.HAS_TRITON and config.use_triton:
+                    from deepspeed.ops.transformer.inference.triton.ops import vector_matmul_func as _triton_vector_matmul_func
+                    self.vector_matmul_func = _triton_vector_matmul_func
+                    triton_autotune = config.triton_autotune and config.layer_id == 0
+                    if triton_autotune:
+                        __class__._triton_autotune(2, self.config.max_out_tokens, self.config.hidden_size)
+                else:
+                    self.vector_matmul_func = self.inference_module.vector_matmul_fp16
+            elif self.config.dtype == torch.int8:
+                self.vector_matmul_func = self.inference_module.vector_matmul_fp16
+            elif self.config.dtype == torch.bfloat16:
+                self.vector_matmul_func = self.inference_module.vector_matmul_bf16
+            else:
+                self.vector_matmul_func = self.inference_module.vector_matmul_fp32
+        except AttributeError:
+            self.vector_matmul_func = self.vector_matmul_fallback
+
+    def vector_matmul_fallback(self, input, weight, async_op, q_scale, q_int8, transpose):
+        if os.environ.get('DS_KI_FALLBACK') == 'True' and not transpose:
+            return torch.matmul(input, weight)
         else:
-            self.vector_matmul_func = self.inference_cuda_module.vector_matmul_fp32
+            raise NotImplementedError
 
     def forward(self, input: torch.Tensor, weight: torch.Tensor, async_op: bool = False):
-        q_scale = weight.scale
-        q_int8 = self.config.q_int8
-        output = self.vector_matmul_func(input, weight, async_op, q_scale, q_int8)
+        q_scale = weight.scale if hasattr(weight, 'scale') else torch.empty(1)
+        q_int8 = self.config.dtype == torch.int8
+        output = self.vector_matmul_func(input, weight, async_op, q_scale, q_int8, self.config.transposed_mode)
         return output
+
+    @staticmethod
+    def _triton_autotune(min_seqlen, max_seqlen, hidden_size, dtype=torch.float16):
+        from deepspeed.ops.transformer.inference.triton.matmul_ext import Fp16Matmul, matmul
+        seqlen = [(min_seqlen + i)
+                  for i in range(0, max_seqlen - min_seqlen + Fp16Matmul._cache_stride + 1, Fp16Matmul._cache_stride)]
+        Fp16Matmul._read_autotune_table()
+        for N in seqlen:
+            A = torch.randn((N, hidden_size), dtype=dtype, device='cuda')
+            B = torch.randn((hidden_size, hidden_size), dtype=dtype, device='cuda')
+            matmul(A, B)
+        Fp16Matmul._update_autotune_table()
diff --git a/deepspeed/ops/transformer/inference/triton/__init__.py b/deepspeed/ops/transformer/inference/triton/__init__.py
new file mode 100755
index 000000000000..b7d1968df62a
--- /dev/null
+++ b/deepspeed/ops/transformer/inference/triton/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .residual_add import residual_add_bias
+from .layer_norm import layer_norm, layer_norm_residual
+from .gelu import gelu
+from .softmax import softmax
+from .ops import *
+from .matmul_ext import fp16_matmul, matmul_4d, score_4d_matmul, context_4d_matmul
diff --git a/deepspeed/ops/transformer/inference/triton/attention.py b/deepspeed/ops/transformer/inference/triton/attention.py
new file mode 100644
index 000000000000..c05370ec74e5
--- /dev/null
+++ b/deepspeed/ops/transformer/inference/triton/attention.py
@@ -0,0 +1,387 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import math
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+from deepspeed.accelerator import get_accelerator
+from deepspeed import comm as dist
+from deepspeed.ops.transformer.inference.op_binding import LinearOp, VectorMatMulOp, SoftmaxContextOp, QKVGemmOp
+from deepspeed.ops.transformer.inference.triton import (
+    softmax,
+    score_4d_matmul,
+    context_4d_matmul,
+)
+
+minus_inf = -10000.0
+
+
+class TritonSelfAttention(nn.Module):
+    num_layers = 0
+
+    def __init__(self, config, mp_group=None, q_scales=None, q_groups=1, merge_count=1, qkv_merging=False):
+        super(TritonSelfAttention, self).__init__()
+        self.config = config
+        data_type = self.config.dtype
+        data_type_fp = torch.half if self.config.dtype == torch.int8 else self.config.dtype
+        assert data_type_fp == torch.half, "triton supports fp16 data_type_fp"
+
+        self.config.layer_id = TritonSelfAttention.num_layers
+        TritonSelfAttention.num_layers = TritonSelfAttention.num_layers + 1
+        device = get_accelerator().current_device_name()  #if config.bigscience_bloom else 'cpu'
+
+        assert config.mp_size == 1, "mp_size has to be 1 with triton attention yet"
+        if self.config.set_empty_params:
+            self.attn_qw = None
+            self.attn_qb = None
+            self.attn_kw = None
+            self.attn_kb = None
+            self.attn_vw = None
+            self.attn_vb = None
+            self.attn_qkvw = None
+            self.attn_qkvb = None
+            self.attn_ow = None
+            self.attn_ob = None
+        else:
+            qkv_size_per_partition = (self.config.hidden_size // self.config.mp_size) * 3
+            self.attn_qkvw = nn.Parameter(torch.empty(self.config.hidden_size,
+                                                      qkv_size_per_partition,
+                                                      dtype=data_type,
+                                                      device=device),
+                                          requires_grad=False)
+            self.attn_qkvb = nn.Parameter(torch.empty(qkv_size_per_partition, dtype=data_type_fp, device=device),
+                                          requires_grad=False)
+            # self-ouput weights
+            out_size_per_partition = self.config.hidden_size // self.config.mp_size
+            self.attn_ow = nn.Parameter(torch.empty(out_size_per_partition,
+                                                    self.config.hidden_size,
+                                                    dtype=data_type,
+                                                    device=device),
+                                        requires_grad=False)
+
+            self.attn_ob = nn.Parameter(torch.empty(self.config.hidden_size, dtype=data_type_fp, device=device),
+                                        requires_grad=False)
+
+        self.num_attention_heads_per_partition = self.config.heads // self.config.mp_size
+        self.hidden_size_per_partition = self.config.hidden_size // self.config.mp_size
+        self.hidden_size_per_attention_head = self.config.hidden_size // self.config.heads
+
+        self.mp_group = mp_group
+        self.use_flash = False
+        # triton flash attention is enabled when the compute capability >= 8.0
+        if get_accelerator().is_triton_supported():
+            self.use_flash = True
+
+        # used for quantization
+        self.q_scales = q_scales
+        self.q_groups = q_groups
+        self.merge_count = int(math.log2(merge_count))
+
+        self.norm_factor = math.sqrt(self.config.hidden_size // self.config.heads)
+        if not config.use_mup:
+            self.norm_factor = math.sqrt(self.norm_factor)
+
+        if self.config.scale_attn_by_inverse_layer_idx is True:
+            self.norm_factor *= math.sqrt(self.config.layer_id + 1)
+            # https://github.com/huggingface/transformers/blob/v4.24.0/src/transformers/models/gpt2/modeling_gpt2.py#L191
+
+        triton_autotune = self.config.triton_autotune and self.config.layer_id == 0
+        self.qkv_func = QKVGemmOp(config)
+        self.score_context_func = SoftmaxContextOp(config)
+        self.linear_func = LinearOp(config)
+        self.vector_matmul_func = VectorMatMulOp(config)
+
+        self.hidden_size = config.hidden_size
+        self.head_size = config.hidden_size // config.heads
+        self.scale = (1 / self.norm_factor / self.norm_factor if self.config.scale_attention else 1.0
+                      )  # making it back to 1/sqrt(head_size)
+        self.triangular_masking = self.config.triangular_masking
+
+        # triton autotune table update for score/context matmul
+        if triton_autotune:
+            print(f"running triton autotune for regular attention kernel")
+            __class__._triton_autotune(2, self.config.max_out_tokens, self.head_size, self.config.hidden_size,
+                                       self.triangular_masking, self.scale)
+
+    @staticmethod
+    def _triton_autotune(min_seqlen,
+                         max_seqlen,
+                         head_size,
+                         hidden_size,
+                         triangular_masking,
+                         scale,
+                         dtype=torch.float16):
+        from deepspeed.ops.transformer.inference.triton.matmul_ext import Fp16Matmul, score_4d_matmul, context_4d_matmul
+        seqlen = [(min_seqlen + i)
+                  for i in range(0, max_seqlen - min_seqlen + Fp16Matmul._cache_stride + 1, Fp16Matmul._cache_stride)]
+        Fp16Matmul._read_autotune_table()
+        for N in seqlen:
+            qkv = torch.randn((1, N, 3 * hidden_size), dtype=dtype, device='cuda')
+            output = score_4d_matmul(qkv, head_size, triangular_masking, scale)
+            context_4d_matmul(output, qkv, head_size)
+        Fp16Matmul._update_autotune_table()
+
+    def ds_compute_attention(self, qkv_out, input_mask, layer_past, alibi):
+        if isinstance(qkv_out, list):
+            qkv_out = qkv_out[0]
+
+        no_masking = input_mask is None
+
+        if no_masking:
+            input_mask = torch.empty(1)
+
+        attn_key_value = self.score_context_func(
+            query_key_value=qkv_out,
+            attn_mask=((1 - input_mask).to(qkv_out.dtype) *
+                       minus_inf) if input_mask.dtype == torch.int64 else input_mask,
+            heads=self.num_attention_heads_per_partition,
+            norm_factor=(1 / self.norm_factor if self.config.scale_attention else 1.0),
+            no_masking=no_masking,
+            layer_id=self.config.layer_id,
+            num_layers=TritonSelfAttention.num_layers,
+            alibi=alibi)
+
+        context_layer, key_layer, value_layer = attn_key_value
+        return context_layer, key_layer, value_layer
+
+    def forward(
+            self,
+            input,
+            input_mask,
+            head_mask=None,
+            layer_past=None,
+            get_present=False,  # not used
+            encoder_hidden_states=None,  # not used
+            encoder_attention_mask=None,  # not used
+            triangularutput_attentions=False,  # not used
+            norm_w=None,
+            norm_b=None,
+            alibi=None,
+            use_triton_attention=True):
+
+        if not self.config.pre_layer_norm:
+            qkv_out = self.linear_func(input=input,
+                                       weight=self.attn_qkvw,
+                                       bias=self.attn_qkvb,
+                                       add_bias=self.attn_qkvb is not None,
+                                       do_flash_attn=False,
+                                       num_heads=self.num_attention_heads_per_partition,
+                                       num_layers=TritonSelfAttention.num_layers)
+            qkv = qkv_out
+        else:
+            qkv_out = self.qkv_func(input=input,
+                                    weight=self.attn_qkvw,
+                                    bias=(self.attn_qkvb if self.attn_qkvb is not None else norm_b),
+                                    gamma=norm_w,
+                                    beta=norm_b)
+            qkv = qkv_out[0]
+
+        if use_triton_attention and (alibi is None):
+            context_layer = _triton_attention(qkv=qkv,
+                                              input_mask=input_mask,
+                                              scale=self.scale,
+                                              layer_past=layer_past,
+                                              alibi=alibi,
+                                              head_size=self.head_size,
+                                              use_triton_flash=self.use_flash,
+                                              use_cuda_flash=False,
+                                              triangular=self.triangular_masking)
+            key_layer, value_layer = qkv[:, :, self.hidden_size:2 * self.hidden_size], qkv[:, :, 2 * self.hidden_size:]
+        else:
+            context_layer, key_layer, value_layer = self.ds_compute_attention(qkv_out=qkv_out,
+                                                                              input_mask=input_mask,
+                                                                              layer_past=layer_past,
+                                                                              alibi=alibi)
+        output = self.vector_matmul_func(input=context_layer, weight=self.attn_ow)
+
+        inp_norm = qkv_out[-1]
+
+        if self.config.mlp_after_attn and self.mp_group is not None and dist.get_world_size(group=self.mp_group) > 1:
+            dist.all_reduce(output, group=self.mp_group)
+
+        return (output, key_layer, value_layer, context_layer, inp_norm)
+
+
+global inference_module
+
+
+def _triton_attention(qkv,
+                      input_mask,
+                      layer_past,
+                      alibi,
+                      scale,
+                      head_size,
+                      triangular=False,
+                      use_cuda_flash=False,
+                      use_triton_flash=False,
+                      use_ds_attention=False):
+    if isinstance(qkv, list):
+        qkv = qkv[0]
+
+    assert alibi is None, "layer_past not supported in alibi yet"
+
+    if use_triton_flash:
+        output = _triton_packed_flash(qkv,
+                                      head_size,
+                                      input_mask,
+                                      scale,
+                                      causal=triangular,
+                                      add_mask=(not triangular and input_mask is not None))
+    else:
+        output = score_4d_matmul(qkv, head_size, triangular, scale)
+        if triangular:
+            output = softmax(output)
+        else:
+            output = softmax(output, input_mask)
+        output = context_4d_matmul(output, qkv, head_size)
+
+    return output
+
+
+'''
+flash attention 2
+modified the triton kernel in
+https://github.com/openai/triton/blob/08c16589573621fcb8cd5a9c3b8a0537077f876d/python/tutorials/06-fused-attention.py
+'''
+
+
+@triton.jit
+def _flash_packed_kernel(
+    QKV,
+    mask,
+    ADD_MASK: tl.constexpr,
+    IS_CAUSAL: tl.constexpr,
+    sm_scale,
+    Out,
+    stride_qz,
+    stride_qn,
+    stride_qm,
+    stride_mz,
+    stride_oz,
+    stride_on,
+    Z,
+    H,
+    N_CTX,
+    P_SEQ,
+    hidden_size,
+    BLOCK_M: tl.constexpr,
+    BLOCK_DMODEL: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    start_m = tl.program_id(0)
+    off_hz = tl.program_id(1)
+    batch = off_hz // H
+    head = off_hz % H
+
+    q_offset = batch * stride_qz + head * BLOCK_DMODEL
+    k_offset = q_offset + hidden_size
+    v_offset = k_offset + hidden_size
+
+    # initialize offsets
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_n = tl.arange(0, BLOCK_N)
+    offs_d = tl.arange(0, BLOCK_DMODEL)
+
+    q_ptrs = QKV + q_offset + offs_m[:, None] * stride_qn + offs_d[None, :]
+    k_ptrs = QKV + hidden_size + q_offset + offs_n[:, None] * stride_qn + offs_d[None, :]
+    v_ptrs = QKV + 2 * hidden_size + q_offset + offs_n[:, None] * stride_qn + offs_d[None, :]
+
+    # mask
+    off_mask = batch * stride_mz + offs_n[None, :]
+    mask_ptrs = mask + off_mask
+
+    # initialize pointer to m and l
+    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
+    l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
+    # scale sm_scale by log_2(e) and use
+    # 2^x instead of exp in the loop because CSE and LICM
+    # don't work as expected with `exp` in the loop
+    qk_scale = sm_scale * 1.44269504
+    # load q: it will stay in SRAM throughout
+    q = tl.load(q_ptrs, mask=offs_m[:, None] < N_CTX, other=0.0)
+    q = (q * qk_scale).to(tl.float16)
+    # loop over k, v and update accumulator
+    lo = 0
+    hi = P_SEQ + (start_m + 1) * BLOCK_M if IS_CAUSAL else N_CTX + P_SEQ
+    for start_n in range(lo, hi, BLOCK_N):
+        # -- load k, v --
+        k = tl.load(k_ptrs + start_n * stride_qn, mask=(start_n + offs_n)[:, None] < N_CTX, other=0.0)
+        v = tl.load(v_ptrs + start_n * stride_qn, mask=(start_n + offs_n)[:, None] < N_CTX, other=0.0)
+        # -- compute qk ---
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float16)
+
+        if ADD_MASK:
+            mask_val = tl.load(mask_ptrs)
+            mask_ptrs += BLOCK_N
+            qk = qk + mask_val.to(tl.float32)
+
+        if IS_CAUSAL:
+            qk = tl.where(P_SEQ + offs_m[:, None] >= (start_n + offs_n[None, :]), qk, float("-inf"))
+
+        qk += tl.dot(q, tl.trans(k), out_dtype=tl.float16)
+        qk += tl.where((start_n + offs_n)[None, :] < N_CTX, 0, minus_inf)
+        # -- compute scaling constant ---
+        m_i_new = tl.maximum(m_i, tl.max(qk, 1))
+        alpha = tl.math.exp2(m_i - m_i_new)
+        p = tl.math.exp2(qk - m_i_new[:, None])
+        # -- scale and update acc --
+        acc_scale = l_i * 0 + alpha  # workaround some compiler bug
+        acc *= acc_scale[:, None]
+        acc += tl.dot(p.to(tl.float16), v.to(tl.float16))
+        # -- update m_i and l_i --
+        l_i = l_i * alpha + tl.sum(p, 1)
+        m_i = m_i_new
+
+    # write back l and m
+    acc = acc / l_i[:, None]
+    o_offset = batch * stride_oz + head * BLOCK_DMODEL
+    out_ptrs = Out + o_offset + (offs_m[:, None] * stride_on + offs_d[None, :])
+    tl.store(out_ptrs, acc.to(tl.float16), mask=offs_m[:, None] < N_CTX)
+
+
+def _triton_packed_flash(qkv, head_size, mask, sm_scale, causal=False, add_mask=True):
+    heads = qkv.shape[-1] // 3 // head_size
+    hidden_size = qkv.shape[-1] // 3
+
+    BLOCK_M = 128
+    BLOCK_N = 64 if head_size <= 64 else 32
+
+    o = torch.empty((qkv.shape[0], qkv.shape[1], hidden_size), device=qkv.device, dtype=torch.half)
+    if mask is None:
+        mask = torch.empty(0)
+        add_mask = False
+
+    grid = (triton.cdiv(qkv.shape[1], BLOCK_M), qkv.shape[0] * heads, 1)
+    num_stages = 4 if head_size <= 64 else 3
+    num_warps = 4
+    P_SEQ = 0
+
+    _flash_packed_kernel[grid](qkv,
+                               mask,
+                               add_mask,
+                               causal,
+                               sm_scale,
+                               o,
+                               qkv.stride(0),
+                               qkv.stride(1),
+                               qkv.stride(2),
+                               mask.stride(1) if add_mask else 0,
+                               o.stride(0),
+                               o.stride(1),
+                               qkv.shape[0],
+                               heads,
+                               qkv.shape[1],
+                               P_SEQ,
+                               hidden_size,
+                               BLOCK_M=BLOCK_M,
+                               BLOCK_N=BLOCK_N,
+                               BLOCK_DMODEL=head_size,
+                               num_warps=num_warps,
+                               num_stages=num_stages)
+
+    return o
diff --git a/deepspeed/ops/transformer/inference/triton/gelu.py b/deepspeed/ops/transformer/inference/triton/gelu.py
new file mode 100644
index 000000000000..738d7d96a1c9
--- /dev/null
+++ b/deepspeed/ops/transformer/inference/triton/gelu.py
@@ -0,0 +1,38 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+import triton
+import triton.language as tl
+from deepspeed.accelerator import get_accelerator
+
+
+@triton.jit
+def gelu_functor(x):
+    # Using approximation introduces greater parity errors.
+    # return tl.sigmoid(1.702 * x) * x
+    return x * 0.5 * (1.0 + tl.math.erf(x / 1.41421356237))
+
+
+@triton.jit
+def gelu_kernel(x_ptr, output_ptr, n_elements, BLOCK_SIZE: tl.constexpr):
+    pid = tl.program_id(axis=0)
+    block_start = pid * BLOCK_SIZE
+    offsets = block_start + tl.arange(0, BLOCK_SIZE)
+    mask = offsets < n_elements
+    x = tl.load(x_ptr + offsets, mask=mask)
+    output = gelu_functor(x)
+    tl.store(output_ptr + offsets, output, mask=mask)
+
+
+def gelu(activations: torch.Tensor) -> torch.Tensor:
+    assert activations.is_contiguous()
+    assert get_accelerator().on_accelerator(activations)
+
+    output = torch.empty_like(activations)
+    n_elements = output.numel()
+    grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']), )
+    gelu_kernel[grid](activations, output, n_elements, BLOCK_SIZE=1024)
+    return output
diff --git a/deepspeed/ops/transformer/inference/triton/layer_norm.py b/deepspeed/ops/transformer/inference/triton/layer_norm.py
new file mode 100644
index 000000000000..d3f313d2ac3d
--- /dev/null
+++ b/deepspeed/ops/transformer/inference/triton/layer_norm.py
@@ -0,0 +1,249 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+import triton
+import triton.language as tl
+'''
+layer-normalization
+modified the triton kernel in
+https://github.com/openai/triton/blob/34817ecc954a6f4ca7b4dfb352fdde1f8bd49ca5/python/tutorials/05-layer-norm.py
+'''
+
+
+@triton.jit
+def layer_norm_kernel(
+    Out,
+    A,
+    Weight,
+    Bias,
+    stride,
+    N,
+    eps,
+    BLOCK_SIZE: tl.constexpr,
+):
+    # position of elements processed by this program
+    row = tl.program_id(0)
+    Out += row * stride
+    A += row * stride
+    # compute mean
+    mean = 0
+    _mean = tl.zeros([BLOCK_SIZE], dtype=tl.float32)
+    for off in range(0, N, BLOCK_SIZE):
+        cols = off + tl.arange(0, BLOCK_SIZE)
+        a = tl.load(A + cols, mask=cols < N, other=0.0).to(tl.float32)
+        _mean += a
+    mean = tl.sum(_mean, axis=0) / N
+    # compute variance
+    _var = tl.zeros([BLOCK_SIZE], dtype=tl.float32)
+    for off in range(0, N, BLOCK_SIZE):
+        cols = off + tl.arange(0, BLOCK_SIZE)
+        a = tl.load(A + cols, mask=cols < N, other=0.0).to(tl.float32)
+        a = tl.where(cols < N, a - mean, 0.0)
+        _var += a * a
+    var = tl.sum(_var, axis=0) / N
+    rstd = 1 / tl.sqrt(var + eps)
+    # multiply by weight and add bias
+    for off in range(0, N, BLOCK_SIZE):
+        cols = off + tl.arange(0, BLOCK_SIZE)
+        mask = cols < N
+        weight = tl.load(Weight + cols, mask=mask)
+        bias = tl.load(Bias + cols, mask=mask)
+        a = tl.load(A + cols, mask=mask, other=0.0).to(tl.float32)
+        a_hat = (a - mean) * rstd
+        out = a_hat * weight + bias
+        # # write-back
+        tl.store(Out + cols, out, mask=mask)
+
+
+@triton.jit
+def layer_norm_residual_kernel(
+    Out,
+    A,
+    Residual,
+    ln_input,
+    Weight,
+    Bias,
+    stride,
+    N,
+    eps,
+    BLOCK_SIZE: tl.constexpr,
+):
+    # position of elements processed by this program
+    row = tl.program_id(0)
+    Out += row * stride
+    A += row * stride
+    Residual += row * stride
+    ln_input += row * stride
+    # compute mean
+    mean = 0
+    _mean = tl.zeros([BLOCK_SIZE], dtype=tl.float32)
+    for off in range(0, N, BLOCK_SIZE):
+        cols = off + tl.arange(0, BLOCK_SIZE)
+        a = tl.load(A + cols, mask=cols < N, other=0.0).to(tl.float32)
+        res = tl.load(Residual + cols, mask=cols < N, other=0.0).to(tl.float32)
+        a = a + res
+        tl.store(ln_input + cols, a, mask=cols < N)
+        _mean += a
+    mean = tl.sum(_mean, axis=0) / N
+    # compute variance
+    _var = tl.zeros([BLOCK_SIZE], dtype=tl.float32)
+    for off in range(0, N, BLOCK_SIZE):
+        cols = off + tl.arange(0, BLOCK_SIZE)
+        a = tl.load(ln_input + cols, mask=cols < N, other=0.0).to(tl.float32)
+        a = tl.where(cols < N, a - mean, 0.0)
+        _var += a * a
+    var = tl.sum(_var, axis=0) / N
+    rstd = 1 / tl.sqrt(var + eps)
+    # multiply by weight and add bias
+    for off in range(0, N, BLOCK_SIZE):
+        cols = off + tl.arange(0, BLOCK_SIZE)
+        mask = cols < N
+        weight = tl.load(Weight + cols, mask=mask)
+        bias = tl.load(Bias + cols, mask=mask)
+        a = tl.load(ln_input + cols, mask=mask, other=0.0).to(tl.float32)
+        a_hat = (a - mean) * rstd
+        out = a_hat * weight + bias
+        # write-back
+        tl.store(Out + cols, out, mask=mask)
+
+
+@triton.jit
+def layer_norm_residual_bias_kernel(
+    Out,
+    A,
+    Residual,
+    InputBias,
+    ln_input,
+    Weight,
+    Bias,
+    stride,
+    N,
+    eps,
+    BLOCK_SIZE: tl.constexpr,
+):
+    # position of elements processed by this program
+    row = tl.program_id(0)
+    Out += row * stride
+    A += row * stride
+    Residual += row * stride
+    ln_input += row * stride
+    # compute mean
+    mean = 0
+    _mean = tl.zeros([BLOCK_SIZE], dtype=tl.float32)
+    for off in range(0, N, BLOCK_SIZE):
+        cols = off + tl.arange(0, BLOCK_SIZE)
+        a = tl.load(A + cols, mask=cols < N, other=0.0).to(tl.float32)
+        res = tl.load(Residual + cols, mask=cols < N, other=0.0).to(tl.float32)
+        b = tl.load(InputBias + cols, mask=cols < N, other=0.0).to(tl.float32)
+        a = a + b + res
+        tl.store(ln_input + cols, a, mask=cols < N)
+        _mean += a
+    mean = tl.sum(_mean, axis=0) / N
+    # compute variance
+    _var = tl.zeros([BLOCK_SIZE], dtype=tl.float32)
+    for off in range(0, N, BLOCK_SIZE):
+        cols = off + tl.arange(0, BLOCK_SIZE)
+        a = tl.load(ln_input + cols, mask=cols < N, other=0.0).to(tl.float32)
+        a = tl.where(cols < N, a - mean, 0.0)
+        _var += a * a
+    var = tl.sum(_var, axis=0) / N
+    rstd = 1 / tl.sqrt(var + eps)
+    # multiply by weight and add bias
+    for off in range(0, N, BLOCK_SIZE):
+        cols = off + tl.arange(0, BLOCK_SIZE)
+        mask = cols < N
+        weight = tl.load(Weight + cols, mask=mask)
+        bias = tl.load(Bias + cols, mask=mask)
+        a = tl.load(ln_input + cols, mask=mask, other=0.0).to(tl.float32)
+        a_hat = (a - mean) * rstd
+        out = a_hat * weight + bias
+        # write-back
+        tl.store(Out + cols, out, mask=mask)
+
+
+def layer_norm(a, weight, bias, eps):
+    assert a.is_contiguous()
+    assert weight.is_contiguous()
+    assert bias.is_contiguous()
+
+    # allocate output
+    out = torch.empty_like(a)
+    # reshape input data into 2D tensor
+    a_arg = a.view(-1, a.shape[-1])
+    M, N = a_arg.shape
+    # Less than 64KB per feature: enqueue fused kernel
+    MAX_FUSED_SIZE = 65536 // a.element_size()
+    BLOCK_SIZE = min(MAX_FUSED_SIZE, triton.next_power_of_2(N))
+    BLOCK_SIZE = max(BLOCK_SIZE, 128)
+    BLOCK_SIZE = min(BLOCK_SIZE, 4096)
+    BLOCK_SIZE = BLOCK_SIZE if N <= 4096 else 8192
+    # heuristics for number of warps
+    num_warps = min(max(BLOCK_SIZE // 256, 1), 8)
+    layer_norm_kernel[(M, )](
+        out,
+        a_arg,
+        weight,
+        bias,
+        a_arg.stride(0),
+        N,
+        eps,
+        BLOCK_SIZE=BLOCK_SIZE,
+        num_warps=num_warps,
+    )
+    return out
+
+
+def layer_norm_residual(a, input_bias, residual, weight, bias, eps):
+    assert a.is_contiguous()
+    assert weight.is_contiguous()
+    assert bias.is_contiguous()
+    assert residual.is_contiguous()
+
+    # allocate output and scratch-pad for residual addition
+    out = torch.empty_like(a)
+    ln_input = torch.empty_like(a)
+    # reshape input data into 2D tensor
+    a_arg = a.view(-1, a.shape[-1])
+    residual = residual.view(-1, residual.shape[-1])
+    M, N = a_arg.shape
+    # Less than 64KB per feature: enqueue fused kernel
+    MAX_FUSED_SIZE = 65536 // a.element_size()
+    BLOCK_SIZE = min(MAX_FUSED_SIZE, triton.next_power_of_2(N))
+    BLOCK_SIZE = max(BLOCK_SIZE, 128)
+    BLOCK_SIZE = min(BLOCK_SIZE, 4096)
+    BLOCK_SIZE = BLOCK_SIZE if N <= 4096 else 8192
+    # heuristics for number of warps
+    num_warps = min(max(BLOCK_SIZE // 256, 1), 8)
+    if input_bias is None:
+        layer_norm_residual_kernel[(M, )](
+            out,
+            a_arg,
+            residual,
+            ln_input,
+            weight,
+            bias,
+            a_arg.stride(0),
+            N,
+            eps,
+            BLOCK_SIZE=BLOCK_SIZE,
+            num_warps=num_warps,
+        )
+    else:
+        layer_norm_residual_bias_kernel[(M, )](
+            out,
+            a_arg,
+            residual,
+            input_bias,
+            ln_input,
+            weight,
+            bias,
+            a_arg.stride(0),
+            N,
+            eps,
+            BLOCK_SIZE=BLOCK_SIZE,
+            num_warps=num_warps,
+        )
+    return out
diff --git a/deepspeed/ops/transformer/inference/triton/matmul_ext.py b/deepspeed/ops/transformer/inference/triton/matmul_ext.py
new file mode 100644
index 000000000000..d6f72b4efb0b
--- /dev/null
+++ b/deepspeed/ops/transformer/inference/triton/matmul_ext.py
@@ -0,0 +1,444 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+import triton
+import os
+from filelock import FileLock
+import deepspeed.ops.transformer.inference.triton.triton_matmul_kernel as triton_matmul_kernel
+import pickle
+from io import open
+import deepspeed
+from pathlib import Path
+import atexit
+
+
+# -----------------------------------------------------------------------------
+# util class/functions for triton
+def _default_cache_dir():
+    return os.path.join(Path.home(), ".triton", "autotune")
+
+
+def bias_add_activation(C, bias=None, activation=""):
+    if bias is not None:
+        C += bias
+    # activation
+    if activation == "relu":
+        relu = torch.nn.Relu()
+        C = relu(C)
+    elif activation == "leaky_relu":
+        leaky_relu = torch.nn.LeakyReLU(0.01)
+        C = leaky_relu(C)
+    elif activation == "gelu":
+        sigmoid = torch.nn.Sigmoid()
+        C = sigmoid(1.702 * C) * C
+    elif activation == "sigmoid":
+        sigmoid = torch.nn.Sigmoid()
+        C = sigmoid(C)
+    return C
+
+
+class AutotuneCacheManager:
+    """
+        Cache manager for autotune
+    """
+
+    def __init__(self, key):
+        self.key = key
+        self.file_path = None
+        self.lock_path = None
+        # if caching is enabled, get the lock and bin path
+        self.cache_dir = os.environ.get('TRITON_CACHE_DIR', _default_cache_dir())
+        if self.cache_dir:
+            os.makedirs(self.cache_dir, exist_ok=True)
+        if self.cache_dir:
+            self.file_path = os.path.join(self.cache_dir, self.key + ".pickle")
+            self.lock_path = self.file_path + ".lock"
+
+    def has_file(self):
+        return self.file_path and os.path.exists(self.file_path)
+
+    def put(self, table):
+        if self.file_path:
+            assert self.lock_path is not None
+            with FileLock(self.lock_path):
+                with open(self.file_path + ".tmp", 'wb') as handle:
+                    pickle.dump(table, handle)
+                os.rename(self.file_path + ".tmp", self.file_path)
+
+    def load(self):
+        if os.path.exists(self.file_path):
+            with open(self.file_path, 'rb') as handle:
+                loaded_dict = pickle.load(handle)
+            return loaded_dict
+        else:
+            return None
+
+
+# -----------------------------------------------------------------------------
+# triton matmul class
+
+
+class MatmulExt(torch.autograd.Function):
+    """
+        a wrapper class that can call different triton matmul kernels depending on the input parameters
+    """
+
+    @staticmethod
+    def forward(A, B, bias=None, activation="", use_triton=True, update_autotune_table=False):
+        """
+            A: input, activation matrix A
+            B: input, weight matrix B
+        """
+        matmul = None
+        quantize_activation = False
+        Batch = 0
+
+        if len(A.shape) == 3:  # if A is 3d-tensor where batch index is given as 0-axis
+            assert A.is_contiguous(), "matrix A must be contiguous"
+            Batch, M, K = A.shape
+            A = A.view(-1, K)
+
+        # fp16 activation and fp16 weight matmul into fp16 output
+        matmul = fp16_matmul
+        C = matmul.forward(A, B, use_triton=use_triton, bias=bias, activation=activation)
+
+        if matmul and update_autotune_table:
+            matmul._update_autotune_table()
+
+        if Batch > 0:
+            C = C.view(Batch, M, -1)
+
+        return C
+
+
+class TritonMatmul(torch.autograd.Function):
+    """
+        triton matmul kernel superclass
+    """
+
+    def __init__(self):
+        pass
+
+    @staticmethod
+    def _ref_forward(A, B, ref_dtype=torch.float32):
+        C = torch.matmul(A.type(ref_dtype), B.type(ref_dtype))
+        return C
+
+    @staticmethod
+    def _read_autotune_table(cache_key, triton_kernel):
+        cache_manager = AutotuneCacheManager(cache_key)
+        table = cache_manager.load()
+        if table:
+            triton_kernel.cache = table
+
+    @staticmethod
+    def _write_autotune_table(cache_key, triton_kernel):
+        cache_manager = AutotuneCacheManager(cache_key)
+        cache_manager.put(triton_kernel.cache)
+
+    @staticmethod
+    def _update_autotune_table(cache_key, triton_kernel):
+        cache_manager = AutotuneCacheManager(cache_key)
+        autotune_table = cache_manager.load()
+        if autotune_table is None:
+            autotune_table = dict()
+        autotune_table.update(triton_kernel.cache)  # always overwrite with the new autotune results
+        cache_manager = AutotuneCacheManager(cache_key)
+        cache_manager.put(autotune_table)
+
+    @staticmethod
+    def forward(
+            A,
+            B,
+            ref_dtype=torch.float32,  # fp32 only
+            bias=None,
+            activation=""):
+        C = torch.matmul(A.type(ref_dtype), B.type(ref_dtype))
+        C = bias_add_activation(C, bias, activation)
+        return C
+
+
+class Fp16Matmul(TritonMatmul):
+    """
+        fp16 matrix multiplication kernel
+        dtypes: fp16 x fp16 = fp16
+    """
+
+    _2d_kernel = triton_matmul_kernel._fp_matmul
+    _4d_kernel = triton_matmul_kernel.matmul_4d_kernel
+    _cache_stride = 32
+
+    def __init__(self, read_cache=True):
+        super().__init__()
+        if read_cache:
+            __class__._read_autotune_table()
+
+    def skip_autotune(self):
+        __class__._2d_kernel.configs = [__class__._2d_kernel.configs[0]]
+        __class__._4d_kernel.configs = [__class__._4d_kernel.configs[0]]
+
+    @staticmethod
+    def forward(A, B, use_triton=True, bias=None, activation=""):
+        if use_triton:
+            device = A.device
+            # handle non-contiguous inputs if necessary
+            if A.stride(0) > 1 and A.stride(1) > 1:
+                A = A.contiguous()
+            if B.stride(0) > 1 and B.stride(1) > 1:
+                B = B.contiguous()
+            # checks constraints
+            assert A.shape[1] == B.shape[0], "incompatible dimensions"
+            M, K = A.shape
+            _, N = B.shape
+            # allocates output
+            C = torch.empty((M, N), device=device, dtype=A.dtype)
+            # accumulator types
+            ACC_TYPE = triton.language.float32 if A.dtype in [torch.float16, torch.bfloat16, torch.float32
+                                                              ] else triton.language.int32
+            # launch kernel
+            grid = lambda META: (triton.cdiv(M, META['BLOCK_M']) * triton.cdiv(N, META['BLOCK_N']), META['SPLIT_K'])
+            __class__._2d_kernel[grid](A,
+                                       B,
+                                       C,
+                                       M,
+                                       N,
+                                       K,
+                                       bias,
+                                       A.stride(0),
+                                       A.stride(1),
+                                       B.stride(0),
+                                       B.stride(1),
+                                       C.stride(0),
+                                       C.stride(1),
+                                       M // __class__._cache_stride,
+                                       N // __class__._cache_stride,
+                                       K // __class__._cache_stride,
+                                       GROUP_M=8,
+                                       ACC_TYPE=ACC_TYPE,
+                                       BIAS_ADD=(0 if bias is None else 1),
+                                       ACTIVATION=activation)
+        else:
+            C = torch.matmul(A, B)
+        return C
+
+    @staticmethod
+    def _matmul_4d(a, b):
+        assert a.shape[-1] == b.shape[-2], "incompatible dimensions"
+        assert a.is_contiguous(), "matrix A must be contiguous"
+        assert b.is_contiguous(), "matrix B must be contiguous"
+
+        B, H, M, K = a.shape
+        B, H, K, N = b.shape
+
+        assert K > 1, "inner-product dimension K should be larger than 1"
+
+        c = torch.empty((B, H, M, N), device=a.device, dtype=a.dtype)
+
+        grid = lambda META: (
+            triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv(N, META['BLOCK_SIZE_N']),
+            H,
+            B,
+        )
+
+        __class__._4d_kernel[grid](
+            a,
+            b,
+            c,
+            M,
+            N,
+            K,
+            M // __class__._cache_stride,
+            N // __class__._cache_stride,
+            K // __class__._cache_stride,
+            a.stride(0),
+            a.stride(1),
+            a.stride(2),
+            a.stride(3),
+            b.stride(0),
+            b.stride(1),
+            b.stride(2),
+            b.stride(3),
+            c.stride(0),
+            c.stride(1),
+            c.stride(2),
+            c.stride(3),
+            scale=-1.0,
+            MASK=False,
+        )
+        return c
+
+    @staticmethod
+    def _score_4d_matmul(input, head_size, input_mask, scale=-1.0):
+        assert input.is_contiguous(), "matrix input must be contiguous"
+
+        batches = input.shape[0]
+        d_model = input.shape[-1] // 3
+        num_of_heads = d_model // head_size
+
+        q = input[:, :, :d_model]
+        k = input[:, :, d_model:d_model * 2]
+
+        q = q.view(batches, -1, num_of_heads, head_size)
+        k = k.view(batches, -1, num_of_heads, head_size)
+
+        # checks constraints
+        assert q.shape == k.shape, "incompatible dimensions"
+        B, M, H, K = q.shape
+        B, N, H, K = k.shape
+
+        assert K > 1, "inner-product dimension K should be larger than 1"
+
+        # allocates output
+        output = torch.empty((B, H, M, N), device=q.device, dtype=q.dtype)
+        grid = lambda META: (
+            triton.cdiv(M, META["BLOCK_SIZE_M"]) * triton.cdiv(N, META["BLOCK_SIZE_N"]),
+            H,
+            B,
+        )
+        __class__._4d_kernel[grid](
+            q,
+            k,
+            output,
+            M,
+            N,
+            K,
+            M // __class__._cache_stride,
+            N // __class__._cache_stride,
+            K // __class__._cache_stride,
+            q.stride(0),
+            q.stride(2),
+            q.stride(1),
+            q.stride(3),
+            k.stride(0),
+            k.stride(2),
+            k.stride(3),
+            k.stride(1),
+            output.stride(0),
+            output.stride(1),
+            output.stride(2),
+            output.stride(3),
+            scale=scale,
+            MASK=False,
+        )
+        return output
+
+    @staticmethod
+    def _context_4d_matmul(prob, input, head_size):
+        assert prob.is_contiguous(), "matrix prob must be contiguous"
+        assert input.is_contiguous(), "matrix input must be contiguous"
+
+        batches = input.shape[0]
+        d_model = input.shape[-1] // 3
+        num_of_heads = d_model // head_size
+
+        v = input[:, :, d_model * 2:]
+
+        v = v.view(batches, -1, num_of_heads, head_size)
+
+        # checks constraints
+        assert (prob.shape[0] == v.shape[0] and prob.shape[1] == v.shape[2] and prob.shape[2] == v.shape[1]
+                and prob.shape[3] == v.shape[1]), "incompatible dimensions"
+        B, H, M, K = prob.shape
+        B, K, H, N = v.shape
+
+        assert K > 1, "inner-product dimension K should be larger than 1"
+
+        # allocates output
+        output = torch.empty((B, M, H, N), device=v.device, dtype=v.dtype)
+        grid = lambda META: (
+            triton.cdiv(M, META["BLOCK_SIZE_M"]) * triton.cdiv(N, META["BLOCK_SIZE_N"]),
+            H,
+            B,
+        )
+
+        __class__._4d_kernel[grid](
+            prob,
+            v,
+            output,
+            M,
+            N,
+            K,
+            M // __class__._cache_stride,
+            N // __class__._cache_stride,
+            K // __class__._cache_stride,
+            prob.stride(0),
+            prob.stride(1),
+            prob.stride(2),
+            prob.stride(3),
+            v.stride(0),
+            v.stride(2),
+            v.stride(1),
+            v.stride(3),
+            # Here we also transpose the output when writing to memory.
+            output.stride(0),
+            output.stride(2),
+            output.stride(1),
+            output.stride(3),
+            scale=-1,
+            MASK=False,
+        )
+        return output.view(batches, -1, d_model)
+
+    @staticmethod
+    def _ref_forward(A, B, ref_dtype=torch.float32, bias=None, activation=""):
+        C = torch.matmul(A.type(ref_dtype), B.type(ref_dtype))
+        C = bias_add_activation(C, bias, activation)
+        return C
+
+    @staticmethod
+    def _check_parity(A,
+                      B,
+                      output_dtype,
+                      SA=None,
+                      SB=None,
+                      qblock_size=None,
+                      ref_dtype=torch.float32,
+                      tol=0.01,
+                      use_triton=True,
+                      bias=None,
+                      activation=""):
+        torch_output = __class__._ref_forward(A, B, ref_dtype=ref_dtype, bias=bias, activation=activation)
+        triton_output = __class__.forward(A, B, use_triton=use_triton, bias=bias, activation=activation)
+        assert torch.allclose(triton_output.cpu().type(torch_output.dtype), torch_output.cpu(), rtol=tol)
+        print(f"{__class__.__name__}: PASSed the parity check")
+        return triton_output, torch_output
+
+    @staticmethod
+    def _read_autotune_table():
+        TritonMatmul._read_autotune_table(__class__.__name__ + "_2d_kernel", __class__._2d_kernel)
+        TritonMatmul._read_autotune_table(__class__.__name__ + "_4d_kernel", __class__._4d_kernel)
+
+    @staticmethod
+    def _write_autotune_table():
+        TritonMatmul._write_autotune_table(__class__.__name__ + "_2d_kernel", __class__._2d_kernel)
+        TritonMatmul._write_autotune_table(__class__.__name__ + "_4d_kernel", __class__._4d_kernel)
+
+    @staticmethod
+    def _update_autotune_table():
+        TritonMatmul._update_autotune_table(__class__.__name__ + "_2d_kernel", __class__._2d_kernel)
+        TritonMatmul._update_autotune_table(__class__.__name__ + "_4d_kernel", __class__._4d_kernel)
+
+
+# -----------------------------------------------------------------------------
+# mapping
+if deepspeed.HAS_TRITON:
+    fp16_matmul = Fp16Matmul()
+    matmul = MatmulExt.forward
+    matmul_4d = fp16_matmul._matmul_4d
+    score_4d_matmul = fp16_matmul._score_4d_matmul
+    context_4d_matmul = fp16_matmul._context_4d_matmul
+else:
+    fp16_matmul = None
+    matmul = None
+    matmul_4d = None
+    score_4d_matmul = None
+    context_4d_matmul = None
+
+
+@atexit.register
+def matmul_ext_update_autotune_table():
+    if deepspeed.HAS_TRITON:
+        fp16_matmul._update_autotune_table()
diff --git a/deepspeed/ops/transformer/inference/triton/mlp.py b/deepspeed/ops/transformer/inference/triton/mlp.py
new file mode 100644
index 000000000000..1708080b27ef
--- /dev/null
+++ b/deepspeed/ops/transformer/inference/triton/mlp.py
@@ -0,0 +1,81 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+import math
+import torch.nn as nn
+from deepspeed.accelerator import get_accelerator
+from deepspeed import comm as dist
+from ..op_binding import MLPGemmOp, VectorMatMulOp, GELUGemmOp, ResidualAddOp
+
+
+class TritonMLP(nn.Module):
+
+    def __init__(self, config, mp_group=None, q_scales=None, q_groups=1, merge_count=1, mlp_extra_grouping=False):
+        super(TritonMLP, self).__init__()
+
+        self.config = config
+        data_type = self.config.dtype
+        data_type_fp = torch.half if self.config.dtype == torch.int8 else self.config.dtype
+        device = get_accelerator().current_device_name()
+        self.attn_nw = nn.Parameter(torch.empty(self.config.hidden_size, dtype=data_type_fp, device=device),
+                                    requires_grad=False)
+        self.attn_nb = nn.Parameter(torch.empty(self.config.hidden_size, dtype=data_type_fp, device=device),
+                                    requires_grad=False)
+        intm_size_per_partition = self.config.intermediate_size // self.config.mp_size
+        self.inter_w = nn.Parameter(torch.empty(self.config.hidden_size,
+                                                intm_size_per_partition,
+                                                dtype=data_type,
+                                                device=device),
+                                    requires_grad=False)
+        self.inter_b = nn.Parameter(torch.empty(intm_size_per_partition, dtype=data_type_fp, device=device),
+                                    requires_grad=False)
+        self.output_w = nn.Parameter(torch.empty(intm_size_per_partition,
+                                                 self.config.hidden_size,
+                                                 dtype=data_type,
+                                                 device=device),
+                                     requires_grad=False)
+        self.output_b = nn.Parameter(torch.empty(self.config.hidden_size, dtype=data_type_fp, device=device),
+                                     requires_grad=False)
+
+        # used for quantization
+        self.q_scales = q_scales
+        self.q_groups = q_groups * 2 if mlp_extra_grouping else q_groups
+        self.merge_count = int(math.log2(merge_count))
+        self.mp_group = mp_group
+
+        self.mlp_gemm_func = MLPGemmOp(config)
+        self.vector_matmul_func = VectorMatMulOp(config)
+        self.fused_gemm_gelu = GELUGemmOp(config)
+        self.residual_add_func = ResidualAddOp(config)
+
+    def forward(self, input, residual, residual_norm, bias):
+        residual_add = None
+        if self.attn_nw is None:
+            output = self.fused_gemm_gelu(input=residual_norm,
+                                          weight=self.inter_w,
+                                          bias=self.inter_b,
+                                          weight_out=self.output_w)
+        else:
+            output, residual_add = self.mlp_gemm_func(input=input,
+                                                      residual=residual,
+                                                      input_bias=bias,
+                                                      weight_interm=self.inter_w,
+                                                      weight_out=self.output_w,
+                                                      bias=self.inter_b,
+                                                      gamma=self.attn_nw,
+                                                      beta=self.attn_nb)
+        residual = self.residual_add_func(hidden_state=output,
+                                          residual=residual,
+                                          attention_output=input,
+                                          attention_bias=bias if bias is not None else self.output_b,
+                                          final_bias=self.output_b,
+                                          add_bias=bias is not None,
+                                          residual_add=residual_add)
+
+        if self.mp_group is not None and dist.get_world_size(group=self.mp_group) > 1:
+            dist.all_reduce(residual, group=self.mp_group)
+
+        return residual
diff --git a/deepspeed/ops/transformer/inference/triton/ops.py b/deepspeed/ops/transformer/inference/triton/ops.py
new file mode 100644
index 000000000000..dd87d08d4d2c
--- /dev/null
+++ b/deepspeed/ops/transformer/inference/triton/ops.py
@@ -0,0 +1,131 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import deepspeed
+from deepspeed.ops.op_builder import InferenceBuilder
+import deepspeed.ops.transformer.inference.triton.matmul_ext as matmul_ext
+from deepspeed.ops.transformer.inference.triton.layer_norm import layer_norm, layer_norm_residual
+
+inference_module = None
+
+
+def vector_matmul_func(input, weight, async_op, q_scale, q_int8, transposed_mode):
+    assert not transposed_mode and not async_op and not q_int8
+    return matmul_ext.matmul(input, weight, bias=None, activation="", use_triton=True)
+
+
+def fused_gemm_gelu(input,
+                    weight,
+                    weight_scale,
+                    bias,
+                    weight_out,
+                    weight_out_scale,
+                    epsilon,
+                    pre_layer_norm,
+                    q_int8,
+                    async_op,
+                    transposed_mode,
+                    use_triton_ln=True):
+    assert not transposed_mode
+
+    # activation
+    activation = "gelu"
+
+    # intermediate fc in FF
+    intm_out = matmul_ext.matmul(input, weight, bias=bias, activation=activation, use_triton=True)
+
+    # output fc in FF
+    ff_out = matmul_ext.matmul(
+        intm_out,
+        weight_out,
+        bias=None,
+        activation="",  # bias added layer with residual_add + bias + layerNorm layer
+        use_triton=True)
+    return ff_out
+
+
+def linear_func(input, weight, bias, add_bias, do_flash_attn, num_heads, transposed_mode=False):
+    assert not transposed_mode and not do_flash_attn
+    qkv_out = matmul_ext.matmul(input, weight, bias=(bias if add_bias else None), activation="", use_triton=True)
+
+    return qkv_out
+
+
+def mlp_gemm_func(input,
+                  residual,
+                  input_bias,
+                  weight_interm,
+                  weight_out,
+                  bias,
+                  gamma,
+                  beta,
+                  epsilon,
+                  pre_layer_norm,
+                  mlp_after_attn,
+                  weight_interm_scale,
+                  weight_out_scale,
+                  q_int8,
+                  mlp_act_func_type,
+                  transposed_mode,
+                  use_triton_ln=True):
+    assert not transposed_mode
+
+    # residual add and layerNorm after attention
+    if use_triton_ln:
+        mlp_input = layer_norm_residual(input, input_bias, residual, gamma, beta, epsilon)
+    else:
+        global inference_module
+        if inference_module is None:
+            inference_module = InferenceBuilder().load()
+        mlp_input = inference_module._layer_norm_residual(input, input_bias, residual, gamma, beta, epsilon)
+
+    # activation
+    if deepspeed.utils.types.ActivationFuncType(mlp_act_func_type) == deepspeed.utils.types.ActivationFuncType.GELU:
+        activation = "gelu"
+    elif deepspeed.utils.types.ActivationFuncType(mlp_act_func_type) == deepspeed.utils.types.ActivationFuncType.ReLU:
+        activation = "relu"
+    else:
+        activation = ""
+
+    # intermediate fc in FF
+    intm_out = matmul_ext.matmul(mlp_input, weight_interm, bias=bias, activation=activation, use_triton=True)
+    # output fc in FF
+    ff_out = matmul_ext.matmul(
+        intm_out,
+        weight_out,
+        bias=None,
+        activation="",  # bias added layer with residual_add + bias + layerNorm layer
+        use_triton=True)
+
+    return ff_out, mlp_input
+
+
+def qkv_gemm_func(
+    input,
+    weight,
+    q_scale,
+    bias,
+    gamma,
+    beta,
+    epsilon,
+    add_bias,
+    q_int8,
+    transposed_mode=False,
+    use_triton_ln=True,
+):
+
+    assert not transposed_mode
+    # residual add and layerNorm after attention
+    if use_triton_ln:
+        qkv_input = layer_norm(input, gamma, beta, epsilon)
+    else:
+        global inference_module
+        if inference_module is None:
+            inference_module = InferenceBuilder().load()
+        qkv_input = inference_module.layer_norm(input, gamma, beta, epsilon)
+
+    qkv_out = matmul_ext.matmul(qkv_input, weight, bias=(bias if add_bias else None), activation="", use_triton=True)
+
+    return qkv_out, qkv_input
diff --git a/deepspeed/ops/transformer/inference/triton/residual_add.py b/deepspeed/ops/transformer/inference/triton/residual_add.py
new file mode 100644
index 000000000000..063e7a7e4a2d
--- /dev/null
+++ b/deepspeed/ops/transformer/inference/triton/residual_add.py
@@ -0,0 +1,88 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+import triton
+import triton.language as tl
+from deepspeed.accelerator import get_accelerator
+
+
+@triton.jit
+def residual_add_bias_kernel(
+    hidden_state_ptr,
+    residual_ptr,
+    attn_output_ptr,
+    hidden_state_size,
+    attn_bias_ptr,
+    final_bias_ptr,
+    bias_size,
+    output_ptr,
+    mp_size: tl.constexpr,
+    mlp_after_attn: tl.constexpr,
+    pre_attn_norm: tl.constexpr,
+    add_attn_bias: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    pid = tl.program_id(axis=0)
+
+    block_start = pid * BLOCK_SIZE
+
+    offsets = block_start + tl.arange(0, BLOCK_SIZE)
+    mask = offsets < hidden_state_size
+
+    bias_offsets = offsets % bias_size
+    bias_mask = bias_offsets < bias_size
+
+    tl_hidden_state = tl.load(hidden_state_ptr + offsets, mask=mask)
+    tl_residual = tl.load(residual_ptr + offsets, mask=mask)
+    tl_attn_output = tl.load(attn_output_ptr + offsets, mask=mask)
+    tl_attn_bias = tl.load(attn_bias_ptr + bias_offsets, mask=bias_mask)
+    tl_final_bias = tl.load(final_bias_ptr + bias_offsets, mask=bias_mask)
+
+    if mlp_after_attn:
+        if pre_attn_norm:
+            output = tl_hidden_state + (tl_residual + tl_final_bias + tl_attn_output + tl_attn_bias) / mp_size
+        else:
+            output = tl_hidden_state + tl_residual + tl_final_bias
+    else:
+        output = tl_hidden_state + tl_attn_output + (tl_residual + tl_final_bias) / mp_size
+        if add_attn_bias:
+            output += tl_attn_bias / mp_size
+
+    tl.store(output_ptr + offsets, output, mask=mask)
+
+
+def residual_add_bias(hidden_state: torch.Tensor, residual: torch.Tensor, attn_output: torch.Tensor,
+                      attn_bias: torch.Tensor, final_bias: torch.Tensor, mp_size: int, mlp_after_attn: bool,
+                      add_attn_bias: bool, pre_attn_norm: bool):
+    # check that all tensors are on the same device
+    assert get_accelerator().on_accelerator(hidden_state) \
+        and get_accelerator().on_accelerator(residual) \
+        and get_accelerator().on_accelerator(attn_output) \
+        and get_accelerator().on_accelerator(attn_bias) \
+        and get_accelerator().on_accelerator(final_bias)
+
+    # check that all tensors have the same dtype
+    assert hidden_state.dtype == residual.dtype == attn_output.dtype \
+        == attn_bias.dtype == final_bias.dtype
+
+    # check that all tensors have the right shape
+    assert hidden_state.shape == residual.shape == attn_output.shape
+    assert attn_bias.shape == final_bias.shape
+    assert attn_bias.shape[0] == hidden_state.shape[2]
+
+    output = torch.empty_like(hidden_state)
+
+    hidden_state_size = output.numel()
+    bias_size = attn_bias.numel()
+
+    grid = lambda meta: (triton.cdiv(hidden_state_size, meta['BLOCK_SIZE']), )
+
+    residual_add_bias_kernel[grid](hidden_state, residual, attn_output, hidden_state_size,\
+                    attn_bias, final_bias, bias_size, output, mp_size, mlp_after_attn, pre_attn_norm, \
+                    add_attn_bias, \
+                    BLOCK_SIZE=1024)
+
+    return output
diff --git a/deepspeed/ops/transformer/inference/triton/softmax.py b/deepspeed/ops/transformer/inference/triton/softmax.py
new file mode 100644
index 000000000000..1ee10d63e6cf
--- /dev/null
+++ b/deepspeed/ops/transformer/inference/triton/softmax.py
@@ -0,0 +1,89 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+import triton
+import triton.language as tl
+'''
+softmax
+modified the triton kernel in
+https://github.com/openai/triton/blob/34817ecc954a6f4ca7b4dfb352fdde1f8bd49ca5/python/tutorials/02-fused-softmax.py
+'''
+
+
+@triton.jit
+def softmax_kernel(output_ptr, input_ptr, stride, n_cols, BLOCK_SIZE: tl.constexpr):
+    row_idx = tl.program_id(0)
+    row_start_ptr = input_ptr + row_idx * stride
+    col_offsets = tl.arange(0, BLOCK_SIZE)
+    input_ptrs = row_start_ptr + col_offsets
+    row = tl.load(input_ptrs, mask=col_offsets < n_cols, other=-float('inf')).to(tl.float32)
+    row_minus_max = row - tl.max(row, axis=0)
+    numerator = tl.exp(row_minus_max)
+    denominator = tl.sum(numerator, axis=0)
+    softmax_output = numerator / denominator
+    output_row_start_ptr = output_ptr + row_idx * stride
+    output_ptrs = output_row_start_ptr + col_offsets
+    tl.store(output_ptrs, softmax_output, mask=col_offsets < n_cols)
+
+
+@triton.jit
+def masked_softmax_kernel(output_ptr, input_ptr, stride, mask_ptr, mask_stride, n_cols, BLOCK_SIZE: tl.constexpr):
+    row_idx = tl.program_id(0)
+    row_start_ptr = input_ptr + row_idx * stride
+    col_offsets = tl.arange(0, BLOCK_SIZE)
+    input_ptrs = row_start_ptr + col_offsets
+    mask_ptrs = mask_ptr + col_offsets + row_idx * mask_stride  # mask_stride is 0 for 1d mask
+    row = tl.load(input_ptrs, mask=col_offsets < n_cols, other=-float('inf')).to(tl.float32)
+    mask = tl.load(mask_ptrs, mask=col_offsets < n_cols, other=0).to(tl.float32)
+    row_minus_max = row - tl.max(row, axis=0)
+    row_minus_max = row_minus_max + mask
+    numerator = tl.exp(row_minus_max)
+    denominator = tl.sum(numerator, axis=0)
+    softmax_output = numerator / denominator
+    output_row_start_ptr = output_ptr + row_idx * stride
+    output_ptrs = output_row_start_ptr + col_offsets
+    tl.store(output_ptrs, softmax_output, mask=col_offsets < n_cols)
+
+
+def softmax(input: torch.Tensor, mask: torch.Tensor = None, dim=-1) -> torch.Tensor:
+    assert input.is_contiguous()
+    assert (dim == -1) or (dim == len(input.shape) - 1), "Only dim=-1 is supported"
+
+    use_mask = False if mask is None else True
+    input_arg = input.view(-1, input.shape[-1])
+    n_rows, n_cols = input_arg.shape
+    BLOCK_SIZE = max(triton.next_power_of_2(n_cols), 2)
+    num_warps = 4
+    if BLOCK_SIZE >= 2048:
+        num_warps = 8
+    if BLOCK_SIZE >= 4096:
+        num_warps = 16
+    # Allocate output
+    output = torch.empty_like(input)
+    if use_mask:
+        assert mask.is_contiguous()
+        mask = mask.view(-1, mask.shape[-1])
+        mask_stride = mask.shape[-1] if mask.shape[-2] > 1 else 0
+        masked_softmax_kernel[(n_rows, )](
+            output,
+            input,
+            input_arg.stride(0),
+            mask,
+            mask_stride,
+            n_cols,
+            num_warps=num_warps,
+            BLOCK_SIZE=BLOCK_SIZE,
+        )
+    else:
+        softmax_kernel[(n_rows, )](
+            output,
+            input,
+            input_arg.stride(0),
+            n_cols,
+            num_warps=num_warps,
+            BLOCK_SIZE=BLOCK_SIZE,
+        )
+    return output
diff --git a/deepspeed/ops/transformer/inference/triton/triton_matmul_kernel.py b/deepspeed/ops/transformer/inference/triton/triton_matmul_kernel.py
new file mode 100644
index 000000000000..e2128e046df0
--- /dev/null
+++ b/deepspeed/ops/transformer/inference/triton/triton_matmul_kernel.py
@@ -0,0 +1,398 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import triton
+import triton.language as tl
+from .gelu import gelu_functor
+import torch
+
+AUTOTUNE_TOP_K = 10
+SKIP_AUTOTUNE = False
+
+
+def _triton_ops_matmul_early_config_prune(configs, named_args):
+    device = torch.cuda.current_device()  #ignore-cuda
+    capability = torch.cuda.get_device_capability()  #ignore-cuda
+    # BLOCK_M, BLOCK_N, BLOCK_K, SPLIT_K, num_warps, num_stages
+    dtsize = named_args['A'].element_size()
+    dtype = named_args['A'].dtype
+
+    # 1. make sure we have enough smem
+    pruned_configs = []
+    for config in configs:
+        kw = config.kwargs
+        BLOCK_M, BLOCK_N, BLOCK_K, num_stages = \
+            kw['BLOCK_M'], kw['BLOCK_N'], kw['BLOCK_K'], config.num_stages
+
+        max_shared_memory = triton.runtime.driver.utils.get_device_properties(device)["max_shared_mem"]
+        required_shared_memory = (BLOCK_M + BLOCK_N) * BLOCK_K * num_stages * dtsize
+        if required_shared_memory <= max_shared_memory:
+            pruned_configs.append(config)
+
+    return pruned_configs
+
+
+def _fp16_matmul_prune_config(configs, named_args, skip_autotune=SKIP_AUTOTUNE):
+    if skip_autotune:
+        configs = [configs[0]]
+    else:
+        configs = _triton_ops_matmul_early_config_prune(configs, named_args)
+    return configs
+
+
+"""
+fp16 matmul implementation is adapted from triton matmul:
+https://github.com/openai/triton/blob/34817ecc954a6f4ca7b4dfb352fdde1f8bd49ca5/python/triton/ops/matmul.py
+"""
+
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': 128,
+            'BLOCK_N': 256,
+            'BLOCK_K': 32,
+            'SPLIT_K': 1
+        }, num_stages=3, num_warps=8),
+        triton.Config({
+            'BLOCK_M': 256,
+            'BLOCK_N': 128,
+            'BLOCK_K': 32,
+            'SPLIT_K': 1
+        }, num_stages=3, num_warps=8),
+        triton.Config({
+            'BLOCK_M': 256,
+            'BLOCK_N': 64,
+            'BLOCK_K': 32,
+            'SPLIT_K': 1
+        }, num_stages=4, num_warps=4),
+        triton.Config({
+            'BLOCK_M': 64,
+            'BLOCK_N': 256,
+            'BLOCK_K': 32,
+            'SPLIT_K': 1
+        }, num_stages=4, num_warps=4),
+        triton.Config({
+            'BLOCK_M': 128,
+            'BLOCK_N': 128,
+            'BLOCK_K': 32,
+            'SPLIT_K': 1
+        }, num_stages=4, num_warps=4),
+        triton.Config({
+            'BLOCK_M': 128,
+            'BLOCK_N': 64,
+            'BLOCK_K': 32,
+            'SPLIT_K': 1
+        }, num_stages=4, num_warps=4),
+        triton.Config({
+            'BLOCK_M': 64,
+            'BLOCK_N': 128,
+            'BLOCK_K': 32,
+            'SPLIT_K': 1
+        }, num_stages=4, num_warps=4),
+        triton.Config({
+            'BLOCK_M': 128,
+            'BLOCK_N': 32,
+            'BLOCK_K': 32,
+            'SPLIT_K': 1
+        }, num_stages=4, num_warps=4),
+        triton.Config({
+            'BLOCK_M': 64,
+            'BLOCK_N': 32,
+            'BLOCK_K': 32,
+            'SPLIT_K': 1
+        }, num_stages=5, num_warps=2),
+    ],
+    key=['CACHE_M', 'CACHE_N', 'CACHE_K'],
+    prune_configs_by={
+        'early_config_prune': _fp16_matmul_prune_config,
+        'perf_model': None,
+        'top_k': AUTOTUNE_TOP_K
+    },
+)
+@triton.heuristics({
+    'EVEN_K': lambda args: args['K'] % (args['BLOCK_K'] * args['SPLIT_K']) == 0,
+})
+@triton.jit
+def _fp_matmul(
+    A,
+    B,
+    C,
+    M,
+    N,
+    K,
+    bias,
+    stride_am,
+    stride_ak,
+    stride_bk,
+    stride_bn,
+    stride_cm,
+    stride_cn,
+    CACHE_M,
+    CACHE_N,
+    CACHE_K,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    GROUP_M: tl.constexpr,
+    SPLIT_K: tl.constexpr,
+    EVEN_K: tl.constexpr,
+    ACC_TYPE: tl.constexpr,
+    BIAS_ADD: tl.constexpr,
+    ACTIVATION: tl.constexpr,
+):
+    # matrix multiplication
+    pid = tl.program_id(0)
+    pid_z = tl.program_id(1)
+    grid_m = (M + BLOCK_M - 1) // BLOCK_M
+    grid_n = (N + BLOCK_N - 1) // BLOCK_N
+    # re-order program ID for better L2 performance
+    width = GROUP_M * grid_n
+    group_id = pid // width
+    group_size = min(grid_m - group_id * GROUP_M, GROUP_M)
+    pid_m = group_id * GROUP_M + (pid % group_size)
+    pid_n = (pid % width) // (group_size)
+    # do matrix multiplication
+    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
+    rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
+    rk = pid_z * BLOCK_K + tl.arange(0, BLOCK_K)
+    # pointers
+    A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak)
+    B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn)
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    for k in range(K, 0, -BLOCK_K * SPLIT_K):
+        if EVEN_K:
+            a = tl.load(A)
+            b = tl.load(B)
+        else:
+            a = tl.load(A, mask=rk[None, :] < k, other=0.)
+            b = tl.load(B, mask=rk[:, None] < k, other=0.)
+        acc += tl.dot(a, b)
+        A += BLOCK_K * SPLIT_K * stride_ak
+        B += BLOCK_K * SPLIT_K * stride_bk
+    # bias addition
+    if BIAS_ADD:
+        bias_offset = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+        bias_ptr = bias + bias_offset
+        b = tl.load(bias_ptr, mask=bias_offset < N)
+        acc = acc + b[None, :]
+    # activation
+    if ACTIVATION == "relu":
+        acc = tl.where(acc >= 0, acc, 0)
+    elif ACTIVATION == "leaky_relu":
+        acc = tl.where(acc >= 0, acc, 0.01 * acc)
+    elif ACTIVATION == "gelu":
+        #acc = tl.sigmoid(1.702 * acc) * acc
+        acc = gelu_functor(acc)
+    elif ACTIVATION == "sigmoid":
+        acc = tl.sigmoid(acc)  # sigmoid
+    acc = acc.to(C.dtype.element_ty)
+    # rematerialize rm and rn to save registers
+    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    C = C + (rm[:, None] * stride_cm + rn[None, :] * stride_cn)
+    mask = (rm < M)[:, None] & (rn < N)[None, :]
+    # handles write-back with reduction-splitting
+    if SPLIT_K == 1:
+        tl.store(C, acc, mask=mask)
+    else:
+        tl.atomic_add(C, acc, mask=mask)
+
+
+def matmul_4d_prune_config(configs, named_args, skip_autotune=SKIP_AUTOTUNE):
+    if skip_autotune:
+        configs = [configs[0]]
+    else:
+        device = torch.cuda.current_device()  #ignore-cuda
+        capability = torch.cuda.get_device_capability()  #ignore-cuda
+        # BLOCK_M, BLOCK_N, BLOCK_K, SPLIT_K, num_warps, num_stages
+        dtsize = named_args['a_ptr'].element_size()
+        dtype = named_args['a_ptr'].dtype
+
+        # make sure we have enough smem
+        pruned_configs = []
+        for config in configs:
+            kw = config.kwargs
+            BLOCK_M, BLOCK_N, BLOCK_K, num_stages = \
+                kw['BLOCK_SIZE_M'], kw['BLOCK_SIZE_N'], kw['BLOCK_SIZE_K'], config.num_stages
+
+            max_shared_memory = triton.runtime.driver.utils.get_device_properties(device)["max_shared_mem"]
+            required_shared_memory = (BLOCK_M + BLOCK_N) * BLOCK_K * num_stages * dtsize
+            if required_shared_memory <= max_shared_memory:
+                pruned_configs.append(config)
+        configs = pruned_configs
+    return configs
+
+
+@triton.autotune(
+    configs=[
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 64,
+                "BLOCK_SIZE_N": 64,
+                "BLOCK_SIZE_K": 64,
+                "GROUP_SIZE_M": 8
+            },
+            num_stages=1,  # this is mainly for unit test, to minimize the share memory usage
+            num_warps=8),
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 128,
+                "BLOCK_SIZE_N": 128,
+                "BLOCK_SIZE_K": 32,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=4,
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 128,
+                "BLOCK_SIZE_N": 64,
+                "BLOCK_SIZE_K": 32,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=4,
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 64,
+                "BLOCK_SIZE_N": 128,
+                "BLOCK_SIZE_K": 32,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=4,
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 128,
+                "BLOCK_SIZE_N": 32,
+                "BLOCK_SIZE_K": 32,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=4,
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 64,
+                "BLOCK_SIZE_N": 32,
+                "BLOCK_SIZE_K": 32,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=5,
+            num_warps=2,
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 32,
+                "BLOCK_SIZE_N": 64,
+                "BLOCK_SIZE_K": 32,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=5,
+            num_warps=2,
+        ),
+    ],
+    key=['CACHE_M', 'CACHE_N', 'CACHE_K'],
+    prune_configs_by={
+        'early_config_prune': matmul_4d_prune_config,
+        'perf_model': None,
+        'top_k': AUTOTUNE_TOP_K
+    },
+)
+@triton.jit
+def matmul_4d_kernel(
+    # Pointers to matrices
+    a_ptr,
+    b_ptr,
+    c_ptr,
+    # Matrix dimensions
+    M,
+    N,
+    K,
+    CACHE_M,
+    CACHE_N,
+    CACHE_K,
+    stride_ab,
+    stride_ah,
+    stride_am,
+    stride_ak,
+    stride_bb,
+    stride_bh,
+    stride_bk,
+    stride_bn,
+    stride_cb,
+    stride_ch,
+    stride_cm,
+    stride_cn,
+    scale,
+    # Meta-parameters
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+    MASK: tl.constexpr,
+):
+    """Kernel for computing the matmul C = A x B.
+    A has shape (M, K), B has shape (K, N) and C has shape (M, N)
+    """
+    pid = tl.program_id(axis=0)
+    head = tl.program_id(axis=1)
+    batch = tl.program_id(axis=2)
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + (pid % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+
+    if MASK:
+        if (pid_m + 1) * BLOCK_SIZE_M - 1 < pid_n * BLOCK_SIZE_N:
+            c = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=c_ptr.dtype.element_ty) - float("inf")
+            offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+            offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+            c_ptrs = (c_ptr + batch * stride_cb + head * stride_ch + stride_cm * offs_cm[:, None] +
+                      stride_cn * offs_cn[None, :])
+            tl.store(c_ptrs, c)
+            return
+
+    offs_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    a_ptrs = (a_ptr + batch * stride_ab + head * stride_ah +
+              (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak))
+    b_ptrs = (b_ptr + batch * stride_bb + head * stride_bh +
+              (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn))
+
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for k in range(0, K, BLOCK_SIZE_K):
+        a_mask = (offs_am[:, None] < M) & (offs_k[None, :] + k < K)
+        b_mask = (offs_k[:, None] + k < K) & (offs_bn[None, :] < N)
+        a = tl.load(a_ptrs, mask=a_mask, other=0.)
+        b = tl.load(b_ptrs, mask=b_mask, other=0.)
+        accumulator += tl.dot(a, b)
+        a_ptrs += BLOCK_SIZE_K * stride_ak
+        b_ptrs += BLOCK_SIZE_K * stride_bk
+
+    c = accumulator.to(c_ptr.dtype.element_ty)
+    if scale > 0:
+        c = c * scale.to(c_ptr.dtype.element_ty)
+
+    offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    if MASK:
+        c += tl.where(offs_cm[:, None] >= offs_cn[None, :], 0, float("-inf"))
+    c_ptrs = (c_ptr + batch * stride_cb + head * stride_ch + stride_cm * offs_cm[:, None] +
+              stride_cn * offs_cn[None, :])
+    c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N)
+    tl.store(c_ptrs, c, mask=c_mask)
diff --git a/deepspeed/ops/transformer/inference/triton_ops.py b/deepspeed/ops/transformer/inference/triton_ops.py
index 45c6db66d107..f98f45ef638e 100644
--- a/deepspeed/ops/transformer/inference/triton_ops.py
+++ b/deepspeed/ops/transformer/inference/triton_ops.py
@@ -1,7 +1,10 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 """
 Inspired by original Triton implementation:
-https://github.com/openai/triton/blob/b244db06da24a87453a40ad35b085ee37dac3705/python/tutorials/06-fused-attention.py
+https://github.com/openai/triton/blob/release/2.1.x/python/tutorials/06-fused-attention.py
 """
 
 import torch
@@ -15,7 +18,6 @@ def _fwd_kernel(
     K,
     V,
     sm_scale,
-    TMP,
     Out,
     stride_qz,
     stride_qh,
@@ -42,66 +44,83 @@ def _fwd_kernel(
 ):
     start_m = tl.program_id(0)
     off_hz = tl.program_id(1)
+    qvk_offset = off_hz * stride_qh
+    Q_block_ptr = tl.make_block_ptr(base=Q + qvk_offset,
+                                    shape=(N_CTX, BLOCK_DMODEL),
+                                    strides=(stride_qm, stride_qk),
+                                    offsets=(start_m * BLOCK_M, 0),
+                                    block_shape=(BLOCK_M, BLOCK_DMODEL),
+                                    order=(1, 0))
+    K_block_ptr = tl.make_block_ptr(base=K + qvk_offset,
+                                    shape=(BLOCK_DMODEL, N_CTX),
+                                    strides=(stride_kk, stride_kn),
+                                    offsets=(0, 0),
+                                    block_shape=(BLOCK_DMODEL, BLOCK_N),
+                                    order=(0, 1))
+    V_block_ptr = tl.make_block_ptr(base=V + qvk_offset,
+                                    shape=(N_CTX, BLOCK_DMODEL),
+                                    strides=(stride_vk, stride_vn),
+                                    offsets=(0, 0),
+                                    block_shape=(BLOCK_N, BLOCK_DMODEL),
+                                    order=(1, 0))
     # initialize offsets
     offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
     offs_n = tl.arange(0, BLOCK_N)
-    offs_d = tl.arange(0, BLOCK_DMODEL)
-    off_q = off_hz * stride_qh + offs_m[:, None] * stride_qm + offs_d[None, :] * stride_qk
-    off_k = off_hz * stride_kh + offs_n[:, None] * stride_kn + offs_d[None, :] * stride_kk
-    off_v = off_hz * stride_vh + offs_n[:, None] * stride_qm + offs_d[None, :] * stride_qk
-    # Initialize pointers to Q, K, V
-    q_ptrs = Q + off_q
-    k_ptrs = K + off_k
-    v_ptrs = V + off_v
     # initialize pointer to m and l
-    t_ptrs = TMP + off_hz * N_CTX + offs_m
     m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
     l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
     acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
+    # scale sm_scale by log_2(e) and use
+    # 2^x instead of exp in the loop because CSE and LICM
+    # don't work as expected with `exp` in the loop
+    qk_scale = sm_scale * 1.44269504
     # load q: it will stay in SRAM throughout
-    q = tl.load(q_ptrs)
+    q = tl.load(Q_block_ptr)
+    q = (q * qk_scale).to(tl.float16)
     # loop over k, v and update accumulator
-    for start_n in range(0, N_CTX, BLOCK_N):
-        start_n = tl.multiple_of(start_n, BLOCK_N)
-        # -- compute qk ----
-        k = tl.load(k_ptrs + start_n * stride_kn)
-
+    lo = 0
+    #hi = (start_m + 1) * BLOCK_M if IS_CAUSAL else N_CTX
+    hi = N_CTX
+    #hi = (start_m + 1) * BLOCK_M
+    for start_n in range(lo, hi, BLOCK_N):
+        # -- load k, v --
+        k = tl.load(K_block_ptr)
+        v = tl.load(V_block_ptr)
+        # -- compute qk ---
         qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
-        qk += tl.dot(q, k, trans_b=True)
-        qk *= sm_scale
-        # -- compute m_ij, p, l_ij
-        m_ij = tl.max(qk, 1)
-        p = tl.exp(qk - m_ij[:, None])
-        l_ij = tl.sum(p, 1)
-        # -- update m_i and l_i
-        m_i_new = tl.maximum(m_i, m_ij)
-        alpha = tl.exp(m_i - m_i_new)
-        beta = tl.exp(m_ij - m_i_new)
-        l_i_new = alpha * l_i + beta * l_ij
-        # -- update output accumulator --
-        # scale p
-        p_scale = beta / l_i_new
-        p = p * p_scale[:, None]
-        # scale acc
-        acc_scale = l_i / l_i_new * alpha
-        tl.store(t_ptrs, acc_scale)
-        acc_scale = tl.load(t_ptrs)  # BUG: have to store and immediately load
-        acc = acc * acc_scale[:, None]
-        # update acc
-        v = tl.load(v_ptrs + start_n * stride_vk)
-        p = p.to(tl.float16)
-        acc += tl.dot(p, v)
-        # update m_i and l_i
-        l_i = l_i_new
+        #if IS_CAUSAL:
+        #qk = tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), qk, float("-inf"))
+        qk += tl.dot(q, k)
+        # -- compute scaling constant ---
+        m_i_new = tl.maximum(m_i, tl.max(qk, 1))
+        alpha = tl.math.exp2(m_i - m_i_new)
+        p = tl.math.exp2(qk - m_i_new[:, None])
+        # -- scale and update acc --
+        acc_scale = l_i * 0 + alpha  # workaround some compiler bug
+        acc *= acc_scale[:, None]
+        acc += tl.dot(p.to(tl.float16), v)
+        # -- update m_i and l_i --
+        l_i = l_i * alpha + tl.sum(p, 1)
         m_i = m_i_new
-    # initialize pointers to output
-    offs_n = tl.arange(0, BLOCK_DMODEL)
-    off_o = off_hz * stride_oh + offs_m[:, None] * stride_om + offs_n[None, :] * stride_on
-    out_ptrs = Out + off_o
-    tl.store(out_ptrs, acc)
+        # update pointers
+        K_block_ptr = tl.advance(K_block_ptr, (0, BLOCK_N))
+        V_block_ptr = tl.advance(V_block_ptr, (BLOCK_N, 0))
+    # write back l and m
+    acc = acc / l_i[:, None]
+    #l_ptrs = L + off_hz * N_CTX + offs_m
+    #tl.store(l_ptrs, m_i + tl.math.log2(l_i))
+    # write back O
+    O_block_ptr = tl.make_block_ptr(base=Out + qvk_offset,
+                                    shape=(N_CTX, BLOCK_DMODEL),
+                                    strides=(stride_om, stride_on),
+                                    offsets=(start_m * BLOCK_M, 0),
+                                    block_shape=(BLOCK_M, BLOCK_DMODEL),
+                                    order=(1, 0))
+    tl.store(O_block_ptr, acc.to(tl.float16))
 
 
 class triton_flash_attn(torch.nn.Module):
+
     def __init__(self, ):
         super(triton_flash_attn, self).__init__()
 
@@ -111,10 +130,6 @@ def forward(self, q, k, v, sm_scale, block_128=True):
         Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1]
         o = torch.empty_like(q)
         grid = (triton.cdiv(q.shape[2], BLOCK), q.shape[0] * q.shape[1])
-        tmp = torch.empty((q.shape[0] * q.shape[1],
-                           q.shape[2]),
-                          device=q.device,
-                          dtype=torch.float32)
         num_warps = 4 if Lk <= 64 else 8
 
         _fwd_kernel[grid](
@@ -122,7 +137,6 @@ def forward(self, q, k, v, sm_scale, block_128=True):
             k,
             v,
             sm_scale,
-            tmp,
             o,
             q.stride(0),
             q.stride(1),
diff --git a/deepspeed/ops/transformer/transformer.py b/deepspeed/ops/transformer/transformer.py
index ae627304f02b..bfd4d60dcb1c 100755
--- a/deepspeed/ops/transformer/transformer.py
+++ b/deepspeed/ops/transformer/transformer.py
@@ -1,6 +1,8 @@
-'''
-Copyright 2020 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 import json
 import math
 import torch
@@ -15,15 +17,9 @@
 
 
 class TransformerConfig():
-    def __init__(self,
-                 batch_size,
-                 hidden_size,
-                 intermediate_size,
-                 heads,
-                 attn_dropout_ratio,
-                 hidden_dropout_ratio,
-                 num_hidden_layers,
-                 initializer_range):
+
+    def __init__(self, batch_size, hidden_size, intermediate_size, heads, attn_dropout_ratio, hidden_dropout_ratio,
+                 num_hidden_layers, initializer_range):
         self.layer_id = -1
         self.batch_size = batch_size
         self.hidden_size = hidden_size
@@ -89,6 +85,7 @@ class DeepSpeedTransformerConfig(TransformerConfig):
 
             training: Enable for training rather than inference.
     """
+
     def __init__(self,
                  batch_size=-1,
                  hidden_size=-1,
@@ -111,15 +108,9 @@ def __init__(self,
                  return_tuple=False,
                  training=True):
         super(DeepSpeedTransformerConfig,
-              self).__init__(
-                  batch_size,
-                  hidden_size,
-                  (intermediate_size if intermediate_size > 0 else 4 * hidden_size),
-                  heads,
-                  attn_dropout_ratio,
-                  hidden_dropout_ratio,
-                  num_hidden_layers,
-                  initializer_range)
+              self).__init__(batch_size, hidden_size,
+                             (intermediate_size if intermediate_size > 0 else 4 * hidden_size), heads,
+                             attn_dropout_ratio, hidden_dropout_ratio, num_hidden_layers, initializer_range)
         self.fp16 = fp16
         self.pre_layer_norm = pre_layer_norm
         self.local_rank = local_rank
@@ -150,97 +141,42 @@ def from_json_file(cls, json_file):
 
 
 class DeepSpeedTransformerFunction(Function):
+
     @staticmethod
-    def forward(ctx,
-                input,
-                input_mask,
-                self,
-                grads,
-                layer_id,
-                attn_qkvw,
-                attn_qkvb,
-                attn_ow,
-                attn_ob,
-                attn_nw,
-                attn_nb,
-                inter_w,
-                inter_b,
-                output_w,
-                output_b,
-                norm_w,
-                norm_b,
-                config):
+    def forward(ctx, input, input_mask, self, grads, layer_id, attn_qkvw, attn_qkvb, attn_ow, attn_ob, attn_nw,
+                attn_nb, inter_w, inter_b, output_w, output_b, norm_w, norm_b, config):
 
         cuda_module = stochastic_transformer_cuda_module if config.stochastic_mode else transformer_cuda_module
         forward_func = cuda_module.forward_fp16 if config.fp16 else cuda_module.forward_fp32
 
         inp_size = input.size()
         if inp_size[1] % 16 != 0:
-            input = torch.cat((input,
-                               torch.randn((inp_size[0],
-                                            (16 - (inp_size[1] % 16)),
-                                            inp_size[2]),
-                                           device=input.device,
-                                           dtype=input.dtype)),
-                              1)
+            input = torch.cat(
+                (input,
+                 torch.randn(
+                     (inp_size[0], (16 - (inp_size[1] % 16)), inp_size[2]), device=input.device, dtype=input.dtype)),
+                1)
             input_mask = torch.cat((input_mask, torch.ones((inp_size[0], input_mask.shape[1], input_mask.shape[2], \
                                             (16 - (inp_size[1] % 16))), device=input_mask.device, dtype=input_mask.dtype) * -10000), 3)
 
-        (output,
-         inp_norm,
-         qkv_tf,
-         soft_inp,
-         ctx_bufB,
-         attn_o_inp,
-         add_res,
-         ff1_inp,
-         gelu_inp,
-         ff2_inp,
-         attn_prob_dropout_mask,
-         attn_output_dropout_mask,
-         layer_output_dropout_mask,
-         attn_layer_norm_var,
-         attn_layer_norm_mean,
-         layer_norm_var,
-         layer_norm_mean) = forward_func(config.layer_id,
-                                         input,
-                                         input_mask,
-                                         attn_qkvw,
-                                         attn_qkvb,
-                                         attn_ow,
-                                         attn_ob,
-                                         attn_nw,
-                                         attn_nb,
-                                         inter_w,
-                                         inter_b,
-                                         output_w,
-                                         output_b,
-                                         norm_w,
-                                         norm_b,
-                                         config.training and config.is_grad_enabled,
-                                         config.pre_layer_norm,
-                                         config.attn_dropout_checkpoint,
-                                         config.normalize_invertible,
-                                         config.gelu_checkpoint)
+        (output, inp_norm, qkv_tf, soft_inp, ctx_bufB, attn_o_inp, add_res, ff1_inp, gelu_inp, ff2_inp,
+         attn_prob_dropout_mask, attn_output_dropout_mask, layer_output_dropout_mask, attn_layer_norm_var,
+         attn_layer_norm_mean, layer_norm_var, layer_norm_mean) = forward_func(
+             config.layer_id, input, input_mask, attn_qkvw, attn_qkvb, attn_ow, attn_ob, attn_nw, attn_nb, inter_w,
+             inter_b, output_w, output_b, norm_w, norm_b, config.training and config.is_grad_enabled,
+             config.pre_layer_norm, config.attn_dropout_checkpoint, config.normalize_invertible,
+             config.gelu_checkpoint)
 
         # For testing only.
         if grads is not None:
             for i in [2]:
-                attn_qkvw.register_hook(
-                    lambda x,
-                    i=i,
-                    self=self: grads.append([
-                        x[i * attn_ow.size(0):(i + 1) * attn_ow.size(0)],
-                        ("Q_W" if i == 0 else "K_W" if i == 1 else "V_W")
-                    ]))
+                attn_qkvw.register_hook(lambda x, i=i, self=self: grads.append([
+                    x[i * attn_ow.size(0):(i + 1) * attn_ow.size(0)], ("Q_W" if i == 0 else "K_W" if i == 1 else "V_W")
+                ]))
             for i in [2]:
-                attn_qkvb.register_hook(
-                    lambda x,
-                    i=i,
-                    self=self: grads.append([
-                        x[i * attn_ow.size(0):(i + 1) * attn_ow.size(0)],
-                        ("Q_B" if i == 0 else "K_B" if i == 1 else "V_B")
-                    ]))
+                attn_qkvb.register_hook(lambda x, i=i, self=self: grads.append([
+                    x[i * attn_ow.size(0):(i + 1) * attn_ow.size(0)], ("Q_B" if i == 0 else "K_B" if i == 1 else "V_B")
+                ]))
 
             attn_ow.register_hook(lambda x, self=self: grads.append([x, "O_W"]))
             attn_ob.register_hook(lambda x, self=self: grads.append([x, "O_B"]))
@@ -255,35 +191,11 @@ def forward(ctx,
 
         if config.is_grad_enabled and config.training:
             if (config.pre_layer_norm and config.normalize_invertible):
-                ctx.save_for_backward(input_mask,
-                                      attn_qkvw,
-                                      attn_qkvb,
-                                      attn_ow,
-                                      attn_ob,
-                                      attn_nw,
-                                      attn_nb,
-                                      inter_w,
-                                      inter_b,
-                                      output_w,
-                                      output_b,
-                                      norm_w,
-                                      norm_b)
+                ctx.save_for_backward(input_mask, attn_qkvw, attn_qkvb, attn_ow, attn_ob, attn_nw, attn_nb, inter_w,
+                                      inter_b, output_w, output_b, norm_w, norm_b)
             else:
-                ctx.save_for_backward(output,
-                                      input,
-                                      input_mask,
-                                      attn_qkvw,
-                                      attn_qkvb,
-                                      attn_ow,
-                                      attn_ob,
-                                      attn_nw,
-                                      attn_nb,
-                                      inter_w,
-                                      inter_b,
-                                      output_w,
-                                      output_b,
-                                      norm_w,
-                                      norm_b)
+                ctx.save_for_backward(output, input, input_mask, attn_qkvw, attn_qkvb, attn_ow, attn_ob, attn_nw,
+                                      attn_nb, inter_w, inter_b, output_w, output_b, norm_w, norm_b)
 
             ctx.config = config
             if (config.pre_layer_norm or not config.normalize_invertible):
@@ -331,88 +243,28 @@ def backward(ctx, grad_output):
         assert ctx.config.training
 
         if (ctx.config.pre_layer_norm and ctx.config.normalize_invertible):
-            (input_mask,
-             attn_qkvw,
-             attn_qkvb,
-             attn_ow,
-             attn_ob,
-             attn_nw,
-             attn_nb,
-             inter_w,
-             inter_b,
-             output_w,
-             output_b,
-             norm_w,
-             norm_b) = ctx.saved_tensors
+            (input_mask, attn_qkvw, attn_qkvb, attn_ow, attn_ob, attn_nw, attn_nb, inter_w, inter_b, output_w,
+             output_b, norm_w, norm_b) = ctx.saved_tensors
         else:
-            (output,
-             input,
-             input_mask,
-             attn_qkvw,
-             attn_qkvb,
-             attn_ow,
-             attn_ob,
-             attn_nw,
-             attn_nb,
-             inter_w,
-             inter_b,
-             output_w,
-             output_b,
-             norm_w,
-             norm_b) = ctx.saved_tensors
+            (output, input, input_mask, attn_qkvw, attn_qkvb, attn_ow, attn_ob, attn_nw, attn_nb, inter_w, inter_b,
+             output_w, output_b, norm_w, norm_b) = ctx.saved_tensors
 
         cuda_module = stochastic_transformer_cuda_module if ctx.config.stochastic_mode else transformer_cuda_module
         backward_func = cuda_module.backward_fp16 if ctx.config.fp16 else cuda_module.backward_fp32
 
-        (grad_input,
-         grad_attn_qkvw,
-         grad_attn_qkvb,
-         grad_attn_ow,
-         grad_attn_ob,
-         grad_attn_nw,
-         grad_attn_nb,
-         grad_inter_w,
-         grad_inter_b,
-         grad_output_w,
-         grad_output_b,
-         grad_norm_w,
-         grad_norm_b) = backward_func(
-             ctx.config.layer_id,
-             grad_output,
-             (ctx.inp_norm if (ctx.config.pre_layer_norm
-                               and ctx.config.normalize_invertible) else output),
-             (ctx.inp_norm if (ctx.config.pre_layer_norm
-                               or not ctx.config.normalize_invertible) else input),
-             ctx.qkv_tf,
-             ctx.soft_inp,
-             (ctx.soft_inp if ctx.config.attn_dropout_checkpoint else ctx.ctx_bufB),
-             ctx.attn_o_inp,
-             (ctx.ff1_inp if ctx.config.normalize_invertible else ctx.add_res),
-             ctx.ff1_inp,
-             (ctx.ff2_inp if ctx.config.gelu_checkpoint else ctx.gelu_inp),
-             ctx.ff2_inp,
-             ctx.attn_prob_dropout_mask,
-             ctx.attn_output_dropout_mask,
-             ctx.layer_output_dropout_mask,
-             ctx.attn_layer_norm_var,
-             ctx.attn_layer_norm_mean,
-             ctx.layer_norm_var,
-             ctx.layer_norm_mean,
-             (ctx.inp_norm if (ctx.config.pre_layer_norm
-                               and ctx.config.normalize_invertible) else input),
-             input_mask,
-             attn_qkvw,
-             attn_qkvb,
-             attn_ow,
-             attn_ob,
-             attn_nw,
-             attn_nb,
-             inter_w,
-             inter_b,
-             output_w,
-             output_b,
-             norm_w,
-             norm_b)
+        (grad_input, grad_attn_qkvw, grad_attn_qkvb, grad_attn_ow, grad_attn_ob, grad_attn_nw, grad_attn_nb,
+         grad_inter_w, grad_inter_b, grad_output_w, grad_output_b, grad_norm_w, grad_norm_b) = backward_func(
+             ctx.config.layer_id, grad_output,
+             (ctx.inp_norm if (ctx.config.pre_layer_norm and ctx.config.normalize_invertible) else output),
+             (ctx.inp_norm if (ctx.config.pre_layer_norm or not ctx.config.normalize_invertible) else input),
+             ctx.qkv_tf, ctx.soft_inp, (ctx.soft_inp if ctx.config.attn_dropout_checkpoint else ctx.ctx_bufB),
+             ctx.attn_o_inp, (ctx.ff1_inp if ctx.config.normalize_invertible else ctx.add_res), ctx.ff1_inp,
+             (ctx.ff2_inp if ctx.config.gelu_checkpoint else ctx.gelu_inp), ctx.ff2_inp, ctx.attn_prob_dropout_mask,
+             ctx.attn_output_dropout_mask, ctx.layer_output_dropout_mask, ctx.attn_layer_norm_var,
+             ctx.attn_layer_norm_mean, ctx.layer_norm_var, ctx.layer_norm_mean,
+             (ctx.inp_norm if
+              (ctx.config.pre_layer_norm and ctx.config.normalize_invertible) else input), input_mask, attn_qkvw,
+             attn_qkvb, attn_ow, attn_ob, attn_nw, attn_nb, inter_w, inter_b, output_w, output_b, norm_w, norm_b)
 
         # This appears to be an effective way to release context memory
         ctx.qkv_tf = None
@@ -436,24 +288,9 @@ def backward(ctx, grad_output):
         if grad_output_shape[1] % 16 != 0:
             grad_input = torch.narrow(grad_input, 1, 0, grad_output_shape[1])
 
-        return (grad_input,
-                None,
-                None,
-                None,
-                None,
-                grad_attn_qkvw,
-                grad_attn_qkvb,
-                grad_attn_ow,
-                grad_attn_ob,
-                grad_attn_nw,
-                grad_attn_nb,
-                grad_inter_w,
-                grad_inter_b,
-                grad_output_w,
-                grad_output_b,
-                grad_norm_w,
-                grad_norm_b,
-                None)
+        return (grad_input, None, None, None, None, grad_attn_qkvw, grad_attn_qkvb, grad_attn_ow, grad_attn_ob,
+                grad_attn_nw, grad_attn_nb, grad_inter_w, grad_inter_b, grad_output_w, grad_output_b, grad_norm_w,
+                grad_norm_b, None)
 
 
 class DeepSpeedTransformerLayer(nn.Module):
@@ -484,23 +321,15 @@ def __init__(self, config, initial_weights=None, initial_biases=None):
             get_accelerator().set_device(self.config.local_rank)
 
         if initial_weights is None and initial_biases is None:
-            self.attn_qkvw = nn.Parameter(
-                torch.Tensor(self.config.hidden_size * 3,
-                             self.config.hidden_size))
+            self.attn_qkvw = nn.Parameter(torch.Tensor(self.config.hidden_size * 3, self.config.hidden_size))
             self.attn_qkvb = nn.Parameter(torch.Tensor(self.config.hidden_size * 3))
-            self.attn_ow = nn.Parameter(
-                torch.Tensor(self.config.hidden_size,
-                             self.config.hidden_size))
+            self.attn_ow = nn.Parameter(torch.Tensor(self.config.hidden_size, self.config.hidden_size))
             self.attn_ob = nn.Parameter(torch.Tensor(self.config.hidden_size))
             self.attn_nw = nn.Parameter(torch.Tensor(self.config.hidden_size))
             self.attn_nb = nn.Parameter(torch.Tensor(self.config.hidden_size))
-            self.inter_w = nn.Parameter(
-                torch.Tensor(self.config.intermediate_size,
-                             self.config.hidden_size))
+            self.inter_w = nn.Parameter(torch.Tensor(self.config.intermediate_size, self.config.hidden_size))
             self.inter_b = nn.Parameter(torch.Tensor(self.config.intermediate_size))
-            self.output_w = nn.Parameter(
-                torch.Tensor(self.config.hidden_size,
-                             self.config.intermediate_size))
+            self.output_w = nn.Parameter(torch.Tensor(self.config.hidden_size, self.config.intermediate_size))
             self.output_b = nn.Parameter(torch.Tensor(self.config.hidden_size))
             self.norm_w = nn.Parameter(torch.Tensor(self.config.hidden_size))
             self.norm_b = nn.Parameter(torch.Tensor(self.config.hidden_size))
@@ -539,21 +368,11 @@ def __init__(self, config, initial_weights=None, initial_biases=None):
         cuda_module = stochastic_transformer_cuda_module if self.config.stochastic_mode else transformer_cuda_module
         create_layer_func = cuda_module.create_transformer_layer_fp16 if self.config.fp16 else cuda_module.create_transformer_layer_fp32
 
-        create_layer_func(self.config.layer_id,
-                          self.config.batch_size,
-                          self.config.hidden_size,
-                          self.config.heads,
-                          self.config.intermediate_size,
-                          self.config.attn_dropout_ratio,
-                          self.config.hidden_dropout_ratio,
-                          self.config.layer_norm_eps,
-                          self.config.seed,
-                          self.config.pre_layer_norm,
-                          self.config.test_gemm,
-                          self.config.attn_dropout_checkpoint,
-                          self.config.normalize_invertible,
-                          self.config.gelu_checkpoint,
-                          self.config.stochastic_mode)
+        create_layer_func(self.config.layer_id, self.config.batch_size, self.config.hidden_size, self.config.heads,
+                          self.config.intermediate_size, self.config.attn_dropout_ratio,
+                          self.config.hidden_dropout_ratio, self.config.layer_norm_eps, self.config.seed,
+                          self.config.pre_layer_norm, self.config.test_gemm, self.config.attn_dropout_checkpoint,
+                          self.config.normalize_invertible, self.config.gelu_checkpoint, self.config.stochastic_mode)
 
     def init_transformer_weights(self, adjust_init_range=False):
         num_layers = self.config.num_hidden_layers
@@ -587,21 +406,7 @@ def forward(self,
                 grads=None):
         self.config.is_grad_enabled = torch.is_grad_enabled()
         self.config.training = self.training
-        return DeepSpeedTransformerFunction.apply(hidden_states,
-                                                  attention_mask,
-                                                  self,
-                                                  grads,
-                                                  self.config.layer_id,
-                                                  self.attn_qkvw,
-                                                  self.attn_qkvb,
-                                                  self.attn_ow,
-                                                  self.attn_ob,
-                                                  self.attn_nw,
-                                                  self.attn_nb,
-                                                  self.inter_w,
-                                                  self.inter_b,
-                                                  self.output_w,
-                                                  self.output_b,
-                                                  self.norm_w,
-                                                  self.norm_b,
-                                                  self.config)
+        return DeepSpeedTransformerFunction.apply(hidden_states, attention_mask, self, grads, self.config.layer_id,
+                                                  self.attn_qkvw, self.attn_qkvb, self.attn_ow, self.attn_ob,
+                                                  self.attn_nw, self.attn_nb, self.inter_w, self.inter_b,
+                                                  self.output_w, self.output_b, self.norm_w, self.norm_b, self.config)
diff --git a/deepspeed/pipe/__init__.py b/deepspeed/pipe/__init__.py
index bbabf4feb7e0..f25471713b45 100644
--- a/deepspeed/pipe/__init__.py
+++ b/deepspeed/pipe/__init__.py
@@ -1,3 +1,6 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from ..runtime.pipe import PipelineModule, LayerSpec, TiedLayerSpec
diff --git a/deepspeed/profiling/__init__.py b/deepspeed/profiling/__init__.py
index fcb45ab2b685..6c5067f71c8f 100644
--- a/deepspeed/profiling/__init__.py
+++ b/deepspeed/profiling/__init__.py
@@ -1 +1,5 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 '''Copyright The Microsoft DeepSpeed Team'''
diff --git a/deepspeed/profiling/config.py b/deepspeed/profiling/config.py
index c22cd453fcba..e4f06630ea6f 100644
--- a/deepspeed/profiling/config.py
+++ b/deepspeed/profiling/config.py
@@ -1,18 +1,19 @@
-'''Copyright The Microsoft DeepSpeed Team'''
-"""
-Copyright (c) Microsoft Corporation
-Licensed under the MIT license.
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from deepspeed.runtime.config_utils import get_scalar_param, DeepSpeedConfigObject
 from deepspeed.profiling.constants import *
 
 
 class DeepSpeedFlopsProfilerConfig(DeepSpeedConfigObject):
+
     def __init__(self, param_dict):
         super(DeepSpeedFlopsProfilerConfig, self).__init__()
 
         self.enabled = None
+        self.recompute_fwd_factor = None
         self.profile_step = None
         self.module_depth = None
         self.top_modules = None
@@ -25,26 +26,21 @@ def __init__(self, param_dict):
         self._initialize(flops_profiler_dict)
 
     def _initialize(self, flops_profiler_dict):
-        self.enabled = get_scalar_param(flops_profiler_dict,
-                                        FLOPS_PROFILER_ENABLED,
-                                        FLOPS_PROFILER_ENABLED_DEFAULT)
+        self.enabled = get_scalar_param(flops_profiler_dict, FLOPS_PROFILER_ENABLED, FLOPS_PROFILER_ENABLED_DEFAULT)
+
+        self.recompute_fwd_factor = get_scalar_param(flops_profiler_dict, FLOPS_PROFILER_RECOMPUTE_FWD_FACTOR,
+                                                     FLOPS_PROFILER_RECOMPUTE_FWD_FACTOR_DEFAULT)
 
-        self.profile_step = get_scalar_param(flops_profiler_dict,
-                                             FLOPS_PROFILER_PROFILE_STEP,
+        self.profile_step = get_scalar_param(flops_profiler_dict, FLOPS_PROFILER_PROFILE_STEP,
                                              FLOPS_PROFILER_PROFILE_STEP_DEFAULT)
 
-        self.module_depth = get_scalar_param(flops_profiler_dict,
-                                             FLOPS_PROFILER_MODULE_DEPTH,
+        self.module_depth = get_scalar_param(flops_profiler_dict, FLOPS_PROFILER_MODULE_DEPTH,
                                              FLOPS_PROFILER_MODULE_DEPTH_DEFAULT)
 
-        self.top_modules = get_scalar_param(flops_profiler_dict,
-                                            FLOPS_PROFILER_TOP_MODULES,
+        self.top_modules = get_scalar_param(flops_profiler_dict, FLOPS_PROFILER_TOP_MODULES,
                                             FLOPS_PROFILER_TOP_MODULES_DEFAULT)
 
-        self.detailed = get_scalar_param(flops_profiler_dict,
-                                         FLOPS_PROFILER_DETAILED,
-                                         FLOPS_PROFILER_DETAILED_DEFAULT)
+        self.detailed = get_scalar_param(flops_profiler_dict, FLOPS_PROFILER_DETAILED, FLOPS_PROFILER_DETAILED_DEFAULT)
 
-        self.output_file = get_scalar_param(flops_profiler_dict,
-                                            FLOPS_PROFILER_OUTPUT_FILE,
+        self.output_file = get_scalar_param(flops_profiler_dict, FLOPS_PROFILER_OUTPUT_FILE,
                                             FLOPS_PROFILER_OUTPUT_FILE_DEFAULT)
diff --git a/deepspeed/profiling/constants.py b/deepspeed/profiling/constants.py
index 162f1d3e7f2f..0374303d7d96 100644
--- a/deepspeed/profiling/constants.py
+++ b/deepspeed/profiling/constants.py
@@ -1,8 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
-"""
-Copyright (c) Microsoft Corporation
-Licensed under the MIT license.
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 #########################################
 # flops profiler
@@ -14,6 +13,7 @@
 "session_params": {
   "flops_profiler": {
     "enabled": true,
+    "recompute_fwd_factor": 0.0,
     "profile_step": 1,
     "module_depth": -1,
     "top_modules": 3,
@@ -28,6 +28,9 @@
 FLOPS_PROFILER_ENABLED = "enabled"
 FLOPS_PROFILER_ENABLED_DEFAULT = False
 
+FLOPS_PROFILER_RECOMPUTE_FWD_FACTOR = "recompute_fwd_factor"
+FLOPS_PROFILER_RECOMPUTE_FWD_FACTOR_DEFAULT = 0.0
+
 FLOPS_PROFILER_PROFILE_STEP = "profile_step"
 FLOPS_PROFILER_PROFILE_STEP_DEFAULT = 1
 
diff --git a/deepspeed/profiling/flops_profiler/README.md b/deepspeed/profiling/flops_profiler/README.md
index 6d749c5df817..82011eb68568 100644
--- a/deepspeed/profiling/flops_profiler/README.md
+++ b/deepspeed/profiling/flops_profiler/README.md
@@ -166,6 +166,7 @@ When using DeepSpeed for model training, the profiler can be configured in the d
 {
   "flops_profiler": {
     "enabled": true,
+    "recompute_fwd_factor": 0.0,
     "profile_step": 1,
     "module_depth": -1,
     "top_modules": 1,
@@ -309,8 +310,9 @@ The following example shows how to profile AlexNet using the DeepSpeed flops pro
 import torchvision.models as models
 import torch
 from deepspeed.profiling.flops_profiler import get_model_profile
+from deepspeed.accelerator import get_accelerator
 
-with torch.cuda.device(0):
+with get_accelerator().device(0):
     model = models.alexnet()
     batch_size = 256
     flops, macs, params = get_model_profile(model=model, # model
@@ -334,6 +336,7 @@ from functools import partial
 import torch
 from transformers import BertForSequenceClassification, BertTokenizer
 from deepspeed.profiling.flops_profiler import get_model_profile
+from deepspeed.accelerator import get_accelerator
 
 
 def bert_input_constructor(batch_size, seq_len, tokenizer):
@@ -350,7 +353,7 @@ def bert_input_constructor(batch_size, seq_len, tokenizer):
     return inputs
 
 
-with torch.cuda.device(0):
+with get_accelerator().device(0):
     tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
     model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
     batch_size = 4
diff --git a/deepspeed/profiling/flops_profiler/__init__.py b/deepspeed/profiling/flops_profiler/__init__.py
index 7454821bc1cc..980d8a5e92fc 100644
--- a/deepspeed/profiling/flops_profiler/__init__.py
+++ b/deepspeed/profiling/flops_profiler/__init__.py
@@ -1,3 +1,6 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .profiler import *
diff --git a/deepspeed/profiling/flops_profiler/profiler.py b/deepspeed/profiling/flops_profiler/profiler.py
index b6684f6978c3..79e682a73b90 100644
--- a/deepspeed/profiling/flops_profiler/profiler.py
+++ b/deepspeed/profiling/flops_profiler/profiler.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import time
 import torch
@@ -9,6 +12,9 @@
 from collections import OrderedDict
 import numpy as np
 from deepspeed.accelerator import get_accelerator
+from deepspeed.utils import logger
+from deepspeed.moe.layer import MoE
+from deepspeed.utils.timer import FORWARD_GLOBAL_TIMER, BACKWARD_GLOBAL_TIMER, STEP_GLOBAL_TIMER
 
 Tensor = torch.Tensor
 
@@ -16,6 +22,8 @@
 module_mac_count = []
 old_functions = {}
 
+DEFAULT_PRECISION = 2
+
 
 class FlopsProfiler(object):
     """Measures the latency, number of estimated floating-point operations and parameters of each module in a PyTorch model.
@@ -53,9 +61,11 @@ class FlopsProfiler(object):
     Args:
         object (torch.nn.Module): The PyTorch model to profile.
     """
-    def __init__(self, model, ds_engine=None):
+
+    def __init__(self, model, ds_engine=None, recompute_fwd_factor=0.0):
         self.model = model
         self.ds_engine = ds_engine
+        self.recompute_fwd_factor = recompute_fwd_factor
         self.started = False
         self.func_patched = False
 
@@ -67,6 +77,7 @@ def start_profile(self, ignore_list=None):
         Args:
             ignore_list (list, optional): the list of modules to ignore while profiling. Defaults to None.
         """
+        logger.info("Flops profiler started")
         self.reset_profile()
         _patch_functionals()
         _patch_tensor_methods()
@@ -78,8 +89,7 @@ def register_module_hooks(module, ignore_list):
             # if computing the flops of a module directly
             if type(module) in MODULE_HOOK_MAPPING:
                 if not hasattr(module, "__flops_handle__"):
-                    module.__flops_handle__ = module.register_forward_hook(
-                        MODULE_HOOK_MAPPING[type(module)])
+                    module.__flops_handle__ = module.register_forward_hook(MODULE_HOOK_MAPPING[type(module)])
                 return
 
             # if computing the flops of the functionals in a module
@@ -105,16 +115,14 @@ def start_time_hook(module, input):
                 module.__start_time__ = time.time()
 
             if not hasattr(module, "__start_time_hook_handle"):
-                module.__start_time_hook_handle__ = module.register_forward_pre_hook(
-                    start_time_hook)
+                module.__start_time_hook_handle__ = module.register_forward_pre_hook(start_time_hook)
 
             def end_time_hook(module, input, output):
                 get_accelerator().synchronize()
                 module.__duration__ += time.time() - module.__start_time__
 
             if not hasattr(module, "__end_time_hook_handle__"):
-                module.__end_time_hook_handle__ = module.register_forward_hook(
-                    end_time_hook)
+                module.__end_time_hook_handle__ = module.register_forward_hook(end_time_hook)
 
         self.model.apply(partial(register_module_hooks, ignore_list=ignore_list))
         self.started = True
@@ -154,10 +162,35 @@ def reset_profile(self):
 
         Adds or resets the extra attributes.
         """
+
+        def get_param_count_and_ep(param):
+            """
+            Return the number of parameters in the layer, whether the layer is an MoE layer,
+            and its expert parallelism size if so
+            """
+            prefix = 'ep_size_'
+            offset = len(prefix)
+            expert_parallelism = 0
+            if getattr(param, "group_name", "").startswith(prefix):
+                try:
+                    expert_parallelism = int(param.group_name[offset:])
+                except ValueError:
+                    pass
+            return param.numel(), expert_parallelism, param.element_size()
+
         def add_or_reset_attrs(module):
             module.__flops__ = 0
             module.__macs__ = 0
-            module.__params__ = sum(p.numel() for p in module.parameters())
+            module.__params__ = module.__expert_params__ = module.__model_expert_params__ = 0
+            parameters = (get_param_count_and_ep(p) for p in module.parameters())
+            for num_params, expert_parallelism, per_param_size in parameters:
+                params = num_params if not expert_parallelism else 0
+                expert_params = num_params if expert_parallelism else 0
+                # number of expert parameters taking into account other expert parallel groups
+                model_expert_params = num_params * expert_parallelism
+                module.__params__ += params
+                module.__expert_params__ += expert_params
+                module.__model_expert_params__ += model_expert_params
             module.__start_time__ = 0
             module.__duration__ = 0
 
@@ -180,12 +213,17 @@ def remove_profile_attrs(module):
                 del module.__macs__
             if hasattr(module, "__params__"):
                 del module.__params__
+            if hasattr(module, "__expert_params__"):
+                del module.__expert_params__
+            if hasattr(module, "__model_expert_params__"):
+                del module.__model_expert_params__
             if hasattr(module, "__start_time__"):
                 del module.__start_time__
             if hasattr(module, "__duration__"):
                 del module.__duration__
 
         self.model.apply(remove_profile_attrs)
+        logger.info("Flops profiler finished")
 
     def get_total_flops(self, as_string=False):
         """Returns the total flops of the model.
@@ -197,7 +235,7 @@ def get_total_flops(self, as_string=False):
             The number of multiply-accumulate operations of the model forward pass.
         """
         total_flops = get_module_flops(self.model)
-        return num_to_string(total_flops) if as_string else total_flops
+        return number_to_string(total_flops) if as_string else total_flops
 
     def get_total_macs(self, as_string=False):
         """Returns the total MACs of the model.
@@ -224,23 +262,24 @@ def get_total_duration(self, as_string=False):
         return duration_to_string(total_duration) if as_string else total_duration
 
     def get_total_params(self, as_string=False):
-        """Returns the total parameters of the model.
+        """Returns the total number of parameters stored per rank.
 
         Args:
             as_string (bool, optional): whether to output the parameters as string. Defaults to False.
 
         Returns:
-            The number of parameters in the model.
+            The total number of parameters stored per rank.
         """
-        return params_to_string(
-            self.model.__params__) if as_string else self.model.__params__
-
-    def print_model_profile(self,
-                            profile_step=1,
-                            module_depth=-1,
-                            top_modules=1,
-                            detailed=True,
-                            output_file=None):
+        total_params = self.model.__expert_params__ + self.model.__params__
+        return params_to_string(total_params) if as_string else total_params
+
+    def is_expert_tensor_parallelism_enabled(self):
+        for _, module in self.model.named_modules():
+            if isinstance(module, MoE) and hasattr(module, 'enable_expert_tensor_parallelism'):
+                return module.enable_expert_tensor_parallelism
+        return False
+
+    def print_model_profile(self, profile_step=1, module_depth=-1, top_modules=1, detailed=True, output_file=None):
         """Prints the model graph with the measured profile attached to each module.
 
         Args:
@@ -268,96 +307,107 @@ def print_model_profile(self,
         total_macs = self.get_total_macs()
         total_duration = self.get_total_duration()
         total_params = self.get_total_params()
+        expert_tensor_parallelism = None  # silence the linters
+        total_model_expert_params = total_model_nonexpert_params = 0
+        if self.ds_engine:
+            total_model_nonexpert_params = self.model.__params__ * self.ds_engine.mp_world_size
+            if self.ds_engine.has_moe_layers:
+                expert_tensor_parallelism = self.ds_engine.mp_world_size if self.is_expert_tensor_parallelism_enabled(
+                ) else 1
+                total_model_expert_params = self.model.__model_expert_params__ * expert_tensor_parallelism
 
         self.flops = total_flops
         self.macs = total_macs
         self.params = total_params
 
-        print(
-            "\n-------------------------- DeepSpeed Flops Profiler --------------------------"
-        )
+        print("\n-------------------------- DeepSpeed Flops Profiler --------------------------")
         print(f'Profile Summary at step {profile_step}:')
-        print(
-            "Notations:\ndata parallel size (dp_size), model parallel size(mp_size),\nnumber of parameters (params), number of multiply-accumulate operations(MACs),\nnumber of floating-point operations (flops), floating-point operations per second (FLOPS),\nfwd latency (forward propagation latency), bwd latency (backward propagation latency),\nstep (weights update latency), iter latency (sum of fwd, bwd and step latency)\n"
-        )
+        print("Notations:\n"
+              "data parallel size (dp_size), model parallel size(mp_size),\n"
+              "number of parameters (params), number of multiply-accumulate operations(MACs),\n"
+              "number of floating-point operations (flops), floating-point operations per second (FLOPS),\n"
+              "fwd latency (forward propagation latency), bwd latency (backward propagation latency),\n"
+              "step (weights update latency), iter latency (sum of fwd, bwd and step latency)\n")
+        line_fmt = '{:<70}  {:<8}'
         if self.ds_engine:
-            print('{:<60}  {:<8}'.format('world size: ', self.ds_engine.world_size))
-            print('{:<60}  {:<8}'.format('data parallel size: ',
-                                         self.ds_engine.dp_world_size))
-            print('{:<60}  {:<8}'.format('model parallel size: ',
-                                         self.ds_engine.mp_world_size))
-            print('{:<60}  {:<8}'.format(
-                'batch size per GPU: ',
-                self.ds_engine.train_micro_batch_size_per_gpu()))
-
-        print('{:<60}  {:<8}'.format('params per gpu: ', params_to_string(total_params)))
-        print('{:<60}  {:<8}'.format(
-            'params of model = params per GPU * mp_size: ',
-            params_to_string(total_params *
-                             ((self.ds_engine.mp_world_size) if self.ds_engine else 1))))
-
-        print('{:<60}  {:<8}'.format('fwd MACs per GPU: ', macs_to_string(total_macs)))
-
-        print('{:<60}  {:<8}'.format('fwd flops per GPU: ', num_to_string(total_flops)))
-
-        print('{:<60}  {:<8}'.format(
-            'fwd flops of model = fwd flops per GPU * mp_size: ',
-            num_to_string(total_flops *
-                          ((self.ds_engine.mp_world_size) if self.ds_engine else 1))))
+            print(line_fmt.format('world size: ', self.ds_engine.world_size))
+            print(line_fmt.format('data parallel size: ', self.ds_engine.dp_world_size))
+            print(line_fmt.format('model parallel size: ', self.ds_engine.mp_world_size))
+            print(line_fmt.format('batch size per GPU: ', self.ds_engine.train_micro_batch_size_per_gpu()))
+            if self.ds_engine.has_moe_layers:
+                print(line_fmt.format('expert tensor parallelism enabled: ', expert_tensor_parallelism > 1))
+
+        print(line_fmt.format('params per GPU: ', params_to_string(total_params)))
+        if total_model_expert_params > 0:
+            print(
+                line_fmt.format('params of model: ',
+                                params_to_string(total_model_nonexpert_params + total_model_expert_params)))
+            print(line_fmt.format('   non-expert params of model: ', params_to_string(total_model_nonexpert_params)))
+            print(line_fmt.format('   expert params of model: ', params_to_string(total_model_expert_params)))
+        else:
+            print(
+                line_fmt.format('params of model = params per GPU * mp_size: ',
+                                params_to_string(total_model_nonexpert_params)))
+
+        print(line_fmt.format('fwd MACs per GPU: ', macs_to_string(total_macs)))
+
+        print(line_fmt.format('fwd flops per GPU: ', number_to_string(total_flops)))
+
+        print(
+            line_fmt.format('fwd flops of model = fwd flops per GPU * mp_size: ',
+                            number_to_string(total_flops * (self.ds_engine.mp_world_size if self.ds_engine else 1))))
 
         fwd_latency = self.get_total_duration()
         if self.ds_engine and self.ds_engine.wall_clock_breakdown():
-            fwd_latency = self.ds_engine.timers('forward').elapsed(False) / 1000.0
-        print('{:<60}  {:<8}'.format('fwd latency: ', duration_to_string(fwd_latency)))
-        print('{:<60}  {:<8}'.format(
-            'fwd FLOPS per GPU = fwd flops per GPU / fwd latency: ',
-            flops_to_string(total_flops / fwd_latency)))
+            fwd_latency = self.ds_engine.timers(FORWARD_GLOBAL_TIMER).elapsed(False) / 1000.0
+        print(line_fmt.format('fwd latency: ', duration_to_string(fwd_latency)))
+        print(
+            line_fmt.format('fwd FLOPS per GPU = fwd flops per GPU / fwd latency: ',
+                            flops_to_string(total_flops / fwd_latency)))
 
         if self.ds_engine and self.ds_engine.wall_clock_breakdown():
-            bwd_latency = self.ds_engine.timers('backward').elapsed(False) / 1000.0
-            step_latency = self.ds_engine.timers('step').elapsed(False) / 1000.0
-            print('{:<60}  {:<8}'.format('bwd latency: ',
-                                         duration_to_string(bwd_latency)))
-            print('{:<60}  {:<8}'.format(
-                'bwd FLOPS per GPU = 2 * fwd flops per GPU / bwd latency: ',
-                flops_to_string(2 * total_flops / bwd_latency)))
-            print('{:<60}  {:<8}'.format(
-                'fwd+bwd FLOPS per GPU = 3 * fwd flops per GPU / (fwd+bwd latency): ',
-                flops_to_string(3 * total_flops / (fwd_latency + bwd_latency))))
-
-            print('{:<60}  {:<8}'.format('step latency: ',
-                                         duration_to_string(step_latency)))
+            bwd_factor = 2 + self.recompute_fwd_factor
+            bwd_latency = self.ds_engine.timers(BACKWARD_GLOBAL_TIMER).elapsed(False) / 1000.0
+            step_latency = self.ds_engine.timers(STEP_GLOBAL_TIMER).elapsed(False) / 1000.0
+            print(line_fmt.format('bwd latency: ', duration_to_string(bwd_latency)))
+            print(
+                line_fmt.format(f'bwd FLOPS per GPU = {bwd_factor:g} * fwd flops per GPU / bwd latency: ',
+                                flops_to_string(bwd_factor * total_flops / bwd_latency)))
+            print(
+                line_fmt.format(
+                    f'fwd+bwd FLOPS per GPU = {bwd_factor + 1:g} * fwd flops per GPU / (fwd+bwd latency): ',
+                    flops_to_string((bwd_factor + 1) * total_flops / (fwd_latency + bwd_latency))))
+
+            print(line_fmt.format('step latency: ', duration_to_string(step_latency)))
 
             iter_latency = fwd_latency + bwd_latency + step_latency
-            print('{:<60}  {:<8}'.format('iter latency: ',
-                                         duration_to_string(iter_latency)))
-            print('{:<60}  {:<8}'.format(
-                'FLOPS per GPU = 3 * fwd flops per GPU / iter latency: ',
-                flops_to_string(3 * total_flops / iter_latency)))
+            print(line_fmt.format('iter latency: ', duration_to_string(iter_latency)))
+            print(
+                line_fmt.format(f'FLOPS per GPU = {bwd_factor + 1:g} * fwd flops per GPU / iter latency: ',
+                                flops_to_string((bwd_factor + 1) * total_flops / iter_latency)))
 
-            samples_per_iter = self.ds_engine.train_micro_batch_size_per_gpu(
-            ) * self.ds_engine.world_size
-            print('{:<60}  {:<8.2f}'.format('samples/second: ',
-                                            samples_per_iter / iter_latency))
+            samples_per_iter = self.ds_engine.train_micro_batch_size_per_gpu() * self.ds_engine.world_size
+            print(line_fmt.format('samples/second: ', round(samples_per_iter / iter_latency, DEFAULT_PRECISION)))
 
         def flops_repr(module):
-            params = module.__params__
+            params = module.__params__ + module.__expert_params__
             flops = get_module_flops(module)
             macs = get_module_macs(module)
+            duration = get_module_duration(module)
             items = [
-                params_to_string(params),
-                "{:.2%} Params".format(params / total_params if total_params else 0),
-                macs_to_string(macs),
-                "{:.2%} MACs".format(0.0 if total_macs == 0 else macs / total_macs),
+                "{} = {:g}% Params".format(
+                    params_to_string(params),
+                    round(100 * params / total_params, DEFAULT_PRECISION) if total_params else 0),
+                "{} = {:g}% MACs".format(macs_to_string(macs),
+                                         round(100 * macs / total_macs, DEFAULT_PRECISION) if total_macs else 0),
+                "{} = {:g}% latency".format(
+                    duration_to_string(duration),
+                    round(100 * duration / total_duration, DEFAULT_PRECISION) if total_duration else 0),
+                flops_to_string(round(flops / duration, DEFAULT_PRECISION) if duration else 0),
             ]
-            duration = get_module_duration(module)
-
-            items.append(duration_to_string(duration))
-            items.append(
-                "{:.2%} latency".format(0.0 if total_duration == 0 else duration /
-                                        total_duration))
-            items.append(flops_to_string(0.0 if duration == 0 else flops / duration))
-            items.append(module.original_extra_repr())
+            original_extra_repr = module.original_extra_repr()
+            if original_extra_repr:
+                items.append(original_extra_repr)
             return ", ".join(items)
 
         def add_extra_repr(module):
@@ -374,16 +424,11 @@ def del_extra_repr(module):
 
         self.model.apply(add_extra_repr)
 
-        print(
-            "\n----------------------------- Aggregated Profile per GPU -----------------------------"
-        )
-        self.print_model_aggregated_profile(module_depth=module_depth,
-                                            top_modules=top_modules)
+        print("\n----------------------------- Aggregated Profile per GPU -----------------------------")
+        self.print_model_aggregated_profile(module_depth=module_depth, top_modules=top_modules)
 
         if detailed:
-            print(
-                "\n------------------------------ Detailed Profile per GPU ------------------------------"
-            )
+            print("\n------------------------------ Detailed Profile per GPU ------------------------------")
             print(
                 "Each module profile is listed after its name in the following order: \nparams, percentage of total params, MACs, percentage of total MACs, fwd latency, percentage of total fwd latency, fwd FLOPS"
             )
@@ -394,9 +439,7 @@ def del_extra_repr(module):
 
         self.model.apply(del_extra_repr)
 
-        print(
-            "------------------------------------------------------------------------------"
-        )
+        print("------------------------------------------------------------------------------")
 
         if output_file:
             sys.stdout = original_stdout
@@ -411,9 +454,7 @@ def print_model_aggregated_profile(self, module_depth=-1, top_modules=1):
         """
         info = {}
         if not hasattr(self.model, "__flops__"):
-            print(
-                "no __flops__ attribute in the model, call this function after start_profile and before end_profile"
-            )
+            print("no __flops__ attribute in the model, call this function after start_profile and before end_profile")
             return
 
         def walk_module(module, curr_depth, info):
@@ -426,7 +467,7 @@ def walk_module(module, curr_depth, info):
                     0,
                 ]  # macs, params, time
             info[curr_depth][module.__class__.__name__][0] += get_module_macs(module)
-            info[curr_depth][module.__class__.__name__][1] += module.__params__
+            info[curr_depth][module.__class__.__name__][1] += module.__params__ + module.__expert_params__
             info[curr_depth][module.__class__.__name__][2] += get_module_duration(module)
             has_children = len(module._modules.items()) != 0
             if has_children:
@@ -439,33 +480,22 @@ def walk_module(module, curr_depth, info):
         if module_depth == -1:
             depth = len(info) - 1
 
-        print(
-            f'Top {top_modules} modules in terms of params, MACs or fwd latency at different model depths:'
-        )
+        print(f'Top {top_modules} modules in terms of params, MACs or fwd latency at different model depths:')
 
         for d in range(depth):
             num_items = min(top_modules, len(info[d]))
 
             sort_macs = {
                 k: macs_to_string(v[0])
-                for k,
-                v in sorted(info[d].items(),
-                            key=lambda item: item[1][0],
-                            reverse=True)[:num_items]
+                for k, v in sorted(info[d].items(), key=lambda item: item[1][0], reverse=True)[:num_items]
             }
             sort_params = {
                 k: params_to_string(v[1])
-                for k,
-                v in sorted(info[d].items(),
-                            key=lambda item: item[1][1],
-                            reverse=True)[:num_items]
+                for k, v in sorted(info[d].items(), key=lambda item: item[1][1], reverse=True)[:num_items]
             }
             sort_time = {
                 k: duration_to_string(v[2])
-                for k,
-                v in sorted(info[d].items(),
-                            key=lambda item: item[1][2],
-                            reverse=True)[:num_items]
+                for k, v in sorted(info[d].items(), key=lambda item: item[1][2], reverse=True)[:num_items]
             }
 
             print(f"depth {d}:")
@@ -499,9 +529,7 @@ def _elu_flops_compute(input: Tensor, alpha: float = 1.0, inplace: bool = False)
     return input.numel(), 0
 
 
-def _leaky_relu_flops_compute(input: Tensor,
-                              negative_slope: float = 0.01,
-                              inplace: bool = False):
+def _leaky_relu_flops_compute(input: Tensor, negative_slope: float = 0.01, inplace: bool = False):
     return input.numel(), 0
 
 
@@ -529,13 +557,7 @@ def _pool_flops_compute(input,
     return input.numel(), 0
 
 
-def _conv_flops_compute(input,
-                        weight,
-                        bias=None,
-                        stride=1,
-                        padding=0,
-                        dilation=1,
-                        groups=1):
+def _conv_flops_compute(input, weight, bias=None, stride=1, padding=0, dilation=1, groups=1):
     assert weight.shape[1] * groups == input.shape[1]
 
     batch_size = input.shape[0]
@@ -546,14 +568,25 @@ def _conv_flops_compute(input,
 
     length = len(input_dims)
 
-    paddings = padding if type(padding) is tuple else (padding, ) * length
     strides = stride if type(stride) is tuple else (stride, ) * length
     dilations = dilation if type(dilation) is tuple else (dilation, ) * length
+    if isinstance(padding, str):
+        if padding == 'valid':
+            paddings = (0, ) * length
+        elif padding == 'same':
+            paddings = ()
+            for d, k in zip(dilations, kernel_dims):
+                total_padding = d * (k - 1)
+                paddings += (total_padding // 2, )
+    elif isinstance(padding, tuple):
+        paddings = padding
+    else:
+        paddings = (padding, ) * length
 
     output_dims = []
     for idx, input_dim in enumerate(input_dims):
-        output_dim = (input_dim + 2 * paddings[idx] -
-                      (dilations[idx] * (kernel_dims[idx] - 1) + 1)) // strides[idx] + 1
+        output_dim = (input_dim + 2 * paddings[idx] - (dilations[idx] *
+                                                       (kernel_dims[idx] - 1) + 1)) // strides[idx] + 1
         output_dims.append(output_dim)
 
     filters_per_channel = out_channels // groups
@@ -581,7 +614,7 @@ def _conv_trans_flops_compute(
 ):
     batch_size = input.shape[0]
     in_channels = input.shape[1]
-    out_channels = weight.shape[0]
+    out_channels = weight.shape[1]
     kernel_dims = list(weight.shape[2:])
     input_dims = list(input.shape[2:])
 
@@ -594,8 +627,8 @@ def _conv_trans_flops_compute(
     output_dims = []
     for idx, input_dim in enumerate(input_dims):
 
-        output_dim = (input_dim + 2 * paddings[idx] -
-                      (dilations[idx] * (kernel_dims[idx] - 1) + 1)) // strides[idx] + 1
+        output_dim = (input_dim + 2 * paddings[idx] - (dilations[idx] *
+                                                       (kernel_dims[idx] - 1) + 1)) // strides[idx] + 1
         output_dims.append(output_dim)
 
     paddings = padding if type(padding) is tuple else (padding, padding)
@@ -670,20 +703,28 @@ def _instance_norm_flops_compute(
     return input.numel() * (5 if has_affine else 4), 0
 
 
-def _upsample_flops_compute(input, **kwargs):
+def _upsample_flops_compute(*args, **kwargs):
+    input = args[0]
     size = kwargs.get('size', None)
+    if size is None and len(args) > 1:
+        size = args[1]
+
     if size is not None:
         if isinstance(size, tuple) or isinstance(size, list):
             return int(_prod(size)), 0
         else:
             return int(size), 0
+
     scale_factor = kwargs.get('scale_factor', None)
+    if scale_factor is None and len(args) > 2:
+        scale_factor = args[2]
     assert scale_factor is not None, "either size or scale_factor should be defined"
+
     flops = input.numel()
     if isinstance(scale_factor, tuple) and len(scale_factor) == len(input):
-        flops * int(_prod(scale_factor))
+        flops *= int(_prod(scale_factor))
     else:
-        flops * scale_factor**len(input)
+        flops *= scale_factor**len(input)
     return flops, 0
 
 
@@ -940,10 +981,11 @@ def _reload_tensor_methods():
 
 
 def _rnn_flops(flops, rnn_module, w_ih, w_hh, input_size):
+    gates_size = w_ih.shape[0]
     # matrix matrix mult ih state and internal state
-    flops += w_ih.shape[0] * w_ih.shape[1]
+    flops += 2 * w_ih.shape[0] * w_ih.shape[1] - gates_size
     # matrix matrix mult hh state and internal state
-    flops += w_hh.shape[0] * w_hh.shape[1]
+    flops += 2 * w_hh.shape[0] * w_hh.shape[1] - gates_size
     if isinstance(rnn_module, (nn.RNN, nn.RNNCell)):
         # add both operations
         flops += rnn_module.hidden_size
@@ -1020,118 +1062,59 @@ def _rnn_cell_forward_hook(rnn_cell_module, input, output):
 }
 
 
-def num_to_string(num, precision=2):
-    if num // 10**9 > 0:
-        return str(round(num / 10.0**9, precision)) + " G"
-    elif num // 10**6 > 0:
-        return str(round(num / 10.0**6, precision)) + " M"
-    elif num // 10**3 > 0:
-        return str(round(num / 10.0**3, precision)) + " K"
-    else:
-        return str(num)
+def macs_to_string(macs, units=None, precision=DEFAULT_PRECISION):
+    return f"{number_to_string(macs, units=units, precision=precision)}MACs"
 
 
-def macs_to_string(macs, units=None, precision=2):
+def number_to_string(num, units=None, precision=DEFAULT_PRECISION):
     if units is None:
-        if macs // 10**9 > 0:
-            return str(round(macs / 10.0**9, precision)) + " GMACs"
-        elif macs // 10**6 > 0:
-            return str(round(macs / 10.0**6, precision)) + " MMACs"
-        elif macs // 10**3 > 0:
-            return str(round(macs / 10.0**3, precision)) + " KMACs"
+        if num >= 1e12:
+            magnitude, units = 1e12, "T"
+        elif num >= 1e9:
+            magnitude, units = 1e9, "G"
+        elif num >= 1e6:
+            magnitude, units = 1e6, "M"
+        elif num >= 1e3:
+            magnitude, units = 1e3, "K"
+        elif num >= 1 or num == 0:
+            magnitude, units = 1, ""
+        elif num >= 1e-3:
+            magnitude, units = 1e-3, "m"
         else:
-            return str(macs) + " MACs"
+            magnitude, units = 1e-6, "u"
     else:
-        if units == "GMACs":
-            return str(round(macs / 10.0**9, precision)) + " " + units
-        elif units == "MMACs":
-            return str(round(macs / 10.0**6, precision)) + " " + units
-        elif units == "KMACs":
-            return str(round(macs / 10.0**3, precision)) + " " + units
-        else:
-            return str(macs) + " MACs"
-
-
-def number_to_string(num, units=None, precision=2):
-    if units is None:
-        if num // 10**9 > 0:
-            return str(round(num / 10.0**9, precision)) + " G"
-        elif num // 10**6 > 0:
-            return str(round(num / 10.0**6, precision)) + " M"
-        elif num // 10**3 > 0:
-            return str(round(num / 10.0**3, precision)) + " K"
-        else:
-            return str(num) + " "
-    else:
-        if units == "G":
-            return str(round(num / 10.0**9, precision)) + " " + units
+        if units == "T":
+            magnitude = 1e12
+        elif units == "G":
+            magnitude = 1e9
         elif units == "M":
-            return str(round(num / 10.0**6, precision)) + " " + units
+            magnitude = 1e6
         elif units == "K":
-            return str(round(num / 10.0**3, precision)) + " " + units
+            magnitude = 1e3
+        elif units == "m":
+            magnitude = 1e-3
+        elif units == "u":
+            magnitude = 1e-6
         else:
-            return str(num) + " "
+            magnitude = 1
+    return f"{round(num / magnitude, precision):g} {units}"
 
 
-def flops_to_string(flops, units=None, precision=2):
-    if units is None:
-        if flops // 10**12 > 0:
-            return str(round(flops / 10.0**12, precision)) + " TFLOPS"
-        if flops // 10**9 > 0:
-            return str(round(flops / 10.0**9, precision)) + " GFLOPS"
-        elif flops // 10**6 > 0:
-            return str(round(flops / 10.0**6, precision)) + " MFLOPS"
-        elif flops // 10**3 > 0:
-            return str(round(flops / 10.0**3, precision)) + " KFLOPS"
-        else:
-            return str(flops) + " FLOPS"
-    else:
-        if units == "TFLOPS":
-            return str(round(flops / 10.0**12, precision)) + " " + units
-        if units == "GFLOPS":
-            return str(round(flops / 10.0**9, precision)) + " " + units
-        elif units == "MFLOPS":
-            return str(round(flops / 10.0**6, precision)) + " " + units
-        elif units == "KFLOPS":
-            return str(round(flops / 10.0**3, precision)) + " " + units
-        else:
-            return str(flops) + " FLOPS"
+def flops_to_string(flops, units=None, precision=DEFAULT_PRECISION):
+    return f"{number_to_string(flops, units=units, precision=precision)}FLOPS"
 
 
-def params_to_string(params_num, units=None, precision=2):
-    if units is None:
-        if params_num // 10**6 > 0:
-            return str(round(params_num / 10**6, 2)) + " M"
-        elif params_num // 10**3:
-            return str(round(params_num / 10**3, 2)) + " k"
-        else:
-            return str(params_num)
-    else:
-        if units == "M":
-            return str(round(params_num / 10.0**6, precision)) + " " + units
-        elif units == "K":
-            return str(round(params_num / 10.0**3, precision)) + " " + units
-        else:
-            return str(params_num)
+def bytes_to_string(b, units=None, precision=DEFAULT_PRECISION):
+    return f"{number_to_string(b, units=units, precision=precision)}B"
 
 
-def duration_to_string(duration, units=None, precision=2):
-    if units is None:
-        if duration > 1:
-            return str(round(duration, precision)) + " s"
-        elif duration * 10**3 > 1:
-            return str(round(duration * 10**3, precision)) + " ms"
-        elif duration * 10**6 > 1:
-            return str(round(duration * 10**6, precision)) + " us"
-        else:
-            return str(duration)
-    else:
-        if units == "us":
-            return str(round(duration * 10.0**6, precision)) + " " + units
-        elif units == "ms":
-            return str(round(duration * 10.0**3, precision)) + " " + units
-        else:
-            return str(round(duration, precision)) + " s"
+def params_to_string(params_num, units=None, precision=DEFAULT_PRECISION):
+    units = units.replace("B", "G") if units else units
+    return number_to_string(params_num, units=units, precision=precision).replace("G", "B").strip()
+
+
+def duration_to_string(duration, units=None, precision=DEFAULT_PRECISION):
+    return f"{number_to_string(duration, units=units, precision=precision)}s"
 
 
     # can not iterate over all submodules using self.model.modules()
@@ -1156,24 +1139,23 @@ def get_module_duration(module):
     duration = module.__duration__
     if duration == 0:  # e.g. ModuleList
         for m in module.children():
-            duration += m.__duration__
+            duration += get_module_duration(m)
     return duration
 
 
-def get_model_profile(
-    model,
-    input_shape=None,
-    args=[],
-    kwargs={},
-    print_profile=True,
-    detailed=True,
-    module_depth=-1,
-    top_modules=1,
-    warm_up=1,
-    as_string=True,
-    output_file=None,
-    ignore_modules=None,
-):
+def get_model_profile(model,
+                      input_shape=None,
+                      args=[],
+                      kwargs={},
+                      print_profile=True,
+                      detailed=True,
+                      module_depth=-1,
+                      top_modules=1,
+                      warm_up=1,
+                      as_string=True,
+                      output_file=None,
+                      ignore_modules=None,
+                      mode='forward'):
     """Returns the total floating-point operations, MACs, and parameters of a model.
 
     Example:
@@ -1210,8 +1192,7 @@ def get_model_profile(
         assert len(input_shape) >= 1, "input_shape must have at least one element"
         try:
             input = torch.ones(()).new_empty(
-                (*input_shape,
-                 ),
+                (*input_shape, ),
                 dtype=next(model.parameters()).dtype,
                 device=next(model.parameters()).device,
             )
@@ -1221,17 +1202,30 @@ def get_model_profile(
         args = [input]
     assert (len(args) > 0) or (len(kwargs) > 0), "args and/or kwargs must be specified if input_shape is None"
 
+    logger.info("Flops profiler warming-up...")
     for _ in range(warm_up):
         if kwargs:
-            _ = model(*args, **kwargs)
+            if mode == 'forward':
+                _ = model(*args, **kwargs)
+            if mode == 'generate':
+                _ = model.generate(*args, **kwargs)
         else:
-            _ = model(*args)
+            if mode == 'forward':
+                _ = model(*args)
+            if mode == 'generate':
+                _ = model.generate(*args)
     prof.start_profile(ignore_list=ignore_modules)
 
     if kwargs:
-        _ = model(*args, **kwargs)
+        if mode == 'forward':
+            _ = model(*args, **kwargs)
+        if mode == 'generate':
+            _ = model.generate(*args, **kwargs)
     else:
-        _ = model(*args)
+        if mode == 'forward':
+            _ = model(*args)
+        if mode == 'generate':
+            _ = model.generate(*args)
 
     flops = prof.get_total_flops()
     macs = prof.get_total_macs()
diff --git a/deepspeed/pydantic_v1.py b/deepspeed/pydantic_v1.py
new file mode 100644
index 000000000000..6aba072ad929
--- /dev/null
+++ b/deepspeed/pydantic_v1.py
@@ -0,0 +1,16 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""Pydantic v1 compatibility module.
+
+Pydantic v2 introduced breaking changes that hinder its adoption:
+https://docs.pydantic.dev/latest/migration/. To provide deepspeed users the option to
+migrate to pydantic v2 on their own timeline, deepspeed uses this compatibility module
+as a pydantic-version-agnostic alias for pydantic's v1 API.
+"""
+
+try:
+    from pydantic.v1 import *  # noqa: F401
+except ImportError:
+    from pydantic import *  # noqa: F401
diff --git a/deepspeed/runtime/__init__.py b/deepspeed/runtime/__init__.py
index aa1f39f9e194..347ff7993d82 100644
--- a/deepspeed/runtime/__init__.py
+++ b/deepspeed/runtime/__init__.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 
 class DeepSpeedOptimizer(object):
diff --git a/deepspeed/runtime/activation_checkpointing/__init__.py b/deepspeed/runtime/activation_checkpointing/__init__.py
index fcb45ab2b685..6c5067f71c8f 100644
--- a/deepspeed/runtime/activation_checkpointing/__init__.py
+++ b/deepspeed/runtime/activation_checkpointing/__init__.py
@@ -1 +1,5 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 '''Copyright The Microsoft DeepSpeed Team'''
diff --git a/deepspeed/runtime/activation_checkpointing/checkpointing.py b/deepspeed/runtime/activation_checkpointing/checkpointing.py
index 74372540106f..72a7bc0516ba 100644
--- a/deepspeed/runtime/activation_checkpointing/checkpointing.py
+++ b/deepspeed/runtime/activation_checkpointing/checkpointing.py
@@ -1,8 +1,8 @@
-'''Copyright The Microsoft DeepSpeed Team'''
-'''
-Copyright (c) Microsoft Corporation
-Licensed under the MIT license.
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
 
+# DeepSpeed Team
+"""
 Use to partition the activations stored for backward propagation
 Therefore reduces the memory consumption
 Also implements CPU checkpointing and contiguous memory checkpointing
@@ -10,7 +10,7 @@
 
 Code for rng checkpointing taken from NVIDIA Megatron-LM mpu/random.py
 b886b7bb972afe72bac0f5de4f42a4a7bae8ebef
-'''
+"""
 
 # Parts of the code here are adapted from PyTorch
 # repo: https://github.com/pytorch/pytorch
@@ -18,6 +18,7 @@
 import torch
 import contextlib
 from deepspeed import comm as dist
+import weakref
 
 import mmap
 from torch import _C
@@ -25,7 +26,7 @@
 from deepspeed.runtime.config import DeepSpeedConfig
 from deepspeed.utils import logger
 from deepspeed.runtime.utils import copy_to_device, move_to_device, see_memory_usage, bwc_tensor_model_parallel_rank
-from deepspeed.utils.timer import SynchronizedWallClockTimer as Timers
+from deepspeed.utils.timer import SynchronizedWallClockTimer as Timers, FORWARD_GLOBAL_TIMER
 from deepspeed.accelerator import get_accelerator
 
 # DeepSpeed Checkpointing Enabled or Disabled
@@ -82,9 +83,7 @@ def detach_variable(inputs, device=None):
             out.append(x)
         return tuple(out)
     else:
-        raise RuntimeError(
-            "Only tuple of tensors is supported. Got Unsupported input type: ",
-            type(inputs).__name__)
+        raise RuntimeError("Only tuple of tensors is supported. Got Unsupported input type: ", type(inputs).__name__)
 
 
 def _set_cuda_rng_state(new_state, device=-1):
@@ -92,7 +91,7 @@ def _set_cuda_rng_state(new_state, device=-1):
 
     Arguments:
         new_state (torch.ByteTensor): The desired state
-    This function is adapted from PyTorch repo (torch.cuda.set_rng_state)
+    This function is adapted from PyTorch repo (torch.cuda.set_rng_state) #ignore-cuda
     with a single change: the input state is not cloned. Cloning caused
     major performance issues for +4 GPU cases.
     """
@@ -128,6 +127,7 @@ class CudaRNGStatesTracker:
     rng state, we can perform operations and return to our starting
     cuda state.
     """
+
     def __init__(self):
         # Map from a string name to the cuda rng state.
         self.states_ = {}
@@ -227,13 +227,9 @@ def model_parallel_cuda_manual_seed(seed):
         logger.info(
             '> initializing model parallel cuda seeds on global rank {}, '
             'model parallel rank {}, and data parallel rank {} with '
-            'model parallel seed: {} and data parallel seed: {}'.format(
-                dist.get_rank(),
-                tp_rank,
-                mpu.get_data_parallel_rank(),
-                model_parallel_seed,
-                data_parallel_seed),
-        )
+            'model parallel seed: {} and data parallel seed: {}'.format(dist.get_rank(), tp_rank,
+                                                                        mpu.get_data_parallel_rank(),
+                                                                        model_parallel_seed, data_parallel_seed), )
     _CUDA_RNG_STATE_TRACKER.reset()
     # Set the default state.
     get_accelerator().manual_seed(data_parallel_seed)
@@ -241,6 +237,14 @@ def model_parallel_cuda_manual_seed(seed):
     _CUDA_RNG_STATE_TRACKER.add(_MODEL_PARALLEL_RNG_TRACKER_NAME, model_parallel_seed)
 
 
+def model_parallel_reconfigure_tp_seed(seed):
+    global mpu
+    tp_rank = bwc_tensor_model_parallel_rank(mpu)
+    model_parallel_seed = seed + 2718 + tp_rank
+    with _CUDA_RNG_STATE_TRACKER.fork():
+        get_accelerator().manual_seed(model_parallel_seed)
+
+
 def get_partition_start(item):
     global mp_rank, mp_size, mp_group
     size = item.numel()
@@ -274,6 +278,8 @@ def gather_partitioned_activations(tensors, device=None):
         # don't need to do all_gather if model parallel is not enabled
         if mp_group is None or mp_size == 1:
             item = item.view(list(size.numpy()))
+            if device is not None:
+                item = item.to(device)
             inputs.append(item)
             continue
 
@@ -282,9 +288,7 @@ def gather_partitioned_activations(tensors, device=None):
         if device is not None:
             flat_tensor = torch.zeros([tensor_size], dtype=item.dtype, device=device)
         else:
-            flat_tensor = torch.zeros([tensor_size],
-                                      dtype=item.dtype,
-                                      device=item.device)
+            flat_tensor = torch.zeros([tensor_size], dtype=item.dtype, device=item.device)
         partitions = []
         for i in range(mp_size):
             part_i = flat_tensor.narrow(0, partition_size * i, partition_size)
@@ -384,28 +388,21 @@ def partition_activations(args, cpu_checkpoint, contiguous_checkpoint):
 
         i = arg_index - num_non_fp_tensors
         partition_size = get_partition_size(item)
-        partition = item.detach().contiguous().view(-1).narrow(
-            0,
-            get_partition_start(item),
-            partition_size).clone()
+        partition = item.detach().contiguous().view(-1).narrow(0, get_partition_start(item), partition_size).clone()
 
         buffer_device = torch.device('cpu') if cpu_checkpoint else partition.device
 
         if contiguous_checkpoint:
             if i >= len(contiguous_data_buffers):
                 tensor_list = [
-                    torch.tensor(()).new_empty([partition_size],
-                                               dtype=partition.dtype,
-                                               device=buffer_device)
+                    torch.tensor(()).new_empty([partition_size], dtype=partition.dtype, device=buffer_device)
                     for _ in range(num_layers)
                 ]
                 contiguous_data_buffers.append(tensor_list)
                 data_offsets.append(0)
             elif contiguous_data_buffers[i] is None:
                 tensor_list = [
-                    torch.tensor(()).new_empty([partition_size],
-                                               dtype=partition.dtype,
-                                               device=buffer_device)
+                    torch.tensor(()).new_empty([partition_size], dtype=partition.dtype, device=buffer_device)
                     for _ in range(num_layers)
                 ]
                 contiguous_data_buffers[i] = tensor_list
@@ -419,14 +416,10 @@ def partition_activations(args, cpu_checkpoint, contiguous_checkpoint):
             # previously launched GPU kernels, there is a small
             # window of time here for CPUs to populate pages asynchronously.
             contiguous_data_buffers[i][data_offsets[i]].data[range(
-                0,
-                contiguous_data_buffers[i][data_offsets[i]].data.shape[0],
-                int(mmap.PAGESIZE /
-                    contiguous_data_buffers[i][data_offsets[i]].data.element_size())
-            )] = 0
-
-            contiguous_partition = contiguous_data_buffers[i][
-                data_offsets[i]].data.copy_(partition.data)
+                0, contiguous_data_buffers[i][data_offsets[i]].data.shape[0],
+                int(mmap.PAGESIZE / contiguous_data_buffers[i][data_offsets[i]].data.element_size()))] = 0
+
+            contiguous_partition = contiguous_data_buffers[i][data_offsets[i]].data.copy_(partition.data)
             data_offsets[i] = data_offsets[i] + 1
             inputs.append(contiguous_partition)
         else:
@@ -459,21 +452,14 @@ def get_partitioned_activations_for_backward(args, inputs, contiguous_checkpoint
             if i >= len(contiguous_size_buffers):
                 tmp = torch.tensor(())
                 contiguous_size_buffers.append(
-                    tmp.new_empty([numel * num_layers],
-                                  dtype=size.dtype,
-                                  device=size.device))
+                    tmp.new_empty([numel * num_layers], dtype=size.dtype, device=size.device))
                 size_offsets.append(0)
             elif contiguous_size_buffers[i] is None:
                 tmp = torch.tensor(())
-                contiguous_size_buffers[i] = tmp.new_empty([numel * num_layers],
-                                                           dtype=size.dtype,
-                                                           device=size.device)
+                contiguous_size_buffers[i] = tmp.new_empty([numel * num_layers], dtype=size.dtype, device=size.device)
                 size_offsets[i] = 0
 
-            contiguous_size = contiguous_size_buffers[i].narrow(
-                0,
-                size_offsets[i],
-                numel).data.copy_(size.data)
+            contiguous_size = contiguous_size_buffers[i].narrow(0, size_offsets[i], numel).data.copy_(size.data)
             contiguous_size = contiguous_size.view_as(size)
             size_offsets[i] = size_offsets[i] + numel
             new_args.append(contiguous_size)
@@ -499,13 +485,14 @@ def get_cpu_activations_for_backward(args, inputs):
 class CheckpointFunction(torch.autograd.Function):
     """This function is adapted from torch.utils.checkpoint with
        two main changes:
-           1) torch.cuda.set_rng_state is replaced with `_set_cuda_rng_state`
+           1) torch.cuda.set_rng_state is replaced with `_set_cuda_rng_state`  #ignore-cuda
            2) the states in the model parallel tracker are also properly
               tracked/set/reset.
            3) Performance activation partitioning, contiguous memory optimization
            4) CPU Checkpointing
            5) Profile forward and backward functions
     """
+
     @staticmethod
     def forward(ctx, run_function, all_outputs, *args):
         global mpu, timers, SYNCHRONIZE, PROFILE_TIME
@@ -523,7 +510,7 @@ def save_args_for_backward(*all_args):
             timers = Timers()
 
         if PROFILE_TIME:
-            timers('forward').start()
+            timers(FORWARD_GLOBAL_TIMER).start()
 
         ctx.run_function = run_function
         global num_layers
@@ -551,12 +538,9 @@ def save_args_for_backward(*all_args):
             see_memory_usage("First Forward Beginning", force=False)
             if dist.get_rank() == 0:
                 logger.info(f"Activation Checkpointing Information")
+                logger.info(f"----Partition Activations {PARTITION_ACTIVATIONS}, CPU CHECKPOINTING {CPU_CHECKPOINT}")
                 logger.info(
-                    f"----Partition Activations {PARTITION_ACTIVATIONS}, CPU CHECKPOINTING {CPU_CHECKPOINT}"
-                )
-                logger.info(
-                    f"----contiguous Memory Checkpointing {CONTIGUOUS_CHECKPOINTING} with {num_layers} total layers"
-                )
+                    f"----contiguous Memory Checkpointing {CONTIGUOUS_CHECKPOINTING} with {num_layers} total layers")
                 logger.info(f"----Synchronization {SYNCHRONIZE}")
                 logger.info(f"----Profiling time in checkpointing {PROFILE_TIME}")
 
@@ -564,18 +548,12 @@ def save_args_for_backward(*all_args):
             transport_stream = get_accelerator().Stream(device=cuda_device)
 
         if PARTITION_ACTIVATIONS:
-            inputs = partition_activations(args,
-                                           CPU_CHECKPOINT,
-                                           CONTIGUOUS_CHECKPOINTING)
+            inputs = partition_activations(args, CPU_CHECKPOINT, CONTIGUOUS_CHECKPOINTING)
         elif CPU_CHECKPOINT:
-            inputs = copy_to_device(args,
-                                    device=torch.device('cpu'),
-                                    criterion_func=is_activation_to_checkpoint)
+            inputs = copy_to_device(args, device=torch.device('cpu'), criterion_func=is_activation_to_checkpoint)
 
         # just in case something funky is happening such as reuse of inputs
-        inputs_cuda = copy_to_device(args,
-                                     device=cuda_device,
-                                     criterion_func=is_activation_to_checkpoint)
+        inputs_cuda = copy_to_device(args, device=cuda_device, criterion_func=is_activation_to_checkpoint)
 
         # Copy the rng states.
         ctx.fwd_cpu_rng_state = torch.get_rng_state()
@@ -591,10 +569,7 @@ def save_args_for_backward(*all_args):
         del inputs_cuda
 
         if PARTITION_ACTIVATIONS:
-            new_args = get_partitioned_activations_for_backward(
-                args,
-                inputs,
-                CONTIGUOUS_CHECKPOINTING)
+            new_args = get_partitioned_activations_for_backward(args, inputs, CONTIGUOUS_CHECKPOINTING)
             assert len(new_args) % 2 == 0, f'save_for_backward called with odd number of args, {len(new_args)}'
             save_args_for_backward(*new_args)
         elif CPU_CHECKPOINT:
@@ -604,8 +579,8 @@ def save_args_for_backward(*all_args):
             save_args_for_backward(*args)
 
         if PROFILE_TIME:
-            timers('forward').stop()
-            timers.log(['forward'])
+            timers(FORWARD_GLOBAL_TIMER).stop()
+            timers.log([FORWARD_GLOBAL_TIMER])
         if SYNCHRONIZE:
             get_accelerator().synchronize()
 
@@ -613,9 +588,7 @@ def save_args_for_backward(*all_args):
         if torch.is_tensor(outputs):
             non_grad_outputs = [outputs] if not outputs.is_floating_point() else []
         else:
-            non_grad_outputs = [
-                o for o in outputs if torch.is_tensor(o) and not o.is_floating_point()
-            ]
+            non_grad_outputs = [o for o in outputs if torch.is_tensor(o) and not o.is_floating_point()]
         ctx.mark_non_differentiable(*non_grad_outputs)
 
         if torch.is_tensor(outputs):
@@ -661,14 +634,11 @@ def backward(ctx, *grads):
 
         if PARTITION_ACTIVATIONS:
             # with get_accelerator().stream(transport_stream):
-            inputs = gather_partitioned_activations(
-                ctx.deepspeed_saved_tensors,
-                device=cuda_device if CPU_CHECKPOINT else None)
+            inputs = gather_partitioned_activations(ctx.deepspeed_saved_tensors,
+                                                    device=cuda_device if CPU_CHECKPOINT else None)
             detached_inputs = detach_variable(inputs)
         elif CPU_CHECKPOINT:
-            inputs = move_to_device(ctx.deepspeed_saved_tensors,
-                                    cuda_device,
-                                    is_activation_to_checkpoint)
+            inputs = move_to_device(ctx.deepspeed_saved_tensors, cuda_device, is_activation_to_checkpoint)
             detached_inputs = detach_variable(inputs)
         else:
             inputs = ctx.deepspeed_saved_tensors
@@ -746,6 +716,271 @@ def backward(ctx, *grads):
         return tuple(ret_list)
 
 
+def non_reentrant_checkpoint(function, *args):
+    """This function is union of `torch.utils.checkpoint._checkpoint_without_reentrant` and `CheckpointFunction` in this module
+
+    This function is aim to solve the back probagation error raised from all input requires no grad.
+    * has already been implemented in pytorch for a while, the solution is stable at most time except for jit module mode.
+    * can help to solve the issue which is hacked by `deepspeed.runtime.pipe.module.PipelineModule._is_checkpointable`
+
+    Main modifications compared to the implementation of torch:
+    1. adapt to the signature of `checkpoint` function in this module
+    2. solve the non-deterministic by random state management consistent with deepspeed `CheckpointFunction`
+    3. when there is partition or cpu checkpointing, gather them in the unpack_hook during back probagation
+    4. make all after backward blocks in the hook which will executed after all leaf nodes backward execution.
+    5. above 4. is inspired by `torch.autograd.graph.register_multi_grad_hook`, which is only implemented after 2.0.0
+    """
+    global mpu, timers, SYNCHRONIZE, PROFILE_TIME
+
+    deepspeed_saved_tensors = None
+    non_tensor_args = None
+    tensor_flags = None
+
+    def save_args_for_backward(*all_args):
+        """keep this function to reduce the modification from original implementation"""
+        nonlocal deepspeed_saved_tensors, non_tensor_args, tensor_flags
+        tensor_args, non_tensor_args, tensor_flags = extract_tensors(all_objects=all_args)
+        deepspeed_saved_tensors = tensor_args
+        non_tensor_args = non_tensor_args
+        tensor_flags = tensor_flags
+
+    if SYNCHRONIZE:
+        get_accelerator().synchronize()
+
+    if timers is None and PROFILE_TIME:
+        timers = Timers()
+
+    if PROFILE_TIME:
+        timers(FORWARD_GLOBAL_TIMER).start()
+
+    global num_layers
+    global mp_rank, mp_size, mp_group
+    global contiguous_data_buffers, contiguous_size_buffers
+    global data_offsets, size_offsets
+    if mp_rank is None:
+        if mpu is not None:
+            if hasattr(mpu, 'get_tensor_model_parallel_rank'):
+                mp_rank = mpu.get_tensor_model_parallel_rank()
+                mp_size = mpu.get_tensor_model_parallel_world_size()
+                mp_group = mpu.get_tensor_model_parallel_group()
+            else:
+                mp_rank = mpu.get_model_parallel_rank()
+                mp_size = mpu.get_model_parallel_world_size()
+                mp_group = mpu.get_model_parallel_group()
+        else:
+            mp_rank = 0
+            mp_size = 1
+            mp_group = None
+
+    global cuda_device, transport_stream, PARTITION_ACTIVATIONS, buffer_0, buffer_1, buffer_0_offset, buffer_1_offset
+
+    if cuda_device is None:
+        see_memory_usage("First Forward Beginning", force=False)
+        if dist.get_rank() == 0:
+            logger.info(f"Activation Checkpointing Information")
+            logger.info(f"----Partition Activations {PARTITION_ACTIVATIONS}, CPU CHECKPOINTING {CPU_CHECKPOINT}")
+            logger.info(
+                f"----contiguous Memory Checkpointing {CONTIGUOUS_CHECKPOINTING} with {num_layers} total layers")
+            logger.info(f"----Synchronization {SYNCHRONIZE}")
+            logger.info(f"----Profiling time in checkpointing {PROFILE_TIME}")
+
+        cuda_device = get_accelerator().current_device_name()
+        transport_stream = get_accelerator().Stream(device=cuda_device)
+
+    if PARTITION_ACTIVATIONS:
+        inputs = partition_activations(args, CPU_CHECKPOINT, CONTIGUOUS_CHECKPOINTING)
+    elif CPU_CHECKPOINT:
+        inputs = copy_to_device(args, device=torch.device('cpu'), criterion_func=is_activation_to_checkpoint)
+
+    # just in case something funky is happening such as reuse of inputs
+    inputs_cuda = copy_to_device(args, device=cuda_device, criterion_func=is_activation_to_checkpoint)
+
+    # Copy the rng states.
+    fwd_cpu_rng_state = torch.get_rng_state()
+    fwd_cuda_rng_state = get_accelerator().get_rng_state()
+    fwd_cuda_rng_state_tracker = get_cuda_rng_tracker().get_states()
+
+    if PARTITION_ACTIVATIONS:
+        new_args = get_partitioned_activations_for_backward(args, inputs, CONTIGUOUS_CHECKPOINTING)
+        assert len(new_args) % 2 == 0, f'save_for_backward called with odd number of args, {len(new_args)}'
+        save_args_for_backward(*new_args)
+    elif CPU_CHECKPOINT:
+        new_args = get_cpu_activations_for_backward(args, inputs)
+        save_args_for_backward(*new_args)
+    else:
+        save_args_for_backward(*args)
+
+    class Holder():
+        """the place holder object used as activations to save memory"""
+        pass
+
+    # weakref seems utilized to discover the tensor deletion before a whole
+    # forward backward pair loop finished
+    storage: weakref.WeakKeyDictionary = weakref.WeakKeyDictionary()
+    weak_holder_list = []
+    leaf_tensors = []
+    backward_visited_leaf_nodes = 0
+
+    def checkpoint_pack(tensor_from_forward):
+        """used to record the activation order in the `weak_holder_list`
+
+        the activation order in holder list is consistent between the first forward and recomputing forward.
+        * the jit compiled forward will break the order consistency *
+        """
+        res = Holder()
+        weak_holder_list.append(weakref.ref(res))
+
+        # if this is a leaf tensor, save it for backward progression trace
+        # leaf tensor used to be input or parameters, which is not activations and
+        # has no memory overhead
+        if tensor_from_forward.requires_grad and tensor_from_forward.is_leaf:
+            leaf_tensors.append(tensor_from_forward)
+        return res
+
+    def checkpoint_unpack(holder_from_backward):
+        """retrieve the activations from recompute"""
+        nonlocal deepspeed_saved_tensors, non_tensor_args, tensor_flags
+
+        # if this is the first step of backward probagation, recompute the graph and save
+        # all the activations with the same order as `checkpoint_pack` does
+        if len(storage) == 0:
+            unpack_counter = 0
+
+            def replay_pack(tensor_from_replay):
+                """save recompute activations"""
+                nonlocal unpack_counter
+                unpack_counter += 1
+
+                if weak_holder_list[unpack_counter - 1]() is None:
+                    return
+
+                detached_activations = tensor_from_replay.detach()
+                storage[weak_holder_list[unpack_counter - 1]()] = detached_activations
+
+                return
+
+            def replay_unpack(none_value):
+                """recompute graph need not to backward"""
+                raise RuntimeError("You are calling backwards on a tensor that is never exposed.")
+
+            global timers
+            see_memory_usage("In backward", force=False)
+            # removing pointers to the contiguous buffer memory
+            # so that they can be garbage collected once the checkpoints
+            # have been used
+            if SYNCHRONIZE:
+                get_accelerator().synchronize()
+            if PROFILE_TIME:
+                timers('backward').start()
+
+            if CONTIGUOUS_CHECKPOINTING:
+                global data_offsets, size_offsets
+                global contiguous_data_buffers, contiguous_size_buffers
+
+                for buffers in contiguous_data_buffers:
+                    buffers = []
+
+                # frees up all the pointers to the checkpoints except for the ones
+                # stored by save for backward
+                contiguous_data_buffers = []
+                contiguous_size_buffers = []
+                data_offsets = []
+                size_offsets = []
+
+            see_memory_usage("In backward checkpointing code", force=False)
+            if not torch.autograd._is_checkpoint_valid():
+                raise RuntimeError("Checkpointing is not compatible with .grad(), "
+                                   "please use .backward() if possible")
+
+            global cuda_device, transport_stream, PARTITION_ACTIVATIONS
+
+            # gather inputs which is partitioned or checkpointed before first forward
+            if PARTITION_ACTIVATIONS:
+                # with get_accelerator().stream(transport_stream):
+                inputs = gather_partitioned_activations(deepspeed_saved_tensors,
+                                                        device=cuda_device if CPU_CHECKPOINT else None)
+                detached_inputs = detach_variable(inputs)
+            elif CPU_CHECKPOINT:
+                inputs = move_to_device(deepspeed_saved_tensors, cuda_device, is_activation_to_checkpoint)
+                detached_inputs = detach_variable(inputs)
+            else:
+                inputs = deepspeed_saved_tensors
+                detached_inputs = detach_variable(inputs)
+
+            # Add non tensor input args
+            detached_inputs = merge_tensors(tensor_objects=detached_inputs,
+                                            non_tensor_objects=non_tensor_args,
+                                            tensor_flags=tensor_flags)
+
+            # Store the current states.
+            bwd_cpu_rng_state = torch.get_rng_state()
+            bwd_cuda_rng_state = get_accelerator().get_rng_state()
+            bwd_cuda_rng_state_tracker = get_cuda_rng_tracker().get_states()
+
+            # Set the states to what it used to be before the forward pass.
+            torch.set_rng_state(fwd_cpu_rng_state)
+            _set_cuda_rng_state(fwd_cuda_rng_state)
+            get_cuda_rng_tracker().set_states(fwd_cuda_rng_state_tracker)
+
+            see_memory_usage("In backward checkpointing code before forward", force=False)
+            with torch.enable_grad(), torch.autograd.graph.saved_tensors_hooks(replay_pack, replay_unpack):
+                _unused = function(*detached_inputs)
+
+            see_memory_usage("In backward checkpointing code after forward", force=False)
+            # Set the states back to what it was at the start of this function.
+            torch.set_rng_state(bwd_cpu_rng_state)
+            _set_cuda_rng_state(bwd_cuda_rng_state)
+            get_cuda_rng_tracker().set_states(bwd_cuda_rng_state_tracker)
+
+            deepspeed_saved_tensors = None
+            non_tensor_args = None
+            tensor_flags = None
+
+        if holder_from_backward not in storage:
+            raise RuntimeError("Attempt to retrieve a tensor saved by autograd multiple times without checkpoint"
+                               " recomputation being triggered in between, this is not currently supported.")
+
+        return storage[holder_from_backward]
+
+    def after_backward_hook(_nonuse_grads):
+        """the hook registered to all leaf tensors"""
+        nonlocal leaf_tensors, backward_visited_leaf_nodes
+        backward_visited_leaf_nodes += 1
+
+        if backward_visited_leaf_nodes == len(leaf_tensors):
+            see_memory_usage("After backward checkpointing code after backward", force=False)
+
+            if PROFILE_TIME:
+                timers('backward').stop()
+                timers.log(['backward'])
+            if SYNCHRONIZE:
+                get_accelerator().synchronize()
+
+    with torch.autograd.graph.saved_tensors_hooks(checkpoint_pack, checkpoint_unpack):
+        outputs = function(*inputs_cuda)
+    for leaf_tensor in leaf_tensors:
+        leaf_tensor.register_hook(after_backward_hook)
+
+    see_memory_usage("After running forward on the layer", force=False)
+
+    if PROFILE_TIME:
+        timers(FORWARD_GLOBAL_TIMER).stop()
+        timers.log([FORWARD_GLOBAL_TIMER])
+    if SYNCHRONIZE:
+        get_accelerator().synchronize()
+
+    all_outputs = []
+    if torch.is_tensor(outputs):
+        all_outputs += [outputs]
+    else:
+        all_outputs += outputs
+
+    if len(all_outputs) == 1:
+        return all_outputs[0]
+    else:
+        return tuple(all_outputs)
+
+
 def checkpoint(function, *args):
     """Checkpoint a model or part of the model.
     This has been directly copied from torch.utils.checkpoint. """
@@ -762,8 +997,7 @@ def partition_activations_in_checkpoint(partition_activation):
     global PARTITION_ACTIVATIONS
     PARTITION_ACTIVATIONS = partition_activation
     if dist.get_rank() == 0:
-        logger.info(
-            f"**************Partition Activations {PARTITION_ACTIVATIONS}************")
+        logger.info(f"**************Partition Activations {PARTITION_ACTIVATIONS}************")
 
 
 def set_num_layers(nlayers):
diff --git a/deepspeed/runtime/activation_checkpointing/config.py b/deepspeed/runtime/activation_checkpointing/config.py
index 0e79579435b8..dc07388a95da 100755
--- a/deepspeed/runtime/activation_checkpointing/config.py
+++ b/deepspeed/runtime/activation_checkpointing/config.py
@@ -1,8 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
-"""
-Copyright (c) Microsoft Corporation
-Licensed under the MIT license.
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from deepspeed.runtime.config_utils import get_scalar_param, DeepSpeedConfigObject
 
@@ -18,7 +17,7 @@
     "partitioned_activations": [true|false],
     "number_checkpoints": 100,
     "contiguous_memory_optimization": [true|false],
-    "cpu_checkpointing": [true|false]
+    "cpu_checkpointing": [true|false],
     "profile": [true|false],
     "synchronize_checkpoint_boundary": [true|false],
     }
@@ -48,16 +47,15 @@
 ACT_CHKPT_DEFAULT = {
     ACT_CHKPT_PARTITION_ACTIVATIONS: ACT_CHKPT_PARTITION_ACTIVATIONS_DEFAULT,
     ACT_CHKPT_NUMBER_CHECKPOINTS: ACT_CHKPT_NUMBER_CHECKPOINTS_DEFAULT,
-    ACT_CHKPT_CONTIGUOUS_MEMORY_OPTIMIZATION:
-    ACT_CHKPT_CONTIGUOUS_MEMORY_OPTIMIZATION_DEFAULT,
-    ACT_CHKPT_SYNCHRONIZE_CHECKPOINT_BOUNDARY:
-    ACT_CHKPT_SYNCHRONIZE_CHECKPOINT_BOUNDARY_DEFAULT,
+    ACT_CHKPT_CONTIGUOUS_MEMORY_OPTIMIZATION: ACT_CHKPT_CONTIGUOUS_MEMORY_OPTIMIZATION_DEFAULT,
+    ACT_CHKPT_SYNCHRONIZE_CHECKPOINT_BOUNDARY: ACT_CHKPT_SYNCHRONIZE_CHECKPOINT_BOUNDARY_DEFAULT,
     ACT_CHKPT_PROFILE: ACT_CHKPT_PROFILE_DEFAULT,
     ACT_CHKPT_CPU_CHECKPOINTING: ACT_CHKPT_CPU_CHECKPOINTING_DEFAULT
 }
 
 
 class DeepSpeedActivationCheckpointingConfig(DeepSpeedConfigObject):
+
     def __init__(self, param_dict):
         super(DeepSpeedActivationCheckpointingConfig, self).__init__()
 
@@ -76,29 +74,21 @@ def __init__(self, param_dict):
         self._initialize(act_chkpt_config_dict)
 
     def _initialize(self, act_chkpt_config_dict):
-        self.partition_activations = get_scalar_param(
-            act_chkpt_config_dict,
-            ACT_CHKPT_PARTITION_ACTIVATIONS,
-            ACT_CHKPT_PARTITION_ACTIVATIONS_DEFAULT)
-
-        self.contiguous_memory_optimization = get_scalar_param(
-            act_chkpt_config_dict,
-            ACT_CHKPT_CONTIGUOUS_MEMORY_OPTIMIZATION,
-            ACT_CHKPT_CONTIGUOUS_MEMORY_OPTIMIZATION_DEFAULT)
-
-        self.cpu_checkpointing = get_scalar_param(act_chkpt_config_dict,
-                                                  ACT_CHKPT_CPU_CHECKPOINTING,
+        self.partition_activations = get_scalar_param(act_chkpt_config_dict, ACT_CHKPT_PARTITION_ACTIVATIONS,
+                                                      ACT_CHKPT_PARTITION_ACTIVATIONS_DEFAULT)
+
+        self.contiguous_memory_optimization = get_scalar_param(act_chkpt_config_dict,
+                                                               ACT_CHKPT_CONTIGUOUS_MEMORY_OPTIMIZATION,
+                                                               ACT_CHKPT_CONTIGUOUS_MEMORY_OPTIMIZATION_DEFAULT)
+
+        self.cpu_checkpointing = get_scalar_param(act_chkpt_config_dict, ACT_CHKPT_CPU_CHECKPOINTING,
                                                   ACT_CHKPT_CPU_CHECKPOINTING_DEFAULT)
 
-        self.number_checkpoints = get_scalar_param(act_chkpt_config_dict,
-                                                   ACT_CHKPT_NUMBER_CHECKPOINTS,
+        self.number_checkpoints = get_scalar_param(act_chkpt_config_dict, ACT_CHKPT_NUMBER_CHECKPOINTS,
                                                    ACT_CHKPT_NUMBER_CHECKPOINTS_DEFAULT)
 
-        self.profile = get_scalar_param(act_chkpt_config_dict,
-                                        ACT_CHKPT_PROFILE,
-                                        ACT_CHKPT_PROFILE_DEFAULT)
+        self.profile = get_scalar_param(act_chkpt_config_dict, ACT_CHKPT_PROFILE, ACT_CHKPT_PROFILE_DEFAULT)
 
-        self.synchronize_checkpoint_boundary = get_scalar_param(
-            act_chkpt_config_dict,
-            ACT_CHKPT_SYNCHRONIZE_CHECKPOINT_BOUNDARY,
-            ACT_CHKPT_SYNCHRONIZE_CHECKPOINT_BOUNDARY_DEFAULT)
+        self.synchronize_checkpoint_boundary = get_scalar_param(act_chkpt_config_dict,
+                                                                ACT_CHKPT_SYNCHRONIZE_CHECKPOINT_BOUNDARY,
+                                                                ACT_CHKPT_SYNCHRONIZE_CHECKPOINT_BOUNDARY_DEFAULT)
diff --git a/deepspeed/runtime/bf16_optimizer.py b/deepspeed/runtime/bf16_optimizer.py
index f667d279ca54..494816e6a846 100644
--- a/deepspeed/runtime/bf16_optimizer.py
+++ b/deepspeed/runtime/bf16_optimizer.py
@@ -1,41 +1,34 @@
-"""
-Copyright 2022 The Microsoft DeepSpeed Team
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from collections import OrderedDict
 import torch
 import sys
 import os
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
 from deepspeed import comm as dist
 from deepspeed.runtime.constants import PIPE_REPLICATED
-from deepspeed.ops.op_builder import UtilsBuilder
 from deepspeed.runtime import ZeROOptimizer
 from packaging import version as pkg_version
 
 from deepspeed.git_version_info import version
-from deepspeed.runtime.utils import (get_global_norm_of_tensors,
-                                     clip_tensors_by_global_norm,
-                                     DummyOptim,
-                                     align_dense_tensors,
-                                     all_gather_dp_groups,
-                                     bwc_tensor_model_parallel_rank,
-                                     is_model_parallel_parameter,
-                                     see_memory_usage)
+from deepspeed.runtime.utils import (get_global_norm_of_tensors, clip_tensors_by_global_norm, DummyOptim,
+                                     align_dense_tensors, all_gather_dp_groups, bwc_tensor_model_parallel_rank,
+                                     is_model_parallel_parameter, see_memory_usage)
 
 from deepspeed.utils import link_hp_params, fragment_address
 from deepspeed.checkpoint import enable_universal_checkpoint
-from deepspeed.checkpoint.constants import (DS_VERSION,
-                                            PARTITION_COUNT,
-                                            BASE_OPTIMIZER_STATE,
-                                            SINGLE_PARTITION_OF_FP32_GROUPS,
-                                            CLIP_GRAD,
-                                            GROUP_PADDINGS,
+from deepspeed.checkpoint.constants import (DS_VERSION, PARTITION_COUNT, BASE_OPTIMIZER_STATE,
+                                            SINGLE_PARTITION_OF_FP32_GROUPS, CLIP_GRAD, GROUP_PADDINGS,
                                             PARAM_SLICE_MAPPINGS)
 
 setattr(sys.modules[__name__], 'fragment_address', fragment_address)
 
 
 class BF16_Optimizer(ZeROOptimizer):
+
     def __init__(self,
                  init_optimizer,
                  param_names,
@@ -58,14 +51,11 @@ def __init__(self,
         self.allgather_bucket_size = int(allgather_bucket_size)
         self.dp_process_group = dp_process_group
         self.dp_rank = dist.get_rank(group=self.dp_process_group)
-        self.real_dp_process_group = [
-            dp_process_group for i in range(len(self.optimizer.param_groups))
-        ]
+        self.real_dp_process_group = [dp_process_group for i in range(len(self.optimizer.param_groups))]
 
-        # Load pre-built or JIT compile (un)flatten ops
-        util_ops = UtilsBuilder().load()
-        self.flatten = util_ops.flatten
-        self.unflatten = util_ops.unflatten
+        # Use torch (un)flatten ops
+        self.flatten = _flatten_dense_tensors
+        self.unflatten = _unflatten_dense_tensors
 
         #align nccl all-gather send buffers to 4-bye boundary
         self.nccl_start_alignment_factor = 2  # 4-byte alignment/sizeof(fp16) = 2
@@ -85,7 +75,6 @@ def __init__(self,
         self.fp32_groups_gradient_flat_partition = []
         self.fp32_groups_has_gradients = []
 
-        self.step_count = 0
         self.group_paddings = []
 
         if self.using_real_optimizer:
@@ -95,9 +84,7 @@ def __init__(self,
 
     def _setup_for_real_optimizer(self):
         dp_world_size = dist.get_world_size(group=self.dp_process_group)
-        self.partition_count = [
-            dp_world_size for i in range(len(self.optimizer.param_groups))
-        ]
+        self.partition_count = [dp_world_size for i in range(len(self.optimizer.param_groups))]
 
         for i, param_group in enumerate(self.optimizer.param_groups):
             see_memory_usage(f'before initializing group {i}', force=True)
@@ -105,69 +92,55 @@ def _setup_for_real_optimizer(self):
             partition_id = dist.get_rank(group=self.real_dp_process_group[i])
 
             # grab the original list
-            self.bf16_groups.append(param_group['params'])
+            trainable_parameters = [param for param in param_group['params'] if param.requires_grad]
+            self.bf16_groups.append(trainable_parameters)
 
             # create flat bf16 params
             self.bf16_groups_flat.append(
-                self._flatten_dense_tensors_aligned(
-                    self.bf16_groups[i],
-                    self.nccl_start_alignment_factor * dp_world_size))
+                self._flatten_dense_tensors_aligned(self.bf16_groups[i],
+                                                    self.nccl_start_alignment_factor * dp_world_size))
 
             # Make bf16 params point to flat tensor storage
-            self._update_storage_to_flattened_tensor(
-                tensor_list=self.bf16_groups[i],
-                flat_tensor=self.bf16_groups_flat[i])
+            self._update_storage_to_flattened_tensor(tensor_list=self.bf16_groups[i],
+                                                     flat_tensor=self.bf16_groups_flat[i])
 
             # divide flat weights into equal sized partitions
             partition_size = self.bf16_groups_flat[i].numel() // dp_world_size
             bf16_dp_partitions = [
-                self.bf16_groups_flat[i].narrow(0,
-                                                dp_index * partition_size,
-                                                partition_size)
+                self.bf16_groups_flat[i].narrow(0, dp_index * partition_size, partition_size)
                 for dp_index in range(dp_world_size)
             ]
             self.bf16_partitioned_groups.append(bf16_dp_partitions)
 
             # create fp32 params partition
-            self.fp32_groups_flat_partition.append(
-                bf16_dp_partitions[partition_id].clone().float().detach())
+            self.fp32_groups_flat_partition.append(bf16_dp_partitions[partition_id].clone().float().detach())
             self.fp32_groups_flat_partition[i].requires_grad = True
 
             num_elem_list = [t.numel() for t in self.bf16_groups[i]]
 
             # create fp32 gradients
-            self.fp32_groups_gradients_flat.append(
-                torch.zeros_like(self.bf16_groups_flat[i],
-                                 dtype=torch.float32))
+            self.fp32_groups_gradients_flat.append(torch.zeros_like(self.bf16_groups_flat[i], dtype=torch.float32))
 
             # track individual fp32 gradients for entire model
-            fp32_gradients = self._split_flat_tensor(
-                flat_tensor=self.fp32_groups_gradients_flat[i],
-                num_elem_list=num_elem_list)
+            fp32_gradients = self._split_flat_tensor(flat_tensor=self.fp32_groups_gradients_flat[i],
+                                                     num_elem_list=num_elem_list)
             self.fp32_groups_gradients.append(fp32_gradients)
             self.fp32_groups_gradient_dict[i] = fp32_gradients
 
             # flat tensor corresponding to actual fp32 gradients (i.e., minus alignment padding)
             length_without_padding = sum(num_elem_list)
             self.fp32_groups_actual_gradients_flat.append(
-                torch.narrow(self.fp32_groups_gradients_flat[i],
-                             0,
-                             0,
-                             length_without_padding))
+                torch.narrow(self.fp32_groups_gradients_flat[i], 0, 0, length_without_padding))
 
             # flat tensor corresponding to gradient partition
             self.fp32_groups_gradient_flat_partition.append(
-                torch.narrow(self.fp32_groups_gradients_flat[i],
-                             0,
-                             partition_id * partition_size,
-                             partition_size))
+                torch.narrow(self.fp32_groups_gradients_flat[i], 0, partition_id * partition_size, partition_size))
 
             # track fp32 gradient updates
             self.fp32_groups_has_gradients.append([False] * len(self.bf16_groups[i]))
 
             # Record padding required for alignment
-            if partition_id == dist.get_world_size(
-                    group=self.real_dp_process_group[i]) - 1:
+            if partition_id == dist.get_world_size(group=self.real_dp_process_group[i]) - 1:
                 padding = self.bf16_groups_flat[i].numel() - length_without_padding
             else:
                 padding = 0
@@ -199,8 +172,7 @@ def _create_param_mapping(self):
             for lp in self.bf16_groups[i]:
                 if lp._hp_mapping is not None:
                     lp_name = self.param_names[lp]
-                    param_mapping_per_group[
-                        lp_name] = lp._hp_mapping.get_hp_fragment_address()
+                    param_mapping_per_group[lp_name] = lp._hp_mapping.get_hp_fragment_address()
             param_mapping.append(param_mapping_per_group)
 
         return param_mapping
@@ -212,17 +184,16 @@ def _link_all_hp_params(self):
             partition_id = dist.get_rank(group=self.real_dp_process_group[i])
             partition_size = self.bf16_groups_flat[i].numel() // dp_world_size
             flat_hp_partition = self.fp32_groups_flat_partition[i]
-            link_hp_params(
-                lp_param_list=self.bf16_groups[i],
-                flat_hp_partition=flat_hp_partition,
-                gradient_dict=self.fp32_groups_gradient_dict,
-                offload_gradient_dict=None,
-                use_offload=False,
-                param_group_index=i,
-                partition_start=partition_id * partition_size,
-                partition_size=partition_size,
-                partition_optimizer_state=self.optimizer.state[flat_hp_partition],
-                dp_group=self.real_dp_process_group[i])
+            link_hp_params(lp_param_list=self.bf16_groups[i],
+                           flat_hp_partition=flat_hp_partition,
+                           gradient_dict=self.fp32_groups_gradient_dict,
+                           offload_gradient_dict=None,
+                           use_offload=False,
+                           param_group_index=i,
+                           partition_start=partition_id * partition_size,
+                           partition_size=partition_size,
+                           partition_optimizer_state=self.optimizer.state[flat_hp_partition],
+                           dp_group=self.real_dp_process_group[i])
 
     def initialize_optimizer_states(self):
         """Take an optimizer step with zero-valued gradients to allocate internal
@@ -231,7 +202,8 @@ def initialize_optimizer_states(self):
         This helps prevent memory fragmentation by allocating optimizer state at the
         beginning of training instead of after activations have been allocated.
         """
-        for param_partition, grad_partition in zip(self.fp32_groups_flat_partition, self.fp32_groups_gradient_flat_partition):
+        for param_partition, grad_partition in zip(self.fp32_groups_flat_partition,
+                                                   self.fp32_groups_gradient_flat_partition):
             param_partition.grad = grad_partition
 
         self.optimizer.step()
@@ -262,26 +234,23 @@ def step(self, closure=None):
         if closure is not None:
             raise NotImplementedError(f'{self.__class__} does not support closure.')
 
-        all_groups_norm = get_global_norm_of_tensors(
-            input_tensors=self.get_grads_for_norm(),
-            mpu=self.mpu,
-            norm_type=self.norm_type)
+        all_groups_norm = get_global_norm_of_tensors(input_tensors=self.get_grads_for_norm(),
+                                                     mpu=self.mpu,
+                                                     norm_type=self.norm_type)
         self._global_grad_norm = all_groups_norm
 
         assert all_groups_norm > 0.
         if self.clip_grad > 0.:
-            clip_tensors_by_global_norm(
-                input_tensors=self.get_grads_for_norm(for_clipping=True),
-                max_norm=self.clip_grad,
-                global_norm=all_groups_norm,
-                mpu=self.mpu)
+            clip_tensors_by_global_norm(input_tensors=self.get_grads_for_norm(for_clipping=True),
+                                        max_norm=self.clip_grad,
+                                        global_norm=all_groups_norm,
+                                        mpu=self.mpu)
 
         self.optimizer.step()
 
         self.update_lp_params()
 
         self.clear_hp_grads()
-        self.step_count += 1
 
     def backward(self, loss, update_hp_grads=True, clear_lp_grads=False, **bwd_kwargs):
         """Perform a backward pass and copy the low-precision gradients to the
@@ -343,7 +312,8 @@ def get_grads_for_norm(self, for_clipping=False):
 
     @torch.no_grad()
     def update_lp_params(self):
-        for i, (bf16_partitions, fp32_partition) in enumerate(zip(self.bf16_partitioned_groups, self.fp32_groups_flat_partition)):
+        for i, (bf16_partitions,
+                fp32_partition) in enumerate(zip(self.bf16_partitioned_groups, self.fp32_groups_flat_partition)):
             partition_id = dist.get_rank(group=self.real_dp_process_group[i])
             bf16_partitions[partition_id].data.copy_(fp32_partition.data)
             # print_rank_0(f'update_lp_params {i=} {partition_id=}', force=True)
@@ -393,20 +363,14 @@ def load_state_dict(self,
                         state_dict_list,
                         checkpoint_folder,
                         load_optimizer_states=True,
-                        load_from_fp32_weights=False):
+                        load_from_fp32_weights=False,
+                        load_serial=None):
         if checkpoint_folder:
-            self._load_universal_checkpoint(checkpoint_folder,
-                                            load_optimizer_states,
-                                            load_from_fp32_weights)
+            self._load_universal_checkpoint(checkpoint_folder, load_optimizer_states, load_from_fp32_weights)
         else:
-            self._load_legacy_checkpoint(state_dict_list,
-                                         load_optimizer_states,
-                                         load_from_fp32_weights)
+            self._load_legacy_checkpoint(state_dict_list, load_optimizer_states, load_from_fp32_weights)
 
-    def _load_legacy_checkpoint(self,
-                                state_dict_list,
-                                load_optimizer_states=True,
-                                load_from_fp32_weights=False):
+    def _load_legacy_checkpoint(self, state_dict_list, load_optimizer_states=True, load_from_fp32_weights=False):
 
         dp_rank = dist.get_rank(group=self.dp_process_group)
         current_rank_sd = state_dict_list[dp_rank]
@@ -421,17 +385,15 @@ def _load_legacy_checkpoint(self,
             self.optimizer.load_state_dict(current_rank_sd[BASE_OPTIMIZER_STATE])
 
         if load_from_fp32_weights:
-            for current, saved in zip(self.fp32_groups_flat_partition, current_rank_sd[SINGLE_PARTITION_OF_FP32_GROUPS]):
+            for current, saved in zip(self.fp32_groups_flat_partition,
+                                      current_rank_sd[SINGLE_PARTITION_OF_FP32_GROUPS]):
                 src_tensor = _get_padded_tensor(saved, current.numel())
                 current.data.copy_(src_tensor.data)
 
         if load_optimizer_states:
             self._link_all_hp_params()
 
-    def _load_universal_checkpoint(self,
-                                   checkpoint_folder,
-                                   load_optimizer_states,
-                                   load_from_fp32_weights):
+    def _load_universal_checkpoint(self, checkpoint_folder, load_optimizer_states, load_from_fp32_weights):
         self._load_hp_checkpoint_state(checkpoint_folder)
 
     @property
@@ -448,11 +410,8 @@ def _load_hp_checkpoint_state(self, checkpoint_dir):
             for lp in self.bf16_groups[i]:
                 if lp._hp_mapping is not None:
                     #print(f"Loading {self.param_names[lp]} {tp_rank=} {tp_world_size=}")
-                    lp.load_hp_checkpoint_state(
-                        os.path.join(checkpoint_dir,
-                                     self.param_names[lp]),
-                        tp_rank,
-                        tp_world_size)
+                    lp.load_hp_checkpoint_state(os.path.join(checkpoint_dir, self.param_names[lp]), tp_rank,
+                                                tp_world_size)
 
 
 def _get_padded_tensor(src_tensor, size):
diff --git a/deepspeed/runtime/checkpoint_engine/README.md b/deepspeed/runtime/checkpoint_engine/README.md
index a19f54889802..c2b7940a414a 100644
--- a/deepspeed/runtime/checkpoint_engine/README.md
+++ b/deepspeed/runtime/checkpoint_engine/README.md
@@ -31,7 +31,7 @@ class CheckpointEngine(object):
         pass
 
     def commit(self, tag):
-        # to tell checkpoint services if all files are readys.
+        # to tell checkpoint services if all files are ready.
         pass
 
 ```
diff --git a/deepspeed/runtime/checkpoint_engine/__init__.py b/deepspeed/runtime/checkpoint_engine/__init__.py
index fcb45ab2b685..6c5067f71c8f 100644
--- a/deepspeed/runtime/checkpoint_engine/__init__.py
+++ b/deepspeed/runtime/checkpoint_engine/__init__.py
@@ -1 +1,5 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 '''Copyright The Microsoft DeepSpeed Team'''
diff --git a/deepspeed/runtime/checkpoint_engine/checkpoint_engine.py b/deepspeed/runtime/checkpoint_engine/checkpoint_engine.py
index cb6decb8826e..a341dffdf692 100644
--- a/deepspeed/runtime/checkpoint_engine/checkpoint_engine.py
+++ b/deepspeed/runtime/checkpoint_engine/checkpoint_engine.py
@@ -1,4 +1,9 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import os
 
 
 class CheckpointEngine(object):
@@ -11,6 +16,9 @@ def create(self, tag):
         # create checkpoint on give tag for save/load.
         pass
 
+    def makedirs(self, path, exist_ok=False):
+        os.makedirs(path, exist_ok=exist_ok)
+
     def save(self, state_dict, path: str):
         pass
 
@@ -18,5 +26,5 @@ def load(self, path: str, map_location=None):
         pass
 
     def commit(self, tag):
-        # to tell checkpoint services if all files are readys.
+        # to tell checkpoint services if all files are ready.
         pass
diff --git a/deepspeed/runtime/checkpoint_engine/nebula_checkpoint_engine.py b/deepspeed/runtime/checkpoint_engine/nebula_checkpoint_engine.py
index 42bb067d087d..e26e3243c4b5 100644
--- a/deepspeed/runtime/checkpoint_engine/nebula_checkpoint_engine.py
+++ b/deepspeed/runtime/checkpoint_engine/nebula_checkpoint_engine.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import os
 import torch
@@ -15,6 +18,7 @@ def _get_tag_from_path(path):
 
 
 class NebulaCheckpointEngine(CheckpointEngine):
+
     def __init__(self, config_params=None):
         super().__init__(config_params)
         self.checkpoint = None
@@ -27,15 +31,14 @@ def __init__(self, config_params=None):
         nebula_config_params = {
             NEBULA_PERSISTENT_STORAGE_PATH: config_params.persistent_storage_path,
             NEBULA_PERSISTENT_TIME_INTERVAL: config_params.persistent_time_interval,
-            NEBULA_NUM_OF_VERSION_IN_RETENTION:
-            config_params.num_of_version_in_retention,
+            NEBULA_NUM_OF_VERSION_IN_RETENTION: config_params.num_of_version_in_retention,
         }
         torch_nebula.init(**nebula_config_params)
 
     def create(self, tag):
         log_dist(f"[Nebula] Start Checkpoint for tag:{tag}", ranks=[0])
         # -2 means: customer needs to  explicitly tell nebula
-        # current checkpoint is complete by commit methond.
+        # current checkpoint is complete by commit method.
         self.checkpoint = torch_nebula.Checkpoint(tag, -2)
 
     def save(self, state_dict, path: str):
@@ -43,10 +46,10 @@ def save(self, state_dict, path: str):
         torch.save("", path)
 
         tag = _get_tag_from_path(path)
-        partititon_name = os.path.basename(path)
-        logger.info(f"[Nebula] Saving {partititon_name} under tag {tag}...")
-        self.checkpoint.save(partititon_name, state_dict)
-        logger.info(f"[Nebula] Saved {partititon_name} under tag {tag}.")
+        partition_name = os.path.basename(path)
+        logger.info(f"[Nebula] Saving {partition_name} under tag {tag}...")
+        self.checkpoint.save(partition_name, state_dict)
+        logger.info(f"[Nebula] Saved {partition_name} under tag {tag}.")
         return None
 
     def load(self, path: str, map_location=None):
@@ -54,16 +57,13 @@ def load(self, path: str, map_location=None):
         first_load_flag = self.tag_flag is None or self.tag_flag == tag
         if not self.enable_nebula_load and first_load_flag:
             self.tag_flag = tag
-            logger.info(
-                f"[Nebula] Disable nebula load. Loading checkpoint from {path} ...")
+            logger.info(f"[Nebula] Disable nebula load. Loading checkpoint from {path} ...")
             partition = torch.load(path, map_location=map_location)
             logger.info(f"[Nebula] Disable nebula load. Loaded checkpoint from {path} .")
             return partition
 
-        partititon_name = os.path.basename(path)
-        logger.info(
-            f"[Nebula] Loading {path} under tag {tag} from nebula path {self.nebula_load_path}..."
-        )
+        partition_name = os.path.basename(path)
+        logger.info(f"[Nebula] Loading {path} under tag {tag} from nebula path {self.nebula_load_path}...")
 
         checkpoint = None
         if tag in (None, 'latest', 'latest_universal'):
@@ -72,44 +72,36 @@ def load(self, path: str, map_location=None):
             # will try to load the valid latest checkpoint from nebula(tier3 > tier1). So, in summary
             # when met failure loading for given tag, the loading priority would be like:
             #               nebula tier3 latest > nebula tier1 latest.
-            checkpoint = torch_nebula.get_latest_checkpoint(
-                persist_path=self.nebula_load_path)
+            checkpoint = torch_nebula.get_latest_checkpoint(persist_path=self.nebula_load_path)
         else:
-            checkpoint = torch_nebula.get_checkpoint(tag=tag,
-                                                     persist_path=self.nebula_load_path)
+            checkpoint = torch_nebula.get_checkpoint(tag=tag, persist_path=self.nebula_load_path)
 
         if checkpoint is None or (checkpoint is not None and checkpoint.tag == ''):
             logger.info(
                 f"Unable to find valid checkpoint tag:{tag} from Nebula, try to get latest checkpoint again from nebula {self.nebula_load_path} path!"
             )
             # nebula tier3 latest
-            checkpoint = torch_nebula.get_latest_checkpoint(
-                persist_path=self.nebula_load_path)
+            checkpoint = torch_nebula.get_latest_checkpoint(persist_path=self.nebula_load_path)
             if checkpoint is None or (checkpoint is not None and checkpoint.tag == ''):
                 logger.info(
                     f"Unable to find latest checkpoint from Nebula tier3, try to get latest checkpoint again from nebula tier1 path!"
                 )
                 # nebula tier1 latest
                 checkpoint = torch_nebula.get_latest_checkpoint()
-                logger.warning(
-                    f"Unable to find valid checkpoint from Nebula under tag:{tag}.")
+                logger.warning(f"Unable to find valid checkpoint from Nebula under tag:{tag}.")
                 return None
 
         tag = checkpoint.tag
         self.tag_flag = -1
-        partition = checkpoint.load(partititon_name, map_location=map_location)
-        logger.info(
-            f"[Nebula] Loaded {path} under tag {tag} from {self.nebula_load_path}.")
+        partition = checkpoint.load(partition_name, map_location=map_location)
+        logger.info(f"[Nebula] Loaded {path} under tag {tag} from {self.nebula_load_path}.")
         return partition
 
     def commit(self, tag):
         # nebula commit will be call when all files under give tag are ready to be persisted in the async way.
-        logger.info(
-            f"[Nebula] all files for {tag} are saved in tier1. It is ready to start persisting"
-        )
+        logger.info(f"[Nebula] all files for {tag} are saved in tier1. It is ready to start persisting")
         commit_rls = self.checkpoint.commit()
         if not commit_rls:
-            logger.error(
-                f"[Nebula] failed to commit the checkpoint, please check the log.")
+            logger.error(f"[Nebula] failed to commit the checkpoint, please check the log.")
             return False
         return commit_rls
diff --git a/deepspeed/runtime/checkpoint_engine/torch_checkpoint_engine.py b/deepspeed/runtime/checkpoint_engine/torch_checkpoint_engine.py
index 5ee131800def..5cd44864bb2e 100644
--- a/deepspeed/runtime/checkpoint_engine/torch_checkpoint_engine.py
+++ b/deepspeed/runtime/checkpoint_engine/torch_checkpoint_engine.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 from deepspeed.utils import logger, log_dist
@@ -7,11 +10,12 @@
 
 
 class TorchCheckpointEngine(CheckpointEngine):
+
     def __init__(self, config_params=None):
         super().__init__(config_params)
 
     def create(self, tag):
-        log_dist(f"[Torch] Checkpoint {tag} is begin to save!", ranks=[0])
+        log_dist(f"[Torch] Checkpoint {tag} is about to be saved!", ranks=[0])
 
     def save(self, state_dict, path: str):
         logger.info(f"[Torch] Saving {path}...")
diff --git a/deepspeed/runtime/comm/__init__.py b/deepspeed/runtime/comm/__init__.py
index fcb45ab2b685..6c5067f71c8f 100644
--- a/deepspeed/runtime/comm/__init__.py
+++ b/deepspeed/runtime/comm/__init__.py
@@ -1 +1,5 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 '''Copyright The Microsoft DeepSpeed Team'''
diff --git a/deepspeed/runtime/comm/coalesced_collectives.py b/deepspeed/runtime/comm/coalesced_collectives.py
index 07302b1b7ae2..b8134b453e39 100644
--- a/deepspeed/runtime/comm/coalesced_collectives.py
+++ b/deepspeed/runtime/comm/coalesced_collectives.py
@@ -1,29 +1,71 @@
-'''Copyright The Microsoft DeepSpeed Team'''
-"""batched collective operations for overhead amortization and better
-bandwidth utilization"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""
+batched collective operations for overhead amortization and better
+bandwidth utilization
+"""
 
 import math
 from typing import List
-
 import torch
 from torch import Tensor
 from deepspeed import comm as dist
 # NOTE: Use torch.distributed's ProcessGroup class until we have our own.
-from torch.distributed import ProcessGroup
-import torch.nn.functional
-
+from torch.distributed import ProcessGroup, all_to_all_single
+from deepspeed.accelerator import get_accelerator
 from deepspeed.utils import instrument_w_nvtx
+from deepspeed.ops import op_builder
+
 
+def _torch_reduce_scatter_fn(input_tensor: Tensor, output_tensor: Tensor, group=None, async_op=False, prof=False):
+    return instrument_w_nvtx(dist.reduce_scatter_fn)(output_tensor, input_tensor, group=group, async_op=False)
 
-def _torch_reduce_scatter_fn(input_tensor: Tensor,
-                             output_tensor: Tensor,
-                             group=None,
-                             async_op=False,
-                             prof=False):
-    return instrument_w_nvtx(dist.reduce_scatter_fn)(output_tensor,
-                                                     input_tensor,
-                                                     group=group,
-                                                     async_op=async_op)
+
+quantizer_module = None
+
+
+@instrument_w_nvtx
+@torch.no_grad()
+def all_to_all_quant_reduce(tensors: List[Tensor], groups: {}) -> List[Tensor]:
+    global quantizer_module
+    if quantizer_module is None:
+        quantizer_module = op_builder.QuantizerBuilder().load()
+    local_world_size = get_accelerator().device_count()
+    global_world_size = dist.get_world_size()
+    num_nodes = global_world_size // local_world_size
+    this_rank = dist.get_rank()
+    intra_idx = int(this_rank / local_world_size)
+    inter_idx = this_rank % local_world_size
+    output_lst: List[Tensor] = [None] * len(tensors)
+    for idx, tensor in enumerate(tensors):
+        if tensor.dim() == 1:
+            intra_quant_group = global_world_size
+            output_lst[idx] = reduce_scatter_coalesced([tensor])[0]
+            continue
+        else:
+            intra_quant_group = max(tensor.shape[0], tensor.shape[1], global_world_size)
+
+            inter_quant_group = intra_quant_group // local_world_size
+            intra_quant_int4, intra_q_scales = quantizer_module.swizzle_quant(tensor, intra_quant_group, 4,
+                                                                              quantizer_module.Symmetric, 1, num_nodes,
+                                                                              local_world_size)
+            local_output = torch.empty_like(intra_quant_int4)
+            scale_output = torch.empty_like(intra_q_scales)
+            all_to_all_single(local_output, intra_quant_int4, group=groups[f'local_{intra_idx}'])
+            all_to_all_single(scale_output, intra_q_scales, group=groups[f'local_{intra_idx}'])
+            global_input_tensor, global_scales = quantizer_module.quantized_reduction(
+                local_output, scale_output, intra_quant_group, inter_quant_group, 4, quantizer_module.Symmetric,
+                local_world_size)
+            global_output = torch.empty_like(global_input_tensor)
+            global_scale_output = torch.empty_like(global_scales)
+            all_to_all_single(global_output, global_input_tensor, group=groups[f'global_{inter_idx}'])
+            all_to_all_single(global_scale_output, global_scales, group=groups[f'global_{inter_idx}'])
+            final_output = quantizer_module.dequantize(global_output, global_scale_output, global_scale_output.numel(),
+                                                       4, quantizer_module.Symmetric)
+            output_lst[idx] = (sum(list(final_output.chunk(num_nodes))) / num_nodes).view(-1)
+    return output_lst
 
 
 @instrument_w_nvtx
@@ -34,7 +76,6 @@ def reduce_scatter_coalesced(
 ) -> List[Tensor]:
     """simultaneously reduce-scatter a list of tensors - this can be done more
     efficiently than individual reduce scatter calls
-
     TODO. see if PyTorch team wants a c++ version of this for ProcessGroupNCCL
     """
     this_rank = dist.get_rank(group)
@@ -45,13 +86,10 @@ def reduce_scatter_coalesced(
         flattened_tensor = tensor.view(-1)
         chunk_sz = math.ceil(tensor.numel() / world_sz)
         partition_lst_for_each_tensor[tensor_idx] = [
-            flattened_tensor[rank * chunk_sz:rank * chunk_sz + chunk_sz]
-            for rank in range(0,
-                              world_sz)
+            flattened_tensor[rank * chunk_sz:rank * chunk_sz + chunk_sz] for rank in range(0, world_sz)
         ]
 
-    padded_partition_sz_for_each_tensor = tuple(
-        math.ceil(t.numel() / world_sz) for t in tensors)
+    padded_partition_sz_for_each_tensor = tuple(math.ceil(t.numel() / world_sz) for t in tensors)
 
     if len(tensors) == 1 and tensors[0].numel() % world_sz == 0:
         # if there's only one tensor being reduced and we don't need to pad
@@ -68,21 +106,15 @@ def reduce_scatter_coalesced(
                 tensor_partitions_lst_with_padding.append(tensor_chunk)
 
                 # add padding if necessary
-                padding_sz = padded_partition_sz_for_each_tensor[
-                    tensor_idx] - tensor_chunk.numel()
+                padding_sz = padded_partition_sz_for_each_tensor[tensor_idx] - tensor_chunk.numel()
                 if padding_sz > 0:
                     tensor_partitions_lst_with_padding.append(
-                        torch.empty(padding_sz,
-                                    dtype=tensor_chunk.dtype,
-                                    device=tensor_chunk.device))
+                        torch.empty(padding_sz, dtype=tensor_chunk.dtype, device=tensor_chunk.device))
 
-        tensor_partition_flat_buffer = instrument_w_nvtx(
-            torch.cat)(tensor_partitions_lst_with_padding)
+        tensor_partition_flat_buffer = instrument_w_nvtx(torch.cat)(tensor_partitions_lst_with_padding)
 
     tensor_partition_flat_buffer.div_(world_sz)  # pre-divide
-    tensor_partition_buffer_for_each_rank: List[Tensor] = torch.chunk(
-        tensor_partition_flat_buffer,
-        world_sz)
+    tensor_partition_buffer_for_each_rank: List[Tensor] = torch.chunk(tensor_partition_flat_buffer, world_sz)
 
     # batched reduce-scatter call
     _torch_reduce_scatter_fn(tensor_partition_flat_buffer,
@@ -95,10 +127,7 @@ def reduce_scatter_coalesced(
     offset = 0
     for tensor_idx in range(len(tensors)):
         output_lst[tensor_idx] = tensor_partition_buffer_for_each_rank[this_rank].narrow(
-            0,
-            offset,
-            partition_lst_for_each_tensor[tensor_idx][this_rank].numel())
+            0, offset, partition_lst_for_each_tensor[tensor_idx][this_rank].numel())
 
         offset += padded_partition_sz_for_each_tensor[tensor_idx]
-
     return output_lst
diff --git a/deepspeed/runtime/comm/mpi.py b/deepspeed/runtime/comm/mpi.py
index 9e112bccc71d..bc544787aa7a 100644
--- a/deepspeed/runtime/comm/mpi.py
+++ b/deepspeed/runtime/comm/mpi.py
@@ -1,6 +1,7 @@
-'''
-Copyright 2020 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 import cupy
@@ -12,6 +13,7 @@
 
 
 class MpiBackend(object):
+
     def __init__(self, cuda_aware):
         self.comm = MPI.COMM_WORLD
         self.rank = self.comm.Get_rank()
@@ -31,49 +33,26 @@ def my_igather(self, rank, size, comm, sendbuf, recbuf, root):
             req.append(comm.Isend(sendbuf, dest=root))
         return req
 
-    def gather_cuda(self,
-                    rank,
-                    world_size,
-                    comm,
-                    cupy_sign_list_packed,
-                    cupy_recvbuf_sign,
-                    cupy_worker_scale,
+    def gather_cuda(self, rank, world_size, comm, cupy_sign_list_packed, cupy_recvbuf_sign, cupy_worker_scale,
                     cupy_recvbuf_scale):
         # We do in-place operations on cupy buffers so we do not return any buffers
         requests = []
         for idx in range(world_size):
-            req_sign = self.my_igather(rank,
-                                       world_size,
-                                       comm,
-                                       cupy_sign_list_packed[idx],
-                                       cupy_recvbuf_sign,
-                                       root=idx)
+            req_sign = self.my_igather(rank, world_size, comm, cupy_sign_list_packed[idx], cupy_recvbuf_sign, root=idx)
             requests += req_sign
 
         for idx in range(world_size):
-            req_scale = self.my_igather(rank,
-                                        world_size,
-                                        comm,
-                                        cupy_worker_scale,
-                                        cupy_recvbuf_scale,
-                                        root=idx)
+            req_scale = self.my_igather(rank, world_size, comm, cupy_worker_scale, cupy_recvbuf_scale, root=idx)
             requests += req_scale
 
         MPI.Request.Waitall(requests)
 
-    def gather_host(self,
-                    rank,
-                    world_size,
-                    comm,
-                    cupy_sign_list_packed,
-                    cupy_recvbuf_sign,
-                    cupy_worker_scale,
+    def gather_host(self, rank, world_size, comm, cupy_sign_list_packed, cupy_recvbuf_sign, cupy_worker_scale,
                     cupy_recvbuf_scale):
 
         # In-place operations are not possible for newly created cupy arrays
         # so we need to return the new buffers
-        numpy_recvbuf_sign = np.zeros([world_size,
-                                       cupy_sign_list_packed[rank].size],
+        numpy_recvbuf_sign = np.zeros([world_size, cupy_sign_list_packed[rank].size],
                                       dtype=cupy_sign_list_packed[0].dtype)
         numpy_recvbuf_scale = np.zeros([world_size, 1], dtype=cupy_worker_scale.dtype)
 
@@ -101,12 +80,7 @@ def gather_host(self,
             requests += req_sign
 
         for idx in range(world_size):
-            req_scale = self.my_igather(rank,
-                                        world_size,
-                                        comm,
-                                        numpy_worker_scale,
-                                        numpy_recvbuf_scale,
-                                        root=idx)
+            req_scale = self.my_igather(rank, world_size, comm, numpy_worker_scale, numpy_recvbuf_scale, root=idx)
             requests += req_scale
 
         MPI.Request.Waitall(requests)
@@ -122,30 +96,18 @@ def gather_host(self,
 
         return cupy_sign_list_packed, cupy_recvbuf_sign, cupy_worker_scale, cupy_recvbuf_scale
 
-    def allgather_cuda(self,
-                       comm,
-                       cupy_server_sign_packed,
-                       cupy_recvbuf_sign_server,
-                       cupy_server_scale,
+    def allgather_cuda(self, comm, cupy_server_sign_packed, cupy_recvbuf_sign_server, cupy_server_scale,
                        cupy_recvbuf_scale_server):
         comm.Allgather(cupy_server_sign_packed, cupy_recvbuf_sign_server)
         comm.Allgather(cupy_server_scale, cupy_recvbuf_scale_server)
 
-    def allgather_host(self,
-                       comm,
-                       cupy_server_sign_packed,
-                       cupy_recvbuf_sign_server,
-                       cupy_server_scale,
+    def allgather_host(self, comm, cupy_server_sign_packed, cupy_recvbuf_sign_server, cupy_server_scale,
                        cupy_recvbuf_scale_server):
 
         # 1. Convert cupy to numpy
-        numpy_recvbuf_sign_server = np.zeros(
-            [comm.Get_size(),
-             cupy_server_sign_packed.size],
-            dtype=cupy_server_sign_packed.dtype)
-        numpy_recvbuf_scale_server = np.zeros([comm.Get_size(),
-                                               1],
-                                              dtype=cupy_server_scale.dtype)
+        numpy_recvbuf_sign_server = np.zeros([comm.Get_size(), cupy_server_sign_packed.size],
+                                             dtype=cupy_server_sign_packed.dtype)
+        numpy_recvbuf_scale_server = np.zeros([comm.Get_size(), 1], dtype=cupy_server_scale.dtype)
 
         numpy_server_sign_packed = cupy.asnumpy(cupy_server_sign_packed)
         numpy_recvbuf_sign_server = cupy.asnumpy(cupy_recvbuf_sign_server)
@@ -167,11 +129,7 @@ def allgather_host(self,
 
         return cupy_server_sign_packed, cupy_recvbuf_sign_server, cupy_server_scale, cupy_recvbuf_scale_server
 
-    def compressed_allreduce(self,
-                             buffer_m: torch.tensor,
-                             worker_error,
-                             server_error,
-                             local_rank):
+    def compressed_allreduce(self, buffer_m: torch.tensor, worker_error, server_error, local_rank):
 
         all_start_time = time.time()
         original_shape = buffer_m.size()
@@ -182,104 +140,71 @@ def compressed_allreduce(self,
         cupy.cuda.Device(local_rank).use()
 
         if original_size != worker_error_size:
-            empty_tensor = torch.zeros(worker_error_size - original_size,
-                                       device=buffer_m.device)
+            empty_tensor = torch.zeros(worker_error_size - original_size, device=buffer_m.device)
             buffer_m = torch.cat([buffer_m, empty_tensor])
 
         buffer_m.add_(worker_error)
-        worker_scale = torch.norm(buffer_m) / np.sqrt(torch.numel(buffer_m))
-        worker_error.set_(buffer_m - worker_scale *
-                          buffer_m.sign().add_(1).bool().float().add_(-0.5).mul_(2.0))
+        worker_scale = torch.linalg.norm(buffer_m) / np.sqrt(torch.numel(buffer_m))
+        worker_error.set_(buffer_m - worker_scale * buffer_m.sign().add_(1).bool().float().add_(-0.5).mul_(2.0))
 
         cupy_sign_list_packed = self.compression_backend.compress_by_chunk(
-            self.compression_backend.torch2cupy(buffer_m.sign_().add_(1).bool()),
-            self.size)
+            self.compression_backend.torch2cupy(buffer_m.sign_().add_(1).bool()), self.size)
         cupy_worker_scale = self.compression_backend.torch2cupy(worker_scale)
 
-        cupy_recvbuf_sign = cupy.zeros(
-            [self.size,
-             cupy_sign_list_packed[self.rank].size],
-            dtype=cupy_sign_list_packed[0].dtype)
+        cupy_recvbuf_sign = cupy.zeros([self.size, cupy_sign_list_packed[self.rank].size],
+                                       dtype=cupy_sign_list_packed[0].dtype)
         cupy_recvbuf_scale = cupy.zeros([self.size, 1], dtype=cupy_worker_scale.dtype)
 
         # Communication Phase 1
         gather_start = time.time()
         if self.cuda_aware:
-            self.gather_cuda(self.rank,
-                             self.size,
-                             self.comm,
-                             cupy_sign_list_packed,
-                             cupy_recvbuf_sign,
-                             cupy_worker_scale,
-                             cupy_recvbuf_scale)
+            self.gather_cuda(self.rank, self.size, self.comm, cupy_sign_list_packed, cupy_recvbuf_sign,
+                             cupy_worker_scale, cupy_recvbuf_scale)
         else:
-            _, cupy_recvbuf_sign, _, cupy_recvbuf_scale = self.gather_host(self.rank,
-               self.size,
-               self.comm,
-               cupy_sign_list_packed,
-               cupy_recvbuf_sign,
-               cupy_worker_scale,
-               cupy_recvbuf_scale)
+            _, cupy_recvbuf_sign, _, cupy_recvbuf_scale = self.gather_host(self.rank, self.size, self.comm,
+                                                                           cupy_sign_list_packed, cupy_recvbuf_sign,
+                                                                           cupy_worker_scale, cupy_recvbuf_scale)
         gather_end = time.time()
 
         # cupy_sign_list_packed, cupy_worker_scale, worker_scale = None, None, None
         cupy_sign_list_packed = None
 
         compensated_server_m = self.compression_backend.cupy2torch(
-            (cupy.unpackbits(cupy_recvbuf_sign.flatten())).reshape(
-                self.size,
-                -1)).float().add_(-0.5).mul_(2.0).mul_(
-                    self.compression_backend.cupy2torch(cupy_recvbuf_scale).mul_(
-                        1 / self.size)).sum(0)
+            (cupy.unpackbits(cupy_recvbuf_sign.flatten())).reshape(self.size, -1)).float().add_(-0.5).mul_(2.0).mul_(
+                self.compression_backend.cupy2torch(cupy_recvbuf_scale).mul_(1 / self.size)).sum(0)
         compensated_server_m.add_(server_error)
-        server_scale = torch.norm(compensated_server_m) / np.sqrt(
-            compensated_server_m.numel())
-        server_error.set_(
-            compensated_server_m - server_scale *
-            compensated_server_m.sign().add_(1).bool().float().add_(-0.5).mul_(2.0))
+        server_scale = torch.linalg.norm(compensated_server_m) / np.sqrt(compensated_server_m.numel())
+        server_error.set_(compensated_server_m -
+                          server_scale * compensated_server_m.sign().add_(1).bool().float().add_(-0.5).mul_(2.0))
 
         cupy_server_scale = self.compression_backend.torch2cupy(server_scale)
 
         cupy_server_sign_packed = self.compression_backend.compress_by_chunk(
-            self.compression_backend.torch2cupy(
-                compensated_server_m.sign_().add_(1).bool()),
-            1)
+            self.compression_backend.torch2cupy(compensated_server_m.sign_().add_(1).bool()), 1)
         compensated_server_m = None
 
-        cupy_recvbuf_sign_server = cupy.zeros(
-            [self.size,
-             cupy_server_sign_packed[0].size],
-            dtype=cupy_recvbuf_sign.dtype)
-        cupy_recvbuf_scale_server = cupy.zeros([self.size,
-                                                1],
-                                               dtype=cupy_recvbuf_scale.dtype)
+        cupy_recvbuf_sign_server = cupy.zeros([self.size, cupy_server_sign_packed[0].size],
+                                              dtype=cupy_recvbuf_sign.dtype)
+        cupy_recvbuf_scale_server = cupy.zeros([self.size, 1], dtype=cupy_recvbuf_scale.dtype)
         # cupy_recvbuf_sign, cupy_recvbuf_scale = None, None
         cupy_recvbuf_sign = None
 
         # Communication Phase 2
         if self.cuda_aware:
-            self.allgather_cuda(self.comm,
-                                cupy_server_sign_packed[0],
-                                cupy_recvbuf_sign_server,
-                                cupy_server_scale,
+            self.allgather_cuda(self.comm, cupy_server_sign_packed[0], cupy_recvbuf_sign_server, cupy_server_scale,
                                 cupy_recvbuf_scale_server)
         else:
-            _, cupy_recvbuf_sign_server, _, cupy_recvbuf_scale_server = self.allgather_host(self.comm,
-                  cupy_server_sign_packed[0],
-                  cupy_recvbuf_sign_server,
-                  cupy_server_scale,
-                  cupy_recvbuf_scale_server)
+            _, cupy_recvbuf_sign_server, _, cupy_recvbuf_scale_server = self.allgather_host(
+                self.comm, cupy_server_sign_packed[0], cupy_recvbuf_sign_server, cupy_server_scale,
+                cupy_recvbuf_scale_server)
 
         # cupy_server_sign_packed, cupy_server_scale, server_scale = None, None, None
         cupy_server_sign_packed = None
 
         buffer_m.data.copy_(
-            self.compression_backend.cupy2torch(
-                (cupy.unpackbits(cupy_recvbuf_sign_server.flatten())).reshape(
-                    self.size,
-                    -1)).float().add_(-0.5).mul_(2.0).mul_(
-                        self.compression_backend.cupy2torch(
-                            cupy_recvbuf_scale_server)).flatten().data)
+            self.compression_backend.cupy2torch((cupy.unpackbits(cupy_recvbuf_sign_server.flatten())).reshape(
+                self.size, -1)).float().add_(-0.5).mul_(2.0).mul_(
+                    self.compression_backend.cupy2torch(cupy_recvbuf_scale_server)).flatten().data)
         if original_size != worker_error_size:
             buffer_m = buffer_m[0:original_size]
         if len(original_shape) > 1:
diff --git a/deepspeed/runtime/comm/nccl.py b/deepspeed/runtime/comm/nccl.py
index fd99068dff82..231f841ae8b7 100644
--- a/deepspeed/runtime/comm/nccl.py
+++ b/deepspeed/runtime/comm/nccl.py
@@ -1,6 +1,7 @@
-'''
-Copyright 2020 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 from deepspeed import comm as dist
@@ -8,10 +9,12 @@
 import numpy as np
 
 from deepspeed.runtime.compression.cupy import CupyBackend
+from deepspeed.runtime.utils import required_torch_version
 from deepspeed.accelerator import get_accelerator
 
 
 class NcclBackend(object):
+
     def __init__(self, mpu=None):
         if mpu is None:
             self.world_group = dist.new_group(ranks=range(dist.get_world_size()))
@@ -21,11 +24,7 @@ def __init__(self, mpu=None):
         self.rank = dist.get_rank(group=self.world_group)
         self.size = dist.get_world_size(group=self.world_group)
         self.compression_backend = CupyBackend()
-        self.bool_not_supported = False
-        TORCH_MAJOR = int(torch.__version__.split('.')[0])
-        TORCH_MINOR = int(torch.__version__.split('.')[1])
-        if TORCH_MAJOR >= 1 and TORCH_MINOR >= 10:
-            self.bool_not_supported = True
+        self.bool_not_supported = required_torch_version(min_version=1.10)
 
     def my_igather(self, rank, size, group, sendbuf, recvbuf, root):
         req = []
@@ -49,11 +48,7 @@ def my_gather(self, rank, size, group, sendbuf, recvbuf, root):
         else:
             dist.send(sendbuf, group=group, dst=root)
 
-    def compressed_allreduce(self,
-                             buffer_m: torch.tensor,
-                             worker_error,
-                             server_error,
-                             local_rank):
+    def compressed_allreduce(self, buffer_m: torch.tensor, worker_error, server_error, local_rank):
 
         # all_start_time = time.time()
         original_shape = buffer_m.size()
@@ -64,53 +59,41 @@ def compressed_allreduce(self,
         cupy.cuda.Device(local_rank).use()
 
         if original_size != worker_error_size:
-            empty_tensor = torch.zeros(worker_error_size - original_size,
-                                       device=buffer_m.device)
+            empty_tensor = torch.zeros(worker_error_size - original_size, device=buffer_m.device)
             buffer_m = torch.cat([buffer_m, empty_tensor])
 
         buffer_m.add_(worker_error)
-        worker_scale = torch.norm(buffer_m) / np.sqrt(buffer_m.numel())
-        worker_error.set_(buffer_m - worker_scale *
-                          buffer_m.sign().add_(1).bool().float().add_(-0.5).mul_(2.0))
+        worker_scale = torch.linalg.norm(buffer_m) / np.sqrt(buffer_m.numel())
+        worker_error.set_(buffer_m - worker_scale * buffer_m.sign().add_(1).bool().float().add_(-0.5).mul_(2.0))
 
         if self.bool_not_supported:
             cupy_sign_list_packed = self.compression_backend.compress_by_chunk(
-                self.compression_backend.torch2cupy(
-                    buffer_m.sign_().add_(1).bool().to(dtype=torch.uint8)),
-                self.size)
+                self.compression_backend.torch2cupy(buffer_m.sign_().add_(1).bool().to(dtype=torch.uint8)), self.size)
         else:
             cupy_sign_list_packed = self.compression_backend.compress_by_chunk(
-                self.compression_backend.torch2cupy(buffer_m.sign_().add_(1).bool()),
-                self.size)
+                self.compression_backend.torch2cupy(buffer_m.sign_().add_(1).bool()), self.size)
         cupy_worker_scale = self.compression_backend.torch2cupy(worker_scale)
 
-        cupy_recvbuf_sign = cupy.zeros(
-            [self.size,
-             cupy_sign_list_packed[self.rank].size],
-            dtype=cupy_sign_list_packed[0].dtype)
+        cupy_recvbuf_sign = cupy.zeros([self.size, cupy_sign_list_packed[self.rank].size],
+                                       dtype=cupy_sign_list_packed[0].dtype)
         # cupy_recvbuf_scale = cupy.zeros([self.size, 1], dtype=cupy_worker_scale.dtype)
 
         sign_list_packed = [
-            self.compression_backend.cupy2torch(cupy_sign_list_packed[idx])
-            for idx in range(self.size)
+            self.compression_backend.cupy2torch(cupy_sign_list_packed[idx]) for idx in range(self.size)
         ]
 
         # worker_scale = self.compression_backend.cupy2torch(cupy_worker_scale)
         recvbuf_sign = self.compression_backend.cupy2torch(cupy_recvbuf_sign)
         #recvbuf_scale = self.compression_backend.cupy2torch(cupy_recvbuf_scale)
         recvbuf_scale = [
-            torch.zeros(1,
-                        dtype=worker_scale.dtype,
-                        device=torch.device(get_accelerator().device_name(local_rank)))
+            torch.zeros(1, dtype=worker_scale.dtype, device=torch.device(get_accelerator().device_name(local_rank)))
             for i in range(self.size)
         ]
 
         # communication phase 1
         # gather_start = time.time()
         # Alltoall for sign
-        dist.all_to_all_single(recvbuf_sign,
-                               torch.stack(sign_list_packed),
-                               group=self.world_group)
+        dist.all_to_all_single(recvbuf_sign, torch.stack(sign_list_packed), group=self.world_group)
         # Allgather for scale
         dist.all_gather(recvbuf_scale, worker_scale, group=self.world_group)
 
@@ -123,61 +106,44 @@ def compressed_allreduce(self,
         #cupy_recvbuf_scale = self.compression_backend.torch2cupy(torch.stack(recvbuf_scale))
 
         compensated_server_m = self.compression_backend.cupy2torch(
-            (cupy.unpackbits(cupy_recvbuf_sign.flatten())).reshape(
-                self.size,
-                -1)).float().add_(-0.5).mul_(2.0).mul_(
-                    torch.stack(recvbuf_scale).mul_(1 / self.size)).sum(0)
+            (cupy.unpackbits(cupy_recvbuf_sign.flatten())).reshape(self.size, -1)).float().add_(-0.5).mul_(2.0).mul_(
+                torch.stack(recvbuf_scale).mul_(1 / self.size)).sum(0)
         compensated_server_m.add_(server_error)
-        server_scale = torch.norm(compensated_server_m) / np.sqrt(
-            compensated_server_m.numel())
-        server_error.set_(
-            compensated_server_m - server_scale *
-            compensated_server_m.sign().add_(1).bool().float().add_(-0.5).mul_(2.0))
+        server_scale = torch.linalg.norm(compensated_server_m) / np.sqrt(compensated_server_m.numel())
+        server_error.set_(compensated_server_m -
+                          server_scale * compensated_server_m.sign().add_(1).bool().float().add_(-0.5).mul_(2.0))
 
         # cupy_server_scale = self.compression_backend.torch2cupy(server_scale)
 
         if self.bool_not_supported:
             cupy_server_sign_packed = self.compression_backend.compress_by_chunk(
-                self.compression_backend.torch2cupy(
-                    compensated_server_m.sign_().add_(1).bool().to(dtype=torch.uint8)),
+                self.compression_backend.torch2cupy(compensated_server_m.sign_().add_(1).bool().to(dtype=torch.uint8)),
                 1)
         else:
             cupy_server_sign_packed = self.compression_backend.compress_by_chunk(
-                self.compression_backend.torch2cupy(
-                    compensated_server_m.sign_().add_(1).bool()),
-                1)
+                self.compression_backend.torch2cupy(compensated_server_m.sign_().add_(1).bool()), 1)
         compensated_server_m = None
 
-        cupy_recvbuf_sign_server = cupy.zeros(
-            [self.size,
-             cupy_server_sign_packed[0].size],
-            dtype=cupy_recvbuf_sign.dtype)
+        cupy_recvbuf_sign_server = cupy.zeros([self.size, cupy_server_sign_packed[0].size],
+                                              dtype=cupy_recvbuf_sign.dtype)
         # cupy_recvbuf_sign, recvbuf_sign = None, None
         cupy_recvbuf_sign = None
 
-        server_sign_packed = [
-            self.compression_backend.cupy2torch(cupy_server_sign_packed[0])
-        ]
+        server_sign_packed = [self.compression_backend.cupy2torch(cupy_server_sign_packed[0])]
         recvbuf_sign_server = [
-            self.compression_backend.cupy2torch(cupy_recvbuf_sign_server[idx])
-            for idx in range(self.size)
+            self.compression_backend.cupy2torch(cupy_recvbuf_sign_server[idx]) for idx in range(self.size)
         ]
 
         # server_scale = self.compression_backend.cupy2torch(cupy_server_scale)
-        cupy_recvbuf_scale_server = cupy.zeros([self.size,
-                                                1],
-                                               dtype=cupy_worker_scale.dtype)
+        cupy_recvbuf_scale_server = cupy.zeros([self.size, 1], dtype=cupy_worker_scale.dtype)
         # cupy_recvbuf_scale, recvbuf_scale = None, None
 
         recvbuf_scale_server = [
-            self.compression_backend.cupy2torch(cupy_recvbuf_scale_server[idx])
-            for idx in range(self.size)
+            self.compression_backend.cupy2torch(cupy_recvbuf_scale_server[idx]) for idx in range(self.size)
         ]
 
         # Communication Phase 2
-        dist.all_gather(recvbuf_sign_server,
-                        server_sign_packed[0],
-                        group=self.world_group)
+        dist.all_gather(recvbuf_sign_server, server_sign_packed[0], group=self.world_group)
         dist.all_gather(recvbuf_scale_server, server_scale, group=self.world_group)
 
         cupy_server_sign_packed = None
@@ -186,16 +152,12 @@ def compressed_allreduce(self,
         # dist.all_gather only provides a tensor list as the recv/output buffer
         recvbuf_sign_server = torch.stack(recvbuf_sign_server)
 
-        cupy_recvbuf_sign_server = self.compression_backend.torch2cupy(
-            recvbuf_sign_server)
+        cupy_recvbuf_sign_server = self.compression_backend.torch2cupy(recvbuf_sign_server)
 
         buffer_m.data.copy_(
-            self.compression_backend.cupy2torch(
-                (cupy.unpackbits(cupy_recvbuf_sign_server.flatten())).reshape(
-                    self.size,
-                    -1)).float().add_(-0.5).mul_(2.0).mul_(
-                        self.compression_backend.cupy2torch(
-                            cupy_recvbuf_scale_server)).flatten().data)
+            self.compression_backend.cupy2torch((cupy.unpackbits(cupy_recvbuf_sign_server.flatten())).reshape(
+                self.size, -1)).float().add_(-0.5).mul_(2.0).mul_(
+                    self.compression_backend.cupy2torch(cupy_recvbuf_scale_server)).flatten().data)
         if original_size != worker_error_size:
             buffer_m = buffer_m[0:original_size]
         if len(original_shape) > 1:
diff --git a/deepspeed/runtime/compression/__init__.py b/deepspeed/runtime/compression/__init__.py
index fcb45ab2b685..6c5067f71c8f 100644
--- a/deepspeed/runtime/compression/__init__.py
+++ b/deepspeed/runtime/compression/__init__.py
@@ -1 +1,5 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 '''Copyright The Microsoft DeepSpeed Team'''
diff --git a/deepspeed/runtime/compression/cupy.py b/deepspeed/runtime/compression/cupy.py
index 68e56c68e9d0..b959a9c20372 100644
--- a/deepspeed/runtime/compression/cupy.py
+++ b/deepspeed/runtime/compression/cupy.py
@@ -1,6 +1,7 @@
-'''
-Copyright 2020 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import cupy
 from torch.utils.dlpack import to_dlpack
@@ -8,6 +9,7 @@
 
 
 class CupyBackend(object):
+
     def __init__(self):
         pass
 
diff --git a/deepspeed/runtime/config.py b/deepspeed/runtime/config.py
index abbabb594579..b49469b94f11 100755
--- a/deepspeed/runtime/config.py
+++ b/deepspeed/runtime/config.py
@@ -1,10 +1,11 @@
-'''Copyright The Microsoft DeepSpeed Team'''
-"""
-Copyright (c) Microsoft Corporation
-Licensed under the MIT license.
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 import os
 from typing import Union
+from enum import Enum
 
 import torch
 import json
@@ -17,6 +18,7 @@
     INITIAL_LOSS_SCALE,
     SCALE_WINDOW,
     DELAYED_SHIFT,
+    CONSECUTIVE_HYSTERESIS,
     MIN_LOSS_SCALE,
 )
 from .config_utils import (
@@ -28,8 +30,10 @@
 from .activation_checkpointing.config import DeepSpeedActivationCheckpointingConfig
 from ..comm.config import DeepSpeedCommsConfig
 from ..monitor.config import get_monitor_config
+from ..inference.config import WeightQuantConfig
 
 from deepspeed import comm as dist
+from deepspeed.runtime.config_utils import DeepSpeedConfigModel
 
 from ..git_version_info import version as __version__
 from ..utils import logger
@@ -44,8 +48,8 @@
     ELASTICITY,
     IGNORE_NON_ELASTIC_BATCH_INFO,
     IGNORE_NON_ELASTIC_BATCH_INFO_DEFAULT,
-    MODEL_PARLLEL_SIZE,
-    MODEL_PARLLEL_SIZE_DEFAULT,
+    MODEL_PARALLEL_SIZE,
+    MODEL_PARALLEL_SIZE_DEFAULT,
     NUM_GPUS_PER_NODE,
     NUM_GPUS_PER_NODE_DEFAULT,
 )
@@ -70,14 +74,13 @@
 ONEBIT_ADAM_OPTIMIZER = 'onebitadam'
 ZERO_ONE_ADAM_OPTIMIZER = 'zerooneadam'
 ONEBIT_LAMB_OPTIMIZER = 'onebitlamb'
+MUADAM_OPTIMIZER = 'muadam'
+MUADAMW_OPTIMIZER = 'muadamw'
+MUSGD_OPTIMIZER = 'musgd'
+LION_OPTIMIZER = 'lion'
 DEEPSPEED_OPTIMIZERS = [
-    ADAGRAD_OPTIMIZER,
-    ADAM_OPTIMIZER,
-    ADAMW_OPTIMIZER,
-    LAMB_OPTIMIZER,
-    ONEBIT_ADAM_OPTIMIZER,
-    ONEBIT_LAMB_OPTIMIZER,
-    ZERO_ONE_ADAM_OPTIMIZER
+    ADAGRAD_OPTIMIZER, ADAM_OPTIMIZER, ADAMW_OPTIMIZER, LAMB_OPTIMIZER, ONEBIT_ADAM_OPTIMIZER, ONEBIT_LAMB_OPTIMIZER,
+    ZERO_ONE_ADAM_OPTIMIZER, MUADAM_OPTIMIZER, MUADAMW_OPTIMIZER, MUSGD_OPTIMIZER, LION_OPTIMIZER
 ]
 
 # extra optimizer parameters for adam/adamw
@@ -92,11 +95,36 @@ class DeepSpeedConfigError(Exception):
     pass
 
 
+class DtypeEnum(Enum):
+    # The torch dtype must always be the first value (so we return torch.dtype)
+    fp16 = torch.float16, "torch.float16", "fp16", "float16", "half"
+    fp32 = torch.float32, "torch.float32", "fp32", "float32", "float"
+    int8 = torch.int8, "torch.int8", "int8"
+    bf16 = torch.bfloat16, "torch.bfloat16", "bf16", "bfloat16"
+
+    # Copied from https://stackoverflow.com/a/43210118
+    # Allows us to use multiple values for each Enum index and returns first
+    # listed value when Enum is called
+    def __new__(cls, *values):
+        obj = object.__new__(cls)
+        # first value is canonical value
+        obj._value_ = values[0]
+        for other_value in values[1:]:
+            cls._value2member_map_[other_value] = obj
+        obj._all_values = values
+        return obj
+
+    def __repr__(self):
+        return "<%s.%s: %s>" % (
+            self.__class__.__name__,
+            self._name_,
+            ", ".join([repr(v) for v in self._all_values]),
+        )
+
+
 def get_pld_enabled(param_dict):
     if PROGRESSIVE_LAYER_DROP in param_dict.keys():
-        return get_scalar_param(param_dict[PROGRESSIVE_LAYER_DROP],
-                                PLD_ENABLED,
-                                PLD_ENABLED_DEFAULT)
+        return get_scalar_param(param_dict[PROGRESSIVE_LAYER_DROP], PLD_ENABLED, PLD_ENABLED_DEFAULT)
     else:
         return False
 
@@ -136,17 +164,13 @@ def get_fp16_enabled(param_dict):
 def get_bfloat16_enabled(param_dict):
     for key in [BFLOAT16, BFLOAT16_OLD]:
         if key in param_dict.keys():
-            return get_scalar_param(param_dict[key],
-                                    BFLOAT16_ENABLED,
-                                    BFLOAT16_ENABLED_DEFAULT)
+            return get_scalar_param(param_dict[key], BFLOAT16_ENABLED, BFLOAT16_ENABLED_DEFAULT)
     return False
 
 
 def get_fp16_master_weights_and_grads_enabled(param_dict):
     if get_fp16_enabled(param_dict):
-        return get_scalar_param(param_dict[FP16],
-                                FP16_MASTER_WEIGHTS_AND_GRADS,
-                                FP16_MASTER_WEIGHTS_AND_GRADS_DEFAULT)
+        return get_scalar_param(param_dict[FP16], FP16_MASTER_WEIGHTS_AND_GRADS, FP16_MASTER_WEIGHTS_AND_GRADS_DEFAULT)
     else:
         return False
 
@@ -158,9 +182,7 @@ def get_fp16_auto_cast(param_dict):
 
 def get_loss_scale(param_dict):
     if get_fp16_enabled(param_dict):
-        return get_scalar_param(param_dict[FP16],
-                                FP16_LOSS_SCALE,
-                                FP16_LOSS_SCALE_DEFAULT)
+        return get_scalar_param(param_dict[FP16], FP16_LOSS_SCALE, FP16_LOSS_SCALE_DEFAULT)
     elif get_bfloat16_enabled(param_dict):
         return 1.0
     else:
@@ -169,8 +191,7 @@ def get_loss_scale(param_dict):
 
 def get_initial_dynamic_scale(param_dict):
     if get_fp16_enabled(param_dict):
-        initial_scale_power = get_scalar_param(param_dict[FP16],
-                                               FP16_INITIAL_SCALE_POWER,
+        initial_scale_power = get_scalar_param(param_dict[FP16], FP16_INITIAL_SCALE_POWER,
                                                FP16_INITIAL_SCALE_POWER_DEFAULT)
     elif get_bfloat16_enabled(param_dict):
         initial_scale_power = 0
@@ -189,24 +210,20 @@ def get_dynamic_loss_scale_args(param_dict):
             FP16_LOSS_SCALE_WINDOW,
             FP16_MIN_LOSS_SCALE,
             FP16_HYSTERESIS,
+            FP16_CONSECUTIVE_HYSTERESIS,
         ]
         if any(arg in list(fp16_dict.keys()) for arg in dynamic_loss_args):
-            init_scale = get_scalar_param(fp16_dict,
-                                          FP16_INITIAL_SCALE_POWER,
-                                          FP16_INITIAL_SCALE_POWER_DEFAULT)
-            scale_window = get_scalar_param(fp16_dict,
-                                            FP16_LOSS_SCALE_WINDOW,
-                                            FP16_LOSS_SCALE_WINDOW_DEFAULT)
-            delayed_shift = get_scalar_param(fp16_dict,
-                                             FP16_HYSTERESIS,
-                                             FP16_HYSTERESIS_DEFAULT)
-            min_loss_scale = get_scalar_param(fp16_dict,
-                                              FP16_MIN_LOSS_SCALE,
-                                              FP16_MIN_LOSS_SCALE_DEFAULT)
+            init_scale = get_scalar_param(fp16_dict, FP16_INITIAL_SCALE_POWER, FP16_INITIAL_SCALE_POWER_DEFAULT)
+            scale_window = get_scalar_param(fp16_dict, FP16_LOSS_SCALE_WINDOW, FP16_LOSS_SCALE_WINDOW_DEFAULT)
+            delayed_shift = get_scalar_param(fp16_dict, FP16_HYSTERESIS, FP16_HYSTERESIS_DEFAULT)
+            consecutive_hysteresis = get_scalar_param(fp16_dict, FP16_CONSECUTIVE_HYSTERESIS,
+                                                      FP16_CONSECUTIVE_HYSTERESIS_DEFAULT)
+            min_loss_scale = get_scalar_param(fp16_dict, FP16_MIN_LOSS_SCALE, FP16_MIN_LOSS_SCALE_DEFAULT)
             loss_scale_args = {
                 INITIAL_LOSS_SCALE: 2**init_scale,
                 SCALE_WINDOW: scale_window,
                 DELAYED_SHIFT: delayed_shift,
+                CONSECUTIVE_HYSTERESIS: consecutive_hysteresis,
                 MIN_LOSS_SCALE: min_loss_scale,
             }
 
@@ -214,19 +231,17 @@ def get_dynamic_loss_scale_args(param_dict):
 
 
 def get_gradient_accumulation_steps(param_dict):
-    return get_scalar_param(param_dict,
-                            GRADIENT_ACCUMULATION_STEPS,
-                            GRADIENT_ACCUMULATION_STEPS_DEFAULT)
+    return get_scalar_param(param_dict, GRADIENT_ACCUMULATION_STEPS, GRADIENT_ACCUMULATION_STEPS_DEFAULT)
 
 
 def get_sparse_gradients_enabled(param_dict):
     return get_scalar_param(param_dict, SPARSE_GRADIENTS, SPARSE_GRADIENTS_DEFAULT)
 
 
-def get_communication_data_type(param_dict):
-    val = get_scalar_param(param_dict,
-                           COMMUNICATION_DATA_TYPE,
-                           COMMUNICATION_DATA_TYPE_DEFAULT)
+def get_communication_data_type(param_dict,
+                                comm_type=COMMUNICATION_DATA_TYPE,
+                                comm_data_type_default=COMMUNICATION_DATA_TYPE_DEFAULT):
+    val = get_scalar_param(param_dict, comm_type, comm_data_type_default)
     val = val.lower() if val is not None else val
     if val is None:
         return val  # we must determine it by other parameters
@@ -237,9 +252,7 @@ def get_communication_data_type(param_dict):
     elif val == "bfp16":
         return torch.bfloat16
 
-    raise ValueError(
-        f"Invalid communication_data_type. Supported data types: ['fp16', 'bfp16', 'fp32']. Got: {val}"
-    )
+    raise ValueError(f"Invalid communication_data_type. Supported data types: ['fp16', 'bfp16', 'fp32']. Got: {val}")
 
 
 def get_prescale_gradients(param_dict):
@@ -247,9 +260,7 @@ def get_prescale_gradients(param_dict):
 
 
 def get_gradient_predivide_factor(param_dict):
-    return get_scalar_param(param_dict,
-                            GRADIENT_PREDIVIDE_FACTOR,
-                            GRADIENT_PREDIVIDE_FACTOR_DEFAULT)
+    return get_scalar_param(param_dict, GRADIENT_PREDIVIDE_FACTOR, GRADIENT_PREDIVIDE_FACTOR_DEFAULT)
 
 
 def get_steps_per_print(param_dict):
@@ -284,8 +295,7 @@ def get_sparse_attention(param_dict):
         elif mode == SPARSE_BSLONGFORMER_MODE:
             return get_sparse_bslongformer_config(sparsity)
         else:
-            raise NotImplementedError(
-                f"Given sparsity mode, {mode}, has not been implemented yet!")
+            raise NotImplementedError(f"Given sparsity mode, {mode}, has not been implemented yet!")
 
     else:
         return None
@@ -303,15 +313,9 @@ def get_sparse_fixed_config(sparsity):
         SPARSE_DIFFERENT_LAYOUT_PER_HEAD,
         SPARSE_DIFFERENT_LAYOUT_PER_HEAD_DEFAULT,
     )
-    num_local_blocks = get_scalar_param(sparsity,
-                                        SPARSE_NUM_LOCAL_BLOCKS,
-                                        SPARSE_NUM_LOCAL_BLOCKS_DEFAULT)
-    num_global_blocks = get_scalar_param(sparsity,
-                                         SPARSE_NUM_GLOBAL_BLOCKS,
-                                         SPARSE_NUM_GLOBAL_BLOCKS_DEFAULT)
-    attention = get_scalar_param(sparsity,
-                                 SPARSE_ATTENTION_TYPE,
-                                 SPARSE_ATTENTION_TYPE_DEFAULT)
+    num_local_blocks = get_scalar_param(sparsity, SPARSE_NUM_LOCAL_BLOCKS, SPARSE_NUM_LOCAL_BLOCKS_DEFAULT)
+    num_global_blocks = get_scalar_param(sparsity, SPARSE_NUM_GLOBAL_BLOCKS, SPARSE_NUM_GLOBAL_BLOCKS_DEFAULT)
+    attention = get_scalar_param(sparsity, SPARSE_ATTENTION_TYPE, SPARSE_ATTENTION_TYPE_DEFAULT)
     horizontal_global_attention = get_scalar_param(
         sparsity,
         SPARSE_HORIZONTAL_GLOBAL_ATTENTION,
@@ -342,23 +346,15 @@ def get_sparse_variable_config(sparsity):
         SPARSE_DIFFERENT_LAYOUT_PER_HEAD,
         SPARSE_DIFFERENT_LAYOUT_PER_HEAD_DEFAULT,
     )
-    num_random_blocks = get_scalar_param(sparsity,
-                                         SPARSE_NUM_RANDOM_BLOCKS,
-                                         SPARSE_NUM_RANDOM_BLOCKS_DEFAULT)
-    local_window_blocks = get_scalar_param(sparsity,
-                                           SPARSE_LOCAL_WINDOW_BLOCKS,
-                                           SPARSE_LOCAL_WINDOW_BLOCKS_DEFAULT)
-    global_block_indices = get_scalar_param(sparsity,
-                                            SPARSE_GLOBAL_BLOCK_INDICES,
-                                            SPARSE_GLOBAL_BLOCK_INDICES_DEFAULT)
+    num_random_blocks = get_scalar_param(sparsity, SPARSE_NUM_RANDOM_BLOCKS, SPARSE_NUM_RANDOM_BLOCKS_DEFAULT)
+    local_window_blocks = get_scalar_param(sparsity, SPARSE_LOCAL_WINDOW_BLOCKS, SPARSE_LOCAL_WINDOW_BLOCKS_DEFAULT)
+    global_block_indices = get_scalar_param(sparsity, SPARSE_GLOBAL_BLOCK_INDICES, SPARSE_GLOBAL_BLOCK_INDICES_DEFAULT)
     global_block_end_indices = get_scalar_param(
         sparsity,
         SPARSE_GLOBAL_BLOCK_END_INDICES,
         SPARSE_GLOBAL_BLOCK_END_INDICES_DEFAULT,
     )
-    attention = get_scalar_param(sparsity,
-                                 SPARSE_ATTENTION_TYPE,
-                                 SPARSE_ATTENTION_TYPE_DEFAULT)
+    attention = get_scalar_param(sparsity, SPARSE_ATTENTION_TYPE, SPARSE_ATTENTION_TYPE_DEFAULT)
     horizontal_global_attention = get_scalar_param(
         sparsity,
         SPARSE_HORIZONTAL_GLOBAL_ATTENTION,
@@ -385,17 +381,13 @@ def get_sparse_bigbird_config(sparsity):
         SPARSE_DIFFERENT_LAYOUT_PER_HEAD,
         SPARSE_DIFFERENT_LAYOUT_PER_HEAD_DEFAULT,
     )
-    num_random_blocks = get_scalar_param(sparsity,
-                                         SPARSE_NUM_RANDOM_BLOCKS,
-                                         SPARSE_NUM_RANDOM_BLOCKS_DEFAULT)
+    num_random_blocks = get_scalar_param(sparsity, SPARSE_NUM_RANDOM_BLOCKS, SPARSE_NUM_RANDOM_BLOCKS_DEFAULT)
     num_sliding_window_blocks = get_scalar_param(
         sparsity,
         SPARSE_NUM_SLIDING_WINDOW_BLOCKS,
         SPARSE_NUM_SLIDING_WINDOW_BLOCKS_DEFAULT,
     )
-    num_global_blocks = get_scalar_param(sparsity,
-                                         SPARSE_NUM_GLOBAL_BLOCKS,
-                                         SPARSE_NUM_GLOBAL_BLOCKS_DEFAULT)
+    num_global_blocks = get_scalar_param(sparsity, SPARSE_NUM_GLOBAL_BLOCKS, SPARSE_NUM_GLOBAL_BLOCKS_DEFAULT)
 
     return {
         SPARSE_MODE: SPARSE_BIGBIRD_MODE,
@@ -419,9 +411,7 @@ def get_sparse_bslongformer_config(sparsity):
         SPARSE_NUM_SLIDING_WINDOW_BLOCKS,
         SPARSE_NUM_SLIDING_WINDOW_BLOCKS_DEFAULT,
     )
-    global_block_indices = get_scalar_param(sparsity,
-                                            SPARSE_GLOBAL_BLOCK_INDICES,
-                                            SPARSE_GLOBAL_BLOCK_INDICES_DEFAULT)
+    global_block_indices = get_scalar_param(sparsity, SPARSE_GLOBAL_BLOCK_INDICES, SPARSE_GLOBAL_BLOCK_INDICES_DEFAULT)
     global_block_end_indices = get_scalar_param(
         sparsity,
         SPARSE_GLOBAL_BLOCK_END_INDICES,
@@ -459,6 +449,8 @@ def get_pipeline_config(param_dict):
         "partition": "best",
         "seed_layers": False,
         "activation_checkpoint_interval": 0,
+        "pipe_partitioned": True,
+        "grad_partitioned": True,
     }
     config = default_pipeline
     for key, val in param_dict.get("pipeline", {}).items():
@@ -474,8 +466,7 @@ def get_optimizer_name(param_dict):
 
 
 def get_optimizer_params(param_dict):
-    if (get_optimizer_name(param_dict) is not None
-            and OPTIMIZER_PARAMS in param_dict[OPTIMIZER].keys()):
+    if (get_optimizer_name(param_dict) is not None and OPTIMIZER_PARAMS in param_dict[OPTIMIZER].keys()):
         return param_dict[OPTIMIZER][OPTIMIZER_PARAMS]
     else:
         return None
@@ -497,15 +488,11 @@ def get_optimizer_legacy_fusion(param_dict):
 
 
 def get_zero_allow_untested_optimizer(param_dict):
-    return get_scalar_param(param_dict,
-                            ZERO_ALLOW_UNTESTED_OPTIMIZER,
-                            ZERO_ALLOW_UNTESTED_OPTIMIZER_DEFAULT)
+    return get_scalar_param(param_dict, ZERO_ALLOW_UNTESTED_OPTIMIZER, ZERO_ALLOW_UNTESTED_OPTIMIZER_DEFAULT)
 
 
 def get_zero_force_ds_cpu_optimizer(param_dict):
-    return get_scalar_param(param_dict,
-                            ZERO_FORCE_DS_CPU_OPTIMIZER,
-                            ZERO_FORCE_DS_CPU_OPTIMIZER_DEFAULT)
+    return get_scalar_param(param_dict, ZERO_FORCE_DS_CPU_OPTIMIZER, ZERO_FORCE_DS_CPU_OPTIMIZER_DEFAULT)
 
 
 def get_scheduler_name(param_dict):
@@ -516,8 +503,7 @@ def get_scheduler_name(param_dict):
 
 
 def get_scheduler_params(param_dict):
-    if (get_scheduler_name(param_dict) is not None
-            and SCHEDULER_PARAMS in param_dict[SCHEDULER].keys()):
+    if (get_scheduler_name(param_dict) is not None and SCHEDULER_PARAMS in param_dict[SCHEDULER].keys()):
         return param_dict[SCHEDULER][SCHEDULER_PARAMS]
     else:
         return None
@@ -536,15 +522,32 @@ def get_train_micro_batch_size_per_gpu(param_dict):
 
 
 def get_wall_clock_breakdown(param_dict):
-    return get_scalar_param(param_dict,
-                            WALL_CLOCK_BREAKDOWN,
-                            WALL_CLOCK_BREAKDOWN_DEFAULT)
+    return get_scalar_param(param_dict, WALL_CLOCK_BREAKDOWN, WALL_CLOCK_BREAKDOWN_DEFAULT)
 
 
 def get_memory_breakdown(param_dict):
     return get_scalar_param(param_dict, MEMORY_BREAKDOWN, MEMORY_BREAKDOWN_DEFAULT)
 
 
+class HybridEngineConfig(DeepSpeedConfigModel):
+    enabled: bool = False
+    max_out_tokens: int = 512
+    inference_tp_size: int = 1
+    release_inference_cache: bool = False
+    pin_parameters: bool = True
+    tp_gather_partition_size: int = 8
+
+
+def get_hybrid_engine_config(param_dict):
+    hybrid_engine_config_dict = param_dict.get("hybrid_engine", {})
+    hybrid_engine_config = HybridEngineConfig(**hybrid_engine_config_dict)
+    return hybrid_engine_config
+
+
+def get_expert_data_topo_config(param_dict):
+    return get_scalar_param(param_dict, USE_DATA_BEFORE_EXPERT_PARALLEL, USE_DATA_BEFORE_EXPERT_PARALLEL_DEFAULT)
+
+
 def get_eigenvalue_config(param_dict):
     if get_quantize_enabled(param_dict):
         param_dict = param_dict[QUANTIZE_TRAINING]
@@ -574,45 +577,35 @@ def get_eigenvalue_config(param_dict):
 
 def get_eigenvalue_enabled(param_dict):
     if EIGENVALUE in param_dict.keys():
-        return get_scalar_param(param_dict[EIGENVALUE],
-                                EIGENVALUE_ENABLED,
-                                EIGENVALUE_ENABLED_DEFAULT)
+        return get_scalar_param(param_dict[EIGENVALUE], EIGENVALUE_ENABLED, EIGENVALUE_ENABLED_DEFAULT)
     else:
         return EIGENVALUE_ENABLED_DEFAULT
 
 
 def get_eigenvalue_verbose(param_dict):
     if EIGENVALUE in param_dict.keys():
-        return get_scalar_param(param_dict[EIGENVALUE],
-                                EIGENVALUE_VERBOSE,
-                                EIGENVALUE_VERBOSE_DEFAULT)
+        return get_scalar_param(param_dict[EIGENVALUE], EIGENVALUE_VERBOSE, EIGENVALUE_VERBOSE_DEFAULT)
     else:
         return EIGENVALUE_VERBOSE_DEFAULT
 
 
 def get_eigenvalue_max_iter(param_dict):
     if EIGENVALUE in param_dict.keys():
-        return get_scalar_param(param_dict[EIGENVALUE],
-                                EIGENVALUE_MAX_ITER,
-                                EIGENVALUE_MAX_ITER_DEFAULT)
+        return get_scalar_param(param_dict[EIGENVALUE], EIGENVALUE_MAX_ITER, EIGENVALUE_MAX_ITER_DEFAULT)
     else:
         return EIGENVALUE_MAX_ITER_DEFAULT
 
 
 def get_eigenvalue_tol(param_dict):
     if EIGENVALUE in param_dict.keys():
-        return get_scalar_param(param_dict[EIGENVALUE],
-                                EIGENVALUE_TOL,
-                                EIGENVALUE_TOL_DEFAULT)
+        return get_scalar_param(param_dict[EIGENVALUE], EIGENVALUE_TOL, EIGENVALUE_TOL_DEFAULT)
     else:
         return EIGENVALUE_TOL_DEFAULT
 
 
 def get_eigenvalue_stability(param_dict):
     if EIGENVALUE in param_dict.keys():
-        return get_scalar_param(param_dict[EIGENVALUE],
-                                EIGENVALUE_STABILITY,
-                                EIGENVALUE_STABILITY_DEFAULT)
+        return get_scalar_param(param_dict[EIGENVALUE], EIGENVALUE_STABILITY, EIGENVALUE_STABILITY_DEFAULT)
     else:
         return EIGENVALUE_STABILITY_DEFAULT
 
@@ -630,18 +623,14 @@ def get_eigenvalue_gas_boundary_resolution(param_dict):
 
 def get_eigenvalue_layer_name(param_dict):
     if EIGENVALUE in param_dict.keys():
-        return get_scalar_param(param_dict[EIGENVALUE],
-                                EIGENVALUE_LAYER_NAME,
-                                EIGENVALUE_LAYER_NAME_DEFAULT)
+        return get_scalar_param(param_dict[EIGENVALUE], EIGENVALUE_LAYER_NAME, EIGENVALUE_LAYER_NAME_DEFAULT)
     else:
         return EIGENVALUE_LAYER_NAME_DEFAULT
 
 
 def get_eigenvalue_layer_num(param_dict):
     if EIGENVALUE in param_dict.keys():
-        return get_scalar_param(param_dict[EIGENVALUE],
-                                EIGENVALUE_LAYER_NUM,
-                                EIGENVALUE_LAYER_NUM_DEFAULT)
+        return get_scalar_param(param_dict[EIGENVALUE], EIGENVALUE_LAYER_NUM, EIGENVALUE_LAYER_NUM_DEFAULT)
     else:
         return EIGENVALUE_LAYER_NUM_DEFAULT
 
@@ -655,35 +644,29 @@ def get_data_types_params(param_dict):
 
 
 def get_checkpoint_tag_validation_mode(checkpoint_params):
-    tag_validation_mode = checkpoint_params.get(CHECKPOINT_TAG_VALIDATION,
-                                                CHECKPOINT_TAG_VALIDATION_DEFAULT)
+    tag_validation_mode = checkpoint_params.get(CHECKPOINT_TAG_VALIDATION, CHECKPOINT_TAG_VALIDATION_DEFAULT)
     tag_validation_mode = tag_validation_mode.upper()
     if tag_validation_mode in CHECKPOINT_TAG_VALIDATION_MODES:
         return tag_validation_mode
     else:
         raise DeepSpeedConfigError(
             "Checkpoint config contains invalid tag_validation "
-            f"value of {tag_validation_mode}, expecting one of {CHECKPOINT_TAG_VALIDATION_MODES}"
-        )
+            f"value of {tag_validation_mode}, expecting one of {CHECKPOINT_TAG_VALIDATION_MODES}")
 
 
 def get_checkpoint_parallel_write_pipeline(checkpoint_params):
     par_write_params = checkpoint_params.get(CHECKPOINT_PARALLEL_WRITE, {})
-    par_write_pipeline = par_write_params.get(
-        CHECKPOINT_PARALLEL_WRITE_PIPELINE_STAGE,
-        CHECKPOINT_PARALLEL_WRITE_PIPELINE_STAGE_DEFAULT)
+    par_write_pipeline = par_write_params.get(CHECKPOINT_PARALLEL_WRITE_PIPELINE_STAGE,
+                                              CHECKPOINT_PARALLEL_WRITE_PIPELINE_STAGE_DEFAULT)
     if par_write_pipeline in [True, False]:
         return par_write_pipeline
     else:
-        raise DeepSpeedConfigError(
-            "checkpoint::parallel_write::pipeline_stage "
-            f"value of '{par_write_pipeline}' is invalid, expecting: true or false")
+        raise DeepSpeedConfigError("checkpoint::parallel_write::pipeline_stage "
+                                   f"value of '{par_write_pipeline}' is invalid, expecting: true or false")
 
 
 def get_dataloader_drop_last(param_dict):
-    return get_scalar_param(param_dict,
-                            DATALOADER_DROP_LAST,
-                            DATALOADER_DROP_LAST_DEFAULT)
+    return get_scalar_param(param_dict, DATALOADER_DROP_LAST, DATALOADER_DROP_LAST_DEFAULT)
 
 
 '''Write deepspeed config files by modifying basic templates.
@@ -691,6 +674,7 @@ def get_dataloader_drop_last(param_dict):
 
 
 class DeepSpeedConfigWriter:
+
     def __init__(self, data=None):
         self.data = data if data is not None else {}
 
@@ -698,9 +682,7 @@ def add_config(self, key, value):
         self.data[key] = value
 
     def load_config(self, filename):
-        self.data = json.load(open(filename,
-                                   "r"),
-                              object_pairs_hook=dict_raise_error_on_duplicate_keys)
+        self.data = json.load(open(filename, "r"), object_pairs_hook=dict_raise_error_on_duplicate_keys)
 
     def write_config(self, filename):
         with open(filename, "w") as outfile:
@@ -708,15 +690,13 @@ def write_config(self, filename):
 
 
 class DeepSpeedConfig(object):
+
     def __init__(self, config: Union[str, dict], mpu=None):
         super(DeepSpeedConfig, self).__init__()
         if isinstance(config, dict):
             self._param_dict = config
         elif os.path.exists(config):
-            self._param_dict = hjson.load(
-                open(config,
-                     "r"),
-                object_pairs_hook=dict_raise_error_on_duplicate_keys)
+            self._param_dict = hjson.load(open(config, "r"), object_pairs_hook=dict_raise_error_on_duplicate_keys)
         else:
             try:
                 config_decoded = base64.urlsafe_b64decode(config).decode('utf-8')
@@ -750,24 +730,18 @@ def __init__(self, config: Union[str, dict], mpu=None):
             # Ensure the resource scheduler saw the same elastic config we are using at runtime
             ensure_immutable_elastic_config(runtime_elastic_config_dict=elastic_dict)
 
-            self.elastic_model_parallel_size = elastic_dict.get(
-                MODEL_PARLLEL_SIZE,
-                MODEL_PARLLEL_SIZE_DEFAULT)
+            self.elastic_model_parallel_size = elastic_dict.get(MODEL_PARALLEL_SIZE, MODEL_PARALLEL_SIZE_DEFAULT)
             if self.elastic_model_parallel_size < 1:
-                raise ElasticityConfigError(
-                    "Model-Parallel size cannot be less than 1, "
-                    f"given model-parallel size: {self.elastic_model_parallel_size}")
+                raise ElasticityConfigError("Model-Parallel size cannot be less than 1, "
+                                            f"given model-parallel size: {self.elastic_model_parallel_size}")
 
-            self.num_gpus_per_node = elastic_dict.get(NUM_GPUS_PER_NODE,
-                                                      NUM_GPUS_PER_NODE_DEFAULT)
+            self.num_gpus_per_node = elastic_dict.get(NUM_GPUS_PER_NODE, NUM_GPUS_PER_NODE_DEFAULT)
             if self.num_gpus_per_node < 1:
-                raise ElasticityConfigError(
-                    "NUmber of GPUs per node cannot be less than 1, "
-                    f"given number of GPUs per node: {self.num_gpus_per_node}")
+                raise ElasticityConfigError("NUmber of GPUs per node cannot be less than 1, "
+                                            f"given number of GPUs per node: {self.num_gpus_per_node}")
 
-            ignore_non_elastic_batch_info = elastic_dict.get(
-                IGNORE_NON_ELASTIC_BATCH_INFO,
-                IGNORE_NON_ELASTIC_BATCH_INFO_DEFAULT)
+            ignore_non_elastic_batch_info = elastic_dict.get(IGNORE_NON_ELASTIC_BATCH_INFO,
+                                                             IGNORE_NON_ELASTIC_BATCH_INFO_DEFAULT)
 
             if not ignore_non_elastic_batch_info:
                 batch_params = [
@@ -785,23 +759,17 @@ def __init__(self, config: Union[str, dict], mpu=None):
 
             # micro_bsz * world_size * gas = total_batch_size
             # gas = total_batch_size // (micro_bsz * world_size)
-            gradient_accu_steps = final_batch_size // (micro_batch_size *
-                                                       self.world_size)
+            gradient_accu_steps = final_batch_size // (micro_batch_size * self.world_size)
 
             if TRAIN_BATCH_SIZE in self._param_dict:
-                logger.warning(
-                    "[Elasticity] overriding training_batch_size: "
-                    f"{self._param_dict[TRAIN_BATCH_SIZE]} -> {final_batch_size}")
+                logger.warning("[Elasticity] overriding training_batch_size: "
+                               f"{self._param_dict[TRAIN_BATCH_SIZE]} -> {final_batch_size}")
             if TRAIN_MICRO_BATCH_SIZE_PER_GPU in self._param_dict:
-                logger.warning(
-                    "[Elasticity] overriding train_micro_batch_size_per_gpu: "
-                    f"{self._param_dict[TRAIN_MICRO_BATCH_SIZE_PER_GPU]} -> {micro_batch_size}"
-                )
+                logger.warning("[Elasticity] overriding train_micro_batch_size_per_gpu: "
+                               f"{self._param_dict[TRAIN_MICRO_BATCH_SIZE_PER_GPU]} -> {micro_batch_size}")
             if GRADIENT_ACCUMULATION_STEPS in self._param_dict:
-                logger.warning(
-                    "[Elasticity] overriding gradient_accumulation_steps: "
-                    f"{self._param_dict[GRADIENT_ACCUMULATION_STEPS]} -> {gradient_accu_steps}"
-                )
+                logger.warning("[Elasticity] overriding gradient_accumulation_steps: "
+                               f"{self._param_dict[GRADIENT_ACCUMULATION_STEPS]} -> {gradient_accu_steps}")
 
             logger.info(f"[Elasticity] valid GPU counts: {valid_gpus}")
 
@@ -817,24 +785,26 @@ def __init__(self, config: Union[str, dict], mpu=None):
     def _initialize_params(self, param_dict):
         self.train_batch_size = get_train_batch_size(param_dict)
         #print(f"beginning get_train_batch_size = {get_train_batch_size}")
-        self.train_micro_batch_size_per_gpu = get_train_micro_batch_size_per_gpu(
-            param_dict)
+        self.train_micro_batch_size_per_gpu = get_train_micro_batch_size_per_gpu(param_dict)
         self.gradient_accumulation_steps = get_gradient_accumulation_steps(param_dict)
         self.steps_per_print = get_steps_per_print(param_dict)
         self.dump_state = get_dump_state(param_dict)
 
         self.disable_allgather = get_disable_allgather(param_dict)
         self.communication_data_type = get_communication_data_type(param_dict)
+        self.seq_parallel_communication_data_type = get_communication_data_type(
+            param_dict, SEQ_PARALLEL_COMMUNICATION_DATA_TYPE, SEQ_PARALLEL_COMMUNICATION_DATA_TYPE_DEFAULT)
         self.prescale_gradients = get_prescale_gradients(param_dict)
         self.gradient_predivide_factor = get_gradient_predivide_factor(param_dict)
         self.sparse_gradients_enabled = get_sparse_gradients_enabled(param_dict)
 
         self.zero_config = get_zero_config(param_dict)
+        self.mics_shard_size = self.zero_config.mics_shard_size
+        self.mics_hierarchial_params_gather = self.zero_config.mics_hierarchical_params_gather
         self.zero_optimization_stage = self.zero_config.stage
         self.zero_enabled = self.zero_optimization_stage > 0
 
-        self.activation_checkpointing_config = DeepSpeedActivationCheckpointingConfig(
-            param_dict)
+        self.activation_checkpointing_config = DeepSpeedActivationCheckpointingConfig(param_dict)
 
         self.comms_config = DeepSpeedCommsConfig(param_dict)
         self.monitor_config = get_monitor_config(param_dict)
@@ -843,9 +813,9 @@ def _initialize_params(self, param_dict):
         self.fp16_enabled = get_fp16_enabled(param_dict)
         self.fp16_auto_cast = get_fp16_auto_cast(param_dict)
         self.bfloat16_enabled = get_bfloat16_enabled(param_dict)
-        assert not (self.fp16_enabled and self.bfloat16_enabled), 'bfloat16 and fp16 modes cannot be simultaneously enabled'
-        self.fp16_master_weights_and_gradients = get_fp16_master_weights_and_grads_enabled(
-            param_dict)
+        assert not (self.fp16_enabled
+                    and self.bfloat16_enabled), 'bfloat16 and fp16 modes cannot be simultaneously enabled'
+        self.fp16_master_weights_and_gradients = get_fp16_master_weights_and_grads_enabled(param_dict)
         self.amp_enabled = get_amp_enabled(param_dict)
         self.amp_params = get_amp_params(param_dict)
         self.loss_scale = get_loss_scale(param_dict)
@@ -855,15 +825,13 @@ def _initialize_params(self, param_dict):
         self.compression_config = get_compression_config(param_dict)
 
         self.optimizer_name = get_optimizer_name(param_dict)
-        if (self.optimizer_name is not None
-                and self.optimizer_name.lower() in DEEPSPEED_OPTIMIZERS):
+        if (self.optimizer_name is not None and self.optimizer_name.lower() in DEEPSPEED_OPTIMIZERS):
             self.optimizer_name = self.optimizer_name.lower()
 
         self.optimizer_params = get_optimizer_params(param_dict)
         self.optimizer_legacy_fusion = get_optimizer_legacy_fusion(param_dict)
 
-        self.zero_allow_untested_optimizer = get_zero_allow_untested_optimizer(
-            param_dict)
+        self.zero_allow_untested_optimizer = get_zero_allow_untested_optimizer(param_dict)
 
         self.zero_force_ds_cpu_optimizer = get_zero_force_ds_cpu_optimizer(param_dict)
 
@@ -871,8 +839,7 @@ def _initialize_params(self, param_dict):
         self.scheduler_params = get_scheduler_params(param_dict)
 
         self.flops_profiler_config = DeepSpeedFlopsProfilerConfig(param_dict)
-        self.wall_clock_breakdown = (get_wall_clock_breakdown(param_dict)
-                                     | self.flops_profiler_config.enabled)
+        self.wall_clock_breakdown = (get_wall_clock_breakdown(param_dict) | self.flops_profiler_config.enabled)
         self.memory_breakdown = get_memory_breakdown(param_dict)
         self.autotuning_config = DeepSpeedAutotuningConfig(param_dict)
 
@@ -887,6 +854,9 @@ def _initialize_params(self, param_dict):
             self.eigenvalue_layer_num,
         ) = get_eigenvalue_config(param_dict)
 
+        self.use_data_before_expert_parallel_ = get_expert_data_topo_config(param_dict)
+        self.hybrid_engine = get_hybrid_engine_config(param_dict)
+
         self.sparse_attention = get_sparse_attention(param_dict)
         self.pipeline = get_pipeline_config(param_dict)
 
@@ -901,20 +871,16 @@ def _initialize_params(self, param_dict):
 
         checkpoint_params = get_checkpoint_params(param_dict)
         validation_mode = get_checkpoint_tag_validation_mode(checkpoint_params)
-        self.checkpoint_tag_validation_enabled = (validation_mode !=
-                                                  ValidationMode.IGNORE)
+        self.checkpoint_tag_validation_enabled = (validation_mode != ValidationMode.IGNORE)
         self.checkpoint_tag_validation_fail = validation_mode == ValidationMode.FAIL
-        self.load_universal_checkpoint = checkpoint_params.get(
-            LOAD_UNIVERSAL_CHECKPOINT,
-            LOAD_UNIVERSAL_CHECKPOINT_DEFAULT)
+        self.load_universal_checkpoint = checkpoint_params.get(LOAD_UNIVERSAL_CHECKPOINT,
+                                                               LOAD_UNIVERSAL_CHECKPOINT_DEFAULT)
 
-        self.use_node_local_storage = checkpoint_params.get(
-            USE_NODE_LOCAL_STORAGE_CHECKPOINT,
-            USE_NODE_LOCAL_STORAGE_CHECKPOINT_DEFAULT)
+        self.use_node_local_storage = checkpoint_params.get(USE_NODE_LOCAL_STORAGE_CHECKPOINT,
+                                                            USE_NODE_LOCAL_STORAGE_CHECKPOINT_DEFAULT)
 
         data_types_params = get_data_types_params(param_dict)
-        self.grad_accum_dtype = data_types_params.get(GRAD_ACCUM_DTYPE,
-                                                      GRAD_ACCUM_DTYPE_DEFAULT)
+        self.grad_accum_dtype = data_types_params.get(GRAD_ACCUM_DTYPE, GRAD_ACCUM_DTYPE_DEFAULT)
 
         par_write_pipe = get_checkpoint_parallel_write_pipeline(checkpoint_params)
         self.checkpoint_parallel_write_pipeline = par_write_pipe
@@ -925,29 +891,25 @@ def _initialize_params(self, param_dict):
 
         self.nebula_config = DeepSpeedNebulaConfig(param_dict)
 
+        self.weight_quantization_config = WeightQuantConfig(
+            **param_dict['weight_quantization']) if 'weight_quantization' in param_dict else None
+
     def _batch_assertion(self):
 
         train_batch = self.train_batch_size
         micro_batch = self.train_micro_batch_size_per_gpu
         grad_acc = self.gradient_accumulation_steps
 
-        assert (
-            train_batch > 0
-        ), f"Train batch size: {train_batch} has to be greater than 0"
+        assert (train_batch > 0), f"Train batch size: {train_batch} has to be greater than 0"
 
-        assert (
-            micro_batch > 0
-        ), f"Micro batch size per gpu: {micro_batch} has to be greater than 0"
+        assert (micro_batch > 0), f"Micro batch size per gpu: {micro_batch} has to be greater than 0"
 
-        assert (
-            grad_acc > 0
-        ), f"Gradient accumulation steps: {grad_acc} has to be greater than 0"
+        assert (grad_acc > 0), f"Gradient accumulation steps: {grad_acc} has to be greater than 0"
 
         assert train_batch == micro_batch * grad_acc * self.world_size, (
             f"Check batch related parameters. train_batch_size is not equal "
             "to micro_batch_per_gpu * gradient_acc_step * world_size "
-            f"{train_batch} != {micro_batch} * {grad_acc} * {self.world_size}"
-        )
+            f"{train_batch} != {micro_batch} * {grad_acc} * {self.world_size}")
 
     def _set_batch_related_parameters(self):
 
@@ -1010,8 +972,7 @@ def print_user_config(self):
                 sort_keys=True,
                 indent=4,
                 cls=ScientificNotationEncoder,
-                separators=(",",
-                            ":"),
+                separators=(",", ":"),
             )))
 
     def print(self, name):
@@ -1024,20 +985,16 @@ def print(self, name):
         self.print_user_config()
 
     def _do_error_check(self):
-        assert (
-            self.train_micro_batch_size_per_gpu
-        ), "DeepSpeedConfig: {} is not defined".format(TRAIN_MICRO_BATCH_SIZE_PER_GPU)
+        assert (self.train_micro_batch_size_per_gpu
+                ), "DeepSpeedConfig: {} is not defined".format(TRAIN_MICRO_BATCH_SIZE_PER_GPU)
 
         assert (
-            self.gradient_accumulation_steps
-        ), "DeepSpeedConfig: {} is not defined".format(GRADIENT_ACCUMULATION_STEPS)
+            self.gradient_accumulation_steps), "DeepSpeedConfig: {} is not defined".format(GRADIENT_ACCUMULATION_STEPS)
 
         if self.zero_enabled:
-            assert (
-                self.zero_optimization_stage <= ZeroStageEnum.max_stage
-            ), "DeepSpeedConfig: Maximum supported ZeRO stage is {}".format(
-                ZeroStageEnum.max_stage
-            )
+            assert (self.zero_optimization_stage <=
+                    ZeroStageEnum.max_stage), "DeepSpeedConfig: Maximum supported ZeRO stage is {}".format(
+                        ZeroStageEnum.max_stage)
 
         if self.fp16_master_weights_and_gradients:
             assert self.zero_enabled and self.zero_optimization_stage == ZeroStageEnum.gradients, "Fp16_master_weights_and_grads is only supported with ZeRO Stage 2 for now."
@@ -1048,19 +1005,15 @@ def _do_warning_check(self):
         vocabulary_size = self._param_dict.get(VOCABULARY_SIZE, VOCABULARY_SIZE_DEFAULT)
         if vocabulary_size and vocabulary_size % TENSOR_CORE_ALIGN_SIZE != 0:
             logger.warning(
-                "DeepSpeedConfig: vocabulary size {} is not aligned to {}, may import tensor core utilization."
-                .format(vocabulary_size,
-                        TENSOR_CORE_ALIGN_SIZE))
+                "DeepSpeedConfig: vocabulary size {} is not aligned to {}, may import tensor core utilization.".format(
+                    vocabulary_size, TENSOR_CORE_ALIGN_SIZE))
 
-        if (self.optimizer_params is not None
-                and MAX_GRAD_NORM in self.optimizer_params.keys()
+        if (self.optimizer_params is not None and MAX_GRAD_NORM in self.optimizer_params.keys()
                 and self.optimizer_params[MAX_GRAD_NORM] > 0):
             if fp16_enabled:
                 if self.global_rank == 0:
-                    logger.warning(
-                        "DeepSpeedConfig: In FP16 mode, DeepSpeed will pass {}:{} to FP16 wrapper"
-                        .format(MAX_GRAD_NORM,
-                                self.optimizer_params[MAX_GRAD_NORM]))
+                    logger.warning("DeepSpeedConfig: In FP16 mode, DeepSpeed will pass {}:{} to FP16 wrapper".format(
+                        MAX_GRAD_NORM, self.optimizer_params[MAX_GRAD_NORM]))
             else:
                 if self.global_rank == 0:
                     logger.warning(
diff --git a/deepspeed/runtime/config_utils.py b/deepspeed/runtime/config_utils.py
index cd262f14a6dd..5522a8e79d69 100755
--- a/deepspeed/runtime/config_utils.py
+++ b/deepspeed/runtime/config_utils.py
@@ -1,8 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
-"""
-Copyright (c) Microsoft Corporation
-Licensed under the MIT license.
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 """
 Collection of DeepSpeed configuration utilities
 """
@@ -10,7 +9,7 @@
 import collections
 import collections.abc
 from functools import reduce
-from pydantic import BaseModel
+from deepspeed.pydantic_v1 import BaseModel
 from deepspeed.utils import logger
 
 
@@ -50,15 +49,10 @@ class MyExampleConfig(DeepSpeedConfigModel):
                                       new_param='my_new_field',
                                       new_param_fn=(lambda x: int(x)))
     """
+
     def __init__(self, strict=False, **data):
-        if (
-                not strict
-        ):  # This is temporary until we refactor all DS configs, allows HF to load models
-            data = {
-                k: v
-                for k,
-                v in data.items() if (v != "auto" or k == "replace_method")
-            }
+        if (not strict):  # This is temporary until we refactor all DS configs, allows HF to load models
+            data = {k: v for k, v in data.items() if (v != "auto" or k == "replace_method")}
         super().__init__(**data)
         self._deprecated_fields_check(self)
 
@@ -73,8 +67,7 @@ def _process_deprecated_field(self, pydantic_config, field):
         dep_msg = kwargs.get("deprecated_msg", "")
         if dep_param in fields_set:
             logger.warning(f"Config parameter {dep_param} is deprecated" +
-                           (f" use {new_param} instead" if new_param else "") +
-                           (f". {dep_msg}" if dep_msg else ""))
+                           (f" use {new_param} instead" if new_param else "") + (f". {dep_msg}" if dep_msg else ""))
             # Check if there is a new param and if it should be set with a value
             if new_param and kwargs.get("set_new_param", True):
                 # Remove the deprecate field if there is a replacing field
@@ -89,9 +82,7 @@ def _process_deprecated_field(self, pydantic_config, field):
                 if len(new_param_nested) > 1:
                     # If the new param exists in a subconfig, we need to get
                     # the fields set for that subconfig
-                    pydantic_config = reduce(getattr,
-                                             new_param_nested[:-1],
-                                             pydantic_config)
+                    pydantic_config = reduce(getattr, new_param_nested[:-1], pydantic_config)
                     fields_set = pydantic_config.__fields_set__
                 new_param_name = new_param_nested[-1]
                 assert (
@@ -101,9 +92,7 @@ def _process_deprecated_field(self, pydantic_config, field):
                 try:
                     setattr(pydantic_config, new_param_name, param_value)
                 except Exception as e:
-                    logger.error(
-                        f"Tried setting value for '{new_param}' with value from deprecated '{dep_param}'"
-                    )
+                    logger.error(f"Tried setting value for '{new_param}' with value from deprecated '{dep_param}'")
                     raise e
 
     def _deprecated_fields_check(self, pydantic_config):
@@ -121,12 +110,20 @@ class Config:
         arbitrary_types_allowed = True
 
 
+def get_config_default(config, field_name):
+    assert field_name in config.__fields__, f"'{field_name}' is not a field in {config}"
+    assert not config.__fields__.get(
+        field_name).required, f"'{field_name}' is a required field and does not have a default value"
+    return config.__fields__.get(field_name).default
+
+
 class pp_int(int):
     """
     A wrapper for integers that will return a custom string or comma-formatted
     string of the integer. For example, print(pp_int(1e5)) will return
     "10,000". This is useful mainly for auto-generated documentation purposes.
     """
+
     def __new__(cls, val, custom_print_str=None):
         inst = super().__new__(cls, val)
         inst.custom_print_str = custom_print_str
@@ -148,6 +145,7 @@ class ScientificNotationEncoder(json.JSONEncoder):
     Just pass ``cls=ScientificNotationEncoder`` to ``json.dumps`` to activate it
 
     """
+
     def iterencode(self, o, _one_shot=False, level=0):
         indent = self.indent if self.indent is not None else 4
         prefix_close = " " * level * indent
@@ -161,10 +159,7 @@ def iterencode(self, o, _one_shot=False, level=0):
             else:
                 return f"{o}"
         elif isinstance(o, collections.abc.Mapping):
-            x = [
-                f'\n{prefix}"{k}": {self.iterencode(v, level=level)}' for k,
-                v in o.items()
-            ]
+            x = [f'\n{prefix}"{k}": {self.iterencode(v, level=level)}' for k, v in o.items()]
             return "{" + ", ".join(x) + f"\n{prefix_close}" + "}"
         elif isinstance(o, collections.abc.Sequence) and not isinstance(o, str):
             return f"[{ f', '.join(map(self.iterencode, o)) }]"
@@ -175,6 +170,7 @@ class DeepSpeedConfigObject(object):
     """
     For json serialization
     """
+
     def repr(self):
         return self.__dict__
 
diff --git a/deepspeed/runtime/constants.py b/deepspeed/runtime/constants.py
index 450847126f24..96f2a38bd05c 100755
--- a/deepspeed/runtime/constants.py
+++ b/deepspeed/runtime/constants.py
@@ -1,8 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
-"""
-Copyright (c) Microsoft Corporation
-Licensed under the MIT license.
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 #############################################
 # Routes
@@ -141,6 +140,7 @@
   "initial_scale_power": 16,
   "loss_scale_window": 1000,
   "hysteresis": 2,
+  "consecutive_hysteresis": false,
   "min_loss_scale": 1
 }
 '''
@@ -168,6 +168,10 @@
 FP16_HYSTERESIS = "hysteresis"
 FP16_HYSTERESIS_DEFAULT = 2
 
+# FP16 consecutive hysteresis
+FP16_CONSECUTIVE_HYSTERESIS = "consecutive_hysteresis"
+FP16_CONSECUTIVE_HYSTERESIS_DEFAULT = False
+
 # FP16 min loss scale
 FP16_MIN_LOSS_SCALE = "min_loss_scale"
 FP16_MIN_LOSS_SCALE_DEFAULT = 1
@@ -219,6 +223,19 @@
 COMMUNICATION_DATA_TYPE = "communication_data_type"
 COMMUNICATION_DATA_TYPE_DEFAULT = None
 
+###########################################################
+# Gradient communication data type for sequence parallelism
+###########################################################
+# Supported types: ['fp16', 'bf16','fp32']
+# Default value is fp32
+# Users can configure in ds_config.json as below example:
+SEQ_PARALLEL_COMMUNICATION_DATA_TYPE_FORMAT = '''
+Optional comm data type for seq paralleism should be set as:
+"seq_parallel_communication_data_type": "fp32"
+'''
+SEQ_PARALLEL_COMMUNICATION_DATA_TYPE = "seq_parallel_comm_data_type"
+SEQ_PARALLEL_COMMUNICATION_DATA_TYPE_DEFAULT = "fp32"
+
 #########################################
 # Scale/predivide gradients before allreduce
 #########################################
@@ -370,11 +387,7 @@ class ValidationMode:
 CHECKPOINT = "checkpoint"
 CHECKPOINT_TAG_VALIDATION = "tag_validation"
 CHECKPOINT_TAG_VALIDATION_DEFAULT = ValidationMode.WARN
-CHECKPOINT_TAG_VALIDATION_MODES = [
-    ValidationMode.WARN,
-    ValidationMode.IGNORE,
-    ValidationMode.FAIL
-]
+CHECKPOINT_TAG_VALIDATION_MODES = [ValidationMode.WARN, ValidationMode.IGNORE, ValidationMode.FAIL]
 
 LOAD_UNIVERSAL_CHECKPOINT = "load_universal"
 LOAD_UNIVERSAL_CHECKPOINT_DEFAULT = False
@@ -420,3 +433,9 @@ class ValidationMode:
 #########################################
 DATA_PARALLEL_GROUP = "data_parallel_group"
 GLOBAL_RANK = "global_rank"
+
+#########################################
+# EXPERT-DATA PARALLELISM TOPO Config
+#########################################
+USE_DATA_BEFORE_EXPERT_PARALLEL = "use_data_before_expert_parallelism"
+USE_DATA_BEFORE_EXPERT_PARALLEL_DEFAULT = False
diff --git a/deepspeed/runtime/data_pipeline/__init__.py b/deepspeed/runtime/data_pipeline/__init__.py
index fcb45ab2b685..6c5067f71c8f 100644
--- a/deepspeed/runtime/data_pipeline/__init__.py
+++ b/deepspeed/runtime/data_pipeline/__init__.py
@@ -1 +1,5 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 '''Copyright The Microsoft DeepSpeed Team'''
diff --git a/deepspeed/runtime/data_pipeline/config.py b/deepspeed/runtime/data_pipeline/config.py
index eefa1402e9aa..623480518925 100644
--- a/deepspeed/runtime/data_pipeline/config.py
+++ b/deepspeed/runtime/data_pipeline/config.py
@@ -1,6 +1,8 @@
-'''
-Copyright 2022 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 from .constants import *
 import copy
 from ..config_utils import get_scalar_param
@@ -24,18 +26,14 @@ def get_data_efficiency_config(param_dict):
 
 def get_data_efficiency_enabled(param_dict):
     if DATA_EFFICIENCY in param_dict.keys():
-        return get_scalar_param(param_dict[DATA_EFFICIENCY],
-                                DATA_EFFICIENCY_ENABLED,
-                                DATA_EFFICIENCY_ENABLED_DEFAULT)
+        return get_scalar_param(param_dict[DATA_EFFICIENCY], DATA_EFFICIENCY_ENABLED, DATA_EFFICIENCY_ENABLED_DEFAULT)
     else:
         return False
 
 
 def get_data_efficiency_seed(param_dict):
     if DATA_EFFICIENCY in param_dict.keys():
-        return get_scalar_param(param_dict[DATA_EFFICIENCY],
-                                DATA_EFFICIENCY_SEED,
-                                DATA_EFFICIENCY_SEED_DEFAULT)
+        return get_scalar_param(param_dict[DATA_EFFICIENCY], DATA_EFFICIENCY_SEED, DATA_EFFICIENCY_SEED_DEFAULT)
     else:
         return DATA_EFFICIENCY_SEED_DEFAULT
 
@@ -55,26 +53,21 @@ def get_data_sampling(param_dict):
 
 def get_data_sampling_enabled(param_dict):
     if DATA_SAMPLING in param_dict.keys():
-        return get_scalar_param(param_dict[DATA_SAMPLING],
-                                DATA_SAMPLING_ENABLED,
-                                DATA_SAMPLING_ENABLED_DEFAULT)
+        return get_scalar_param(param_dict[DATA_SAMPLING], DATA_SAMPLING_ENABLED, DATA_SAMPLING_ENABLED_DEFAULT)
     else:
         return False
 
 
 def get_data_sampling_num_epochs(param_dict):
     if DATA_SAMPLING in param_dict.keys():
-        return get_scalar_param(param_dict[DATA_SAMPLING],
-                                DATA_SAMPLING_NUM_EPOCHS,
-                                DATA_SAMPLING_NUM_EPOCHS_DEFAULT)
+        return get_scalar_param(param_dict[DATA_SAMPLING], DATA_SAMPLING_NUM_EPOCHS, DATA_SAMPLING_NUM_EPOCHS_DEFAULT)
     else:
         return DATA_SAMPLING_NUM_EPOCHS_DEFAULT
 
 
 def get_data_sampling_num_workers(param_dict):
     if DATA_SAMPLING in param_dict.keys():
-        return get_scalar_param(param_dict[DATA_SAMPLING],
-                                DATA_SAMPLING_NUM_WORKERS,
+        return get_scalar_param(param_dict[DATA_SAMPLING], DATA_SAMPLING_NUM_WORKERS,
                                 DATA_SAMPLING_NUM_WORKERS_DEFAULT)
     else:
         return DATA_SAMPLING_NUM_WORKERS_DEFAULT
@@ -87,7 +80,8 @@ def get_curriculum_learning(param_dict):
         param_dict[CURRICULUM_LEARNING] = {}
     sub_param_dict = param_dict[CURRICULUM_LEARNING]
     if output[CURRICULUM_LEARNING_ENABLED]:
-        assert CURRICULUM_LEARNING_METRICS in sub_param_dict.keys(), f"Curriculum learning is enabled, {CURRICULUM_LEARNING_METRICS} must be specified"
+        assert CURRICULUM_LEARNING_METRICS in sub_param_dict.keys(
+        ), f"Curriculum learning is enabled, {CURRICULUM_LEARNING_METRICS} must be specified"
         for key, val in get_curriculum_learning_params(param_dict).items():
             output[key] = val
     return output
@@ -95,8 +89,7 @@ def get_curriculum_learning(param_dict):
 
 def get_curriculum_learning_enabled(param_dict):
     if CURRICULUM_LEARNING in param_dict.keys():
-        return get_scalar_param(param_dict[CURRICULUM_LEARNING],
-                                CURRICULUM_LEARNING_ENABLED,
+        return get_scalar_param(param_dict[CURRICULUM_LEARNING], CURRICULUM_LEARNING_ENABLED,
                                 CURRICULUM_LEARNING_ENABLED_DEFAULT)
     else:
         return False
@@ -113,8 +106,7 @@ def get_curriculum_learning_params(param_dict):
 
 def get_curriculum_enabled_legacy(param_dict):
     if CURRICULUM_LEARNING_LEGACY in param_dict.keys():
-        return get_scalar_param(param_dict[CURRICULUM_LEARNING_LEGACY],
-                                CURRICULUM_ENABLED_LEGACY,
+        return get_scalar_param(param_dict[CURRICULUM_LEARNING_LEGACY], CURRICULUM_ENABLED_LEGACY,
                                 CURRICULUM_ENABLED_DEFAULT_LEGACY)
     else:
         return False
@@ -142,9 +134,7 @@ def get_data_routing(param_dict):
 
 def get_data_routing_enabled(param_dict):
     if DATA_ROUTING in param_dict.keys():
-        return get_scalar_param(param_dict[DATA_ROUTING],
-                                DATA_ROUTING_ENABLED,
-                                DATA_ROUTING_ENABLED_DEFAULT)
+        return get_scalar_param(param_dict[DATA_ROUTING], DATA_ROUTING_ENABLED, DATA_ROUTING_ENABLED_DEFAULT)
     else:
         return False
 
@@ -164,9 +154,7 @@ def get_random_ltd(param_dict):
 
 def get_random_ltd_enabled(param_dict):
     if RANDOM_LTD in param_dict.keys():
-        return get_scalar_param(param_dict[RANDOM_LTD],
-                                RANDOM_LTD_ENABLED,
-                                RANDOM_LTD_ENABLED_DEFAULT)
+        return get_scalar_param(param_dict[RANDOM_LTD], RANDOM_LTD_ENABLED, RANDOM_LTD_ENABLED_DEFAULT)
     else:
         return False
 
diff --git a/deepspeed/runtime/data_pipeline/constants.py b/deepspeed/runtime/data_pipeline/constants.py
index b801d2e956fc..1ade640e38d9 100644
--- a/deepspeed/runtime/data_pipeline/constants.py
+++ b/deepspeed/runtime/data_pipeline/constants.py
@@ -1,10 +1,11 @@
-'''
-Copyright 2022 The Microsoft DeepSpeed Team
-'''
-#########################################
-# Data efficiency library
-# See sample config at https://www.deepspeed.ai/docs/config-json/#data-efficiency
-#########################################
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""
+Data efficiency library
+ See sample config at https://www.deepspeed.ai/docs/config-json/data-efficiency
+"""
 DATA_EFFICIENCY = "data_efficiency"
 DATA_EFFICIENCY_ENABLED = "enabled"
 DATA_EFFICIENCY_ENABLED_DEFAULT = False
diff --git a/deepspeed/runtime/data_pipeline/curriculum_scheduler.py b/deepspeed/runtime/data_pipeline/curriculum_scheduler.py
index b4cb18c4476b..23d747957dc4 100644
--- a/deepspeed/runtime/data_pipeline/curriculum_scheduler.py
+++ b/deepspeed/runtime/data_pipeline/curriculum_scheduler.py
@@ -1,12 +1,15 @@
-'''
-Copyright 2021 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 import math
 from deepspeed.utils import logger
 from .constants import *
 
 
 class CurriculumScheduler(object):
+
     def __init__(self, config):
         super().__init__()
         self.state = {}
@@ -16,17 +19,12 @@ def __init__(self, config):
             f"Curriculum learning requires the config '{CURRICULUM_LEARNING_MAX_DIFFICULTY}'"
         assert CURRICULUM_LEARNING_SCHEDULE_TYPE in config, \
             f"Curriculum learning requires the config '{CURRICULUM_LEARNING_SCHEDULE_TYPE}'"
-        self.state[CURRICULUM_LEARNING_MIN_DIFFICULTY] = config[
-            CURRICULUM_LEARNING_MIN_DIFFICULTY]
-        self.state[CURRICULUM_LEARNING_MAX_DIFFICULTY] = config[
-            CURRICULUM_LEARNING_MAX_DIFFICULTY]
-        self.state[CURRICULUM_LEARNING_CURRENT_DIFFICULTY] = config[
-            CURRICULUM_LEARNING_MIN_DIFFICULTY]
-        self.state[CURRICULUM_LEARNING_SCHEDULE_TYPE] = config[
-            CURRICULUM_LEARNING_SCHEDULE_TYPE]
+        self.state[CURRICULUM_LEARNING_MIN_DIFFICULTY] = config[CURRICULUM_LEARNING_MIN_DIFFICULTY]
+        self.state[CURRICULUM_LEARNING_MAX_DIFFICULTY] = config[CURRICULUM_LEARNING_MAX_DIFFICULTY]
+        self.state[CURRICULUM_LEARNING_CURRENT_DIFFICULTY] = config[CURRICULUM_LEARNING_MIN_DIFFICULTY]
+        self.state[CURRICULUM_LEARNING_SCHEDULE_TYPE] = config[CURRICULUM_LEARNING_SCHEDULE_TYPE]
         self.first_step = True
-        if config[
-                CURRICULUM_LEARNING_SCHEDULE_TYPE] == CURRICULUM_LEARNING_SCHEDULE_FIXED_DISCRETE:
+        if config[CURRICULUM_LEARNING_SCHEDULE_TYPE] == CURRICULUM_LEARNING_SCHEDULE_FIXED_DISCRETE:
             """
             The schedule_config is a list of difficulty and a list of max
             step belonging to each difficulty. Example json config:
@@ -43,18 +41,12 @@ def __init__(self, config):
                 f"Curriculum learning with fixed_discrete schedule requires the schedule_config '{CURRICULUM_LEARNING_SCHEDULE_DIFFICULTY}'"
             assert CURRICULUM_LEARNING_SCHEDULE_MAX_STEP in config[CURRICULUM_LEARNING_SCHEDULE_CONFIG], \
                 f"Curriculum learning with fixed_discrete schedule requires the schedule_config '{CURRICULUM_LEARNING_SCHEDULE_MAX_STEP}'"
-            assert len(config[CURRICULUM_LEARNING_SCHEDULE_CONFIG]
-                       [CURRICULUM_LEARNING_SCHEDULE_MAX_STEP]) > 0
-            assert len(config[CURRICULUM_LEARNING_SCHEDULE_CONFIG]
-                       [CURRICULUM_LEARNING_SCHEDULE_DIFFICULTY]) > 0
-            assert len(config[CURRICULUM_LEARNING_SCHEDULE_CONFIG]
-                       [CURRICULUM_LEARNING_SCHEDULE_DIFFICULTY]) == len(
-                           config[CURRICULUM_LEARNING_SCHEDULE_CONFIG]
-                           [CURRICULUM_LEARNING_SCHEDULE_MAX_STEP]) + 1
-            self.state[CURRICULUM_LEARNING_SCHEDULE_CONFIG] = config[
-                CURRICULUM_LEARNING_SCHEDULE_CONFIG]
-        elif config[
-                CURRICULUM_LEARNING_SCHEDULE_TYPE] == CURRICULUM_LEARNING_SCHEDULE_FIXED_ROOT:
+            assert len(config[CURRICULUM_LEARNING_SCHEDULE_CONFIG][CURRICULUM_LEARNING_SCHEDULE_MAX_STEP]) > 0
+            assert len(config[CURRICULUM_LEARNING_SCHEDULE_CONFIG][CURRICULUM_LEARNING_SCHEDULE_DIFFICULTY]) > 0
+            assert len(config[CURRICULUM_LEARNING_SCHEDULE_CONFIG][CURRICULUM_LEARNING_SCHEDULE_DIFFICULTY]) == len(
+                config[CURRICULUM_LEARNING_SCHEDULE_CONFIG][CURRICULUM_LEARNING_SCHEDULE_MAX_STEP]) + 1
+            self.state[CURRICULUM_LEARNING_SCHEDULE_CONFIG] = config[CURRICULUM_LEARNING_SCHEDULE_CONFIG]
+        elif config[CURRICULUM_LEARNING_SCHEDULE_TYPE] == CURRICULUM_LEARNING_SCHEDULE_FIXED_ROOT:
             """
             The schedule_config includes:
             total_curriculum_step: how many steps the curriculum learning takes to go
@@ -79,15 +71,12 @@ def __init__(self, config):
                 f"Curriculum learning with fixed_root schedule requires the schedule_config '{CURRICULUM_LEARNING_SCHEDULE_DIFFICULTY_STEP}'"
             assert CURRICULUM_LEARNING_SCHEDULE_ROOT_DEGREE in config[CURRICULUM_LEARNING_SCHEDULE_CONFIG], \
                 f"Curriculum learning with fixed_root schedule requires the schedule_config '{CURRICULUM_LEARNING_SCHEDULE_ROOT_DEGREE}'"
-            if config[CURRICULUM_LEARNING_SCHEDULE_CONFIG][
-                    CURRICULUM_LEARNING_SCHEDULE_DIFFICULTY_STEP] % 8 != 0:
+            if config[CURRICULUM_LEARNING_SCHEDULE_CONFIG][CURRICULUM_LEARNING_SCHEDULE_DIFFICULTY_STEP] % 8 != 0:
                 logger.warning(
                     f'When using seqlen metric, the difficulty_step for curriculum learning has to be multiple of 8 (for FP16 data) or 16 (for INT8 data) to enable NVIDIA Tensor Core acceleration. Disregard this warning if this is unrelated to your metric/hardware.'
                 )
-            self.state[CURRICULUM_LEARNING_SCHEDULE_CONFIG] = config[
-                CURRICULUM_LEARNING_SCHEDULE_CONFIG]
-        elif config[
-                CURRICULUM_LEARNING_SCHEDULE_TYPE] == CURRICULUM_LEARNING_SCHEDULE_FIXED_LINEAR:
+            self.state[CURRICULUM_LEARNING_SCHEDULE_CONFIG] = config[CURRICULUM_LEARNING_SCHEDULE_CONFIG]
+        elif config[CURRICULUM_LEARNING_SCHEDULE_TYPE] == CURRICULUM_LEARNING_SCHEDULE_FIXED_LINEAR:
             """
             The schedule_config is the same as CURRICULUM_LEARNING_SCHEDULE_FIXED_ROOT but without the
             root_degree.
@@ -100,15 +89,12 @@ def __init__(self, config):
                 f"Curriculum learning with fixed_linear schedule requires the schedule_config '{CURRICULUM_LEARNING_SCHEDULE_TOTAL_STEP}'"
             assert CURRICULUM_LEARNING_SCHEDULE_DIFFICULTY_STEP in config[CURRICULUM_LEARNING_SCHEDULE_CONFIG], \
                 f"Curriculum learning with fixed_linear schedule requires the schedule_config '{CURRICULUM_LEARNING_SCHEDULE_DIFFICULTY_STEP}'"
-            if config[CURRICULUM_LEARNING_SCHEDULE_CONFIG][
-                    CURRICULUM_LEARNING_SCHEDULE_DIFFICULTY_STEP] % 8 != 0:
+            if config[CURRICULUM_LEARNING_SCHEDULE_CONFIG][CURRICULUM_LEARNING_SCHEDULE_DIFFICULTY_STEP] % 8 != 0:
                 logger.warning(
                     f'When using seqlen metric, the difficulty_step for curriculum learning has to be multiple of 8 (for FP16 data) or 16 (for INT8 data) to enable NVIDIA Tensor Core acceleration. Disregard this warning if this is unrelated to your metric/hardware.'
                 )
-            self.state[CURRICULUM_LEARNING_SCHEDULE_CONFIG] = config[
-                CURRICULUM_LEARNING_SCHEDULE_CONFIG]
-        elif config[
-                CURRICULUM_LEARNING_SCHEDULE_TYPE] == CURRICULUM_LEARNING_SCHEDULE_CUSTOM:
+            self.state[CURRICULUM_LEARNING_SCHEDULE_CONFIG] = config[CURRICULUM_LEARNING_SCHEDULE_CONFIG]
+        elif config[CURRICULUM_LEARNING_SCHEDULE_TYPE] == CURRICULUM_LEARNING_SCHEDULE_CUSTOM:
             """
             Fully customized schedule. User need to provide a custom schedule
             function by using the set_custom_curriculum_learning_schedule API
@@ -145,38 +131,28 @@ def __fixed_root_get_difficulty(self, global_steps, root_degree=None):
         s_state = self.state[CURRICULUM_LEARNING_SCHEDULE_CONFIG]
         if root_degree is None:
             root_degree = s_state[CURRICULUM_LEARNING_SCHEDULE_ROOT_DEGREE]
-        next_difficulty = (float(global_steps) /
-                           s_state[CURRICULUM_LEARNING_SCHEDULE_TOTAL_STEP])**(
-                               1.0 / root_degree)
-        next_difficulty = math.floor(next_difficulty *
-                                     (self.state[CURRICULUM_LEARNING_MAX_DIFFICULTY] -
-                                      self.state[CURRICULUM_LEARNING_MIN_DIFFICULTY]) +
-                                     self.state[CURRICULUM_LEARNING_MIN_DIFFICULTY])
-        next_difficulty -= (next_difficulty %
-                            s_state[CURRICULUM_LEARNING_SCHEDULE_DIFFICULTY_STEP])
-        next_difficulty = min(next_difficulty,
-                              self.state[CURRICULUM_LEARNING_MAX_DIFFICULTY])
+        next_difficulty = (float(global_steps) / s_state[CURRICULUM_LEARNING_SCHEDULE_TOTAL_STEP])**(1.0 / root_degree)
+        next_difficulty = math.floor(
+            next_difficulty *
+            (self.state[CURRICULUM_LEARNING_MAX_DIFFICULTY] - self.state[CURRICULUM_LEARNING_MIN_DIFFICULTY]) +
+            self.state[CURRICULUM_LEARNING_MIN_DIFFICULTY])
+        next_difficulty -= (next_difficulty % s_state[CURRICULUM_LEARNING_SCHEDULE_DIFFICULTY_STEP])
+        next_difficulty = min(next_difficulty, self.state[CURRICULUM_LEARNING_MAX_DIFFICULTY])
         return next_difficulty
 
     def get_difficulty(self, global_steps):
-        if self.state[
-                CURRICULUM_LEARNING_SCHEDULE_TYPE] == CURRICULUM_LEARNING_SCHEDULE_FIXED_DISCRETE:
+        if self.state[CURRICULUM_LEARNING_SCHEDULE_TYPE] == CURRICULUM_LEARNING_SCHEDULE_FIXED_DISCRETE:
             return self.__fixed_discrete_get_difficulty(global_steps)
-        elif self.state[
-                CURRICULUM_LEARNING_SCHEDULE_TYPE] == CURRICULUM_LEARNING_SCHEDULE_FIXED_LINEAR:
+        elif self.state[CURRICULUM_LEARNING_SCHEDULE_TYPE] == CURRICULUM_LEARNING_SCHEDULE_FIXED_LINEAR:
             return self.__fixed_root_get_difficulty(global_steps, 1)
-        elif self.state[
-                CURRICULUM_LEARNING_SCHEDULE_TYPE] == CURRICULUM_LEARNING_SCHEDULE_FIXED_ROOT:
+        elif self.state[CURRICULUM_LEARNING_SCHEDULE_TYPE] == CURRICULUM_LEARNING_SCHEDULE_FIXED_ROOT:
             return self.__fixed_root_get_difficulty(global_steps)
-        elif self.state[
-                CURRICULUM_LEARNING_SCHEDULE_TYPE] == CURRICULUM_LEARNING_SCHEDULE_CUSTOM:
+        elif self.state[CURRICULUM_LEARNING_SCHEDULE_TYPE] == CURRICULUM_LEARNING_SCHEDULE_CUSTOM:
             return self.custom_get_difficulty(global_steps)
         else:
             raise RuntimeError('Unsupported curriculum schedule type')
 
     def update_difficulty(self, global_steps):
-        if self.state[CURRICULUM_LEARNING_CURRENT_DIFFICULTY] < self.state[
-                CURRICULUM_LEARNING_MAX_DIFFICULTY]:
-            self.state[CURRICULUM_LEARNING_CURRENT_DIFFICULTY] = self.get_difficulty(
-                global_steps)
+        if self.state[CURRICULUM_LEARNING_CURRENT_DIFFICULTY] < self.state[CURRICULUM_LEARNING_MAX_DIFFICULTY]:
+            self.state[CURRICULUM_LEARNING_CURRENT_DIFFICULTY] = self.get_difficulty(global_steps)
         return self.state[CURRICULUM_LEARNING_CURRENT_DIFFICULTY]
diff --git a/deepspeed/runtime/data_pipeline/data_routing/__init__.py b/deepspeed/runtime/data_pipeline/data_routing/__init__.py
index fcb45ab2b685..6c5067f71c8f 100644
--- a/deepspeed/runtime/data_pipeline/data_routing/__init__.py
+++ b/deepspeed/runtime/data_pipeline/data_routing/__init__.py
@@ -1 +1,5 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 '''Copyright The Microsoft DeepSpeed Team'''
diff --git a/deepspeed/runtime/data_pipeline/data_routing/basic_layer.py b/deepspeed/runtime/data_pipeline/data_routing/basic_layer.py
index 436da95380e7..780a88c3d5a2 100644
--- a/deepspeed/runtime/data_pipeline/data_routing/basic_layer.py
+++ b/deepspeed/runtime/data_pipeline/data_routing/basic_layer.py
@@ -1,6 +1,7 @@
-'''
-Copyright 2022 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from deepspeed.utils import logger
 from torch import Tensor
@@ -14,6 +15,7 @@ class RandomLayerTokenDrop(Module):
     """
     A  layer wrapper for random LTD
     """
+
     def __init__(self, layer: Module):
         super(RandomLayerTokenDrop, self).__init__()
         self.random_ltd_layer = layer
@@ -52,9 +54,7 @@ def init_config(self, config, scheduler, random_ltd_layer_id):
         elif self.model_type == 'decoder':
             self.index_generator = gpt_sample_tokens
         else:
-            logger.warning(
-                "************For now, we only support encoder-only or decoder-only models************"
-            )
+            logger.warning("************For now, we only support encoder-only or decoder-only models************")
             raise NotImplementedError
 
     def get_bsh(self, hidden_stats):
@@ -78,40 +78,36 @@ def forward(self, hidden_states, **kwargs) -> Tensor:
                                                                                       self.curr_micro_batch, \
                                                                                       self.random_ltd_num_layer, \
                                                                                       hidden_states.device, mask)
-                self.random_ltd_scheduler.state[
-                    RANDOM_LTD_SAMPLE_INDEX] = sampled_indices
-                self.random_ltd_scheduler.state[
-                    RANDOM_LTD_ATTENTION_MASK] = part_attention_mask
+                self.random_ltd_scheduler.state[RANDOM_LTD_SAMPLE_INDEX] = sampled_indices
+                self.random_ltd_scheduler.state[RANDOM_LTD_ATTENTION_MASK] = part_attention_mask
             else:
-                sampled_indices = self.random_ltd_scheduler.state[
-                    RANDOM_LTD_SAMPLE_INDEX]
-                part_attention_mask = self.random_ltd_scheduler.state[
-                    RANDOM_LTD_ATTENTION_MASK]
-
+                sampled_indices = self.random_ltd_scheduler.state[RANDOM_LTD_SAMPLE_INDEX]
+                part_attention_mask = self.random_ltd_scheduler.state[RANDOM_LTD_ATTENTION_MASK]
 
-            hidden_states, part_hidden_states = GatherTokens.apply(hidden_states, sampled_indices[self.random_ltd_layer_id,:,:], self.batch_first)
+            hidden_states, part_hidden_states = GatherTokens.apply(hidden_states,
+                                                                   sampled_indices[self.random_ltd_layer_id, :, :],
+                                                                   self.batch_first)
             if self.mask_name is not None:
                 if self.model_type == 'encoder':
-                    kwargs[self.mask_name] = part_attention_mask[
-                        self.random_ltd_layer_id]
+                    kwargs[self.mask_name] = part_attention_mask[self.random_ltd_layer_id]
                 else:
                     kwargs[self.mask_name] = part_attention_mask
 
             outputs = self.random_ltd_layer(part_hidden_states, **kwargs)
 
             if isinstance(outputs, tuple):
-                hidden_states = ScatterTokens.apply(hidden_states, outputs[0], sampled_indices[self.random_ltd_layer_id,:,:], self.batch_first)
+                hidden_states = ScatterTokens.apply(hidden_states, outputs[0],
+                                                    sampled_indices[self.random_ltd_layer_id, :, :], self.batch_first)
                 my_list = list(outputs)
                 my_list[0] = hidden_states
                 return tuple(my_list)
             elif isinstance(outputs, Tensor):
-                hidden_states = ScatterTokens.apply(hidden_states, outputs, sampled_indices[self.random_ltd_layer_id,:,:], self.batch_first)
+                hidden_states = ScatterTokens.apply(hidden_states, outputs,
+                                                    sampled_indices[self.random_ltd_layer_id, :, :], self.batch_first)
                 return hidden_states
             else:
-                logger.warning(
-                    "************For now, we only support tuple and tensor output.  \
-                       You need to adjust the output according to the layer in your model************"
-                )
+                logger.warning("************For now, we only support tuple and tensor output.  \
+                       You need to adjust the output according to the layer in your model************")
                 raise NotImplementedError
         else:
             return self.random_ltd_layer(hidden_states, **kwargs)
diff --git a/deepspeed/runtime/data_pipeline/data_routing/helper.py b/deepspeed/runtime/data_pipeline/data_routing/helper.py
index 0f3791cf6e57..150182d77bcf 100644
--- a/deepspeed/runtime/data_pipeline/data_routing/helper.py
+++ b/deepspeed/runtime/data_pipeline/data_routing/helper.py
@@ -1,6 +1,7 @@
-'''
-Copyright 2022 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .basic_layer import RandomLayerTokenDrop
 from collections import OrderedDict
diff --git a/deepspeed/runtime/data_pipeline/data_routing/scheduler.py b/deepspeed/runtime/data_pipeline/data_routing/scheduler.py
index db0a7d4bce99..8a0b031d4f63 100644
--- a/deepspeed/runtime/data_pipeline/data_routing/scheduler.py
+++ b/deepspeed/runtime/data_pipeline/data_routing/scheduler.py
@@ -1,6 +1,7 @@
-'''
-Copyright 2022 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import math
 
@@ -12,6 +13,7 @@
 
 
 class BaseScheduler(object):
+
     def __init__(self):
         self.state = {}
 
@@ -19,12 +21,9 @@ def __fixed_root_get_value(self, global_steps, root_degree=None):
         s_state = self.state[RANDOM_LTD_SCHEDULE_CONFIG]
         if root_degree is None:
             root_degree = s_state['root_degree']
-        next_seq = (float(global_steps) /
-                    s_state[RANDOM_LTD_REQUIRE_STEP])**(1.0 / root_degree)
-        next_seq = math.floor(
-            next_seq *
-            (self.state[RANDOM_LTD_MAX_VALUE] - self.state[RANDOM_LTD_MIN_VALUE]) +
-            self.state[RANDOM_LTD_MIN_VALUE])
+        next_seq = (float(global_steps) / s_state[RANDOM_LTD_REQUIRE_STEP])**(1.0 / root_degree)
+        next_seq = math.floor(next_seq * (self.state[RANDOM_LTD_MAX_VALUE] - self.state[RANDOM_LTD_MIN_VALUE]) +
+                              self.state[RANDOM_LTD_MIN_VALUE])
         next_seq -= (next_seq % s_state[RANDOM_LTD_INCREASE_STEP])
         next_seq = min(next_seq, self.state[RANDOM_LTD_MAX_VALUE])
         return next_seq
@@ -37,6 +36,7 @@ def get_value(self, global_steps):
 
 
 class RandomLTDScheduler(BaseScheduler):
+
     def __init__(self, config):
         super().__init__()
         self.model_layer_num = config[RANDOM_LTD_TOTAL_LAYER_NUM]
@@ -61,12 +61,9 @@ def reset_to_init(self):
         if self.config_schedule is not None:
             self.state[RANDOM_LTD_MIN_VALUE] = self.config_schedule[RANDOM_LTD_MIN_VALUE]
             self.state[RANDOM_LTD_MAX_VALUE] = self.config_schedule[RANDOM_LTD_MAX_VALUE]
-            self.state[RANDOM_LTD_CURRENT_VALUE] = self.config_schedule[
-                RANDOM_LTD_MIN_VALUE]
-            self.state[RANDOM_LTD_SCHEDULE_CONFIG] = self.config_schedule[
-                RANDOM_LTD_SCHEDULE_CONFIG]
-            self.state[RANDOM_LTD_SCHEDULER_TYPE] = self.config_schedule[
-                RANDOM_LTD_SCHEDULER_TYPE]
+            self.state[RANDOM_LTD_CURRENT_VALUE] = self.config_schedule[RANDOM_LTD_MIN_VALUE]
+            self.state[RANDOM_LTD_SCHEDULE_CONFIG] = self.config_schedule[RANDOM_LTD_SCHEDULE_CONFIG]
+            self.state[RANDOM_LTD_SCHEDULER_TYPE] = self.config_schedule[RANDOM_LTD_SCHEDULER_TYPE]
         self.state[RANDOM_LTD_CONSUMED_LAYER_TOKENS] = 0
         self.state[RANDOM_LTD_CURR_STEP] = -1
 
@@ -95,8 +92,7 @@ def update_seq(self, global_steps):
 
     def state_dict(self):
         return {
-            RANDOM_LTD_CONSUMED_LAYER_TOKENS:
-            self.state[RANDOM_LTD_CONSUMED_LAYER_TOKENS],
+            RANDOM_LTD_CONSUMED_LAYER_TOKENS: self.state[RANDOM_LTD_CONSUMED_LAYER_TOKENS],
             RANDOM_LTD_CURR_STEP: self.state[RANDOM_LTD_CURR_STEP],
             RANDOM_LTD_CURRENT_VALUE: self.state[RANDOM_LTD_CURRENT_VALUE],
             RANDOM_LTD_MIN_VALUE: self.state[RANDOM_LTD_MIN_VALUE],
@@ -104,8 +100,7 @@ def state_dict(self):
         }
 
     def load_state_dict(self, state_dict):
-        self.state[RANDOM_LTD_CONSUMED_LAYER_TOKENS] = state_dict[
-            RANDOM_LTD_CONSUMED_LAYER_TOKENS]
+        self.state[RANDOM_LTD_CONSUMED_LAYER_TOKENS] = state_dict[RANDOM_LTD_CONSUMED_LAYER_TOKENS]
         self.state[RANDOM_LTD_CURR_STEP] = state_dict[RANDOM_LTD_CURR_STEP]
         self.state[RANDOM_LTD_CURRENT_VALUE] = state_dict[RANDOM_LTD_CURRENT_VALUE]
         self.state[RANDOM_LTD_MIN_VALUE] = state_dict[RANDOM_LTD_MIN_VALUE]
diff --git a/deepspeed/runtime/data_pipeline/data_routing/utils.py b/deepspeed/runtime/data_pipeline/data_routing/utils.py
index 8b7bd501c75f..afcfef2ef4dc 100644
--- a/deepspeed/runtime/data_pipeline/data_routing/utils.py
+++ b/deepspeed/runtime/data_pipeline/data_routing/utils.py
@@ -1,6 +1,7 @@
-'''
-Copyright 2022 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 
@@ -10,8 +11,7 @@ def bsh_decoder_gather(reserved_length, hidden_states, mask):
     rand_list = []
     part_hidden_states = []  #  batch, seq, hidden ## different from megatron
     for k in range(hidden_states.size(0)):
-        B_tmp = torch.randperm(hidden_states.size(1),
-                               device=hidden_states.device)[:reserved_length]
+        B_tmp = torch.randperm(hidden_states.size(1), device=hidden_states.device)[:reserved_length]
         B = B_tmp.sort()[0]
         rand_list.append(B)
         part_hidden_states.append(hidden_states[k:k + 1, B, :])
diff --git a/deepspeed/runtime/data_pipeline/data_sampling/__init__.py b/deepspeed/runtime/data_pipeline/data_sampling/__init__.py
index fcb45ab2b685..6c5067f71c8f 100644
--- a/deepspeed/runtime/data_pipeline/data_sampling/__init__.py
+++ b/deepspeed/runtime/data_pipeline/data_sampling/__init__.py
@@ -1 +1,5 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 '''Copyright The Microsoft DeepSpeed Team'''
diff --git a/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py b/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py
index efd198f1fb04..cb0d366ce798 100644
--- a/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py
+++ b/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py
@@ -1,6 +1,8 @@
-'''
-Copyright 2022 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 import os
 from collections import defaultdict
 import csv
@@ -16,6 +18,7 @@
 
 
 class DataAnalyzer(object):
+
     def __init__(self,
                  dataset,
                  num_workers=1,
@@ -53,25 +56,19 @@ def __init__(self,
         self.custom_map_finalize = custom_map_finalize
         self.custom_reduce = custom_reduce
 
-    def init_metric_results(self,
-                            thread_id,
-                            metric_names,
-                            metric_types,
-                            metric_dtypes,
-                            save_path,
-                            worker_id):
+    def init_metric_results(self, thread_id, metric_names, metric_types, metric_dtypes, save_path, worker_id):
         metric_results = []
         for m_idx in range(len(metric_names)):
             metric_name, metric_type, metric_dtype = metric_names[m_idx], \
                 metric_types[m_idx], metric_dtypes[m_idx]
-            assert metric_dtype not in [np.float64, np.double], "Currently floating point metric values are not supported. Please change your metric into integer values (and potentially multiply a larger coefficient to keep the precision)."
+            assert metric_dtype not in [
+                np.float64, np.double
+            ], "Currently floating point metric values are not supported. Please change your metric into integer values (and potentially multiply a larger coefficient to keep the precision)."
             metric_save_path = f"{save_path}/{metric_name}/worker{worker_id}_thread{thread_id}/"
             os.makedirs(metric_save_path, exist_ok=True)
             if metric_type == 'single_value_per_sample':
                 sample_to_metric_fname = f"{metric_save_path}/{metric_name}_sample_to_metric"
-                sample_to_metric_builder = create_mmap_dataset_builder(
-                    sample_to_metric_fname,
-                    metric_dtype)
+                sample_to_metric_builder = create_mmap_dataset_builder(sample_to_metric_fname, metric_dtype)
                 metric_to_sample_fname = f"{metric_save_path}/{metric_name}_metric_to_sample"
                 os.system(f"rm -rf {metric_to_sample_fname}*")
                 metric_to_sample_dict = defaultdict(list)
@@ -84,34 +81,25 @@ def init_metric_results(self,
             elif metric_type == 'accumulate_value_over_samples':
                 metric_value = None
                 metric_value_fname = f"{metric_save_path}/{metric_name}_metric_value"
-                metric_results.append({
-                    "metric_value": metric_value,
-                    "metric_value_fname": metric_value_fname
-                })
+                metric_results.append({"metric_value": metric_value, "metric_value_fname": metric_value_fname})
         return metric_results
 
-    def update_metric_results(self,
-                              data,
-                              metric_types,
-                              metric_functions,
-                              metric_results):
+    def update_metric_results(self, data, metric_types, metric_functions, metric_results):
         for m_idx in range(len(metric_types)):
             metric_type, metric_function, metric_result = metric_types[m_idx], \
                 metric_functions[m_idx], metric_results[m_idx]
             if metric_type == 'single_value_per_sample':
                 metric_values = metric_function(data)
                 for row in range(metric_values.size()[0]):
-                    metric_result["sample_to_metric_builder"].add_item(
-                        metric_values[row].reshape(-1))
-                    metric_result["metric_to_sample_dict"][
-                        metric_values[row].item()].append(data['index'][row][0].item())
+                    metric_result["sample_to_metric_builder"].add_item(metric_values[row].reshape(-1))
+                    metric_result["metric_to_sample_dict"][metric_values[row].item()].append(
+                        data['index'][row][0].item())
                 for m_value in metric_result["metric_to_sample_dict"]:
                     if len(metric_result["metric_to_sample_dict"][m_value]) > 100:
                         metric_fname = metric_result["metric_to_sample_fname"]
                         with open(f"{metric_fname}_{m_value}.csv", 'a') as f:
                             writer = csv.writer(f)
-                            writer.writerows(
-                                [metric_result["metric_to_sample_dict"][m_value]])
+                            writer.writerows([metric_result["metric_to_sample_dict"][m_value]])
                         metric_result["metric_to_sample_dict"][m_value] = []
             elif metric_type == 'accumulate_value_over_samples':
                 metric_values = metric_function(data)
@@ -126,25 +114,20 @@ def finalize_metric_results(self, metric_types, metric_dtypes, metric_results):
                 metric_dtypes[m_idx], metric_results[m_idx]
             if metric_type == 'single_value_per_sample':
                 metric_fname = metric_result["sample_to_metric_fname"]
-                close_mmap_dataset_builder(metric_result["sample_to_metric_builder"],
-                                           metric_fname)
+                close_mmap_dataset_builder(metric_result["sample_to_metric_builder"], metric_fname)
                 for m_value in metric_result["metric_to_sample_dict"]:
                     if len(metric_result["metric_to_sample_dict"][m_value]) > 0:
                         metric_fname = metric_result["metric_to_sample_fname"]
                         with open(f"{metric_fname}_{m_value}.csv", 'a') as f:
                             writer = csv.writer(f)
-                            writer.writerows(
-                                [metric_result["metric_to_sample_dict"][m_value]])
+                            writer.writerows([metric_result["metric_to_sample_dict"][m_value]])
                         metric_result["metric_to_sample_dict"][m_value] = []
             elif metric_type == 'accumulate_value_over_samples':
                 if metric_result["metric_value"] is not None:
-                    metric_value_builder = create_mmap_dataset_builder(
-                        metric_result["metric_value_fname"],
-                        metric_dtype)
-                    metric_value_builder.add_item(
-                        metric_result["metric_value"].reshape(-1))
-                    close_mmap_dataset_builder(metric_value_builder,
-                                               metric_result["metric_value_fname"])
+                    metric_value_builder = create_mmap_dataset_builder(metric_result["metric_value_fname"],
+                                                                       metric_dtype)
+                    metric_value_builder.add_item(metric_result["metric_value"].reshape(-1))
+                    close_mmap_dataset_builder(metric_value_builder, metric_result["metric_value_fname"])
 
     def run_map_helper(self, thread_id):
         start_idx, end_idx = self.thread_splits[thread_id][0], \
@@ -152,15 +135,9 @@ def run_map_helper(self, thread_id):
         logger.info(f"worker {self.worker_id} thread {thread_id}: start working " \
             f"on data subset {start_idx} to {end_idx}")
         thread_dataset = Subset(self.dataset, list(range(start_idx, end_idx)))
-        sampler = BatchSampler(SequentialSampler(thread_dataset),
-                               batch_size=self.batch_size,
-                               drop_last=False)
+        sampler = BatchSampler(SequentialSampler(thread_dataset), batch_size=self.batch_size, drop_last=False)
         if self.collate_fn is None:
-            iterator = iter(
-                DataLoader(thread_dataset,
-                           batch_sampler=sampler,
-                           num_workers=0,
-                           pin_memory=False))
+            iterator = iter(DataLoader(thread_dataset, batch_sampler=sampler, num_workers=0, pin_memory=False))
         else:
             iterator = iter(
                 DataLoader(thread_dataset,
@@ -169,19 +146,11 @@ def run_map_helper(self, thread_id):
                            collate_fn=self.collate_fn,
                            pin_memory=False))
         if self.custom_map_init is None:
-            metric_results = self.init_metric_results(thread_id,
-                                                      self.metric_names,
-                                                      self.metric_types,
-                                                      self.metric_dtypes,
-                                                      self.save_path,
-                                                      self.worker_id)
+            metric_results = self.init_metric_results(thread_id, self.metric_names, self.metric_types,
+                                                      self.metric_dtypes, self.save_path, self.worker_id)
         else:
-            metric_results = self.custom_map_init(thread_id,
-                                                  self.metric_names,
-                                                  self.metric_types,
-                                                  self.metric_dtypes,
-                                                  self.save_path,
-                                                  self.worker_id)
+            metric_results = self.custom_map_init(thread_id, self.metric_names, self.metric_types, self.metric_dtypes,
+                                                  self.save_path, self.worker_id)
         total_sample = len(thread_dataset)
         processed_sample = 0
         start = time.time()
@@ -189,15 +158,9 @@ def run_map_helper(self, thread_id):
             try:
                 data = next(iterator)
                 if self.custom_map_update is None:
-                    self.update_metric_results(data,
-                                               self.metric_types,
-                                               self.metric_functions,
-                                               metric_results)
+                    self.update_metric_results(data, self.metric_types, self.metric_functions, metric_results)
                 else:
-                    self.custom_map_update(data,
-                                           self.metric_types,
-                                           self.metric_functions,
-                                           metric_results)
+                    self.custom_map_update(data, self.metric_types, self.metric_functions, metric_results)
                 processed_sample += self.batch_size
                 duration = (time.time() - start) / 3600.0
                 remain_duration = duration * total_sample / processed_sample - duration
@@ -206,22 +169,17 @@ def run_map_helper(self, thread_id):
                     f"out of {total_sample} processed in {duration:.2f} hr, " \
                     f"estimated to finish in {remain_duration:.2f} hr")
             except StopIteration:
-                logger.info(
-                    f"worker {self.worker_id} thread {thread_id}: reach end of file")
+                logger.info(f"worker {self.worker_id} thread {thread_id}: reach end of file")
                 break
         if self.custom_map_finalize is None:
-            self.finalize_metric_results(self.metric_types,
-                                         self.metric_dtypes,
-                                         metric_results)
+            self.finalize_metric_results(self.metric_types, self.metric_dtypes, metric_results)
         else:
-            self.custom_map_finalize(self.metric_types,
-                                     self.metric_dtypes,
-                                     metric_results)
+            self.custom_map_finalize(self.metric_types, self.metric_dtypes, metric_results)
         logger.info(f"worker {self.worker_id} thread {thread_id}: finished")
 
     def run_map(self):
-        self.worker_splits, self.thread_splits = split_dataset(self.dataset,
-            self.num_workers, self.worker_id, self.num_threads)
+        self.worker_splits, self.thread_splits = split_dataset(self.dataset, self.num_workers, self.worker_id,
+                                                               self.num_threads)
         if len(self.specific_threads) > 0:
             threads_to_run = self.specific_threads
         else:
@@ -238,81 +196,50 @@ def run_map(self):
             assert self.num_threads == 1
             self.run_map_helper(0)
 
-    def get_metric_value_percentiles(self,
-                                     metric_name,
-                                     num_sample_per_value,
-                                     total_num_samples):
+    def get_metric_value_percentiles(self, metric_name, num_sample_per_value, total_num_samples):
         logger.info(f"Checking the value percentiles of metric {metric_name}...")
         processed_samples = 0
         current_percentile = 5
         for key in sorted(num_sample_per_value.keys()):
             processed_samples += num_sample_per_value[key]
             if processed_samples >= total_num_samples * current_percentile / 100.0:
-                logger.info(
-                    f"Metric {metric_name} {current_percentile}th percentile: {key}")
+                logger.info(f"Metric {metric_name} {current_percentile}th percentile: {key}")
                 current_percentile += 5
 
-    def merge_gather_map_stats(self,
-                               num_workers,
-                               num_threads,
-                               num_threads_reduce,
-                               t_idx_reduce,
-                               metric_save_path,
-                               metric_name,
-                               return_dict):
+    def merge_gather_map_stats(self, num_workers, num_threads, num_threads_reduce, t_idx_reduce, metric_save_path,
+                               metric_name, return_dict):
         results = []
         for w_idx in range(num_workers):
             for t_idx in range(num_threads):
                 if (w_idx * num_threads + t_idx) % num_threads_reduce == t_idx_reduce:
                     w_metric_save_path = f"{metric_save_path}/worker{w_idx}_thread{t_idx}/"
                     w_sample_to_metric_fname = f"{w_metric_save_path}/{metric_name}_sample_to_metric"
-                    w_sample_to_metric = MMapIndexedDataset(w_sample_to_metric_fname,
-                                                            skip_warmup=True)
+                    w_sample_to_metric = MMapIndexedDataset(w_sample_to_metric_fname, skip_warmup=True)
                     unique_v = list(np.unique(w_sample_to_metric))
                     sample_to_metric_count = len(w_sample_to_metric)
-                    logger.info(
-                        f"Finished gathering map stats from worker {w_idx} thread {t_idx}."
-                    )
+                    logger.info(f"Finished gathering map stats from worker {w_idx} thread {t_idx}.")
                     results.append([unique_v, sample_to_metric_count])
         return_dict[t_idx_reduce] = results
 
-    def merge_sample_to_metric(self,
-                               t_idx_reduce,
-                               metric_save_path,
-                               metric_name,
-                               metric_value_dtype,
+    def merge_sample_to_metric(self, t_idx_reduce, metric_save_path, metric_name, metric_value_dtype,
                                map_worker_thread):
         sample_to_metric_fname = f"{metric_save_path}/{metric_name}_sample_to_metric_thread{t_idx_reduce}"
-        sample_to_metric_builder = create_mmap_dataset_builder(
-            sample_to_metric_fname,
-            metric_value_dtype)
+        sample_to_metric_builder = create_mmap_dataset_builder(sample_to_metric_fname, metric_value_dtype)
         for w_t in map_worker_thread:
             w_metric_save_path = f"{metric_save_path}/worker{w_t[0]}_thread{w_t[1]}/"
             w_sample_to_metric_fname = f"{w_metric_save_path}/{metric_name}_sample_to_metric"
             w_data = MMapIndexedDataset(w_sample_to_metric_fname, skip_warmup=True)
             for row in range(len(w_data)):
-                sample_to_metric_builder.add_item(
-                    torch.tensor(w_data[row].astype(np.int64),
-                                 dtype=torch.long))
-            logger.info(
-                f"Finished merge_sample_to_metric from worker {w_t[0]} thread {w_t[1]}.")
+                sample_to_metric_builder.add_item(torch.tensor(w_data[row].astype(np.int64), dtype=torch.long))
+            logger.info(f"Finished merge_sample_to_metric from worker {w_t[0]} thread {w_t[1]}.")
         close_mmap_dataset_builder(sample_to_metric_builder, sample_to_metric_fname)
 
-    def merge_metric_to_sample(self,
-                               t_idx_reduce,
-                               metric_save_path,
-                               metric_name,
-                               sample_idx_dtype,
-                               metric_value_dtype,
-                               unique_metric_values,
-                               num_workers,
-                               num_threads):
+    def merge_metric_to_sample(self, t_idx_reduce, metric_save_path, metric_name, sample_idx_dtype, metric_value_dtype,
+                               unique_metric_values, num_workers, num_threads):
         index_to_sample_fname = f"{metric_save_path}/{metric_name}_index_to_sample_thread{t_idx_reduce}"
-        index_to_sample_builder = create_mmap_dataset_builder(index_to_sample_fname,
-                                                              sample_idx_dtype)
+        index_to_sample_builder = create_mmap_dataset_builder(index_to_sample_fname, sample_idx_dtype)
         index_to_metric_fname = f"{metric_save_path}/{metric_name}_index_to_metric_thread{t_idx_reduce}"
-        index_to_metric_builder = create_mmap_dataset_builder(index_to_metric_fname,
-                                                              metric_value_dtype)
+        index_to_metric_builder = create_mmap_dataset_builder(index_to_metric_fname, metric_value_dtype)
         for unique_v in unique_metric_values:
             samples = []
             for w_idx in range(num_workers):
@@ -330,13 +257,7 @@ def merge_metric_to_sample(self,
         close_mmap_dataset_builder(index_to_sample_builder, index_to_sample_fname)
         close_mmap_dataset_builder(index_to_metric_builder, index_to_metric_fname)
 
-    def merge_map_results(self,
-                          dataset,
-                          metric_names,
-                          metric_types,
-                          save_path,
-                          num_workers,
-                          num_threads,
+    def merge_map_results(self, dataset, metric_names, metric_types, save_path, num_workers, num_threads,
                           num_threads_reduce):
         total_num_samples = len(dataset)
         sample_idx_dtype = find_fit_int_dtype(0, total_num_samples - 1)
@@ -385,9 +306,7 @@ def merge_map_results(self,
                 for w_idx in range(num_workers):
                     for t_idx in range(num_threads):
                         map_worker_thread.append([w_idx, t_idx])
-                thread_splits = split_index(0,
-                                            len(map_worker_thread),
-                                            num_threads_reduce)
+                thread_splits = split_index(0, len(map_worker_thread), num_threads_reduce)
                 p = []
                 for t_idx_reduce in range(num_threads_reduce):
                     start_idx, end_idx = thread_splits[t_idx_reduce][0], thread_splits[t_idx_reduce][1]
@@ -405,24 +324,18 @@ def merge_map_results(self,
                     p[t_idx_reduce].join()
 
                 sample_to_metric_fname = f"{metric_save_path}/{metric_name}_sample_to_metric"
-                sample_to_metric_builder = create_mmap_dataset_builder(
-                    sample_to_metric_fname,
-                    metric_value_dtype)
+                sample_to_metric_builder = create_mmap_dataset_builder(sample_to_metric_fname, metric_value_dtype)
                 for t_idx_reduce in range(num_threads_reduce):
                     chunk_fname = f"{metric_save_path}/{metric_name}_sample_to_metric_thread{t_idx_reduce}"
                     logger.info(f"Merging file {chunk_fname}")
                     sample_to_metric_builder.merge_file_(chunk_fname)
-                close_mmap_dataset_builder(sample_to_metric_builder,
-                                           sample_to_metric_fname)
-                sample_to_metric = MMapIndexedDataset(sample_to_metric_fname,
-                                                      skip_warmup=True)
+                close_mmap_dataset_builder(sample_to_metric_builder, sample_to_metric_fname)
+                sample_to_metric = MMapIndexedDataset(sample_to_metric_fname, skip_warmup=True)
                 assert len(sample_to_metric) == total_num_samples
 
                 # metric_to_sample
                 unique_metric_values = list(sorted(unique_metric_values))
-                thread_splits = split_index(0,
-                                            len(unique_metric_values),
-                                            num_threads_reduce)
+                thread_splits = split_index(0, len(unique_metric_values), num_threads_reduce)
                 p = []
                 for t_idx_reduce in range(num_threads_reduce):
                     start_idx, end_idx = thread_splits[t_idx_reduce][0], thread_splits[t_idx_reduce][1]
@@ -442,13 +355,9 @@ def merge_map_results(self,
                 for t_idx_reduce in range(num_threads_reduce):
                     p[t_idx_reduce].join()
                 index_to_sample_fname = f"{metric_save_path}/{metric_name}_index_to_sample"
-                index_to_sample_builder = create_mmap_dataset_builder(
-                    index_to_sample_fname,
-                    sample_idx_dtype)
+                index_to_sample_builder = create_mmap_dataset_builder(index_to_sample_fname, sample_idx_dtype)
                 index_to_metric_fname = f"{metric_save_path}/{metric_name}_index_to_metric"
-                index_to_metric_builder = create_mmap_dataset_builder(
-                    index_to_metric_fname,
-                    metric_value_dtype)
+                index_to_metric_builder = create_mmap_dataset_builder(index_to_metric_fname, metric_value_dtype)
                 for t_idx_reduce in range(num_threads_reduce):
                     chunk_is_fname = f"{metric_save_path}/{metric_name}_index_to_sample_thread{t_idx_reduce}"
                     logger.info(f"Merging file {chunk_is_fname}")
@@ -456,43 +365,29 @@ def merge_map_results(self,
                     chunk_im_fname = f"{metric_save_path}/{metric_name}_index_to_metric_thread{t_idx_reduce}"
                     logger.info(f"Merging file {chunk_im_fname}")
                     index_to_metric_builder.merge_file_(chunk_im_fname)
-                close_mmap_dataset_builder(index_to_sample_builder,
-                                           index_to_sample_fname)
-                close_mmap_dataset_builder(index_to_metric_builder,
-                                           index_to_metric_fname)
+                close_mmap_dataset_builder(index_to_sample_builder, index_to_sample_fname)
+                close_mmap_dataset_builder(index_to_metric_builder, index_to_metric_fname)
                 num_sample_per_value = {}
-                index_to_sample = MMapIndexedDataset(index_to_sample_fname,
-                                                     skip_warmup=True)
-                index_to_metric = MMapIndexedDataset(index_to_metric_fname,
-                                                     skip_warmup=True)
+                index_to_sample = MMapIndexedDataset(index_to_sample_fname, skip_warmup=True)
+                index_to_metric = MMapIndexedDataset(index_to_metric_fname, skip_warmup=True)
                 index_to_sample_merged_fname = f"{metric_save_path}/{metric_name}_index_to_sample_percentile_merged"
-                index_to_sample_merged_builder = create_mmap_dataset_builder(
-                    index_to_sample_merged_fname,
-                    sample_idx_dtype)
+                index_to_sample_merged_builder = create_mmap_dataset_builder(index_to_sample_merged_fname,
+                                                                             sample_idx_dtype)
                 for v_idx in range(len(index_to_sample)):
                     if v_idx > 0:
                         assert index_to_metric[v_idx] > index_to_metric[v_idx - 1]
-                    num_sample_per_value[index_to_metric[v_idx][0]] = len(
-                        index_to_sample[v_idx])
+                    num_sample_per_value[index_to_metric[v_idx][0]] = len(index_to_sample[v_idx])
                 assert sum(num_sample_per_value.values()) == total_num_samples
-                merge_step = len(index_to_sample) // 100
+                merge_step = max(1, len(index_to_sample) // 100)
                 for v_idx in range(0, len(index_to_sample), merge_step):
                     merged_samples = np.copy(
-                        np.concatenate(
-                            index_to_sample[v_idx:min(len(index_to_sample),
-                                                      (v_idx + merge_step))],
-                            axis=None))
+                        np.concatenate(index_to_sample[v_idx:min(len(index_to_sample), (v_idx + merge_step))],
+                                       axis=None))
                     index_to_sample_merged_builder.add_item(
-                        torch.tensor(merged_samples.astype(np.int64),
-                                     dtype=torch.long))
-                    logger.info(
-                        f"Finished merging index_to_sample {v_idx} to {v_idx+merge_step}."
-                    )
-                close_mmap_dataset_builder(index_to_sample_merged_builder,
-                                           index_to_sample_merged_fname)
-                self.get_metric_value_percentiles(metric_name,
-                                                  num_sample_per_value,
-                                                  total_num_samples)
+                        torch.tensor(merged_samples.astype(np.int64), dtype=torch.long))
+                    logger.info(f"Finished merging index_to_sample {v_idx} to {v_idx+merge_step}.")
+                close_mmap_dataset_builder(index_to_sample_merged_builder, index_to_sample_merged_fname)
+                self.get_metric_value_percentiles(metric_name, num_sample_per_value, total_num_samples)
             elif metric_type == 'accumulate_value_over_samples':
                 metric_save_path = f"{save_path}/{metric_name}/"
                 metric_value = None
@@ -500,8 +395,7 @@ def merge_map_results(self,
                     for t_idx in range(num_threads):
                         w_metric_save_path = f"{metric_save_path}/worker{w_idx}_thread{t_idx}/"
                         w_metric_value_fname = f"{w_metric_save_path}/{metric_name}_metric_value"
-                        w_metric_value = MMapIndexedDataset(w_metric_value_fname,
-                                                            skip_warmup=True)
+                        w_metric_value = MMapIndexedDataset(w_metric_value_fname, skip_warmup=True)
                         if metric_value is None:
                             metric_value = np.copy(w_metric_value[0])
                         else:
@@ -510,28 +404,14 @@ def merge_map_results(self,
                 value_min = int(min(metric_value))
                 metric_value_dtype = find_fit_int_dtype(value_min, value_max)
                 metric_value_fname = f"{metric_save_path}/{metric_name}_metric_value"
-                metric_value_builder = create_mmap_dataset_builder(
-                    metric_value_fname,
-                    metric_value_dtype)
-                metric_value_builder.add_item(
-                    torch.tensor(metric_value.astype(np.int64),
-                                 dtype=torch.long))
+                metric_value_builder = create_mmap_dataset_builder(metric_value_fname, metric_value_dtype)
+                metric_value_builder.add_item(torch.tensor(metric_value.astype(np.int64), dtype=torch.long))
                 close_mmap_dataset_builder(metric_value_builder, metric_value_fname)
 
     def run_reduce(self):
         if self.custom_reduce is None:
-            self.merge_map_results(self.dataset,
-                                   self.metric_names,
-                                   self.metric_types,
-                                   self.save_path,
-                                   self.num_workers,
-                                   self.num_threads,
-                                   self.num_threads_reduce)
+            self.merge_map_results(self.dataset, self.metric_names, self.metric_types, self.save_path,
+                                   self.num_workers, self.num_threads, self.num_threads_reduce)
         else:
-            self.custom_reduce(self.dataset,
-                               self.metric_names,
-                               self.metric_types,
-                               self.save_path,
-                               self.num_workers,
-                               self.num_threads,
-                               self.num_threads_reduce)
+            self.custom_reduce(self.dataset, self.metric_names, self.metric_types, self.save_path, self.num_workers,
+                               self.num_threads, self.num_threads_reduce)
diff --git a/deepspeed/runtime/data_pipeline/data_sampling/data_sampler.py b/deepspeed/runtime/data_pipeline/data_sampling/data_sampler.py
index 24dbcfa93a0b..ef845e4bc490 100644
--- a/deepspeed/runtime/data_pipeline/data_sampling/data_sampler.py
+++ b/deepspeed/runtime/data_pipeline/data_sampling/data_sampler.py
@@ -1,21 +1,24 @@
-'''
-Copyright 2022 The Microsoft DeepSpeed Team
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""
+coding=utf-8
+ Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
 Part of this code was adopted from https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/data/data_samplers.py
-'''
-# coding=utf-8
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+"""
 
 import torch
 import os
@@ -31,6 +34,7 @@
 
 
 class DeepSpeedDataSampler(object):
+
     def __init__(self,
                  data_efficiency_config,
                  one_epoch_total_samples,
@@ -45,8 +49,8 @@ def __init__(self,
         self.data_efficiency_config = data_efficiency_config
         self.one_epoch_total_samples = one_epoch_total_samples
         self.index_dtype = find_fit_int_dtype(0, one_epoch_total_samples)
-        self.total_samples = one_epoch_total_samples * self.data_efficiency_config[
-            DATA_SAMPLING][DATA_SAMPLING_NUM_EPOCHS]
+        self.total_samples = one_epoch_total_samples * self.data_efficiency_config[DATA_SAMPLING][
+            DATA_SAMPLING_NUM_EPOCHS]
         self.micro_batch_size = micro_batch_size
         self.data_parallel_rank = data_parallel_rank
         self.data_parallel_group = data_parallel_group
@@ -57,13 +61,11 @@ def __init__(self,
             self.gradient_accumulation_steps
         self.global_rank = global_rank
         self.drop_last = drop_last
-        self.np_rng = np.random.default_rng(
-            self.data_efficiency_config[DATA_EFFICIENCY_SEED])
+        self.np_rng = np.random.default_rng(self.data_efficiency_config[DATA_EFFICIENCY_SEED])
         self.state = {}
         self.batch = []
         self.consumed_samples = 0
-        if self.data_efficiency_config[DATA_SAMPLING][CURRICULUM_LEARNING][
-                CURRICULUM_LEARNING_ENABLED]:
+        if self.data_efficiency_config[DATA_SAMPLING][CURRICULUM_LEARNING][CURRICULUM_LEARNING_ENABLED]:
             self.curriculum_step = 0
             self.current_difficulties = {}
             self.data_cluster_paths = []
@@ -77,33 +79,26 @@ def __init__(self,
             if self.global_rank == 0:
                 self.data_clusters = []
                 self.data_cluster_sizes = []
-                cluster_path = self.data_efficiency_config[DATA_SAMPLING][
-                    CURRICULUM_LEARNING][CURRICULUM_LEARNING_CLUSTER_PATH]
+                cluster_path = self.data_efficiency_config[DATA_SAMPLING][CURRICULUM_LEARNING][
+                    CURRICULUM_LEARNING_CLUSTER_PATH]
                 if not os.path.exists(cluster_path):
                     os.makedirs(cluster_path)
-            for metric in self.data_efficiency_config[DATA_SAMPLING][
-                    CURRICULUM_LEARNING][CURRICULUM_LEARNING_METRICS]:
+            for metric in self.data_efficiency_config[DATA_SAMPLING][CURRICULUM_LEARNING][CURRICULUM_LEARNING_METRICS]:
                 self.curriculum_schedulers[metric] = CurriculumScheduler(
-                    data_efficiency_config[DATA_SAMPLING][CURRICULUM_LEARNING]
-                    [CURRICULUM_LEARNING_METRICS][metric])
-                self.difficulty_type[metric] = data_efficiency_config[DATA_SAMPLING][
-                    CURRICULUM_LEARNING][CURRICULUM_LEARNING_METRICS][metric][
-                        CURRICULUM_LEARNING_DIFFICULTY_TYPE]
-                self.clustering_type[metric] = data_efficiency_config[DATA_SAMPLING][
-                    CURRICULUM_LEARNING][CURRICULUM_LEARNING_METRICS][metric][
-                        CURRICULUM_LEARNING_CLUSTERING_TYPE]
+                    data_efficiency_config[DATA_SAMPLING][CURRICULUM_LEARNING][CURRICULUM_LEARNING_METRICS][metric])
+                self.difficulty_type[metric] = data_efficiency_config[DATA_SAMPLING][CURRICULUM_LEARNING][
+                    CURRICULUM_LEARNING_METRICS][metric][CURRICULUM_LEARNING_DIFFICULTY_TYPE]
+                self.clustering_type[metric] = data_efficiency_config[DATA_SAMPLING][CURRICULUM_LEARNING][
+                    CURRICULUM_LEARNING_METRICS][metric][CURRICULUM_LEARNING_CLUSTERING_TYPE]
                 if self.global_rank == 0:
                     if self.clustering_type[metric] != CURRICULUM_LEARNING_SINGLE_CLUSTER:
                         self.curriculum_index_to_sample[metric] = MMapIndexedDataset(
-                            data_efficiency_config[DATA_SAMPLING][CURRICULUM_LEARNING]
-                            [CURRICULUM_LEARNING_METRICS][metric]
-                            [CURRICULUM_LEARNING_SAMPLE_PATH],
+                            data_efficiency_config[DATA_SAMPLING][CURRICULUM_LEARNING][CURRICULUM_LEARNING_METRICS]
+                            [metric][CURRICULUM_LEARNING_SAMPLE_PATH],
                             skip_warmup=True)
-                        if self.difficulty_type[
-                                metric] == CURRICULUM_LEARNING_VALUE_BASED:
+                        if self.difficulty_type[metric] == CURRICULUM_LEARNING_VALUE_BASED:
                             self.curriculum_index_to_metric[metric] = MMapIndexedDataset(
-                                data_efficiency_config[DATA_SAMPLING]
-                                [CURRICULUM_LEARNING][CURRICULUM_LEARNING_METRICS]
+                                data_efficiency_config[DATA_SAMPLING][CURRICULUM_LEARNING][CURRICULUM_LEARNING_METRICS]
                                 [metric][CURRICULUM_LEARNING_METRIC_PATH],
                                 skip_warmup=True)
 
@@ -122,8 +117,7 @@ def __len__(self):
     def set_custom_curriculum_learning_schedule(self, schedule_func_dict):
         for metric in self.curriculum_schedulers:
             if metric in schedule_func_dict:
-                self.curriculum_schedulers[metric].set_custom_get_difficulty(
-                    schedule_func_dict[metric])
+                self.curriculum_schedulers[metric].set_custom_get_difficulty(schedule_func_dict[metric])
 
     def get_start_end_idx(self):
         start_idx = self.data_parallel_rank * self.micro_batch_size
@@ -133,26 +127,19 @@ def get_start_end_idx(self):
     def get_sample_based_on_metric_value(self, metric, value_start, value_end):
         new_samples = None
         for row in range(len(self.curriculum_index_to_sample[metric])):
-            if self.curriculum_index_to_metric[metric][
-                    row] <= value_end and self.curriculum_index_to_metric[metric][
-                        row] > value_start:
+            if self.curriculum_index_to_metric[metric][row] <= value_end and self.curriculum_index_to_metric[metric][
+                    row] > value_start:
                 row_samples = np.copy(self.curriculum_index_to_sample[metric][row])
                 new_samples = row_samples if new_samples is None else np.concatenate(
-                    (new_samples,
-                     row_samples),
-                    axis=None)
+                    (new_samples, row_samples), axis=None)
         return new_samples
 
-    def get_sample_based_on_metric_percentile(self,
-                                              metric,
-                                              percentile_start,
-                                              percentile_end):
+    def get_sample_based_on_metric_percentile(self, metric, percentile_start, percentile_end):
         new_samples = None
         if self.data_1epoch_size is None:
-            self.data_1epoch_size = sum(
-                len(x) for x in self.curriculum_index_to_sample[metric])
-        max_percentile = self.data_efficiency_config[DATA_SAMPLING][CURRICULUM_LEARNING][
-            CURRICULUM_LEARNING_METRICS][metric][CURRICULUM_LEARNING_MAX_DIFFICULTY]
+            self.data_1epoch_size = sum(len(x) for x in self.curriculum_index_to_sample[metric])
+        max_percentile = self.data_efficiency_config[DATA_SAMPLING][CURRICULUM_LEARNING][CURRICULUM_LEARNING_METRICS][
+            metric][CURRICULUM_LEARNING_MAX_DIFFICULTY]
         sample_per_percentile = self.data_1epoch_size // max_percentile
         start_count = sample_per_percentile * percentile_start
         end_count = sample_per_percentile * percentile_end
@@ -167,12 +154,9 @@ def get_sample_based_on_metric_percentile(self,
                     row_end = row_size
                 else:
                     row_end = end_count - current_count
-                row_samples = np.copy(
-                    self.curriculum_index_to_sample[metric][row][row_start:row_end])
+                row_samples = np.copy(self.curriculum_index_to_sample[metric][row][row_start:row_end])
                 new_samples = row_samples if new_samples is None else np.concatenate(
-                    (new_samples,
-                     row_samples),
-                    axis=None)
+                    (new_samples, row_samples), axis=None)
             current_count += row_size
             if current_count >= end_count:
                 break
@@ -193,63 +177,42 @@ def get_new_cluster(self, previous_difficulties):
                     need_clustering += 1
             if need_clustering > 1:
                 for metric in self.curriculum_schedulers:
-                    if self.clustering_type[
-                            metric] == CURRICULUM_LEARNING_SINGLE_CLUSTER:
+                    if self.clustering_type[metric] == CURRICULUM_LEARNING_SINGLE_CLUSTER:
                         metric_cluster = np.arange(start=0,
                                                    stop=self.one_epoch_total_samples,
                                                    step=1,
                                                    dtype=self.index_dtype)
                     else:
-                        if self.difficulty_type[
-                                metric] == CURRICULUM_LEARNING_VALUE_BASED:
-                            metric_cluster = self.get_sample_based_on_metric_value(
-                                metric,
-                                float('-inf'),
-                                self.current_difficulties[metric])
-                        elif self.difficulty_type[
-                                metric] == CURRICULUM_LEARNING_PERCENTILE_BASED:
+                        if self.difficulty_type[metric] == CURRICULUM_LEARNING_VALUE_BASED:
+                            metric_cluster = self.get_sample_based_on_metric_value(metric, float('-inf'),
+                                                                                   self.current_difficulties[metric])
+                        elif self.difficulty_type[metric] == CURRICULUM_LEARNING_PERCENTILE_BASED:
                             metric_cluster = self.get_sample_based_on_metric_percentile(
-                                metric,
-                                0,
-                                self.current_difficulties[metric])
+                                metric, 0, self.current_difficulties[metric])
                     new_cluster = metric_cluster if new_cluster is None else \
                         np.intersect1d(new_cluster, metric_cluster, assume_unique=True)
                 for cluster in self.data_clusters:
-                    new_cluster = np.setdiff1d(new_cluster,
-                                               cluster[0],
-                                               assume_unique=True)
+                    new_cluster = np.setdiff1d(new_cluster, cluster[0], assume_unique=True)
             else:
                 if len(self.data_clusters) == 0:
-                    new_cluster = np.arange(start=0,
-                                            stop=self.one_epoch_total_samples,
-                                            step=1,
-                                            dtype=self.index_dtype)
+                    new_cluster = np.arange(start=0, stop=self.one_epoch_total_samples, step=1, dtype=self.index_dtype)
                 for metric in self.curriculum_schedulers:
                     if self.clustering_type[metric] != CURRICULUM_LEARNING_SINGLE_CLUSTER:
-                        if self.difficulty_type[
-                                metric] == CURRICULUM_LEARNING_VALUE_BASED:
-                            new_cluster = self.get_sample_based_on_metric_value(
-                                metric,
-                                previous_difficulties[metric],
-                                self.current_difficulties[metric])
-                        elif self.difficulty_type[
-                                metric] == CURRICULUM_LEARNING_PERCENTILE_BASED:
+                        if self.difficulty_type[metric] == CURRICULUM_LEARNING_VALUE_BASED:
+                            new_cluster = self.get_sample_based_on_metric_value(metric, previous_difficulties[metric],
+                                                                                self.current_difficulties[metric])
+                        elif self.difficulty_type[metric] == CURRICULUM_LEARNING_PERCENTILE_BASED:
                             new_cluster = self.get_sample_based_on_metric_percentile(
-                                metric,
-                                previous_difficulties[metric],
-                                self.current_difficulties[metric])
+                                metric, previous_difficulties[metric], self.current_difficulties[metric])
             if new_cluster is not None and len(new_cluster) > 0:
                 logger.info(
                     f"new data cluster (previous_difficulties {previous_difficulties}, current_difficulties {self.current_difficulties}) with size {len(new_cluster)} generated."
                 )
                 self.np_rng.shuffle(new_cluster)
-                cluster_builder = create_mmap_dataset_builder(cluster_path,
-                                                              self.index_dtype)
+                cluster_builder = create_mmap_dataset_builder(cluster_path, self.index_dtype)
                 cluster_builder.add_item_numpy(new_cluster)
                 close_mmap_dataset_builder(cluster_builder, cluster_path)
-                self.data_clusters.append(
-                    MMapIndexedDataset(cluster_path,
-                                       skip_warmup=True))
+                self.data_clusters.append(MMapIndexedDataset(cluster_path, skip_warmup=True))
                 self.data_cluster_sizes.append(len(self.data_clusters[-1][0]))
             else:
                 logger.info(
@@ -264,10 +227,7 @@ def sample_from_clusters(self):
         num_clusters = len(self.data_clusters)
         weight_sum = sum(self.data_cluster_sizes)
         weights = [x / weight_sum for x in self.data_cluster_sizes]
-        samples = self.np_rng.choice(num_clusters,
-                                     self.global_batch_size,
-                                     replace=True,
-                                     p=weights)
+        samples = self.np_rng.choice(num_clusters, self.global_batch_size, replace=True, p=weights)
         samples = np.bincount(samples, minlength=num_clusters)
         return samples
 
@@ -285,8 +245,7 @@ def reshuffle_clusters(self, cidx):
 
     def get_sample_from_cluster(self, cidx, num_samples):
         start_idx = self.data_cluster_current_position[cidx]
-        samples = list(
-            np.copy(self.data_clusters[cidx][0][start_idx:(start_idx + num_samples)]))
+        samples = list(np.copy(self.data_clusters[cidx][0][start_idx:(start_idx + num_samples)]))
         self.data_cluster_current_position[cidx] += num_samples
         if len(samples) < num_samples:
             num_samples_remained = num_samples - len(samples)
@@ -297,14 +256,12 @@ def get_sample_from_cluster(self, cidx, num_samples):
         return samples
 
     def get_next_global_batch(self):
-        if self.data_efficiency_config[DATA_SAMPLING][CURRICULUM_LEARNING][
-                CURRICULUM_LEARNING_ENABLED]:
+        if self.data_efficiency_config[DATA_SAMPLING][CURRICULUM_LEARNING][CURRICULUM_LEARNING_ENABLED]:
             self.curriculum_step += 1
             new_cluster = False
             previous_difficulties = {}
             for metric in self.curriculum_schedulers:
-                next_difficulty = self.curriculum_schedulers[metric].update_difficulty(
-                    self.curriculum_step)
+                next_difficulty = self.curriculum_schedulers[metric].update_difficulty(self.curriculum_step)
                 if metric not in self.current_difficulties or \
                     next_difficulty != self.current_difficulties[metric]:
                     new_cluster = True
@@ -313,8 +270,7 @@ def get_next_global_batch(self):
                 else:
                     if self.difficulty_type[metric] == CURRICULUM_LEARNING_VALUE_BASED:
                         previous_difficulties[metric] = float('-inf')
-                    elif self.difficulty_type[
-                            metric] == CURRICULUM_LEARNING_PERCENTILE_BASED:
+                    elif self.difficulty_type[metric] == CURRICULUM_LEARNING_PERCENTILE_BASED:
                         previous_difficulties[metric] = 0
                 self.current_difficulties[metric] = next_difficulty
             if new_cluster:
@@ -323,12 +279,9 @@ def get_next_global_batch(self):
                 samples_per_cluster = self.sample_from_clusters()
                 batch = []
                 for cidx in range(len(samples_per_cluster)):
-                    batch += self.get_sample_from_cluster(cidx,
-                                                          samples_per_cluster[cidx])
+                    batch += self.get_sample_from_cluster(cidx, samples_per_cluster[cidx])
                 self.np_rng.shuffle(batch)
-                batch = torch.tensor(batch,
-                                     device=get_accelerator().current_device_name(),
-                                     dtype=torch.long).view(-1)
+                batch = torch.tensor(batch, device=get_accelerator().current_device_name(), dtype=torch.long).view(-1)
             else:
                 batch = torch.empty(self.global_batch_size,
                                     device=get_accelerator().current_device_name(),
@@ -356,8 +309,7 @@ def state_dict(self):
             CURRICULUM_LEARNING_STEP: self.curriculum_step,
             CURRICULUM_LEARNING_CURRENT_DIFFICULTIES: self.current_difficulties,
             CURRICULUM_LEARNING_DATA_CLUSTER_PATHS: self.data_cluster_paths,
-            CURRICULUM_LEARNING_DATA_CLUSTER_CURRENT_POSITION:
-            self.data_cluster_current_position,
+            CURRICULUM_LEARNING_DATA_CLUSTER_CURRENT_POSITION: self.data_cluster_current_position,
             CURRICULUM_LEARNING_NP_RNG_STATE: np.random.get_state()
         }
 
@@ -367,11 +319,10 @@ def load_state_dict(self, state_dict):
         self.curriculum_step = state_dict[CURRICULUM_LEARNING_STEP]
         self.current_difficulties = state_dict[CURRICULUM_LEARNING_CURRENT_DIFFICULTIES]
         self.data_cluster_paths = state_dict[CURRICULUM_LEARNING_DATA_CLUSTER_PATHS]
-        self.data_cluster_current_position = state_dict[
-            CURRICULUM_LEARNING_DATA_CLUSTER_CURRENT_POSITION]
+        self.data_cluster_current_position = state_dict[CURRICULUM_LEARNING_DATA_CLUSTER_CURRENT_POSITION]
         np.random.set_state(state_dict[CURRICULUM_LEARNING_NP_RNG_STATE])
-        cluster_root_path = self.data_efficiency_config[DATA_SAMPLING][
-            CURRICULUM_LEARNING][CURRICULUM_LEARNING_CLUSTER_PATH]
+        cluster_root_path = self.data_efficiency_config[DATA_SAMPLING][CURRICULUM_LEARNING][
+            CURRICULUM_LEARNING_CLUSTER_PATH]
         # Backward compatibility: previously data_cluster_paths were stored as
         # absolute paths. Now we changed it to just the file name so that even
         # if user moved the cluster files, the checkpoint loading still works
@@ -379,12 +330,9 @@ def load_state_dict(self, state_dict):
         # in deepspeed json config.
         for idx in range(len(self.data_cluster_paths)):
             if '/' in self.data_cluster_paths[idx]:
-                self.data_cluster_paths[idx] = self.data_cluster_paths[idx].split(
-                    '/')[-1]
+                self.data_cluster_paths[idx] = self.data_cluster_paths[idx].split('/')[-1]
         if self.global_rank == 0:
             for cluster_fname in self.data_cluster_paths:
                 cluster_path = f"{cluster_root_path}/{cluster_fname}"
-                self.data_clusters.append(
-                    MMapIndexedDataset(cluster_path,
-                                       skip_warmup=True))
+                self.data_clusters.append(MMapIndexedDataset(cluster_path, skip_warmup=True))
                 self.data_cluster_sizes.append(len(self.data_clusters[-1][0]))
diff --git a/deepspeed/runtime/data_pipeline/data_sampling/indexed_dataset.py b/deepspeed/runtime/data_pipeline/data_sampling/indexed_dataset.py
index b2a31663e923..60115fa6efef 100644
--- a/deepspeed/runtime/data_pipeline/data_sampling/indexed_dataset.py
+++ b/deepspeed/runtime/data_pipeline/data_sampling/indexed_dataset.py
@@ -1,7 +1,10 @@
-'''
-Copyright 2022 The Microsoft DeepSpeed Team
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""
 Part of this code was adopted from https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/data/indexed_dataset.py
-'''
+"""
 
 # Copyright (c) Facebook, Inc. and its affiliates.
 #
@@ -50,16 +53,13 @@ def infer_dataset_impl(path):
                 return None
     else:
         print(f"Dataset does not exist: {path}")
-        print(
-            "Path should be a basename that both .idx and .bin can be appended to get full filenames."
-        )
+        print("Path should be a basename that both .idx and .bin can be appended to get full filenames.")
         return None
 
 
 def make_builder(out_file, impl, vocab_size=None):
     if impl == 'mmap':
-        return MMapIndexedDatasetBuilder(out_file,
-                                         dtype=__best_fitting_dtype(vocab_size))
+        return MMapIndexedDatasetBuilder(out_file, dtype=__best_fitting_dtype(vocab_size))
     else:
         return IndexedDatasetBuilder(out_file)
 
@@ -67,9 +67,7 @@ def make_builder(out_file, impl, vocab_size=None):
 def make_dataset(path, impl, skip_warmup=False):
     if not IndexedDataset.exists(path):
         print(f"Dataset does not exist: {path}")
-        print(
-            "Path should be a basename that both .idx and .bin can be appended to get full filenames."
-        )
+        print("Path should be a basename that both .idx and .bin can be appended to get full filenames.")
         return None
     if impl == 'infer':
         impl = infer_dataset_impl(path)
@@ -150,10 +148,8 @@ def __init__(self, path):
     def read_index(self, path):
         with open(index_file_path(path), 'rb') as f:
             magic = f.read(8)
-            assert magic == self._HDR_MAGIC, (
-                'Index file doesn\'t match expected format. '
-                'Make sure that --dataset-impl is configured properly.'
-            )
+            assert magic == self._HDR_MAGIC, ('Index file doesn\'t match expected format. '
+                                              'Make sure that --dataset-impl is configured properly.')
             version = f.read(8)
             assert struct.unpack('<Q', version) == (1, )
             code, self.element_size = struct.unpack('<QQ', f.read(16))
@@ -212,8 +208,7 @@ def size(self, index):
 
     @staticmethod
     def exists(path):
-        return (os.path.exists(index_file_path(path))
-                and os.path.exists(data_file_path(path)))
+        return (os.path.exists(index_file_path(path)) and os.path.exists(data_file_path(path)))
 
     @property
     def supports_prefetch(self):
@@ -221,6 +216,7 @@ def supports_prefetch(self):
 
 
 class IndexedCachedDataset(IndexedDataset):
+
     def __init__(self, path):
         super().__init__(path)
         self.cache = None
@@ -273,15 +269,7 @@ def __getitem__(self, idx):
 
 
 class IndexedDatasetBuilder(object):
-    element_sizes = {
-        np.uint8: 1,
-        np.int8: 1,
-        np.int16: 2,
-        np.int32: 4,
-        np.int64: 8,
-        np.float64: 4,
-        np.double: 8
-    }
+    element_sizes = {np.uint8: 1, np.int8: 1, np.int16: 2, np.int32: 4, np.int64: 8, np.float64: 4, np.double: 8}
 
     def __init__(self, out_file, dtype=np.int32):
         self.out_file = open(out_file, 'wb')
@@ -379,12 +367,15 @@ def get_pointers_with_total(sizes, elemsize, dtype):
 
 
 class MMapIndexedDataset(torch.utils.data.Dataset):
+
     class Index(object):
         _HDR_MAGIC = b'MMIDIDX\x00\x00'
 
         @classmethod
         def writer(cls, path, dtype):
+
             class _Writer(object):
+
                 def __enter__(self):
                     self._file = open(path, 'wb')
 
@@ -430,10 +421,8 @@ def __exit__(self, exc_type, exc_val, exc_tb):
         def __init__(self, path, skip_warmup=False):
             with open(path, 'rb') as stream:
                 magic_test = stream.read(9)
-                assert self._HDR_MAGIC == magic_test, (
-                    'Index file doesn\'t match expected format. '
-                    'Make sure that --dataset-impl is configured properly.'
-                )
+                assert self._HDR_MAGIC == magic_test, ('Index file doesn\'t match expected format. '
+                                                       'Make sure that --dataset-impl is configured properly.')
                 version = struct.unpack('<Q', stream.read(8))
                 assert (1, ) == version
 
@@ -452,10 +441,7 @@ def __init__(self, path, skip_warmup=False):
             self._bin_buffer_mmap = np.memmap(path, mode='r', order='C')
             self._bin_buffer = memoryview(self._bin_buffer_mmap)
             print("    reading sizes...")
-            self._sizes = np.frombuffer(self._bin_buffer,
-                                        dtype=np.int32,
-                                        count=self._len,
-                                        offset=offset)
+            self._sizes = np.frombuffer(self._bin_buffer, dtype=np.int32, count=self._len, offset=offset)
             print("    reading pointers...")
             self._pointers = np.frombuffer(self._bin_buffer,
                                            dtype=np.int64,
@@ -465,8 +451,7 @@ def __init__(self, path, skip_warmup=False):
             self._doc_idx = np.frombuffer(self._bin_buffer,
                                           dtype=np.int64,
                                           count=self._doc_count,
-                                          offset=offset + self._sizes.nbytes +
-                                          self._pointers.nbytes)
+                                          offset=offset + self._sizes.nbytes + self._pointers.nbytes)
 
         def __del__(self):
             self._bin_buffer_mmap._mmap.close()
@@ -514,9 +499,7 @@ def _do_init(self, path, skip_warmup):
             print("    warming up data mmap file...")
             _warmup_mmap_file(data_file_path(self._path))
         print("    creating numpy buffer of mmap...")
-        self._bin_buffer_mmap = np.memmap(data_file_path(self._path),
-                                          mode='r',
-                                          order='C')
+        self._bin_buffer_mmap = np.memmap(data_file_path(self._path), mode='r', order='C')
         print("    creating memory view of numpy buffer...")
         self._bin_buffer = memoryview(self._bin_buffer_mmap)
 
@@ -532,10 +515,7 @@ def __len__(self):
     def __getitem__(self, idx):
         if isinstance(idx, int):
             ptr, size = self._index[idx]
-            np_array = np.frombuffer(self._bin_buffer,
-                                     dtype=self._index.dtype,
-                                     count=size,
-                                     offset=ptr)
+            np_array = np.frombuffer(self._bin_buffer, dtype=self._index.dtype, count=size, offset=ptr)
             return np_array
         elif isinstance(idx, slice):
             start, stop, step = idx.indices(len(self))
@@ -545,10 +525,7 @@ def __getitem__(self, idx):
             sizes = self._index._sizes[idx]
             offsets = list(accumulate(sizes))
             total_size = sum(sizes)
-            np_array = np.frombuffer(self._bin_buffer,
-                                     dtype=self._index.dtype,
-                                     count=total_size,
-                                     offset=ptr)
+            np_array = np.frombuffer(self._bin_buffer, dtype=self._index.dtype, count=total_size, offset=ptr)
             sents = np.split(np_array, offsets[:-1])
             return sents
 
@@ -562,10 +539,7 @@ def get(self, idx, offset=0, length=None):
         if length is None:
             length = size - offset
         ptr += offset * np.dtype(self._index.dtype).itemsize
-        np_array = np.frombuffer(self._bin_buffer,
-                                 dtype=self._index.dtype,
-                                 count=length,
-                                 offset=ptr)
+        np_array = np.frombuffer(self._bin_buffer, dtype=self._index.dtype, count=length, offset=ptr)
         return np_array
 
     @property
@@ -591,8 +565,7 @@ def supports_prefetch(self):
 
     @staticmethod
     def exists(path):
-        return (os.path.exists(index_file_path(path))
-                and os.path.exists(data_file_path(path)))
+        return (os.path.exists(index_file_path(path)) and os.path.exists(data_file_path(path)))
 
     @property
     def dtype(self):
@@ -600,6 +573,7 @@ def dtype(self):
 
 
 class MMapIndexedDatasetBuilder(object):
+
     def __init__(self, out_file, dtype=np.int64):
         self._data_file = open(out_file, 'wb')
         self._dtype = dtype
@@ -626,9 +600,7 @@ def merge_file_(self, another_file):
         assert index.dtype == self._dtype
 
         total_len = len(index.sizes) + len(self._sizes)
-        print(
-            f"    concat {another_file} size={len(index.sizes)} for a total size of {total_len}"
-        )
+        print(f"    concat {another_file} size={len(index.sizes)} for a total size of {total_len}")
 
         offset = len(self._sizes)
         self._sizes.extend(index.sizes)
diff --git a/deepspeed/runtime/data_pipeline/data_sampling/utils.py b/deepspeed/runtime/data_pipeline/data_sampling/utils.py
index 7b1a3fb5945b..9c643f3705de 100644
--- a/deepspeed/runtime/data_pipeline/data_sampling/utils.py
+++ b/deepspeed/runtime/data_pipeline/data_sampling/utils.py
@@ -1,6 +1,7 @@
-'''
-Copyright 2022 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import math
 import numpy as np
@@ -32,19 +33,14 @@ def find_fit_int_dtype(min_value, max_value):
 
 def split_index(start_idx, end_idx, num_partitions):
     partition_size = math.ceil((end_idx - start_idx) / num_partitions)
-    partitions = [[
-        start_idx + x * partition_size,
-        min(end_idx,
-            start_idx + (x + 1) * partition_size)
-    ] for x in range(num_partitions)]
+    partitions = [[start_idx + x * partition_size,
+                   min(end_idx, start_idx + (x + 1) * partition_size)] for x in range(num_partitions)]
     return partitions
 
 
 def split_dataset(dataset, num_workers, worker_id, num_threads):
     worker_splits = split_index(0, len(dataset), num_workers)
-    thread_splits = split_index(worker_splits[worker_id][0],
-                                worker_splits[worker_id][1],
-                                num_threads)
+    thread_splits = split_index(worker_splits[worker_id][0], worker_splits[worker_id][1], num_threads)
     return worker_splits, thread_splits
 
 
diff --git a/deepspeed/runtime/dataloader.py b/deepspeed/runtime/dataloader.py
index 3734eedeb5c9..499473b4ced8 100644
--- a/deepspeed/runtime/dataloader.py
+++ b/deepspeed/runtime/dataloader.py
@@ -1,6 +1,7 @@
-'''
-Copyright 2019 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from torch.utils.data import DataLoader, RandomSampler
 from torch.utils.data.distributed import DistributedSampler
@@ -14,6 +15,7 @@
 
 
 class RepeatingLoader:
+
     def __init__(self, loader):
         """Wraps an iterator to allow for infinite iteration. This is especially useful
         for DataLoader types that we wish to automatically restart upon completion.
@@ -37,6 +39,7 @@ def __next__(self):
 
 
 class DeepSpeedDataLoader(object):
+
     def __init__(self,
                  dataset,
                  batch_size,
@@ -55,30 +58,26 @@ def __init__(self,
         self.batch_size = batch_size
         self.curriculum_learning_enabled = False
         if CURRICULUM_LEARNING in deepspeed_dataloader_config:
-            self.curriculum_learning_enabled = deepspeed_dataloader_config[
-                CURRICULUM_LEARNING]
+            self.curriculum_learning_enabled = deepspeed_dataloader_config[CURRICULUM_LEARNING]
 
         if self.curriculum_learning_enabled:
-            data_sampler = DeepSpeedDataSampler(
-                self.deepspeed_dataloader_config[DATA_EFFICIENCY],
-                len(dataset),
-                self.batch_size,
-                data_parallel_rank,
-                data_parallel_world_size,
-                self.deepspeed_dataloader_config[DATA_PARALLEL_GROUP],
-                self.deepspeed_dataloader_config[GRADIENT_ACCUMULATION_STEPS],
-                self.deepspeed_dataloader_config[GLOBAL_RANK],
-                drop_last=dataloader_drop_last)
+            data_sampler = DeepSpeedDataSampler(self.deepspeed_dataloader_config[DATA_EFFICIENCY],
+                                                len(dataset),
+                                                self.batch_size,
+                                                data_parallel_rank,
+                                                data_parallel_world_size,
+                                                self.deepspeed_dataloader_config[DATA_PARALLEL_GROUP],
+                                                self.deepspeed_dataloader_config[GRADIENT_ACCUMULATION_STEPS],
+                                                self.deepspeed_dataloader_config[GLOBAL_RANK],
+                                                drop_last=dataloader_drop_last)
             device_count = get_accelerator().device_count()
-            num_local_io_workers = self.deepspeed_dataloader_config[
-                DATA_SAMPLING_NUM_WORKERS]
+            num_local_io_workers = self.deepspeed_dataloader_config[DATA_SAMPLING_NUM_WORKERS]
         else:
             if local_rank >= 0:
                 if data_sampler is None:
-                    data_sampler = DistributedSampler(
-                        dataset=dataset,
-                        num_replicas=data_parallel_world_size,
-                        rank=data_parallel_rank)
+                    data_sampler = DistributedSampler(dataset=dataset,
+                                                      num_replicas=data_parallel_world_size,
+                                                      rank=data_parallel_rank)
                 device_count = 1
             else:
                 if data_sampler is None:
diff --git a/deepspeed/runtime/eigenvalue.py b/deepspeed/runtime/eigenvalue.py
index 618ac00caff4..df63854dd1ca 100755
--- a/deepspeed/runtime/eigenvalue.py
+++ b/deepspeed/runtime/eigenvalue.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 from deepspeed.utils import log_dist
@@ -7,6 +10,7 @@
 
 
 class Eigenvalue(object):
+
     def __init__(self,
                  verbose=False,
                  max_iter=100,
@@ -77,8 +81,7 @@ def compute_eigenvalue(self, module, device=None, scale=1.0):
                 ]
             else:
                 v = [
-                    torch.randn(p.size(),
-                                device=device) for p in model_block.parameters()
+                    torch.randn(p.size(), device=device) for p in model_block.parameters()
                     if p.grad is not None and p.grad.grad_fn is not None
                 ]
             torch.random.set_rng_state(rng_state)
@@ -100,24 +103,18 @@ def compute_eigenvalue(self, module, device=None, scale=1.0):
             # Disable eigenvalue if the model doesn't support second order gradients computation,
             # e.g. when enabling DS transformer kernel.
             if len(grads) == 0 or len(params) == 0:
-                log_dist(f'The model does NOT support eigenvalue computation.',
-                         ranks=[0],
-                         level=logging.WARNING)
+                log_dist(f'The model does NOT support eigenvalue computation.', ranks=[0], level=logging.WARNING)
                 return []
 
             i = 0
             eigenvalue_current, eigenvalue_previous = 1., 0.
 
             while (i < self.max_iter) and abs(eigenvalue_current) > 0 and (abs(
-                (eigenvalue_current - eigenvalue_previous) /
-                    eigenvalue_current) >= self.tol):  # test convergence criteria
+                (eigenvalue_current - eigenvalue_previous) / eigenvalue_current) >=
+                                                                           self.tol):  # test convergence criteria
                 eigenvalue_previous = eigenvalue_current
 
-                Hv = torch.autograd.grad(grads,
-                                         params,
-                                         grad_outputs=v,
-                                         only_inputs=True,
-                                         retain_graph=True)
+                Hv = torch.autograd.grad(grads, params, grad_outputs=v, only_inputs=True, retain_graph=True)
                 #Hv = [hv.float() for hv in Hv]
                 Hv = [self.nan_to_num(hv).float() for hv in Hv]
 
@@ -131,9 +128,7 @@ def compute_eigenvalue(self, module, device=None, scale=1.0):
             block_eigenvalue.append(eigenvalue_current)
 
             if self.verbose:
-                log_dist(
-                    f'block: {block}, power iteration: {i}, eigenvalue: {eigenvalue_current}',
-                    ranks=[0])
+                log_dist(f'block: {block}, power iteration: {i}, eigenvalue: {eigenvalue_current}', ranks=[0])
 
         block_eigenvalue = self.post_process(block_eigenvalue)
 
diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py
index 13c669b6de90..20df5eaf81bd 100644
--- a/deepspeed/runtime/engine.py
+++ b/deepspeed/runtime/engine.py
@@ -1,6 +1,7 @@
-"""
-Copyright 2019 The Microsoft DeepSpeed Team
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import os
 import re
@@ -9,16 +10,19 @@
 import hashlib
 from collections import defaultdict, OrderedDict, deque
 from shutil import copyfile
+import gc
 
 from torch.nn.modules import Module
 from torch.nn.parameter import Parameter
 from torch.optim import Optimizer
 from torch.optim.lr_scheduler import _LRScheduler
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
 
 from typing import Callable, Dict, Union, Iterable
 
 import deepspeed
 
+from deepspeed import comm as dist
 from deepspeed.runtime.utils import see_memory_usage, DummyOptim
 from .zero.offload_config import OffloadDeviceEnum
 from deepspeed.runtime.zero.stage_1_and_2 import DeepSpeedZeroOptimizer
@@ -31,9 +35,10 @@
 from deepspeed.runtime.fp16.unfused_optimizer import FP16_UnfusedOptimizer
 from deepspeed.runtime.bf16_optimizer import BF16_Optimizer
 
-from deepspeed.runtime.config import DeepSpeedConfig, DEEPSPEED_OPTIMIZERS, \
+from deepspeed.runtime.config import DEEPSPEED_OPTIMIZERS, \
     ADAGRAD_OPTIMIZER, ADAM_OPTIMIZER, ADAMW_OPTIMIZER, LAMB_OPTIMIZER, ONEBIT_ADAM_OPTIMIZER, ONEBIT_LAMB_OPTIMIZER, \
-    TORCH_ADAM_PARAM, ADAM_W_MODE, ADAM_W_MODE_DEFAULT, ZERO_ONE_ADAM_OPTIMIZER
+    TORCH_ADAM_PARAM, ADAM_W_MODE, ADAM_W_MODE_DEFAULT, ZERO_ONE_ADAM_OPTIMIZER, MUADAM_OPTIMIZER, MUADAMW_OPTIMIZER, \
+    MUSGD_OPTIMIZER, LION_OPTIMIZER
 
 from deepspeed.runtime.dataloader import DeepSpeedDataLoader
 from deepspeed.runtime.constants import \
@@ -53,13 +58,17 @@
     WEIGHT_QUANTIZE_ROUNDING, \
     WEIGHT_QUANTIZE_VERBOSE, \
     WEIGHT_QUANTIZE_KERNEL
-from deepspeed.checkpoint.constants import OPTIMIZER_STATE_DICT
+from deepspeed.checkpoint.constants import OPTIMIZER_STATE_DICT, FROZEN_PARAM_FRAGMENTS
 from deepspeed.runtime.sparse_tensor import SparseTensor
 
 from deepspeed.runtime import lr_schedules
 from deepspeed.utils import groups
 from deepspeed.utils import logger, log_dist, instrument_w_nvtx
-from deepspeed.utils.timer import ThroughputTimer, SynchronizedWallClockTimer
+from deepspeed.utils.timer import NoopTimer, ThroughputTimer, SynchronizedWallClockTimer, \
+    FORWARD_MICRO_TIMER, BACKWARD_MICRO_TIMER, BACKWARD_INNER_MICRO_TIMER, BACKWARD_REDUCE_MICRO_TIMER, \
+    STEP_MICRO_TIMER, \
+    FORWARD_GLOBAL_TIMER, BACKWARD_GLOBAL_TIMER, BACKWARD_INNER_GLOBAL_TIMER, BACKWARD_REDUCE_GLOBAL_TIMER, \
+    STEP_GLOBAL_TIMER
 from deepspeed.utils.debug import debug_extract_module_and_param_names
 from deepspeed.monitor.monitor import MonitorMaster
 from deepspeed.runtime.progressive_layer_drop import ProgressiveLayerDrop
@@ -77,9 +86,10 @@
 from deepspeed.runtime.data_pipeline.data_routing.basic_layer import RandomLayerTokenDrop
 
 from deepspeed.runtime.checkpoint_engine.torch_checkpoint_engine import TorchCheckpointEngine
+from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
 
 from .pipe.module import PipelineModule
-from .utils import ensure_directory_exists, get_ma_status
+from .utils import get_ma_status
 from ..ops.adam import FusedAdam
 from ..moe.sharded_moe import TopKGate, MOELayer
 from ..moe.layer import MoE
@@ -90,12 +100,8 @@
 from deepspeed.utils.logging import print_json_dist, print_configuration
 
 from deepspeed.accelerator import get_accelerator
-from deepspeed.ops.op_builder import UtilsBuilder
-
-from deepspeed.inference.config import DtypeEnum
 
-# Set to torch's distributed package or deepspeed.comm based inside DeepSpeedEngine init
-dist = None
+from deepspeed.runtime.config import DtypeEnum
 
 MEMORY_OPT_ALLREDUCE_SIZE = 500000000
 
@@ -110,16 +116,13 @@
 except ImportError:
     # Fail silently so we don't spam logs unnecessarily if user isn't using amp
     APEX_INSTALLED = False
-    pass
 
 
 def split_half_float_double_sparse(tensors):
     device_type = get_accelerator().device_name()
     supported_types = [
-        "torch.{}.HalfTensor".format(device_type),
-        "torch.{}.FloatTensor".format(device_type),
-        "torch.{}.DoubleTensor".format(device_type),
-        "torch.{}.BFloat16Tensor".format(device_type),
+        "torch.{}.HalfTensor".format(device_type), "torch.{}.FloatTensor".format(device_type),
+        "torch.{}.DoubleTensor".format(device_type), "torch.{}.BFloat16Tensor".format(device_type),
         SparseTensor.type()
     ]
 
@@ -134,20 +137,9 @@ def split_half_float_double_sparse(tensors):
     return buckets
 
 
-FORWARD_MICRO_TIMER = 'forward_microstep'
-FORWARD_GLOBAL_TIMER = 'forward'
-BACKWARD_MICRO_TIMER = 'backward_microstep'
-BACKWARD_GLOBAL_TIMER = 'backward'
-BACKWARD_INNER_MICRO_TIMER = 'backward_inner_microstep'
-BACKWARD_INNER_GLOBAL_TIMER = 'backward_inner'
-BACKWARD_REDUCE_MICRO_TIMER = 'backward_allreduce_microstep'
-BACKWARD_REDUCE_GLOBAL_TIMER = 'backward_allreduce'
-STEP_MICRO_TIMER = 'step_microstep'
-STEP_GLOBAL_TIMER = 'step'
-
-
 class EngineTimers(object):
     r"""Wallclock timers for DeepSpeedEngine"""
+
     def __init__(self, enable_micro_timers, enable_global_timers):
         self.forward_timers = []
         self.backward_timers = []
@@ -164,10 +156,7 @@ def __init__(self, enable_micro_timers, enable_global_timers):
             self.backward_reduce_timers += [BACKWARD_REDUCE_MICRO_TIMER]
             self.step_timers += [STEP_MICRO_TIMER]
             self.micro_timers += [
-                FORWARD_MICRO_TIMER,
-                BACKWARD_MICRO_TIMER,
-                BACKWARD_INNER_MICRO_TIMER,
-                BACKWARD_REDUCE_MICRO_TIMER,
+                FORWARD_MICRO_TIMER, BACKWARD_MICRO_TIMER, BACKWARD_INNER_MICRO_TIMER, BACKWARD_REDUCE_MICRO_TIMER,
                 STEP_MICRO_TIMER
             ]
 
@@ -178,16 +167,14 @@ def __init__(self, enable_micro_timers, enable_global_timers):
             self.backward_reduce_timers += [BACKWARD_REDUCE_GLOBAL_TIMER]
             self.step_timers += [STEP_GLOBAL_TIMER]
             self.global_timers += [
-                FORWARD_GLOBAL_TIMER,
-                BACKWARD_GLOBAL_TIMER,
-                BACKWARD_INNER_GLOBAL_TIMER,
-                BACKWARD_REDUCE_GLOBAL_TIMER,
+                FORWARD_GLOBAL_TIMER, BACKWARD_GLOBAL_TIMER, BACKWARD_INNER_GLOBAL_TIMER, BACKWARD_REDUCE_GLOBAL_TIMER,
                 STEP_GLOBAL_TIMER
             ]
 
 
 class DeepSpeedEngine(Module):
     r"""DeepSpeed engine for training."""
+
     def __init__(
         self,
         args,
@@ -200,7 +187,7 @@ def __init__(
         dist_init_required=None,
         collate_fn=None,
         config=None,
-        config_params=None,
+        config_class=None,
         dont_change_device=False,
     ):
         super(DeepSpeedEngine, self).__init__()
@@ -210,6 +197,7 @@ def __init__(
         self.training_data = training_data
         self.collate_fn = collate_fn
         self.mpu = mpu
+        self.all_to_all_group = None
         self.data_parallel_group = None
         self.global_steps = 0
         self.global_samples = 0
@@ -218,6 +206,7 @@ def __init__(
         self.gradient_average = True
         self.warn_unscaled_loss = True
         self.config = config
+        self._config = config_class
         self.loaded_checkpoint_mp_world_size = None
         self.loaded_checkpoint_dp_world_size = None
         self.enable_backward_allreduce = True
@@ -236,10 +225,9 @@ def __init__(
 
         self.checkpoint_engine = None
 
-        global dist
-        from deepspeed import comm as dist
         self._is_gradient_accumulation_boundary = None
         self.scale_wrt_gas = None
+        self.losses = 0.0
 
         # for debug purposes - can then debug print: debug_get_module_name(module)
         debug_extract_module_and_param_names(model)
@@ -247,38 +235,15 @@ def __init__(
         # needed for zero_to_fp32 weights reconstruction to remap nameless data to state_dict
         self.param_names = {param: name for name, param in model.named_parameters()}
 
-        # Set config using config_params for backwards compat
-        if self.config is None and config_params is not None:
-            self.config = config_params
-
-        from deepspeed.comm import supported_torch_version
-        # This supported_torch_version check is for torch1.2 compatibility only
-        if supported_torch_version:
-            dist.init_distributed(dist_backend=self.dist_backend,
-                                  dist_init_required=dist_init_required)
-        else:
-            if dist_init_required is None:
-                dist_init_required = not dist.is_initialized()
-
-            if dist_init_required is False:
-                assert (
-                    dist.is_initialized() is True
-                ), "Torch distributed not initialized. Please set dist_init_required to True or initialize before calling deepspeed.initialize()"
-            else:
-                if not dist.is_initialized():
-                    dist.init_process_group(backend=self.dist_backend)
-
         self._do_args_sanity_check(args)
         self._configure_with_arguments(args, mpu)
         self._do_sanity_check()
-        see_memory_usage(f"DeepSpeed Engine: After args sanity test",
-                         force=self.memory_breakdown())
+        see_memory_usage(f"DeepSpeed Engine: After args sanity test", force=self.memory_breakdown())
         if mpu is not None:
             if self.elasticity_enabled():
                 if not self.is_elastic_model_parallel_supported():
-                    assert not self.elasticity_enabled(), (
-                        "Elasticity is not currently supported" " with model parallelism."
-                    )
+                    assert not self.elasticity_enabled(), ("Elasticity is not currently supported"
+                                                           " with model parallelism.")
 
         self._set_distributed_vars(args)
 
@@ -309,11 +274,10 @@ def __init__(
             monitor_memory=False,
         )
 
-        log_dist(f"DeepSpeed Flops Profiler Enabled: {self.flops_profiler_enabled()}",
-                 ranks=[0])
+        log_dist(f"DeepSpeed Flops Profiler Enabled: {self.flops_profiler_enabled()}", ranks=[0])
 
         if self.flops_profiler_enabled():
-            self.flops_profiler = FlopsProfiler(self.module, self)
+            self.flops_profiler = FlopsProfiler(self.module, self, self.flops_profiler_recompute_fwd_factor())
 
         if training_data:
             self.training_dataloader = self.deepspeed_io(training_data)
@@ -332,6 +296,10 @@ def __init__(
         if model_parameters is None:
             model_parameters = self.module.parameters()
 
+        # Convert model parameters from generator to list
+        if not isinstance(model_parameters, list):
+            model_parameters = list(model_parameters)
+
         if has_optimizer:
             self._configure_optimizer(optimizer, model_parameters)
             self._configure_lr_scheduler(lr_scheduler)
@@ -342,16 +310,19 @@ def __init__(
         elif self.bfloat16_enabled():
             self.optimizer = self._configure_bf16_optimizer(optimizer=None)
 
+        # Hook optimizer for snip_momentum pruning
+        if hasattr(model, 'pruners'):
+            from ..compression.helper import rewrite_optimizer_step
+            self.optimizer.pruners = model.pruners
+            rewrite_optimizer_step(self.optimizer)
+
         # Bookkeeping for sparse support
         self.sparse_tensor_module_names = set()
         # if self.sparse_gradients_enabled():
         for name, module in self.module.named_modules():
-            if isinstance(module,
-                          (torch.nn.Embedding,
-                           torch.nn.EmbeddingBag)) and self.sparse_gradients_enabled():
+            if isinstance(module, (torch.nn.Embedding, torch.nn.EmbeddingBag)) and self.sparse_gradients_enabled():
                 self.sparse_tensor_module_names.add(name + ".weight")
-                logger.info(
-                    "Will convert {} to sparse tensor during training".format(name))
+                logger.info("Will convert {} to sparse tensor during training".format(name))
 
         self.save_non_zero_checkpoint = False
         self.save_zero_checkpoint = False
@@ -365,33 +336,28 @@ def __init__(
             self.progressive_layer_drop = self._configure_progressive_layer_drop()
 
         if self.curriculum_enabled_legacy():
-            self.curriculum_scheduler_legacy = self._configure_curriculum_scheduler_legacy(
-            )
+            self.curriculum_scheduler_legacy = self._configure_curriculum_scheduler_legacy()
 
         if self.random_ltd_enabled():
             random_ltd_config = self.random_ltd_config()
             random_ltd_config[RANDOM_LTD_GLOBAL_BATCH_SIZE] = self.train_batch_size()
-            random_ltd_config[
-                RANDOM_LTD_MICRO_BATCH_SIZE] = self.train_micro_batch_size_per_gpu()
-            self.random_ltd_scheduler = self._configure_random_ltd_scheduler(
-                random_ltd_config)
+            random_ltd_config[RANDOM_LTD_MICRO_BATCH_SIZE] = self.train_micro_batch_size_per_gpu()
+            self.random_ltd_scheduler = self._configure_random_ltd_scheduler(random_ltd_config)
 
         # Engine timers
 
-        self.engine_timers = EngineTimers(
-            enable_micro_timers=self.wall_clock_breakdown(),
-            enable_global_timers=self.wall_clock_breakdown()
-            or self.flops_profiler_enabled())
+        self.engine_timers = EngineTimers(enable_micro_timers=self.wall_clock_breakdown(),
+                                          enable_global_timers=self.wall_clock_breakdown()
+                                          or self.flops_profiler_enabled())
 
         if self.global_rank == 0:
             self._config.print("DeepSpeedEngine configuration")
             if self.dump_state():
                 print_configuration(self, "DeepSpeedEngine")
 
-        # Load pre-installed or JIT compile (un)flatten ops
-        util_ops = UtilsBuilder().load()
-        self.flatten = util_ops.flatten
-        self.unflatten = util_ops.unflatten
+        # Use torch (un)flatten ops
+        self.flatten = _flatten_dense_tensors
+        self.unflatten = _unflatten_dense_tensors
 
     def destroy(self):
         if self.optimizer is not None and hasattr(self.optimizer, 'destroy'):
@@ -404,7 +370,7 @@ def _get_model_parameters(self):
             trainable_num_params = 0
 
             for p in self.module.parameters():
-                # since user code might call deepspeed.zero.Init() before deepspeed.initialize(), need to check the attrbuite to check if the parameter is partitioned in zero 3 already or not
+                # since user code might call deepspeed.zero.Init() before deepspeed.initialize(), need to check the attribute to check if the parameter is partitioned in zero 3 already or not
                 n = 0
                 if hasattr(p, "ds_tensor"):  # if the parameter is partitioned in zero 3
                     n += p.ds_numel
@@ -414,10 +380,8 @@ def _get_model_parameters(self):
                 if p.requires_grad:
                     trainable_num_params += n
             if self.global_rank == 0:
-                self.autotuning_model_info[
-                    "num_params"] = num_params * self.mp_world_size
-                self.autotuning_model_info[
-                    "trainable_num_params"] = trainable_num_params * self.mp_world_size
+                self.autotuning_model_info["num_params"] = num_params * self.mp_world_size
+                self.autotuning_model_info["trainable_num_params"] = trainable_num_params * self.mp_world_size
 
             logger.info(f"model parameter = {num_params}")
 
@@ -447,25 +411,32 @@ def set_train_batch_size(self, train_batch_size):
             ValueError: if ``train_batch_size`` is not divisible by the
                 configured micro-batch size and data parallelism.
         """
-        if train_batch_size % (self.train_micro_batch_size_per_gpu() *
-                               self.dp_world_size) != 0:
+        if train_batch_size % (self.train_micro_batch_size_per_gpu() * self.dp_world_size) != 0:
             #print(f'{train_batch_size=} {self.train_micro_batch_size_per_gpu()=} {self.dp_world_size=}')
-            raise ValueError(
-                f'Train batch size must be divisible by micro-batch data parallelism')
-        new_gas = train_batch_size // (self.train_micro_batch_size_per_gpu() *
-                                       self.dp_world_size)
+            raise ValueError(f'Train batch size must be divisible by micro-batch data parallelism')
+        new_gas = train_batch_size // (self.train_micro_batch_size_per_gpu() * self.dp_world_size)
         # overwrite config
         self._config.train_batch_size = train_batch_size
         self._config.gradient_accumulation_steps = new_gas
 
+    def set_train_micro_batch_size(self, micro_batch_size):
+        """Adjust the micro batch size(i.e., the micro batch size in every data parallel group),
+        while keep the gradient accumulation steps the same.
+        Args:
+            micro_batch_size (int): The new micro batch size for training.
+        """
+        # overwrite config
+        new_global_batch_size = micro_batch_size * self._config.gradient_accumulation_steps * self.dp_world_size
+        self._config.train_batch_size = new_global_batch_size
+        self._config.train_micro_batch_size_per_gpu = micro_batch_size
+
     def set_data_post_process_func(self, post_process_func):
         if self.training_dataloader is not None:
             self.training_dataloader.post_process_func = post_process_func
 
     def set_custom_curriculum_learning_schedule(self, schedule_func_dict):
         if self.training_dataloader is not None and self.curriculum_learning_enabled():
-            self.training_dataloader.data_sampler.set_custom_curriculum_learning_schedule(
-                schedule_func_dict)
+            self.training_dataloader.data_sampler.set_custom_curriculum_learning_schedule(schedule_func_dict)
 
     def get_global_grad_norm(self) -> float:
         """Return the 2-norm of all gradients. If there is model parallelism,
@@ -492,8 +463,7 @@ def __getattr__(self, name):
         elif name in dir(_module):
             return getattr(_module, name)
         else:
-            raise AttributeError(
-                f"'{type(self).__name__}' object has no attribute '{name}'")
+            raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'")
 
     def checkpoint_tag_validation_enabled(self):
         return self._config.checkpoint_tag_validation_enabled
@@ -567,15 +537,13 @@ def data_sampling_config(self):
         return self._config.data_efficiency_config[DATA_SAMPLING]
 
     def curriculum_learning_enabled(self):
-        return self._config.data_efficiency_config[DATA_SAMPLING][CURRICULUM_LEARNING][
-            CURRICULUM_LEARNING_ENABLED]
+        return self._config.data_efficiency_config[DATA_SAMPLING][CURRICULUM_LEARNING][CURRICULUM_LEARNING_ENABLED]
 
     def curriculum_learning_config(self):
         return self._config.data_efficiency_config[DATA_SAMPLING][CURRICULUM_LEARNING]
 
     def random_ltd_enabled(self):
-        return self._config.data_efficiency_config[DATA_ROUTING][RANDOM_LTD][
-            RANDOM_LTD_ENABLED]
+        return self._config.data_efficiency_config[DATA_ROUTING][RANDOM_LTD][RANDOM_LTD_ENABLED]
 
     def random_ltd_config(self):
         return self._config.data_efficiency_config[DATA_ROUTING][RANDOM_LTD]
@@ -583,26 +551,20 @@ def random_ltd_config(self):
     def random_ltd_initialize(self):
         assert self.random_ltd_enabled()
         random_ltd_config = self.random_ltd_config()
-        random_ltd_queue = deque(
-            [x for x in sorted(random_ltd_config[RANDOM_LTD_LAYER_ID])])
+        random_ltd_queue = deque([x for x in sorted(random_ltd_config[RANDOM_LTD_LAYER_ID])])
         count = 0
         for name, layer in self.module.named_modules():
             if isinstance(layer, RandomLayerTokenDrop):
-                if len(random_ltd_queue) != 0 and str(
-                        random_ltd_queue[0]) in name:  ###[1,2,3]
-                    layer.init_config(random_ltd_config,
-                                      self.random_ltd_scheduler,
-                                      count)
+                if len(random_ltd_queue) != 0 and str(random_ltd_queue[0]) in name:  ###[1,2,3]
+                    layer.init_config(random_ltd_config, self.random_ltd_scheduler, count)
                     random_ltd_queue.popleft()
                     count += 1
 
         if random_ltd_config[RANDOM_LTD_LAYER_NUM] != count:
-            raise ValueError(
-                f'random_ltd_layer_num {random_ltd_config[RANDOM_LTD_LAYER_NUM]} must be \
+            raise ValueError(f'random_ltd_layer_num {random_ltd_config[RANDOM_LTD_LAYER_NUM]} must be \
                 equivalent to the len of random_ltd_layer_id {count}')
 
-        if random_ltd_config[RANDOM_LTD_LAYER_TOKEN_LR_SCHEDULE][
-                RANDOM_LTD_LAYER_TOKEN_LR_ENABLED]:
+        if random_ltd_config[RANDOM_LTD_LAYER_TOKEN_LR_SCHEDULE][RANDOM_LTD_LAYER_TOKEN_LR_ENABLED]:
             assert self.client_lr_scheduler is None
             raise ValueError(f'not yet support')
             #self.lr_scheduler = lr_schedules.WarmupLayerTokenDecayLR(self.optimizer, self.random_ltd_scheduler)
@@ -613,6 +575,9 @@ def wall_clock_breakdown(self):
     def flops_profiler_enabled(self):
         return self._config.flops_profiler_config.enabled or self.autotuning_enabled()
 
+    def flops_profiler_recompute_fwd_factor(self):
+        return self._config.flops_profiler_config.recompute_fwd_factor
+
     def flops_profiler_profile_step(self):
         step = self._config.flops_profiler_config.profile_step
         if self._config.autotuning_config.enabled:
@@ -663,8 +628,7 @@ def autotuning_metric(self):
     def autotuning_profile_model_info(self):
         return self.autotuning_enabled(
         ) and self._config.autotuning_config.model_info and self._config.autotuning_config.model_info.get(
-            "profile",
-            False)
+            "profile", False)
 
     def sparse_gradients_enabled(self):
         return self._config.sparse_gradients_enabled
@@ -676,8 +640,7 @@ def train_micro_batch_size_per_gpu(self):
         return self._config.train_micro_batch_size_per_gpu
 
     def optimizer_name(self):
-        return (self.client_optimizer.__class__.__name__
-                if self.client_optimizer else self._config.optimizer_name)
+        return (self.client_optimizer.__class__.__name__ if self.client_optimizer else self._config.optimizer_name)
 
     def optimizer_params(self):
         return self._config.optimizer_params
@@ -695,22 +658,15 @@ def quantize_training(self):
         return (
             self._config.compression_config[WEIGHT_QUANTIZATION][SHARED_PARAMETERS]
             [WEIGHT_QUANTIZE_IN_FORWARD_ENABLED],
-            self._config.compression_config[WEIGHT_QUANTIZATION][SHARED_PARAMETERS]
-            [WEIGHT_QUANTIZE_ENABLED],
-            self._config.compression_config[WEIGHT_QUANTIZATION][SHARED_PARAMETERS]
-            [WEIGHT_QUANTIZE_GROUPS],
+            self._config.compression_config[WEIGHT_QUANTIZATION][SHARED_PARAMETERS][WEIGHT_QUANTIZE_ENABLED],
+            self._config.compression_config[WEIGHT_QUANTIZATION][SHARED_PARAMETERS][WEIGHT_QUANTIZE_GROUPS],
             self._config.compression_config[WEIGHT_QUANTIZATION][SHARED_PARAMETERS]
             [WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZE],
-            self._config.compression_config[WEIGHT_QUANTIZATION][SHARED_PARAMETERS]
-            [WEIGHT_QUANTIZE_CHANGE_RATIO],
-            self._config.compression_config[WEIGHT_QUANTIZATION][SHARED_PARAMETERS]
-            [WEIGHT_QUANTIZE_TYPE],
-            self._config.compression_config[WEIGHT_QUANTIZATION][SHARED_PARAMETERS]
-            [WEIGHT_QUANTIZE_ROUNDING],
-            self._config.compression_config[WEIGHT_QUANTIZATION][SHARED_PARAMETERS]
-            [WEIGHT_QUANTIZE_VERBOSE],
-            self._config.compression_config[WEIGHT_QUANTIZATION][SHARED_PARAMETERS]
-            [WEIGHT_QUANTIZE_KERNEL],
+            self._config.compression_config[WEIGHT_QUANTIZATION][SHARED_PARAMETERS][WEIGHT_QUANTIZE_CHANGE_RATIO],
+            self._config.compression_config[WEIGHT_QUANTIZATION][SHARED_PARAMETERS][WEIGHT_QUANTIZE_TYPE],
+            self._config.compression_config[WEIGHT_QUANTIZATION][SHARED_PARAMETERS][WEIGHT_QUANTIZE_ROUNDING],
+            self._config.compression_config[WEIGHT_QUANTIZATION][SHARED_PARAMETERS][WEIGHT_QUANTIZE_VERBOSE],
+            self._config.compression_config[WEIGHT_QUANTIZATION][SHARED_PARAMETERS][WEIGHT_QUANTIZE_KERNEL],
         )
 
     def zero_optimization(self):
@@ -736,10 +692,7 @@ def zero_offload_param(self):
 
     def zero_use_cpu_optimizer(self):
         if self._config.zero_config.offload_optimizer is not None:
-            return self._config.zero_config.offload_optimizer.device in [
-                OffloadDeviceEnum.cpu,
-                OffloadDeviceEnum.nvme
-            ]
+            return self._config.zero_config.offload_optimizer.device in [OffloadDeviceEnum.cpu, OffloadDeviceEnum.nvme]
         return False
 
     def zero_cpu_offload(self):
@@ -747,15 +700,24 @@ def zero_cpu_offload(self):
             return self._config.zero_config.offload_optimizer.device == OffloadDeviceEnum.cpu
         return False
 
+    def zero_partial_offload(self):
+        return getattr(self._config.zero_config.offload_optimizer, "ratio", 1.0)
+
     def zero_sub_group_size(self):
         return self._config.zero_config.sub_group_size
 
     def zero_optimization_stage(self):
         return self._config.zero_optimization_stage
 
+    def mics_shard_size(self):
+        return self._config.mics_shard_size
+
     def zero_reduce_bucket_size(self):
         return self._config.zero_config.reduce_bucket_size
 
+    def zero_multi_rank_bucket_allreduce(self):
+        return self._config.zero_config.use_multi_rank_bucket_allreduce
+
     def zero_allgather_bucket_size(self):
         return self._config.zero_config.allgather_bucket_size
 
@@ -765,6 +727,13 @@ def zero_optimization_partition_gradients(self):
     def zero_optimization_partition_weights(self):
         return self.zero_optimization_stage() >= ZeroStageEnum.weights
 
+    def is_first_weights_partition_group(self):
+        ret = True if self.mics_shard_size() < 0 \
+            and self.zero_optimization_partition_weights() else False
+        if self.mics_shard_size() > 0 and self.global_rank < self.mics_shard_size():
+            ret = True
+        return ret
+
     def zero_contiguous_gradients(self):
         return self._config.zero_config.contiguous_gradients
 
@@ -836,13 +805,19 @@ def communication_data_type(self):
         res = self._config.communication_data_type
         if res is not None:
             return res
-        elif self.fp16_enabled() or self.zero_optimization_stage():
+
+        if self.fp16_enabled():
             return torch.float16
-        elif self.bfloat16_enabled():
+
+        if self.bfloat16_enabled():
             return torch.bfloat16
 
         return torch.float32
 
+    @communication_data_type.setter
+    def communication_data_type(self, value):
+        self._config.communication_data_type = value
+
     def postscale_gradients(self):
         return not self._config.prescale_gradients
 
@@ -858,6 +833,18 @@ def zero_allgather_partitions(self):
     def zero_round_robin_gradients(self):
         return self._config.zero_config.round_robin_gradients
 
+    def zero_hpz_partition_size(self):
+        return self._config.zero_config.zero_hpz_partition_size
+
+    def zero_quantized_weights(self):
+        return self._config.zero_config.zero_quantized_weights
+
+    def zero_quantized_nontrainable_weights(self):
+        return self._config.zero_config.zero_quantized_nontrainable_weights
+
+    def zero_quantized_gradients(self):
+        return self._config.zero_config.zero_quantized_gradients
+
     def dump_state(self):
         return self._config.dump_state
 
@@ -886,7 +873,7 @@ def get_data_types(self):
         elif self.bfloat16_enabled():
             model_dtype = torch.bfloat16
 
-        if self._config.grad_accum_dtype == None:
+        if self._config.grad_accum_dtype is None:
             if model_dtype == torch.bfloat16 and not self.zero_optimization():
                 grad_accum_dtype = torch.float32
             else:
@@ -896,18 +883,21 @@ def get_data_types(self):
 
         return (model_dtype, grad_accum_dtype)
 
+    def _optimizer_has_ckpt_event_prologue(self):
+        return self.optimizer is not None and hasattr(self.optimizer, 'checkpoint_event_prologue')
+
+    def _optimizer_has_ckpt_event_epilogue(self):
+        return self.optimizer is not None and hasattr(self.optimizer, 'checkpoint_event_epilogue')
+
     def _configure_lr_scheduler(self, client_lr_scheduler):
         # First check for scheduler in json configuration
         lr_scheduler = self._scheduler_from_config(self.optimizer)
         if lr_scheduler:
-            log_dist(
-                f"DeepSpeed using configured LR scheduler = {self.scheduler_name()}",
-                ranks=[0])
+            log_dist(f"DeepSpeed using configured LR scheduler = {self.scheduler_name()}", ranks=[0])
             self.lr_scheduler = lr_scheduler
         else:
             if isinstance(client_lr_scheduler, Callable):
-                log_dist('DeepSpeed using client callable to create LR scheduler',
-                         ranks=[0])
+                log_dist('DeepSpeed using client callable to create LR scheduler', ranks=[0])
                 self.lr_scheduler = client_lr_scheduler(self.basic_optimizer)
             else:
                 log_dist('DeepSpeed using client LR scheduler', ranks=[0])
@@ -922,25 +912,20 @@ def _configure_checkpointing(self, dist_init_required):
             try:
                 from deepspeed.runtime.checkpoint_engine.nebula_checkpoint_engine import \
                     NebulaCheckpointEngine
-                self.checkpoint_engine = NebulaCheckpointEngine(
-                    config_params=self._config.nebula_config)
+                self.checkpoint_engine = NebulaCheckpointEngine(config_params=self._config.nebula_config)
             except ImportError as err:
-                logger.error(
-                    f"No torch_nebula was found! Will fall back to torch.save. Details: {err}"
-                )
+                logger.error(f"No torch_nebula was found! Will fall back to torch.save. Details: {err}")
                 self.checkpoint_engine = TorchCheckpointEngine()
 
-        dp_rank = self.global_rank
-        if self.mpu:
-            dp_rank = self.mpu.get_data_parallel_rank()
+        dp_rank = groups._get_sequence_data_parallel_rank()
 
         rank = self.local_rank if self.use_node_local_storage() else dp_rank
 
         # only the first data parallel process needs to store the model checkpoint
         # if you want to use node local storage this must be done by rank 0 on each
         # node
-        self.save_non_zero_checkpoint = (
-            rank == 0) or self.zero_optimization_partition_weights()
+        self.save_non_zero_checkpoint = (rank == 0) or (self.zero_optimization_partition_weights()
+                                                        and self.is_first_weights_partition_group())
 
         if self.zero_optimization() or self.bfloat16_enabled():
             param_rank = dist.get_rank(group=self.optimizer.dp_process_group)
@@ -955,9 +940,8 @@ def _scheduler_from_config(self, optimizer):
             if hasattr(lr_schedules, scheduler_name):
                 scheduler = getattr(lr_schedules, scheduler_name)
             else:
-                assert hasattr(
-                    torch.optim.lr_scheduler, scheduler_name
-                ), f"DeepSpeed does not recognize LR scheduler {scheduler_name}"
+                assert hasattr(torch.optim.lr_scheduler,
+                               scheduler_name), f"DeepSpeed does not recognize LR scheduler {scheduler_name}"
 
                 scheduler = getattr(torch.optim.lr_scheduler, scheduler_name)
 
@@ -968,9 +952,7 @@ def _scheduler_from_config(self, optimizer):
             return None
 
     def _set_distributed_vars(self, args):
-        device_rank = args.device_rank if args is not None and hasattr(
-            args,
-            'device_rank') else self.local_rank
+        device_rank = args.device_rank if args is not None and hasattr(args, 'device_rank') else self.local_rank
         if device_rank >= 0:
             get_accelerator().set_device(device_rank)
             self.device = torch.device(get_accelerator().device_name(), device_rank)
@@ -999,48 +981,23 @@ def _configure_with_arguments(self, args, mpu):
         if hasattr(args, 'local_rank'):
             args.local_rank = self.local_rank
 
-        if self.config is None:
-            self.config = (args.deepspeed_config
-                           if hasattr(args,
-                                      "deepspeed_config") else None)
-        self._config = DeepSpeedConfig(self.config, mpu)
-
     # Validate command line arguments
     def _do_args_sanity_check(self, args):
-        if hasattr(args, "deepscale_config") and args.deepscale_config is not None:
-            logger.warning(
-                "************ --deepscale_config is deprecated, please use --deepspeed_config ************"
-            )
-            if hasattr(args, "deepspeed_config"):
-                assert (
-                    args.deepspeed_config is None
-                ), "Not sure how to proceed, we were given both a deepscale_config and deepspeed_config"
-            args.deepspeed_config = args.deepscale_config
-
         assert "LOCAL_RANK" in os.environ or "OMPI_COMM_WORLD_LOCAL_RANK" in os.environ, "DeepSpeed requires the LOCAL_RANK environment " \
             "variable, it is set by the deepspeed launcher, deepspeed.init_distributed, or the torch's launcher. If using a " \
             "different launcher please ensure LOCAL_RANK is set prior to initializing deepspeed."
 
-        if hasattr(args, 'local_rank') and args.local_rank != None:
-            assert isinstance(
-                args.local_rank, int), f"args.local_rank of {args.local_rank} is an unknown type {type(args.local_rank)}"
+        if hasattr(args, 'local_rank') and args.local_rank is not None:
+            assert isinstance(args.local_rank,
+                              int), f"args.local_rank of {args.local_rank} is an unknown type {type(args.local_rank)}"
             if args.local_rank >= 0:
                 env_local_rank = int(os.environ.get("LOCAL_RANK"))
                 assert (
                     env_local_rank == args.local_rank
                 ), f"Mismatch in local rank setting, args.local_rank={args.local_rank} but env['LOCAL_RANK']={env_local_rank}."
 
-        if self.config is None:
-            assert (
-                hasattr(
-                    args, "deepspeed_config") and args.deepspeed_config is not None
-            ), "DeepSpeed requires --deepspeed_config to specify configuration file"
-
     def _is_supported_optimizer(self, optimizer_name):
-        return (optimizer_name in DEEPSPEED_OPTIMIZERS
-                or getattr(torch.optim,
-                           optimizer_name,
-                           None) is not None)
+        return (optimizer_name in DEEPSPEED_OPTIMIZERS or getattr(torch.optim, optimizer_name, None) is not None)
 
     def _supported_optims(self):
         FairseqOptimizer = None
@@ -1065,18 +1022,11 @@ def _do_sanity_check(self):
         if not self.client_optimizer:
             if self.optimizer_name() is not None:
                 assert self._is_supported_optimizer(
-                    self.optimizer_name()
-                ), "{} is not a supported DeepSpeed Optimizer".format(
-                    self.optimizer_name()
-                )
+                    self.optimizer_name()), "{} is not a supported DeepSpeed Optimizer".format(self.optimizer_name())
 
-        if (self.optimizer_name() == LAMB_OPTIMIZER
-                or self.optimizer_name() == ONEBIT_LAMB_OPTIMIZER):
-            assert (
-                self.dynamic_loss_scale()
-            ), "DeepSpeed {} optimizer requires dynamic loss scaling".format(
-                self.optimizer_name()
-            )
+        if (self.optimizer_name() == LAMB_OPTIMIZER or self.optimizer_name() == ONEBIT_LAMB_OPTIMIZER):
+            assert (self.dynamic_loss_scale()), "DeepSpeed {} optimizer requires dynamic loss scaling".format(
+                self.optimizer_name())
 
         # Detect invalid combinations of client optimizer and client scheduler
         if isinstance(self.client_lr_scheduler, _LRScheduler):
@@ -1084,6 +1034,7 @@ def _do_sanity_check(self):
                 f'Client Optimizer (type = {type(self.client_optimizer)} is not instantiated but Client LR Scheduler is instantiated'
 
     def _broadcast_model(self):
+
         def is_replicated(p):
             if hasattr(p, "ds_status") and p.ds_status is not ZeroParamStatus.AVAILABLE:
                 return False
@@ -1098,20 +1049,15 @@ def is_replicated(p):
                                    group=self.expert_data_parallel_group[p.group_name])
             else:
                 if torch.is_tensor(p) and is_replicated(p):
-                    dist.broadcast(p,
-                                   groups._get_broadcast_src_rank(),
-                                   group=self.data_parallel_group)
+                    dist.broadcast(p, groups._get_broadcast_src_rank(), group=self.seq_data_parallel_group)
 
     @staticmethod
     def __check_params(model: Module, dtype: torch.dtype) -> None:
         return
-        if not all(param.dtype == dtype
-                   for param in model.parameters()) and dist.get_rank() == 0:
-            raise ValueError(
-                f"{dtype} is enabled but the following parameters have dtype that is "
-                f"not {dtype}: "
-                f"{[(n, p.dtype) for n, p in model.named_parameters() if p.dtype != dtype]}"
-            )
+        if not all(param.dtype == dtype for param in model.parameters()) and dist.get_rank() == 0:
+            raise ValueError(f"{dtype} is enabled but the following parameters have dtype that is "
+                             f"not {dtype}: "
+                             f"{[(n, p.dtype) for n, p in model.named_parameters() if p.dtype != dtype]}")
 
     def _set_client_model(self, model):
         # register client model in _modules so that nn.module methods work correctly
@@ -1122,23 +1068,36 @@ def _set_client_model(self, model):
 
     def _configure_distributed_model(self, model):
         self._set_client_model(model)
+        is_zero_init_model = self.zero_optimization_partition_weights() and any(
+            [hasattr(param, "ds_id") for param in self.module.parameters()])
 
         if self.fp16_enabled():
-            if self.zero_optimization_partition_weights() and any(
-                [hasattr(param,
-                         "ds_id") for param in self.module.parameters()]):
+            if is_zero_init_model:
                 self.__check_params(self.module, torch.half)
-            self.module.half()
+            # selectively avoid casting specially 
+            # marked parameters to 16-bit
+            self.module._apply(
+                lambda t: t.half() if (
+                    t.is_floating_point() and 
+                    not getattr(t, "_deepspeed_no_cast", False)
+                ) else t
+            )
         elif self.bfloat16_enabled():
-            if self.zero_optimization_partition_weights() and any(
-                    hasattr(param,
-                            'ds_id') for param in self.module.parameters()):
+            if is_zero_init_model:
                 self.__check_params(self.module, torch.bfloat16)
-            self.module.bfloat16()
+            # selectively avoid casting specially 
+            # marked parameters to 16-bit
+            self.module._apply(
+                lambda t: t.bfloat16() if (
+                    t.is_floating_point() and 
+                    not getattr(t, "_deepspeed_no_cast", False)
+                ) else t
+            )
         else:
             self.__check_params(self.module, torch.float)
 
-        if not self.dont_change_device:
+        # zero.Init() handles device placement of model
+        if not (self.dont_change_device or is_zero_init_model):
             self.module.to(self.device)
 
         # MoE related initialization
@@ -1165,16 +1124,25 @@ def _configure_distributed_model(self, model):
         # Set deepspeed parallelism spec. for the model including expert parallelism
         for _, module in self.module.named_modules():
             if hasattr(module, 'set_deepspeed_parallelism'):
-                module.set_deepspeed_parallelism()
+                module.set_deepspeed_parallelism(self._config.use_data_before_expert_parallel_)
 
         # Query the groups module to get information about various parallel groups
+        self.local_all_to_all_group = None
+        if self.zero_quantized_gradients():
+            log_dist("Using quantized gradients", ranks=[0])
+            self.local_all_to_all_group = groups._get_local_all_to_all_group()
         self.data_parallel_group = groups._get_data_parallel_group()
         self.dp_world_size = groups._get_data_parallel_world_size()
+        self.seq_data_parallel_group = groups._get_sequence_data_parallel_group()
+        self.seq_dp_world_size = groups._get_sequence_data_parallel_world_size()
         self.mp_world_size = groups._get_model_parallel_world_size()
         self.expert_parallel_group = groups._get_expert_parallel_group_dict()
         self.expert_data_parallel_group = groups._get_expert_data_parallel_group_dict()
+        self.sequence_parallel_size = groups._get_sequence_parallel_world_size()
+        if self.sequence_parallel_size > 1:
+            self.communication_data_type = self._config.seq_parallel_communication_data_type
 
-        if not self.amp_enabled():
+        if not (self.amp_enabled() or is_zero_init_model):
             self._broadcast_model()
 
     # check if parameters are duplicated in optimizer param_groups
@@ -1186,11 +1154,10 @@ def ids_list(group):
                 return [id(param) for param in group]
 
             occurrence = sum([
-                ids_list(group['params']).count(param_id)
-                if param_id in ids_list(group['params']) else 0
+                ids_list(group['params']).count(param_id) if param_id in ids_list(group['params']) else 0
                 for group in optimizer.param_groups
             ])
-            assert occurrence <= 1, f"Parameter with name: {name} occurs multiple times in optimizer.param_groups. Make sure it only appears once to prevent undefined behaviour."
+            assert occurrence <= 1, f"Parameter with name: {name} occurs multiple times in optimizer.param_groups. Make sure it only appears once to prevent undefined behavior."
 
     def _do_optimizer_sanity_check(self, basic_optimizer):
         model_dtype, grad_accum_dtype = self.get_data_types()
@@ -1207,33 +1174,22 @@ def _do_optimizer_sanity_check(self, basic_optimizer):
                 ), 'You are using an untested ZeRO Optimizer. Please add <"zero_allow_untested_optimizer": true> in the configuration file to use it.'
 
                 if self.global_rank == 0:
-                    logger.warning(
-                        "**** You are using ZeRO with an untested optimizer, proceed with caution *****"
-                    )
-
+                    logger.warning("**** You are using ZeRO with an untested optimizer, proceed with caution *****")
             if model_dtype == torch.bfloat16 and grad_accum_dtype == torch.float32 and self.zero_optimization_stage(
-            ) == 1:
+            ) == 1 and not self.zero_cpu_offload():
                 return BFLOAT16
-
-            if model_dtype != grad_accum_dtype:
-                raise NotImplementedError(
-                    "Model data type and gradient accumulation data type must be equal to use ZeRO"
-                )
             return ZERO_OPTIMIZATION
         elif amp_enabled:
             if model_dtype != grad_accum_dtype:
                 raise NotImplementedError(
-                    "Model data type and gradient accumulation data type must be equal to use Amp"
-                )
+                    "Model data type and gradient accumulation data type must be equal to use Amp")
             if model_dtype == torch.bfloat16 or model_dtype == torch.float16:
-                raise NotImplementedError(
-                    "Cannot enable both amp with (legacy) fp16 or bfloat16 mode")
+                raise NotImplementedError("Cannot enable both amp with (legacy) fp16 or bfloat16 mode")
             try:
                 logger.info("Initializing Apex amp from: {}".format(amp.__path__))
             except NameError:
                 # If apex/amp is available it will be imported above
-                raise RuntimeError(
-                    "Unable to import apex/amp, please make sure it is installed")
+                raise RuntimeError("Unable to import apex/amp, please make sure it is installed")
             return AMP
         # data type checks
         elif model_dtype == grad_accum_dtype:
@@ -1247,46 +1203,35 @@ def _do_optimizer_sanity_check(self, basic_optimizer):
         elif model_dtype == torch.bfloat16 and grad_accum_dtype == torch.float32:
             return BFLOAT16
         else:
-            raise NotImplementedError(
-                "unsupported mix of model dtype and gradient accummulation type")
+            raise NotImplementedError("unsupported mix of model dtype and gradient accumulation type")
 
         return None
 
     # Configure optimizer
     def _configure_optimizer(self, client_optimizer, model_parameters):
-        if client_optimizer is not None:
+        if client_optimizer is None:
+            basic_optimizer = self._configure_basic_optimizer(model_parameters)
+            log_dist(f"Using DeepSpeed Optimizer param name {self.optimizer_name()} as basic optimizer", ranks=[0])
+        else:
             if isinstance(client_optimizer, tuple(self._supported_optims())):
-                client_optimizer.param_groups[:] = [
-                    pg for pg in client_optimizer.param_groups if len(pg["params"]) != 0
-                ]
-                log_dist(
-                    "Removing param_group that has no 'params' in the client Optimizer",
-                    ranks=[0])
-
                 basic_optimizer = client_optimizer
                 log_dist('Using client Optimizer as basic optimizer', ranks=[0])
             else:
                 basic_optimizer = client_optimizer(model_parameters)
                 log_dist('Using client callable to create basic optimizer', ranks=[0])
 
-            if self.zero_use_cpu_optimizer() and not isinstance(
-                    basic_optimizer,
-                    deepspeed.ops.adam.DeepSpeedCPUAdam):
+            if self.zero_use_cpu_optimizer() and not isinstance(basic_optimizer, deepspeed.ops.adam.DeepSpeedCPUAdam):
                 if self.zero_force_ds_cpu_optimizer():
                     msg = f'You are using ZeRO-Offload with a client provided optimizer ({type(basic_optimizer)}) which in most cases will yield poor performance. Please either use deepspeed.ops.adam.DeepSpeedCPUAdam or set an optimizer in your ds-config (https://www.deepspeed.ai/docs/config-json/#optimizer-parameters). If you really want to use a custom optimizer w. ZeRO-Offload and understand the performance impacts you can also set <"zero_force_ds_cpu_optimizer": false> in your configuration file.'
                     raise ZeRORuntimeException(msg)
-        else:
-            basic_optimizer = self._configure_basic_optimizer(model_parameters)
-            log_dist(
-                f"Using DeepSpeed Optimizer param name {self.optimizer_name()} as basic optimizer",
-                ranks=[0])
+
+        basic_optimizer.param_groups[:] = [pg for pg in basic_optimizer.param_groups if len(pg["params"]) != 0]
+        log_dist("Removing param_group that has no 'params' in the basic Optimizer", ranks=[0])
 
         self._check_for_duplicates(basic_optimizer)
 
         self.basic_optimizer = basic_optimizer
-        log_dist("DeepSpeed Basic Optimizer = {}".format(
-            basic_optimizer.__class__.__name__),
-                 ranks=[0])
+        log_dist("DeepSpeed Basic Optimizer = {}".format(basic_optimizer.__class__.__name__), ranks=[0])
 
         optimizer_wrapper = self._do_optimizer_sanity_check(basic_optimizer)
 
@@ -1295,9 +1240,7 @@ def _configure_optimizer(self, client_optimizer, model_parameters):
         elif optimizer_wrapper == AMP:
             amp_params = self.amp_params()
             log_dist(f"Initializing AMP with these params: {amp_params}", ranks=[0])
-            model, self.optimizer = amp.initialize(
-                self.module, basic_optimizer, **amp_params
-            )
+            model, self.optimizer = amp.initialize(self.module, basic_optimizer, **amp_params)
             self._set_client_model(model)
             self._broadcast_model()
             # TODO: maybe need to broadcast experts differently?
@@ -1308,8 +1251,7 @@ def _configure_optimizer(self, client_optimizer, model_parameters):
         else:
             self.optimizer = basic_optimizer
 
-        log_dist("DeepSpeed Final Optimizer = {}".format(self.optimizer_name()),
-                 ranks=[0])
+        log_dist("DeepSpeed Final Optimizer = {}".format(self.optimizer_name()), ranks=[0])
 
         self.compression_scheduler = self._configure_compression_scheduler()
         self.quantizer = self._configure_quantization()
@@ -1324,32 +1266,24 @@ def _configure_basic_optimizer(self, model_parameters):
                 "'max_grad_norm' is not supported as an optimizer parameter, please switch to using the deepspeed parameter 'gradient_clipping' see: https://www.deepspeed.ai/docs/config-json/#gradient-clipping for more details"
             )
 
-        if self.optimizer_name() in [ADAGRAD_OPTIMIZER, ADAM_OPTIMIZER, ADAMW_OPTIMIZER]:
+        if self.optimizer_name() in [ADAM_OPTIMIZER, ADAMW_OPTIMIZER]:
             torch_adam = optimizer_parameters.pop(TORCH_ADAM_PARAM, False)
             adam_w_mode = optimizer_parameters.pop(ADAM_W_MODE, ADAM_W_MODE_DEFAULT)
 
             # Optimizer name of Adam forces AdamW logic unless adam_w_mode is explicitly set
-            effective_adam_w_mode = self.optimizer_name(
-            ) == ADAMW_OPTIMIZER or adam_w_mode
+            effective_adam_w_mode = self.optimizer_name() == ADAMW_OPTIMIZER or adam_w_mode
 
             if torch_adam:
                 if not effective_adam_w_mode:
-                    optimizer = torch.optim.Adam(model_parameters,
-                                                 **optimizer_parameters)
+                    optimizer = torch.optim.Adam(model_parameters, **optimizer_parameters)
                 else:
-                    optimizer = torch.optim.AdamW(model_parameters,
-                                                  **optimizer_parameters)
+                    optimizer = torch.optim.AdamW(model_parameters, **optimizer_parameters)
             else:
                 if self.zero_use_cpu_optimizer():
-                    if self.optimizer_name() == ADAGRAD_OPTIMIZER:
-                        from deepspeed.ops.adagrad import DeepSpeedCPUAdagrad
-                        optimizer = DeepSpeedCPUAdagrad(model_parameters,
-                                                        **optimizer_parameters)
-                    else:
-                        from deepspeed.ops.adam import DeepSpeedCPUAdam
-                        optimizer = DeepSpeedCPUAdam(model_parameters,
-                                                     **optimizer_parameters,
-                                                     adamw_mode=effective_adam_w_mode)
+                    from deepspeed.ops.adam import DeepSpeedCPUAdam
+                    optimizer = DeepSpeedCPUAdam(model_parameters,
+                                                 **optimizer_parameters,
+                                                 adamw_mode=effective_adam_w_mode)
                 else:
                     from deepspeed.ops.adam import FusedAdam
 
@@ -1359,6 +1293,12 @@ def _configure_basic_optimizer(self, model_parameters):
                         adam_w_mode=effective_adam_w_mode,
                     )
 
+        elif self.optimizer_name() == ADAGRAD_OPTIMIZER:
+            if self.zero_use_cpu_optimizer():
+                from deepspeed.ops.adagrad import DeepSpeedCPUAdagrad
+                optimizer = DeepSpeedCPUAdagrad(model_parameters, **optimizer_parameters)
+            else:
+                optimizer = torch.optim.Adagrad(model_parameters, **optimizer_parameters)
         elif self.optimizer_name() == LAMB_OPTIMIZER:
             from deepspeed.ops.lamb import FusedLamb
 
@@ -1369,26 +1309,46 @@ def _configure_basic_optimizer(self, model_parameters):
 
             optimizer = OnebitAdam(model_parameters, self, **optimizer_parameters)
             if not self.fp16_enabled():
-                logger.warning(
-                    f"Currently the convergence of 1-bit Adam is only verified under FP16"
-                )
+                logger.warning(f"Currently the convergence of 1-bit Adam is only verified under FP16")
         elif self.optimizer_name() == ZERO_ONE_ADAM_OPTIMIZER:
             assert not self.zero_optimization(), "0/1 Adam is not compatible with ZeRO"
             from deepspeed.runtime.fp16.onebit.zoadam import ZeroOneAdam
 
             optimizer = ZeroOneAdam(model_parameters, self, **optimizer_parameters)
             if not self.fp16_enabled():
-                logger.warning(
-                    f'Currently the convergence of 0/1 Adam is only verified under FP16')
+                logger.warning(f'Currently the convergence of 0/1 Adam is only verified under FP16')
         elif self.optimizer_name() == ONEBIT_LAMB_OPTIMIZER:
             assert not self.zero_optimization(), "1bit-Lamb is not compatible with ZeRO"
             from deepspeed.runtime.fp16.onebit.lamb import OnebitLamb
 
             optimizer = OnebitLamb(model_parameters, self, **optimizer_parameters)
             if not self.fp16_enabled():
-                logger.warning(
-                    f"Currently the convergence of 1-bit Lamb is only verified under FP16"
-                )
+                logger.warning(f"Currently the convergence of 1-bit Lamb is only verified under FP16")
+        elif self.optimizer_name() == LION_OPTIMIZER:
+            if self.zero_use_cpu_optimizer():
+                from deepspeed.ops.lion import DeepSpeedCPULion
+                optimizer = DeepSpeedCPULion(model_parameters, **optimizer_parameters)
+            else:
+                from deepspeed.ops.lion import FusedLion
+                optimizer = FusedLion(model_parameters, **optimizer_parameters)
+        elif self.optimizer_name() == MUADAM_OPTIMIZER:
+            try:
+                from mup import MuAdam
+            except ImportError:
+                logger.error(f"Install mup to use MuAdam optimizer")
+            optimizer = MuAdam(model_parameters, **optimizer_parameters)
+        elif self.optimizer_name() == MUADAMW_OPTIMIZER:
+            try:
+                from mup import MuAdamW
+            except ImportError:
+                logger.error(f"Install mup to use MuAdamW optimizer")
+            optimizer = MuAdamW(model_parameters, **optimizer_parameters)
+        elif self.optimizer_name() == MUSGD_OPTIMIZER:
+            try:
+                from mup import MuSGD
+            except ImportError:
+                logger.error(f"Install mup to use MuSGD optimizer")
+            optimizer = MuSGD(model_parameters, **optimizer_parameters)
         else:
             torch_optimizer = getattr(torch.optim, self.optimizer_name())
             optimizer = torch_optimizer(model_parameters, **optimizer_parameters)
@@ -1413,7 +1373,8 @@ def _configure_quantization(self):
             use_quantizer_kernel,
         ) = self.quantize_training()
         if quantize_enabled and not quantize_weight_in_forward:
-            assert self.fp16_enabled(), "MoQ (quantize in optimization step) weight quantization is only supported for FP16"
+            assert self.fp16_enabled(
+            ), "MoQ (quantize in optimization step) weight quantization is only supported for FP16"
         quantizer = None
         if quantize_enabled and not quantize_weight_in_forward:
             from deepspeed.runtime.quantize import Quantizer
@@ -1443,7 +1404,7 @@ def _configure_fp16_optimizer(self, optimizer):
                 or self.optimizer_name() in [ONEBIT_ADAM_OPTIMIZER, ZERO_ONE_ADAM_OPTIMIZER]:
             if self.dynamic_loss_scale():
                 log_dist(f'Creating fp16 optimizer with dynamic loss scale', ranks=[0])
-                timers = self.timers if self.wall_clock_breakdown() else None
+                timers = self.timers if self.wall_clock_breakdown() else NoopTimer()
                 optimizer = FP16_Optimizer(
                     optimizer,
                     deepspeed=self,
@@ -1457,9 +1418,7 @@ def _configure_fp16_optimizer(self, optimizer):
                     has_moe_layers=self.has_moe_layers,
                 )
             else:
-                log_dist(
-                    f'Creating fp16 optimizer with static loss scale: {self.loss_scale()}',
-                    ranks=[0])
+                log_dist(f'Creating fp16 optimizer with static loss scale: {self.loss_scale()}', ranks=[0])
                 optimizer = FP16_Optimizer(
                     optimizer,
                     deepspeed=self,
@@ -1470,8 +1429,7 @@ def _configure_fp16_optimizer(self, optimizer):
                     has_moe_layers=self.has_moe_layers,
                 )
         else:
-            log_dist(f'Creating fp16 unfused optimizer with dynamic loss scale',
-                     ranks=[0])
+            log_dist(f'Creating fp16 unfused optimizer with dynamic loss scale', ranks=[0])
             optimizer = FP16_UnfusedOptimizer(
                 optimizer,
                 deepspeed=self,
@@ -1493,22 +1451,24 @@ def _configure_bf16_optimizer(self, optimizer):
 
         log_dist('Creating BF16 optimizer', ranks=[0])
 
-        timers = self.timers if self.wall_clock_breakdown() else None
-        optimizer = BF16_Optimizer(
-            optimizer,
-            self.param_names,
-            mpu=self.mpu,
-            clip_grad=clip_grad,
-            allgather_bucket_size=self.zero_allgather_bucket_size(),
-            dp_process_group=self.data_parallel_group,
-            timers=timers)
+        timers = self.timers if self.wall_clock_breakdown() else NoopTimer()
+        optimizer = BF16_Optimizer(optimizer,
+                                   self.param_names,
+                                   mpu=self.mpu,
+                                   clip_grad=clip_grad,
+                                   allgather_bucket_size=self.zero_allgather_bucket_size(),
+                                   dp_process_group=self.seq_data_parallel_group,
+                                   timers=timers)
 
         return optimizer
 
     def _configure_zero_optimizer(self, optimizer):
         zero_stage = self.zero_optimization_stage()
-        model_dtype, grad_accum_dtype = self.get_data_types()
-        timers = self.timers if self.wall_clock_breakdown() else None
+
+        mics_shard_size = self.mics_shard_size()
+        model_dtype, gradient_accumulation_dtype = self.get_data_types()
+
+        timers = self.timers if self.wall_clock_breakdown() else NoopTimer()
 
         if optimizer is None:
             optimizer = DummyOptim(list(self.module.parameters()))
@@ -1524,8 +1484,7 @@ def _configure_zero_optimizer(self, optimizer):
             round_robin_gradients = self.zero_round_robin_gradients()
             assert not isinstance(optimizer, DummyOptim), "zero stage {} requires an optimizer".format(zero_stage)
 
-            log_dist(f'Creating {model_dtype} ZeRO stage {zero_stage} optimizer',
-                     ranks=[0])
+            log_dist(f'Creating {model_dtype} ZeRO stage {zero_stage} optimizer', ranks=[0])
             # Overlap and contiguous grads are meaningless in stage 1 and are ignored
             if zero_stage == ZeroStageEnum.optimizer_states:
                 overlap_comm = False
@@ -1536,9 +1495,7 @@ def _configure_zero_optimizer(self, optimizer):
 
             if isinstance(self.module, PipelineModule):
                 if overlap_comm:
-                    logger.warning(
-                        "Pipeline parallelism does not support overlapped communication, will be disabled."
-                    )
+                    logger.warning("Pipeline parallelism does not support overlapped communication, will be disabled.")
                     overlap_comm = False
             optimizer = DeepSpeedZeroOptimizer(
                 optimizer,
@@ -1550,15 +1507,14 @@ def _configure_zero_optimizer(self, optimizer):
                 clip_grad=self.gradient_clipping(),
                 contiguous_gradients=contiguous_gradients,
                 reduce_bucket_size=self.zero_reduce_bucket_size(),
+                use_multi_rank_bucket_allreduce=self.zero_multi_rank_bucket_allreduce(),
                 allgather_bucket_size=self.zero_allgather_bucket_size(),
-                dp_process_group=self.data_parallel_group,
-                expert_parallel_group=self.expert_parallel_group
-                if self.has_moe_layers else None,
-                expert_data_parallel_group=self.expert_data_parallel_group
-                if self.has_moe_layers else None,
+                dp_process_group=self.seq_data_parallel_group,
+                expert_parallel_group=self.expert_parallel_group if self.has_moe_layers else None,
+                expert_data_parallel_group=self.expert_data_parallel_group if self.has_moe_layers else None,
                 reduce_scatter=self.zero_reduce_scatter(),
                 overlap_comm=overlap_comm,
-                cpu_offload=self.zero_cpu_offload(),
+                offload_optimizer_config=self.zero_offload_optimizer(),
                 mpu=self.mpu,
                 postscale_gradients=self.postscale_gradients(),
                 gradient_predivide_factor=self.gradient_predivide_factor(),
@@ -1567,8 +1523,8 @@ def _configure_zero_optimizer(self, optimizer):
                 partition_grads=zero_stage == ZeroStageEnum.gradients,
                 round_robin_gradients=round_robin_gradients,
                 has_moe_layers=self.has_moe_layers,
-                fp16_master_weights_and_gradients=self.fp16_master_weights_and_gradients(
-                ),
+                fp16_master_weights_and_gradients=self.fp16_master_weights_and_gradients(),
+                gradient_accumulation_dtype=gradient_accumulation_dtype,
                 communication_data_type=self.communication_data_type,
                 elastic_checkpoint=self.zero_elastic_checkpoint())
 
@@ -1576,6 +1532,10 @@ def _configure_zero_optimizer(self, optimizer):
             assert not self.has_moe_layers, "MoE not supported with Stage 3"
             if isinstance(optimizer, DummyOptim):
                 log_dist("Creating ZeRO Offload", ranks=[0])
+                zero_param_parallel_group = groups._get_zero_param_intra_parallel_group()
+                if self.zero_hpz_partition_size() > 1 and zero_param_parallel_group is None:
+                    self._set_zero_group_parallelism()
+                    zero_param_parallel_group = groups._get_zero_param_intra_parallel_group()
                 optimizer = DeepSpeedZeRoOffload(
                     self.module,
                     timers=timers,
@@ -1587,10 +1547,21 @@ def _configure_zero_optimizer(self, optimizer):
                     param_persistence_threshold=self.zero_param_persistence_threshold(),
                     model_persistence_threshold=self.zero_model_persistence_threshold(),
                     offload_param_config=self.zero_offload_param(),
-                    mpu=self.mpu)
+                    mpu=self.mpu,
+                    zero_param_parallel_group=zero_param_parallel_group,
+                    zero_quantized_weights=self.zero_quantized_weights(),
+                    zero_quantized_nontrainable_weights=self.zero_quantized_nontrainable_weights(),
+                )
             else:
-                log_dist(f'Creating {model_dtype} ZeRO stage {zero_stage} optimizer',
-                         ranks=[0])
+                log_dist(
+                    f'Creating fp16 ZeRO stage {zero_stage} optimizer,'
+                    f' MiCS is enabled {mics_shard_size>0},'
+                    f' Hierarchical params gather {self._config.mics_hierarchial_params_gather}',
+                    ranks=[0])
+                if mics_shard_size > 0:
+                    return self._return_mics_optimizer(optimizer, timers)
+
+                log_dist(f'Creating {model_dtype} ZeRO stage {zero_stage} optimizer', ranks=[0])
                 from deepspeed.runtime.zero.stage3 import DeepSpeedZeroOptimizer_Stage3
                 optimizer = DeepSpeedZeroOptimizer_Stage3(
                     self.module,
@@ -1608,24 +1579,64 @@ def _configure_zero_optimizer(self, optimizer):
                     max_live_parameters=self.zero_max_live_parameters(),
                     param_persistence_threshold=self.zero_param_persistence_threshold(),
                     model_persistence_threshold=self.zero_model_persistence_threshold(),
-                    dp_process_group=self.data_parallel_group,
+                    dp_process_group=self.seq_data_parallel_group,
+                    all2all_process_group=self.local_all_to_all_group,
                     reduce_scatter=self.zero_reduce_scatter(),
                     overlap_comm=self.zero_overlap_comm(),
                     offload_optimizer_config=self.zero_offload_optimizer(),
                     offload_param_config=self.zero_offload_param(),
                     sub_group_size=self.zero_sub_group_size(),
+                    offload_ratio=self.zero_partial_offload(),
                     mpu=self.mpu,
                     postscale_gradients=self.postscale_gradients(),
                     gradient_predivide_factor=self.gradient_predivide_factor(),
                     gradient_accumulation_steps=self.gradient_accumulation_steps(),
                     aio_config=self.aio_config(),
-                    communication_data_type=self.communication_data_type)
+                    gradient_accumulation_dtype=gradient_accumulation_dtype,
+                    communication_data_type=self.communication_data_type,
+                    zero_hpz_partition_size=self.zero_hpz_partition_size(),
+                    zero_quantized_weights=self.zero_quantized_weights(),
+                    zero_quantized_nontrainable_weights=self.zero_quantized_nontrainable_weights(),
+                )
 
         else:
             raise NotImplementedError("ZeRO stage {} not implemented".format(zero_stage))
 
         return optimizer
 
+    def _return_mics_optimizer(self, basic_optimizer, timers):
+        from deepspeed.runtime.zero.mics import MiCS_Optimizer
+        model_dtype, gradient_accumulation_dtype = self.get_data_types()
+        optimizer = MiCS_Optimizer(self.module,
+                                   basic_optimizer,
+                                   timers=timers,
+                                   ds_config=self.config,
+                                   static_loss_scale=self.loss_scale(),
+                                   dynamic_loss_scale=self.dynamic_loss_scale(),
+                                   dynamic_loss_args=self.dynamic_loss_scale_args(),
+                                   clip_grad=self.gradient_clipping(),
+                                   contiguous_gradients=self.zero_contiguous_gradients(),
+                                   reduce_bucket_size=self.zero_reduce_bucket_size(),
+                                   prefetch_bucket_size=self.zero_prefetch_bucket_size(),
+                                   max_reuse_distance=self.zero_max_reuse_distance(),
+                                   max_live_parameters=self.zero_max_live_parameters(),
+                                   param_persistence_threshold=self.zero_param_persistence_threshold(),
+                                   model_persistence_threshold=self.zero_model_persistence_threshold(),
+                                   dp_process_group=self.seq_data_parallel_group,
+                                   reduce_scatter=self.zero_reduce_scatter(),
+                                   overlap_comm=self.zero_overlap_comm(),
+                                   offload_optimizer_config=self.zero_offload_optimizer(),
+                                   offload_param_config=self.zero_offload_param(),
+                                   sub_group_size=self.zero_sub_group_size(),
+                                   mpu=self.mpu,
+                                   postscale_gradients=self.postscale_gradients(),
+                                   gradient_predivide_factor=self.gradient_predivide_factor(),
+                                   gradient_accumulation_steps=self.gradient_accumulation_steps(),
+                                   aio_config=self.aio_config(),
+                                   gradient_accumulation_dtype=gradient_accumulation_dtype,
+                                   communication_data_type=self.communication_data_type)
+        return optimizer
+
     def _configure_eigenvalue(self):
         eigenvalue = Eigenvalue(
             verbose=self.eigenvalue_verbose(),
@@ -1654,9 +1665,7 @@ def is_map_style_dataset(obj):
 
     @staticmethod
     def is_iterable_style_dataset(obj):
-        return isinstance(obj,
-                          torch.utils.data.IterableDataset
-                          )  # hasattr(obj, "__iter__") should work as well
+        return isinstance(obj, torch.utils.data.IterableDataset)  # hasattr(obj, "__iter__") should work as well
 
     def dataloader_drop_last(self):
         return self._config.dataloader_drop_last
@@ -1679,8 +1688,7 @@ def deepspeed_io(self,
                      data_sampler=None,
                      collate_fn=None,
                      num_local_io_workers=None):
-        if not (self.is_map_style_dataset(dataset)
-                or self.is_iterable_style_dataset(dataset)):
+        if not (self.is_map_style_dataset(dataset) or self.is_iterable_style_dataset(dataset)):
             raise ValueError("Training data must be a torch Dataset")
 
         if batch_size is None:
@@ -1712,33 +1720,26 @@ def deepspeed_io(self,
         deepspeed_dataloader_config = {}
         if self.curriculum_learning_enabled():
             deepspeed_dataloader_config = {
-                CURRICULUM_LEARNING:
-                self.curriculum_learning_enabled(),
-                DATA_EFFICIENCY:
-                self.data_efficiency_config(),
-                DATA_PARALLEL_GROUP:
-                self.data_parallel_group,
-                GRADIENT_ACCUMULATION_STEPS:
-                self.gradient_accumulation_steps(),
-                GLOBAL_RANK:
-                self.global_rank,
-                DATA_SAMPLING_NUM_WORKERS:
-                self.data_sampling_config()[DATA_SAMPLING_NUM_WORKERS]
+                CURRICULUM_LEARNING: self.curriculum_learning_enabled(),
+                DATA_EFFICIENCY: self.data_efficiency_config(),
+                DATA_PARALLEL_GROUP: self.data_parallel_group,
+                GRADIENT_ACCUMULATION_STEPS: self.gradient_accumulation_steps(),
+                GLOBAL_RANK: self.global_rank,
+                DATA_SAMPLING_NUM_WORKERS: self.data_sampling_config()[DATA_SAMPLING_NUM_WORKERS]
             }
 
-        return DeepSpeedDataLoader(
-            dataset=dataset,
-            batch_size=batch_size,
-            pin_memory=pin_memory,
-            collate_fn=collate_fn,
-            local_rank=self.local_rank,
-            tput_timer=deepspeed_io_timer,
-            num_local_io_workers=num_local_io_workers,
-            data_sampler=data_sampler,
-            data_parallel_world_size=data_parallel_world_size,
-            data_parallel_rank=data_parallel_rank,
-            dataloader_drop_last=self.dataloader_drop_last(),
-            deepspeed_dataloader_config=deepspeed_dataloader_config)
+        return DeepSpeedDataLoader(dataset=dataset,
+                                   batch_size=batch_size,
+                                   pin_memory=pin_memory,
+                                   collate_fn=collate_fn,
+                                   local_rank=self.local_rank,
+                                   tput_timer=deepspeed_io_timer,
+                                   num_local_io_workers=num_local_io_workers,
+                                   data_sampler=data_sampler,
+                                   data_parallel_world_size=data_parallel_world_size,
+                                   data_parallel_rank=data_parallel_rank,
+                                   dataloader_drop_last=self.dataloader_drop_last(),
+                                   deepspeed_dataloader_config=deepspeed_dataloader_config)
 
     def train(self, mode=True):
         r""""""
@@ -1765,9 +1766,7 @@ def _scale_loss_by_gas(self, prescaled_loss):
         else:
             scaled_loss = prescaled_loss
             if self.warn_unscaled_loss:
-                logger.warning(
-                    f"DeepSpeed unable to scale loss because of type: {type(prescaled_loss)}"
-                )
+                logger.warning(f"DeepSpeed unable to scale loss because of type: {type(prescaled_loss)}")
                 self.warn_unscaled_loss = False
 
         return scaled_loss
@@ -1785,9 +1784,8 @@ def forward(self, *inputs, **kwargs):
         else:
             see_memory_usage("Engine before forward", force=self.memory_breakdown())
 
-        flops_profiler_active = (self.flops_profiler_enabled() and self.global_steps
-                                 == self.flops_profiler_profile_step()
-                                 and self.global_rank == 0)
+        flops_profiler_active = (self.flops_profiler_enabled()
+                                 and self.global_steps == self.flops_profiler_profile_step() and self.global_rank == 0)
 
         # used to check quantization happens at step 0!
         if self.global_steps == 0 and hasattr(self, "compression_scheduler"):
@@ -1816,10 +1814,7 @@ def forward(self, *inputs, **kwargs):
             if self.module.training and self.curriculum_enabled_legacy():
                 self.curriculum_scheduler_legacy.update_difficulty(self.global_steps + 1)
                 if self.curriculum_params_legacy()["curriculum_type"] == "seqlen":
-                    kwargs.update({
-                        "curriculum_seqlen":
-                        self.curriculum_scheduler_legacy.get_current_difficulty()
-                    })
+                    kwargs.update({"curriculum_seqlen": self.curriculum_scheduler_legacy.get_current_difficulty()})
 
         if self.module.training and self.random_ltd_enabled():
             self.random_ltd_scheduler.update_seq(self.global_steps)
@@ -1829,7 +1824,6 @@ def forward(self, *inputs, **kwargs):
             # we are in a forward pass.
             for module in self.module.modules():
                 module._parameters._in_forward = True
-                pass
 
         self._start_timers(self.engine_timers.forward_timers)
 
@@ -1854,9 +1848,7 @@ def forward(self, *inputs, **kwargs):
         if self.autotuning_profile_model_info():
             activation_mem = get_ma_status() - ma
             self.autotuning_model_info["activation_mem_per_gpu"] = activation_mem
-            print_json_dist(self.autotuning_model_info,
-                            [0],
-                            path=self.autotuning_model_info_path())
+            print_json_dist(self.autotuning_model_info, [0], path=self.autotuning_model_info_path())
             exit()
         else:
             see_memory_usage("Engine after forward", force=self.memory_breakdown())
@@ -1898,7 +1890,7 @@ def print_forward_breakdown(self, fwd_time):
 
         # if deepspeed.comm.get_rank() == 0:
         log_dist(
-            f"rank={dist.get_rank()} time (ms) | forward: {fwd_time:.2f} (forward_moe: {moe_time:.2f}, 1st alltoall: {falltoall:.2f}, 2nd alltoall: {salltoall:.2f}, top-k: {gate_time:.2f})",
+            f"time (ms) | fwd: {fwd_time:.2f} (fwd_moe: {moe_time:.2f}, 1st_a2a: {falltoall:.2f}, 2nd_a2a: {salltoall:.2f}, top_k: {gate_time:.2f})",
             ranks=[0])
 
     @instrument_w_nvtx
@@ -1907,27 +1899,21 @@ def allreduce_gradients(self, bucket_size=MEMORY_OPT_ALLREDUCE_SIZE):
             f'allreduce_gradients() is not valid when bfloat+pipeline_parallelism is enabled'
 
         # Pass (PP) gas boundary flag to optimizer (required for zero)
-        self.optimizer.is_gradient_accumulation_boundary = self.is_gradient_accumulation_boundary(
-        )
+        self.optimizer.is_gradient_accumulation_boundary = self.is_gradient_accumulation_boundary()
         # ZeRO stage >= 2 communicates during non gradient accumulation boundaries as well
         if self.zero_optimization_partition_gradients():
             self.optimizer.overlapping_partition_gradients_reduce_epilogue()
 
         # Communicate only at gradient accumulation boundaries
         elif self.is_gradient_accumulation_boundary():
-            if self.zero_optimization_stage() == ZeroStageEnum.optimizer_states:
-                self.optimizer.reduce_gradients(
-                    pipeline_parallel=self.pipeline_parallelism)
+            if self.zero_optimization_stage() == ZeroStageEnum.optimizer_states and hasattr(
+                    self.optimizer, 'reduce_gradients'):
+                self.optimizer.reduce_gradients(pipeline_parallel=self.pipeline_parallelism)
             else:
                 self.buffered_allreduce_fallback(elements_per_buffer=bucket_size)
 
     @instrument_w_nvtx
-    def backward(self,
-                 loss,
-                 allreduce_gradients=True,
-                 release_loss=False,
-                 retain_graph=False,
-                 scale_wrt_gas=True):
+    def backward(self, loss, allreduce_gradients=True, release_loss=False, retain_graph=False, scale_wrt_gas=True):
         r"""Execute backward pass on the loss
         Arguments:
             loss: Torch tensor on which to execute backward propagation
@@ -1942,21 +1928,20 @@ def backward(self,
             scale_wrt_gas = self.scale_wrt_gas
 
         if not allreduce_gradients:
-            logger.warning(
-                f"Argument `allreduce_gradients` is deprecated, ignored, and will soon be removed"
-            )
+            logger.warning(f"Argument `allreduce_gradients` is deprecated, ignored, and will soon be removed")
 
         # scale loss w.r.t. gradient accumulation if needed
         if self.gradient_accumulation_steps() > 1 and scale_wrt_gas:
             loss = self._scale_loss_by_gas(loss.float())
 
-        # Log training Loss
+        # Log training loss
+        self.losses += loss.mean().item()
         if self.monitor.enabled:
             if self.is_gradient_accumulation_boundary():
                 if self.global_rank == 0:
                     self.summary_events = [(
                         f"Train/Samples/train_loss",
-                        loss.mean().item() * self.gradient_accumulation_steps(),
+                        self.losses,
                         self.global_samples,
                     )]
                     self.monitor.write_events(self.summary_events)
@@ -1969,16 +1954,13 @@ def backward(self,
         self._start_timers(self.engine_timers.backward_inner_timers)
 
         if self.zero_optimization():
-            self.optimizer.is_gradient_accumulation_boundary = self.is_gradient_accumulation_boundary(
-            )
+            self.optimizer.is_gradient_accumulation_boundary = self.is_gradient_accumulation_boundary()
             self.optimizer.backward(loss, retain_graph=retain_graph)
         elif self.amp_enabled():
             # AMP requires delaying unscale when inside gradient accumulation boundaries
             # https://nvidia.github.io/apex/advanced.html#gradient-accumulation-across-iterations
             delay_unscale = not self.is_gradient_accumulation_boundary()
-            with amp.scale_loss(loss,
-                                self.optimizer,
-                                delay_unscale=delay_unscale) as scaled_loss:
+            with amp.scale_loss(loss, self.optimizer, delay_unscale=delay_unscale) as scaled_loss:
                 scaled_loss.backward(retain_graph=retain_graph)
         elif self.fp16_enabled():
             if self.eigenvalue_enabled():
@@ -2033,7 +2015,7 @@ def set_gradient_accumulation_boundary(self, is_boundary):
         """
         Manually overrides the DeepSpeed engine's gradient accumulation boundary state, this is an optional
         feature and should be used with care. The state should be set before to the intended
-        value before each forward/backward. The final fordward/backward should have the
+        value before each forward/backward. The final forward/backward should have the
         boundary state set to True. This style allows client code to only call engine.step() once after all
         the gradient accumulation passes are complete. See example below:
         .. code-block:: python
@@ -2061,22 +2043,17 @@ def zero_grad(self):
             param.grad = None
 
     def clip_fp32_gradients(self):
-        clip_grad_norm_(parameters=self.module.parameters(),
-                        max_norm=self.gradient_clipping(),
-                        mpu=self.mpu)
+        clip_grad_norm_(parameters=self.module.parameters(), max_norm=self.gradient_clipping(), mpu=self.mpu)
 
     def _take_model_step(self, lr_kwargs, block_eigenvalue={}):
         if self.gradient_clipping() > 0.0:
-            if not (self.fp16_enabled() or self.bfloat16_enabled() or self.amp_enabled()
-                    or self.zero_optimization()):
+            if not (self.fp16_enabled() or self.bfloat16_enabled() or self.amp_enabled() or self.zero_optimization()):
                 self.clip_fp32_gradients()
             elif self.amp_enabled():
                 # AMP's recommended way of doing clipping
                 # https://nvidia.github.io/apex/advanced.html#gradient-clipping
                 master_params = amp.master_params(self.optimizer)
-                clip_grad_norm_(parameters=master_params,
-                                max_norm=self.gradient_clipping(),
-                                mpu=self.mpu)
+                clip_grad_norm_(parameters=master_params, max_norm=self.gradient_clipping(), mpu=self.mpu)
         self.optimizer.step()
 
         if hasattr(self.optimizer, '_global_grad_norm'):
@@ -2094,10 +2071,10 @@ def _take_model_step(self, lr_kwargs, block_eigenvalue={}):
                     block_eigenvalue,
                 )
         # zero grad in basic optimizer could be unreliable and may not exhibit
-        # the behaviour that we want
+        # the behavior that we want
         if self.bfloat16_enabled():
             # TODO: Temporary until bf16_optimizer and zero_optimizer are integrated
-            if self.zero_optimization():
+            if self.zero_optimization() and hasattr(self.optimizer, "zero_grad"):
                 self.optimizer.zero_grad()
             else:
                 pass
@@ -2125,11 +2102,12 @@ def _take_model_step(self, lr_kwargs, block_eigenvalue={}):
                     # XXX Hack to work with Megatron 2.0 and DeepSpeed pipelines.
                     # We don't currently have a way to specify lr_kwargs from
                     # pipe_engine.train_batch()
-                    self.lr_scheduler.step(increment=self.train_batch_size())
+                    self.lr_scheduler.step(self.train_batch_size())
 
         if report_progress and (self.global_steps + 1) % self.steps_per_print() == 0:
             self._report_progress(self.global_steps + 1)
 
+        self.losses = 0.0
         self.global_steps += 1
         self.global_samples += self.train_batch_size()
 
@@ -2142,8 +2120,7 @@ def step(self, lr_kwargs=None):
         # Check early because self.global_steps is incremented at some point here.
         # TODO: Delay self.global_steps increment until very end of this function.
         flops_profiler_active = self.flops_profiler_enabled(
-        ) and self.global_steps == self.flops_profiler_profile_step(
-        ) and self.global_rank == 0
+        ) and self.global_steps == self.flops_profiler_profile_step() and self.global_rank == 0
 
         self._start_timers(self.engine_timers.step_timers)
 
@@ -2158,20 +2135,16 @@ def step(self, lr_kwargs=None):
         if self.is_gradient_accumulation_boundary():
             self.gas_boundary_ctr += 1
 
-            if (self.eigenvalue_enabled() and
-                (self.gas_boundary_ctr % self.eigenvalue_gas_boundary_resolution() == 0)
+            if (self.eigenvalue_enabled() and (self.gas_boundary_ctr % self.eigenvalue_gas_boundary_resolution() == 0)
                     and self.quantizer.any_precision_switch()):
                 log_dist(f"computing eigenvalue...", ranks=[0])
-                self.block_eigenvalue = self.eigenvalue.compute_eigenvalue(
-                    self.module,
-                    self.device,
-                    self.optimizer.cur_scale)
+                self.block_eigenvalue = self.eigenvalue.compute_eigenvalue(self.module, self.device,
+                                                                           self.optimizer.cur_scale)
 
             if self.progressive_layer_drop:
                 self.progressive_layer_drop.update_state(self.global_steps)
 
-            if (self.eigenvalue_enabled() and not self.gas_boundary_ctr %
-                    self.eigenvalue_gas_boundary_resolution()
+            if (self.eigenvalue_enabled() and not self.gas_boundary_ctr % self.eigenvalue_gas_boundary_resolution()
                     and self.quantizer.any_precision_switch()):
                 self._take_model_step(lr_kwargs, self.block_eigenvalue)
             else:
@@ -2179,8 +2152,7 @@ def step(self, lr_kwargs=None):
 
             report_progress = self.global_rank == 0 if self.global_rank else True
 
-        self.tput_timer.stop(global_step=self.is_gradient_accumulation_boundary(),
-                             report_speed=report_progress)
+        self.tput_timer.stop(global_step=self.is_gradient_accumulation_boundary(), report_speed=report_progress)
 
         self._stop_timers(self.engine_timers.step_timers)
 
@@ -2188,9 +2160,7 @@ def step(self, lr_kwargs=None):
         if self.monitor.enabled:
             if self.is_gradient_accumulation_boundary():
                 if self.global_rank == 0:
-                    self.summary_events = [(f"Train/Samples/lr",
-                                            self.get_lr()[0],
-                                            self.global_samples)]
+                    self.summary_events = [(f"Train/Samples/lr", self.get_lr()[0], self.global_samples)]
 
                     if self.fp16_enabled() and hasattr(self.optimizer, "cur_scale"):
                         self.summary_events.append((
@@ -2199,8 +2169,8 @@ def step(self, lr_kwargs=None):
                             self.global_samples,
                         ))
 
-                    if (self.eigenvalue_enabled() and not self.gas_boundary_ctr %
-                            self.eigenvalue_gas_boundary_resolution()):
+                    if (self.eigenvalue_enabled()
+                            and not self.gas_boundary_ctr % self.eigenvalue_gas_boundary_resolution()):
                         ev_values = self.block_eigenvalue.values()
                         for i in range(len(ev_values)):
                             self.summary_events.append((
@@ -2214,6 +2184,7 @@ def step(self, lr_kwargs=None):
         if flops_profiler_active:
             if self.autotuning_enabled():
                 self.flops = self.flops_profiler.get_total_flops() * 3
+                self.fwd_duration = self.flops_profiler.get_total_duration()
             else:
                 self.flops_profiler.print_model_profile(
                     profile_step=self.global_steps,
@@ -2224,14 +2195,12 @@ def step(self, lr_kwargs=None):
                 )
             self.flops_profiler.end_profile()
 
-        if self.autotuning_enabled() and self.global_steps == (
-                self.autotuning_end_profile_step() + 1):
+        if self.autotuning_enabled() and self.global_steps == (self.autotuning_end_profile_step() + 1):
             self._autotuning_exit()
 
         if self.wall_clock_breakdown():
             # Log micro timing and reset
-            self.timers.log(names=self.engine_timers.micro_timers,
-                            memory_breakdown=self.memory_breakdown())
+            self.timers.log(names=self.engine_timers.micro_timers, memory_breakdown=self.memory_breakdown())
 
         if self.wall_clock_breakdown() or self.flops_profiler_enabled():
             # Log global timing and reset
@@ -2265,13 +2234,14 @@ def _autotuning_exit(self):
                 FORWARD_GLOBAL_TIMER,
                 BACKWARD_GLOBAL_TIMER,
                 STEP_GLOBAL_TIMER,
-            ],
-                                       reset=False)
-            titer = msg[FORWARD_GLOBAL_TIMER] + msg[BACKWARD_GLOBAL_TIMER] + msg[
-                STEP_GLOBAL_TIMER]
+            ], reset=False)
+            titer = 0.0
+            titer += msg[FORWARD_GLOBAL_TIMER] if FORWARD_GLOBAL_TIMER in msg else 0
+            titer += msg[BACKWARD_GLOBAL_TIMER] if BACKWARD_GLOBAL_TIMER in msg else 0
+            titer += msg[STEP_GLOBAL_TIMER] if STEP_GLOBAL_TIMER in msg else 0
+            titer *= self.gradient_accumulation_steps()
             msg["latency"] = titer
-            msg["FLOPS_per_gpu"] = self.flops * 1_000_000 * self.gradient_accumulation_steps(
-            ) / titer
+            msg["FLOPS_per_gpu"] = self.flops * 1_000_000 * self.gradient_accumulation_steps() / titer
             msg["throughput"] = self.train_batch_size() * 1_000_000 / \
                 msg["latency"]
             print_json_dist(msg, [0], path=self.autotuning_metric_path())
@@ -2345,8 +2315,7 @@ def get_pld_theta(self):
     def _report_progress(self, step):
         lr = self.get_lr()
         mom = self.get_mom()
-        log_dist(f"step={step}, skipped={self.skipped_steps}, lr={lr}, mom={mom}",
-                 ranks=[0])
+        log_dist(f"step={step}, skipped={self.skipped_steps}, lr={lr}, mom={mom}", ranks=[0])
 
     def allreduce_bucket(self, bucket, dp_group):
         tensor = self.flatten(bucket)
@@ -2362,10 +2331,8 @@ def allreduce_bucket(self, bucket, dp_group):
 
             dist.all_reduce(tensor_to_allreduce, group=dp_group)
             if self.gradient_average:
-                if self.gradient_predivide_factor() != dist.get_world_size(
-                        group=dp_group):
-                    tensor_to_allreduce.mul_(self.gradient_predivide_factor() /
-                                             dist.get_world_size(group=dp_group))
+                if self.gradient_predivide_factor() != dist.get_world_size(group=dp_group):
+                    tensor_to_allreduce.mul_(self.gradient_predivide_factor() / dist.get_world_size(group=dp_group))
         else:
             tensor_to_allreduce.mul_(1. / dist.get_world_size(group=dp_group))
             dist.all_reduce(tensor_to_allreduce, group=dp_group)
@@ -2401,15 +2368,16 @@ def _get_gradients_for_reduction(self):
                 expert_grads[key] = []
 
         for param_name, param in self.module.named_parameters():
+            if not param.requires_grad:
+                continue
+
             if param.grad is None:
                 # In cases where there is an imbalance of empty grads across
                 # ranks we must create empty grads, this will ensure that every
                 # rank is reducing the same size. In some cases it may make
                 # sense in the future to support the ability to average not
                 # w.r.t. world size but with a different value.
-                param.grad = torch.zeros(param.size(),
-                                         dtype=param.dtype,
-                                         device=param.device)
+                param.grad = torch.zeros(param.size(), dtype=param.dtype, device=param.device)
 
             grad_data = param.grad.data
             if param_name in self.sparse_tensor_module_names or grad_data.is_sparse:
@@ -2431,14 +2399,12 @@ def _reduce_non_expert_gradients(self, grads, elements_per_buffer):
             if self.pipeline_parallelism:
                 dp_group = self.mpu.get_data_parallel_group()
             else:
-                dp_group = groups._get_data_parallel_group()
+                dp_group = groups._get_sequence_data_parallel_group()
 
             if bucket_type == SparseTensor.type():
                 self.sparse_allreduce_no_retain(bucket, dp_group=dp_group)
             else:
-                self.allreduce_no_retain(bucket,
-                                         dp_group=dp_group,
-                                         numel_per_bucket=elements_per_buffer)
+                self.allreduce_no_retain(bucket, dp_group=dp_group, numel_per_bucket=elements_per_buffer)
 
     def _reduce_expert_gradients(self, expert_grads, elements_per_buffer):
         for ep_name, expert_grads_group in expert_grads.items():
@@ -2446,15 +2412,12 @@ def _reduce_expert_gradients(self, expert_grads, elements_per_buffer):
             for i, bucket_tuple in enumerate(expert_split_buckets):
                 bucket_type, bucket = bucket_tuple
                 if bucket_type == SparseTensor.type():
-                    self.sparse_allreduce_no_retain(
-                        bucket,
-                        groups._get_expert_data_parallel_group(ep_name))
+                    self.sparse_allreduce_no_retain(bucket, groups._get_expert_data_parallel_group(ep_name))
                 else:
                     # Separate between diff groups
-                    self.allreduce_no_retain(
-                        bucket,
-                        dp_group=groups._get_expert_data_parallel_group(ep_name),
-                        numel_per_bucket=elements_per_buffer)
+                    self.allreduce_no_retain(bucket,
+                                             dp_group=groups._get_expert_data_parallel_group(ep_name),
+                                             numel_per_bucket=elements_per_buffer)
 
     def buffered_allreduce_fallback(self, grads=None, elements_per_buffer=500000000):
         if grads is None:
@@ -2498,9 +2461,9 @@ def sparse_allreduce(self, sparse, dp_group):
         if self.postscale_gradients():
             if self.gradient_average:
                 values.mul_(self.gradient_predivide_factor() /
-                            dist.get_world_size(group=dp_group))
+                            (dist.get_world_size(group=dp_group) / float(self.sequence_parallel_size)))
         else:
-            values.mul_(1. / dist.get_world_size(group=dp_group))
+            values.mul_(1. / (dist.get_world_size(group=dp_group) / float(self.sequence_parallel_size)))
 
         indices_device_list = self.sparse_all_gather(indices, dp_group)
         values_device_list = self.sparse_all_gather(values, dp_group)
@@ -2519,41 +2482,37 @@ def sparse_all_gather(self, value, dp_group):
         if value.dim() == 1:
             if fill_size > 0:
                 value = torch.cat([value, value.new_empty(fill_size)])
-            tensor_list = [
-                value.new_empty(max_size)
-                for _ in range(dist.get_world_size(group=dp_group))
-            ]
+            tensor_list = [value.new_empty(max_size) for _ in range(dist.get_world_size(group=dp_group))]
         else:
             if fill_size > 0:
                 value = torch.cat([value, value.new_empty(fill_size, value.size()[1])])
             tensor_list = [
                 value.new_empty(max_size,
-                                value.size()[1])
-                for _ in range(dist.get_world_size(group=dp_group))
+                                value.size()[1]) for _ in range(dist.get_world_size(group=dp_group))
             ]
 
         dist.all_gather(tensor_list, value, group=dp_group)
         tensors = []
         for dev_idx, t in enumerate(tensor_list):
             size = all_sizes[dev_idx][0]
-            tensors.append(
-                t.index_select(0,
-                               torch.arange(size,
-                                            dtype=torch.long,
-                                            device=self.device)))
+            tensors.append(t.index_select(0, torch.arange(size, dtype=torch.long, device=self.device)))
 
         return tensors
 
     def all_gather_scalar(self, value, dp_group):
-        tensor_list = [
-            value.new_zeros(value.size())
-            for _ in range(dist.get_world_size(group=dp_group))
-        ]
+        tensor_list = [value.new_zeros(value.size()) for _ in range(dist.get_world_size(group=dp_group))]
         dist.all_gather(tensor_list, value, group=dp_group)
         return tensor_list
 
-    def module_state_dict(self, destination=None, prefix="", keep_vars=False):
+    def module_state_dict(self, destination=None, prefix="", keep_vars=False, exclude_frozen_parameters=False):
         sd = self.module.state_dict(destination, prefix, keep_vars)
+
+        # Remove frozen parameter weights from state_dict if specified
+        if exclude_frozen_parameters:
+            for n, p in self.module.named_parameters():
+                if not p.requires_grad and n in sd:
+                    del sd[n]
+
         if self.random_ltd_enabled():
             sd = remove_random_ltd_state_dict(sd)
         return sd
@@ -2568,20 +2527,19 @@ def load_moe_state_dict(checkpoint_path,
                             num_experts=1,
                             checkpoint_engine=TorchCheckpointEngine()):
         if old_moe_load:
-            expp_rank = groups._get_expert_data_parallel_rank(
-                groups._get_max_expert_size_name())
+            expp_rank = groups._get_expert_data_parallel_rank(groups._get_max_expert_size_name())
 
-            num_local_experts = max(
-                num_experts) // groups._get_expert_parallel_world_size(
-                    groups._get_max_expert_size_name())
+            num_local_experts = max(num_experts) // groups._get_expert_parallel_world_size(
+                groups._get_max_expert_size_name())
             for local_expert_id in range(num_local_experts):
                 global_expert_id = expp_rank * num_local_experts + local_expert_id
-                expert_state_dict = checkpoint_engine.load(DeepSpeedEngine._get_expert_ckpt_name(
-                    checkpoint_path,
-                    -1, # -1 means ignore layer_id
-                    global_expert_id,
-                    tag,
-                    mpu),
+                expert_state_dict = checkpoint_engine.load(
+                    DeepSpeedEngine._get_expert_ckpt_name(
+                        checkpoint_path,
+                        -1,  # -1 means ignore layer_id
+                        global_expert_id,
+                        tag,
+                        mpu),
                     map_location=torch.device('cpu'))
 
                 # Updating global -> local expert ids
@@ -2602,41 +2560,54 @@ def load_moe_state_dict(checkpoint_path,
                     # loop all local_experts
                     for local_expert_id in range(num_local_experts):
                         global_expert_id = expp_rank * num_local_experts + local_expert_id
-                        expert_state_dict = checkpoint_engine.load(
-                            DeepSpeedEngine._get_expert_ckpt_name(
-                                checkpoint_path,
-                                moe_layer_id,
-                                global_expert_id,
-                                tag,
-                                mpu),
-                            map_location=torch.device('cpu'))
+                        expert_state_dict = checkpoint_engine.load(DeepSpeedEngine._get_expert_ckpt_name(
+                            checkpoint_path, moe_layer_id, global_expert_id, tag, mpu),
+                                                                   map_location=torch.device('cpu'))
                         # print(expert_state_dict.keys())
                         # Updating global -> local expert ids
                         moe_str_prefix = '.deepspeed_moe.experts.deepspeed_experts.'
                         for key in list(expert_state_dict.keys()):
-                            local_key = key.replace(
-                                f'{moe_str_prefix}{global_expert_id}',
-                                f'{moe_str_prefix}{local_expert_id}')
+                            local_key = key.replace(f'{moe_str_prefix}{global_expert_id}',
+                                                    f'{moe_str_prefix}{local_expert_id}')
                             expert_state_dict[local_key] = expert_state_dict.pop(key)
                         state_dict.update(expert_state_dict)
                     moe_layer_id += 1
 
-    def load_module_state_dict(self, state_dict, strict=True, custom_load_fn=None):
-        if custom_load_fn:
-            custom_load_fn(src=state_dict, dst=self.module)
+    def load_module_state_dict(self, checkpoint, strict=True, custom_load_fn=None, fetch_z3_params=False):
+        if fetch_z3_params:
+            params_to_fetch = [
+                p for p in self.module.parameters()
+                if hasattr(p, 'ds_id') and p.ds_status == ZeroParamStatus.NOT_AVAILABLE
+            ]
         else:
-            self.module.load_state_dict(state_dict, # TODO
-                                        strict=strict)
+            params_to_fetch = []
+
+        with deepspeed.zero.GatheredParameters(params_to_fetch, modifier_rank=0):
+            module_state_dict = checkpoint['module']
+            if custom_load_fn:
+                custom_load_fn(src=module_state_dict, dst=self.module)
+            else:
+                self.module.load_state_dict(
+                    module_state_dict,  # TODO
+                    strict=strict)
+
+        if checkpoint.get(FROZEN_PARAM_FRAGMENTS, None) is not None:
+            saved_frozen_params = checkpoint[FROZEN_PARAM_FRAGMENTS]
+            for param in self.module.parameters():
+                if param.requires_grad:
+                    continue
+                if param not in self.param_names:
+                    raise ValueError(f"failed to find frozen {param} in named params")
+                name = self.param_names[param]
+                if hasattr(param, 'ds_id'):
+                    param.ds_tensor.data.copy_(saved_frozen_params[name].data)
+                else:
+                    param.data.copy_(saved_frozen_params[name].data)
 
     def _get_zero_ckpt_prefix(self, dp_rank, bf16_mode):
         return f'{"bf16_" if bf16_mode else ""}zero_pp_rank_{dp_rank}'
 
-    def _get_rank_zero_ckpt_name(self,
-                                 checkpoints_path,
-                                 tag,
-                                 mp_rank,
-                                 dp_rank,
-                                 bf16_mode):
+    def _get_rank_zero_ckpt_name(self, checkpoints_path, tag, mp_rank, dp_rank, bf16_mode):
         file_prefix = self._get_zero_ckpt_prefix(dp_rank, bf16_mode=bf16_mode)
         zero_ckpt_name = os.path.join(
             checkpoints_path,
@@ -2649,11 +2620,7 @@ def _get_zero_ckpt_name(self, checkpoints_path, tag):
         mp_rank = 0 if self.mpu is None else self.mpu.get_model_parallel_rank()
         pp_rank = dist.get_rank(group=self.optimizer.dp_process_group)
         bf16_mode = self.bfloat16_enabled()
-        return self._get_rank_zero_ckpt_name(checkpoints_path,
-                                             tag,
-                                             mp_rank,
-                                             pp_rank,
-                                             bf16_mode)
+        return self._get_rank_zero_ckpt_name(checkpoints_path, tag, mp_rank, pp_rank, bf16_mode)
 
     def _get_ckpt_name(self, checkpoints_path, tag, mp_placeholder=None):
         if mp_placeholder is not None:
@@ -2663,8 +2630,7 @@ def _get_ckpt_name(self, checkpoints_path, tag, mp_placeholder=None):
             mp_rank_str = f"{mp_rank:02d}"
 
         if self.zero_optimization_partition_weights():
-            filename = "zero_pp_rank_{}".format(
-                dist.get_rank(group=self.optimizer.dp_process_group))
+            filename = "zero_pp_rank_{}".format(dist.get_rank(group=self.optimizer.dp_process_group))
             ckpt_name = os.path.join(
                 checkpoints_path,
                 str(tag),
@@ -2680,10 +2646,8 @@ def _get_ckpt_name(self, checkpoints_path, tag, mp_placeholder=None):
 
     def _get_optimizer_ckpt_name(self, checkpoints_path, tag, expp_rank):
         mp_rank = 0 if self.mpu is None else self.mpu.get_model_parallel_rank()
-        ckpt_name = os.path.join(
-            checkpoints_path,
-            str(tag),
-            f'expp_rank_{expp_rank}_mp_rank_{mp_rank:02d}_optim_states.pt')
+        ckpt_name = os.path.join(checkpoints_path, str(tag),
+                                 f'expp_rank_{expp_rank}_mp_rank_{mp_rank:02d}_optim_states.pt')
         return ckpt_name
 
     @staticmethod
@@ -2691,24 +2655,17 @@ def _get_expert_ckpt_name(checkpoints_path, layer_id, expert_id, tag, mpu=None):
         mp_rank = 0 if mpu is None else mpu.get_model_parallel_rank()
         if layer_id <= -1:
             # Used to support old checkpoint loading
-            ckpt_name = os.path.join(
-                checkpoints_path,
-                '' if tag is None else str(tag),
-                f'expert_{expert_id}_mp_rank_{mp_rank:02d}_model_states.pt')
+            ckpt_name = os.path.join(checkpoints_path, '' if tag is None else str(tag),
+                                     f'expert_{expert_id}_mp_rank_{mp_rank:02d}_model_states.pt')
         else:
             # Used to support new checkpoint loading
-            ckpt_name = os.path.join(
-                checkpoints_path,
-                '' if tag is None else str(tag),
-                f'layer_{layer_id}_expert_{expert_id}_mp_rank_{mp_rank:02d}_model_states.pt'
-            )
+            ckpt_name = os.path.join(checkpoints_path, '' if tag is None else str(tag),
+                                     f'layer_{layer_id}_expert_{expert_id}_mp_rank_{mp_rank:02d}_model_states.pt')
         return ckpt_name
 
     def _get_all_ckpt_names(self, checkpoints_path, tag):
         # It is required that (checkpoints_path, tag) are consistent among all ranks.
-        ckpt_file_pattern = self._get_ckpt_name(checkpoints_path,
-                                                tag,
-                                                mp_placeholder="*")
+        ckpt_file_pattern = self._get_ckpt_name(checkpoints_path, tag, mp_placeholder="*")
         import glob
 
         ckpt_files = glob.glob(ckpt_file_pattern)
@@ -2748,17 +2705,14 @@ def load_checkpoint(self,
         """
 
         if tag is None:
-            latest_tag = "latest_universal" if self.load_universal_checkpoint(
-            ) else "latest"
+            latest_tag = "latest_universal" if self.load_universal_checkpoint() else "latest"
             latest_path = os.path.join(load_dir, latest_tag)
             if os.path.isfile(latest_path):
                 with open(latest_path, "r") as fd:
                     tag = fd.read().strip()
             else:
                 if self.load_universal_checkpoint():
-                    raise ValueError(
-                        f'Invalid for universal checkpoint: {latest_path} does not exist'
-                    )
+                    raise ValueError(f'Invalid for universal checkpoint: {latest_path} does not exist')
                 else:
                     logger.warning(
                         f"Unable to find latest file at {latest_path}, if trying to load latest "
@@ -2766,7 +2720,7 @@ def load_checkpoint(self,
                     )
                     return None, None
 
-        if self.zero_optimization_partition_weights():
+        if self._optimizer_has_ckpt_event_prologue():
             # Prepare for checkpoint load by ensuring all parameters are partitioned
             self.optimizer.checkpoint_event_prologue()
 
@@ -2778,17 +2732,23 @@ def load_checkpoint(self,
                                                          load_module_only=load_module_only,
                                                          custom_load_fn=custom_load_fn)
 
-        load_zero_checkpoint = self.zero_optimization() or self.bfloat16_enabled()
-        if load_zero_checkpoint and load_path is not None:
-            success = self._load_zero_checkpoint(
-                load_dir,
-                tag,
-                load_optimizer_states=load_optimizer_states)
-            if not success:
-                self.optimizer._restore_from_bit16_weights()
+        if not load_module_only and load_optimizer_states:
+            load_zero_checkpoint = self.zero_optimization() or self.bfloat16_enabled()
+            if load_zero_checkpoint and load_path is not None:
+                success = self._load_zero_checkpoint(
+                    load_dir,
+                    tag,
+                    load_optimizer_states=load_optimizer_states)
+                if not success:
+                    self.optimizer._restore_from_bit16_weights()
 
-        if self.zero_optimization_partition_weights():
-            self.optimizer.checkpoint_event_epilogue()
+            if self._optimizer_has_ckpt_event_epilogue():
+                self.optimizer.checkpoint_event_epilogue()
+
+        if self.load_universal_checkpoint():
+            self.optimizer.update_lp_params()
+            if load_zero_checkpoint:
+                self.update_optimizer_step(step=client_states['iteration'] + 1)
 
         return load_path, client_states
 
@@ -2804,20 +2764,21 @@ def _load_checkpoint(self,
         from deepspeed.runtime.state_dict_factory import SDLoaderFactory
 
         ckpt_list = self._get_all_ckpt_names(load_dir, tag)
-        sd_loader = SDLoaderFactory.get_sd_loader(
-            ckpt_list,
-            checkpoint_engine=self.checkpoint_engine)
+        sd_loader = SDLoaderFactory.get_sd_loader(ckpt_list, checkpoint_engine=self.checkpoint_engine)
 
         is_pipe_parallel = isinstance(self.module, PipelineModule)
 
         mp_rank = 0 if self.mpu is None else self.mpu.get_model_parallel_rank()
-        load_path, checkpoint, _ = sd_loader.load(
-            self.mp_world_size, mp_rank, is_pipe_parallel=is_pipe_parallel
-        )
+        load_path, checkpoint, _ = sd_loader.load(self.mp_world_size, mp_rank, is_pipe_parallel=is_pipe_parallel)
 
         if checkpoint is None:
             return None, None
 
+        fetch_z3_params = False
+        if self.zero_optimization_partition_weights() and not load_optimizer_states:
+            checkpoint['module'] = get_fp32_state_dict_from_zero_checkpoint(load_dir)
+            fetch_z3_params = True
+
         if is_pipe_parallel:
             # Pipeline parallelism uses this to load its own checkpoint files.
             self._curr_ckpt_path = os.path.join(load_dir, tag)
@@ -2836,53 +2797,50 @@ def _load_checkpoint(self,
                                                 num_experts=self.num_experts,
                                                 checkpoint_engine=self.checkpoint_engine)
         if not self.load_universal_checkpoint():
-            self.load_module_state_dict(state_dict=checkpoint['module'],
+            self.load_module_state_dict(checkpoint=checkpoint,
                                         strict=load_module_strict,
-                                        custom_load_fn=custom_load_fn)
+                                        custom_load_fn=custom_load_fn,
+                                        fetch_z3_params=fetch_z3_params)
 
         self.loaded_checkpoint_dp_world_size = checkpoint['dp_world_size']
 
+        optim_checkpoint = None
         if load_module_only:
             deepspeed_states = ['module']
-            if self.optimizer is not None and self.fp16_enabled():
+            if self.optimizer is not None:
                 self.optimizer.refresh_fp32_params()
+            if load_lr_scheduler_states and self.lr_scheduler is not None:
+                self.lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
         else:
-            if self.has_moe_layers:
-                largest_group_name = groups._get_max_expert_size_name()
-                expp_rank = groups._get_expert_parallel_rank(largest_group_name)
-                optim_load_path = self._get_optimizer_ckpt_name(load_dir, tag, expp_rank)
-                optim_checkpoint = self.checkpoint_engine.load(
-                    optim_load_path,
-                    map_location=torch.device('cpu'))
-            else:
-                optim_checkpoint = checkpoint
-
-            has_zero_optimizer_state = self.zero_optimization() or self.bfloat16_enabled(
-            )
+            has_zero_optimizer_state = self.zero_optimization() or self.bfloat16_enabled()
             if load_optimizer_states and self.optimizer is not None and not has_zero_optimizer_state:
-                if self.fp16_enabled():
-                    self.optimizer.load_state_dict(
-                        optim_checkpoint['optimizer'],
-                        load_optimizer_states=load_optimizer_states)
+                if self.has_moe_layers:
+                    largest_group_name = groups._get_max_expert_size_name()
+                    expp_rank = groups._get_expert_parallel_rank(largest_group_name)
+                    optim_load_path = self._get_optimizer_ckpt_name(load_dir, tag, expp_rank)
+                    optim_checkpoint = self.checkpoint_engine.load(optim_load_path, map_location=torch.device('cpu'))
+                else:
+                    optim_checkpoint = checkpoint
+
+                if self.fp16_enabled() or self.bfloat16_enabled():
+                    self.optimizer.load_state_dict(optim_checkpoint['optimizer'],
+                                                   load_optimizer_states=load_optimizer_states)
                 else:
-                    self.optimizer.load_state_dict(optim_checkpoint['optimizer'])
+                    optim_checkpoint = checkpoint
+
+                self.optimizer.load_state_dict(optim_checkpoint['optimizer'])
 
             if load_lr_scheduler_states and self.lr_scheduler is not None:
                 self.lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
 
-            if self.random_ltd_enabled(
-            ) and self.random_ltd_scheduler is not None and 'random_ltd' in checkpoint:
+            if self.random_ltd_enabled() and self.random_ltd_scheduler is not None and 'random_ltd' in checkpoint:
                 self.random_ltd_scheduler.load_state_dict(checkpoint['random_ltd'])
 
             if self.training_dataloader is not None and self.curriculum_learning_enabled(
             ) and 'data_sampler' in checkpoint:
-                self.training_dataloader.data_sampler.load_state_dict(
-                    checkpoint['data_sampler'])
+                self.training_dataloader.data_sampler.load_state_dict(checkpoint['data_sampler'])
 
-            def get_sparse_tensor_module_names(original_set,
-                                               loaded_set,
-                                               original_parameters,
-                                               loaded_parameters):
+            def get_sparse_tensor_module_names(original_set, loaded_set, original_parameters, loaded_parameters):
                 result = set()
 
                 for name in original_set:
@@ -2892,8 +2850,7 @@ def get_sparse_tensor_module_names(original_set,
 
                 for name in loaded_set:
                     if name in original_parameters:
-                        result.add(
-                            name)  # parameter exists in both configs and it was sparse
+                        result.add(name)  # parameter exists in both configs and it was sparse
 
                 return result
 
@@ -2908,26 +2865,16 @@ def get_sparse_tensor_module_names(original_set,
                     self.sparse_tensor_module_names = sparse_tensor_module_names
                 else:
                     self.sparse_tensor_module_names = get_sparse_tensor_module_names(
-                        self.sparse_tensor_module_names,
-                        sparse_tensor_module_names,
-                        dict(self.module.named_parameters()),
-                        checkpoint["module"])
+                        self.sparse_tensor_module_names, sparse_tensor_module_names,
+                        dict(self.module.named_parameters()), checkpoint["module"])
 
             self.global_steps = checkpoint['global_steps']
-            self.global_samples = checkpoint.get(
-                'global_samples',
-                self.global_steps * self.train_batch_size())
+            self.global_samples = checkpoint.get('global_samples', self.global_steps * self.train_batch_size())
             self.skipped_steps = checkpoint['skipped_steps']
             self.loaded_checkpoint_mp_world_size = checkpoint['mp_world_size']
             deepspeed_states = [
-                'module',
-                'sparse_tensor_module_names',
-                'skipped_steps',
-                'global_steps',
-                'dp_world_size',
-                'mp_world_size',
-                'data_sampler',
-                'random_ltd'
+                'module', 'sparse_tensor_module_names', 'skipped_steps', 'global_steps', 'dp_world_size',
+                'mp_world_size', 'data_sampler', 'random_ltd'
             ]
         client_state = {}
 
@@ -2936,26 +2883,32 @@ def get_sparse_tensor_module_names(original_set,
         if load_optimizer_states:
             deepspeed_states.append('optimizer')
 
-        client_state = {
-            key: value
-            for key,
-            value in checkpoint.items() if not key in deepspeed_states
-        }
+        client_state = {key: value for key, value in checkpoint.items() if not key in deepspeed_states}
 
-        if not load_optimizer_states and not load_module_only:
+        if optim_checkpoint is not None:
             client_state['optimizer'] = optim_checkpoint['optimizer']
 
         return load_path, client_state
 
     def _load_zero_checkpoint(self, load_dir, tag, load_optimizer_states=True):
+
+        load_serial = None
+        # When use loading checkpoint serial, checkpoint loading start from local rank 0,
+        # all other local rank would be paused, waiting for its rank-1 peer ready and its notification.
+        if self._config.zero_config.pipeline_loading_checkpoint:
+            assert self.zero_optimization_stage(
+            ) == ZeroStageEnum.weights, "Only stage3 support for pipeline checkpoint loading"
+            load_serial = torch.zeros(1).to(self.device)
+            if dist.get_local_rank() != 0:
+                dist.recv(tensor=load_serial, src=dist.get_rank() - 1)
         if self.load_universal_checkpoint():
             zero_sd_list = None
             checkpoint_folder = f'{os.path.join(load_dir, tag)}'
         else:
-            if load_optimizer_states and self.dp_world_size != self.loaded_checkpoint_dp_world_size:
+            if load_optimizer_states and self.seq_dp_world_size != self.loaded_checkpoint_dp_world_size:
                 raise ZeRORuntimeException("The checkpoint being loaded used a DP " \
                     f"world size of {self.loaded_checkpoint_dp_world_size} but the " \
-                    f"current world size is {self.dp_world_size}. Automatic adjustment " \
+                    f"current world size is {self.seq_dp_world_size}. Automatic adjustment " \
                     "of ZeRO's optimizer state partitioning with a new world size is not " \
                     "currently supported.")
             checkpoint_folder = None
@@ -2963,28 +2916,37 @@ def _load_zero_checkpoint(self, load_dir, tag, load_optimizer_states=True):
             if zero_sd_list is None:
                 return False
 
-        self.optimizer.load_state_dict(
-            state_dict_list=zero_sd_list,
-            load_optimizer_states=load_optimizer_states,
-            load_from_fp32_weights=self.zero_load_from_fp32_weights(),
-            checkpoint_folder=checkpoint_folder)
+        self.optimizer.load_state_dict(state_dict_list=zero_sd_list,
+                                       load_optimizer_states=load_optimizer_states,
+                                       load_from_fp32_weights=self.zero_load_from_fp32_weights(),
+                                       checkpoint_folder=checkpoint_folder,
+                                       load_serial=load_serial)
 
         if self.load_universal_checkpoint():
-            logger.info(
-                f'loaded universal zero checkpoints from {checkpoint_folder} for rank {self.global_rank}'
-            )
+            logger.info(f'loaded universal zero checkpoints from {checkpoint_folder} for rank {self.global_rank}')
         else:
-            logger.info(
-                f"loading {len(zero_sd_list)} zero partition checkpoints for rank {self.global_rank}"
-            )
+            logger.info(f"loading {len(zero_sd_list)} zero partition checkpoints for rank {self.global_rank}")
         return True
 
-    def _get_mp_rank_zero_checkpoint_names(self,
-                                           load_dir,
-                                           tag,
-                                           mp_rank,
-                                           dp_world_size,
-                                           bf16_mode):
+    def update_optimizer_step(self, step):
+
+        def set_step(d):
+            if isinstance(d['step'], torch.Tensor):
+                d['step'] = torch.tensor(step, dtype=d['step'].dtype, device=d['step'].device)
+            else:
+                d['step'] = step
+
+        optimizer = self.optimizer
+        base_optimizer = optimizer.optimizer
+        state = base_optimizer.state
+        for group in optimizer.param_groups:
+            if 'step' in group:
+                set_step(group)
+            for p in group['params']:
+                if p in state and len(state[p]) > 0 and 'step' in state[p]:
+                    set_step(state[p])
+
+    def _get_mp_rank_zero_checkpoint_names(self, load_dir, tag, mp_rank, dp_world_size, bf16_mode):
         zero_ckpt_names = []
         for dp_rank in range(dp_world_size):
             ckpt_name = self._get_rank_zero_ckpt_name(checkpoints_path=load_dir,
@@ -2998,18 +2960,16 @@ def _get_mp_rank_zero_checkpoint_names(self,
 
     def _get_all_zero_checkpoint_names(self, load_dir, tag, bf16_mode):
         mp_rank = 0 if self.mpu is None else self.mpu.get_model_parallel_rank()
-        zero_ckpt_names = self._get_mp_rank_zero_checkpoint_names(
-            load_dir=load_dir,
-            tag=tag,
-            mp_rank=mp_rank,
-            dp_world_size=self.loaded_checkpoint_dp_world_size,
-            bf16_mode=bf16_mode)
+        zero_ckpt_names = self._get_mp_rank_zero_checkpoint_names(load_dir=load_dir,
+                                                                  tag=tag,
+                                                                  mp_rank=mp_rank,
+                                                                  dp_world_size=self.loaded_checkpoint_dp_world_size,
+                                                                  bf16_mode=bf16_mode)
         for i, ckpt_name in enumerate(zero_ckpt_names):
             if not os.path.exists(ckpt_name):
                 # transparently handle the old file pattern for optim_states
                 if "optim_states.pt" in ckpt_name:
-                    ckpt_name_try = ckpt_name.replace("_optim_states.pt",
-                                                      "optim_states.pt")
+                    ckpt_name_try = ckpt_name.replace("_optim_states.pt", "optim_states.pt")
                     if os.path.exists(ckpt_name_try):
                         zero_ckpt_names[i] = ckpt_name_try
                         continue
@@ -3023,8 +2983,7 @@ def _get_all_zero_checkpoint_state_dicts(self, zero_ckpt_names):
             if ckpt_name is None:
                 _state = {OPTIMIZER_STATE_DICT: None}
             # Fully load state for current rank
-            elif self.zero_elastic_checkpoint() or dist.get_rank(
-                    group=self.optimizer.dp_process_group) == i:
+            elif self.zero_elastic_checkpoint() or dist.get_rank(group=self.optimizer.dp_process_group) == i:
                 _state = self.checkpoint_engine.load(
                     ckpt_name,
                     map_location='cpu',
@@ -3034,25 +2993,18 @@ def _get_all_zero_checkpoint_state_dicts(self, zero_ckpt_names):
             zero_sd_list.append(_state)
 
         zero_optimizer_sd = [sd[OPTIMIZER_STATE_DICT] for sd in zero_sd_list]
-        logger.info(
-            f"successfully read {len(zero_optimizer_sd)} ZeRO state_dicts for rank {self.global_rank}"
-        )
+        logger.info(f"successfully read {len(zero_optimizer_sd)} ZeRO state_dicts for rank {self.global_rank}")
         return zero_optimizer_sd
 
     def _get_all_zero_checkpoints(self, load_dir, tag):
         for bf16_mode in [self.bfloat16_enabled(), not self.bfloat16_enabled()]:
-            zero_ckpt_names = self._get_all_zero_checkpoint_names(
-                load_dir,
-                tag,
-                bf16_mode)
+            zero_ckpt_names = self._get_all_zero_checkpoint_names(load_dir, tag, bf16_mode)
             if zero_ckpt_names is not None:
                 # Warn if loading checkpoint of different bit16 type
                 if bf16_mode is not self.bfloat16_enabled():
                     checkpoint_bit16 = BFLOAT16 if bf16_mode else FP16
                     engine_bit16 = BFLOAT16 if self.bfloat16_enabled() else FP16
-                    logger.warn(
-                        f'Loading {checkpoint_bit16} zero checkpoints into {engine_bit16} training engine'
-                    )
+                    logger.warn(f'Loading {checkpoint_bit16} zero checkpoints into {engine_bit16} training engine')
                 return self._get_all_zero_checkpoint_state_dicts(zero_ckpt_names)
 
         return None
@@ -3066,16 +3018,15 @@ def _checkpoint_tag_validation(self, tag):
             dist.all_reduce(max_bhash, op=dist.ReduceOp.MAX)
             dist.all_reduce(min_bhash, op=dist.ReduceOp.MIN)
             valid = all(min_bhash == bhash) and all(max_bhash == bhash)
-            msg = (
-                f"[rank={dist.get_rank()}] The checkpoint tag name '{tag}' is not consistent across "
-                "all ranks. Including rank unique information in checkpoint tag could cause issues when "
-                "restoring with different world sizes.")
+            msg = (f"[rank={dist.get_rank()}] The checkpoint tag name '{tag}' is not consistent across "
+                   "all ranks. Including rank unique information in checkpoint tag could cause issues when "
+                   "restoring with different world sizes.")
             if self.checkpoint_tag_validation_fail():
                 assert valid, msg
             elif not valid:
                 logger.warning(msg)
 
-    def save_checkpoint(self, save_dir, tag=None, client_state={}, save_latest=True):
+    def save_checkpoint(self, save_dir, tag=None, client_state={}, save_latest=True, exclude_frozen_parameters=False):
         """Save training checkpoint
 
         Arguments:
@@ -3084,14 +3035,15 @@ def save_checkpoint(self, save_dir, tag=None, client_state={}, save_latest=True)
                 used if not provided. Tag name must be the same across all ranks.
             client_state: Optional. State dictionary used for saving required training states in the client code.
             save_latest: Optional. Save a file 'latest' pointing to the latest saved checkpoint.
+            exclude_frozen_parameters: Optional. Exclude frozen parameters from checkpointed state.
         Important: all processes must call this method and not just the process with rank 0. It is
         because each process needs to save its master weights and scheduler+optimizer states. This
         method will hang waiting to synchronize with other processes if it's called just for the
         process with rank 0.
 
         """
-        if self.zero_optimization_partition_weights():
-            # Prepare for checkpoint save by ensuring all parameters are partitioned
+        if self._optimizer_has_ckpt_event_prologue():
+            # Custom preparation for checkpoint save, if applicable
             self.optimizer.checkpoint_event_prologue()
 
         rank = self.local_rank if self.use_node_local_storage() else self.global_rank
@@ -3100,7 +3052,8 @@ def save_checkpoint(self, save_dir, tag=None, client_state={}, save_latest=True)
         # There seems to be issue creating them in parallel
 
         # Ensure save_dir directory exists
-        os.makedirs(save_dir, exist_ok=True)
+        if rank == 0:
+            self.checkpoint_engine.makedirs(save_dir, exist_ok=True)
         dist.barrier()
 
         if tag is None:
@@ -3116,7 +3069,10 @@ def save_checkpoint(self, save_dir, tag=None, client_state={}, save_latest=True)
         if self.has_moe_layers:
             self.save_non_zero_checkpoint = False
             self._create_checkpoint_file(save_dir, tag, False)
-            self._save_moe_checkpoint(save_dir, tag, client_state=client_state)
+            self._save_moe_checkpoint(save_dir,
+                                      tag,
+                                      client_state=client_state,
+                                      exclude_frozen_parameters=exclude_frozen_parameters)
 
         # We distribute the task of saving layer checkpoint files among
         # data parallel instances, so all procs should call _save_checkpoint.
@@ -3124,13 +3080,16 @@ def save_checkpoint(self, save_dir, tag=None, client_state={}, save_latest=True)
         # parallel rank 0 save the general model params.
         if not self.has_moe_layers:
             self._create_checkpoint_file(save_dir, tag, False)
-            self._save_checkpoint(save_dir, tag, client_state=client_state)
+            self._save_checkpoint(save_dir,
+                                  tag,
+                                  client_state=client_state,
+                                  exclude_frozen_parameters=exclude_frozen_parameters)
 
         if self.save_zero_checkpoint:
             self._create_zero_checkpoint_files(save_dir, tag)
             self._save_zero_checkpoint(save_dir, tag)
 
-        if self.zero_optimization_partition_weights():
+        if self._optimizer_has_ckpt_event_epilogue():
             self.optimizer.checkpoint_event_epilogue()
 
         # Save latest checkpoint tag
@@ -3153,7 +3112,7 @@ def _get_non_moe_state_dict(self, full_state_dict):
 
         return full_state_dict
 
-    def _save_moe_checkpoint(self, save_dir, tag, client_state={}):
+    def _save_moe_checkpoint(self, save_dir, tag, client_state={}, exclude_frozen_parameters=False):
         save_path = self._get_ckpt_name(save_dir, tag)
         # A hack to save the checkpointing directory. Pipeline parallelism overrides
         # module_state_dict() and uses this path to save the model. module_state_dict()
@@ -3201,15 +3160,9 @@ def _save_moe_checkpoint(self, save_dir, tag, client_state={}):
                 # let save the moe parameters
                 for global_expert_id, expert_state_dict in experts_state_dict.items():
                     # save the moe parameters
-                    moe_save_path = self._get_expert_ckpt_name(
-                        save_dir,
-                        moe_layer_id,
-                        global_expert_id,
-                        tag,
-                        self.mpu)
+                    moe_save_path = self._get_expert_ckpt_name(save_dir, moe_layer_id, global_expert_id, tag, self.mpu)
                     if self.random_ltd_enabled():
-                        expert_state_dict = remove_random_ltd_state_dict(
-                            expert_state_dict)
+                        expert_state_dict = remove_random_ltd_state_dict(expert_state_dict)
                     self.checkpoint_engine.save(expert_state_dict, moe_save_path)
                 moe_layer_id += 1
 
@@ -3227,16 +3180,15 @@ def _save_moe_checkpoint(self, save_dir, tag, client_state={}):
 
         # Save optimizer states. They are different across each exp parallel rank.
         optimizer_state = {
-            'optimizer':
-            self.optimizer.state_dict()
-            if self.optimizer and not self.zero_optimization() else None
+            'optimizer': self.optimizer.state_dict() if self.optimizer and not self.zero_optimization() else None
         }
         # TODO: why use BufferedWriter not the path
         file_path = self._get_optimizer_ckpt_name(save_dir, tag, expp_rank)
         self.checkpoint_engine.save(optimizer_state, file_path)
 
         # get non-moe parameters
-        model_state_dict = self._get_non_moe_state_dict(self.module_state_dict())
+        model_state_dict = self._get_non_moe_state_dict(
+            self.module_state_dict(exclude_frozen_parameters=exclude_frozen_parameters))
 
         if expp_rank == 0:
             # TODO: update num experts info,.. in checkpoint
@@ -3244,15 +3196,12 @@ def _save_moe_checkpoint(self, save_dir, tag, client_state={}):
                 'module':
                 model_state_dict,
                 'lr_scheduler':
-                self.lr_scheduler.state_dict()
-                if self.lr_scheduler is not None else None,
+                self.lr_scheduler.state_dict() if self.lr_scheduler is not None else None,
                 'data_sampler':
                 self.training_dataloader.data_sampler.state_dict() if
-                (self.training_dataloader is not None
-                 and self.curriculum_learning_enabled()) else None,
+                (self.training_dataloader is not None and self.curriculum_learning_enabled()) else None,
                 'random_ltd':
-                self.random_ltd_scheduler.state_dict()
-                if self.random_ltd_enabled() else None,
+                self.random_ltd_scheduler.state_dict() if self.random_ltd_enabled() else None,
                 'sparse_tensor_module_names':
                 self.sparse_tensor_module_names,
                 'skipped_steps':
@@ -3274,11 +3223,11 @@ def _save_moe_checkpoint(self, save_dir, tag, client_state={}):
         self._curr_save_path = None
 
     def _create_checkpoint_file(self, save_dir, tag, zero_checkpoint):
-        name_function = (self._get_zero_ckpt_name
-                         if zero_checkpoint else self._get_ckpt_name)
+        name_function = (self._get_zero_ckpt_name if zero_checkpoint else self._get_ckpt_name)
         try:
             checkpoint_name = name_function(save_dir, tag)
-            ensure_directory_exists(checkpoint_name)
+            path = os.path.dirname(checkpoint_name)
+            self.checkpoint_engine.makedirs(path, exist_ok=True)
         except:
             logger.error(f"Failed saving model checkpoint to {save_dir} with tag {tag}")
             return False
@@ -3288,46 +3237,48 @@ def _create_checkpoint_file(self, save_dir, tag, zero_checkpoint):
     def _create_zero_checkpoint_files(self, save_dir, tag):
         success = True
         # zero checkpoint files are created sequentially
-        for rank in range(self.world_size):
+        for rank in range(dist.get_world_size(self.optimizer.dp_process_group)):
             if rank == self.global_rank:
                 success = self._create_checkpoint_file(save_dir, tag, True)
 
-            dist.barrier()
+            dist.barrier(group=self.optimizer.dp_process_group)
 
         return success
 
-    def _save_checkpoint(self, save_dir, tag, client_state={}):
+    def _save_checkpoint(self, save_dir, tag, client_state={}, exclude_frozen_parameters=False):
 
         save_path = self._get_ckpt_name(save_dir, tag)
 
         zero_optimizer_state = self.zero_optimization() or self.bfloat16_enabled()
 
+        save_frozen_param = self.zero_optimization_partition_gradients() and not exclude_frozen_parameters
+
         # A hack to save the checkpointing directory. Pipeline parallelism overrides
         # module_state_dict() and uses this path to save the model. module_state_dict()
         # then instead just returns None.  The module_state_dict() implementation in
         # PipelineEngine expects the save path to be set in self._curr_ckpt_path.
         self._curr_ckpt_path = os.path.join(save_dir, tag)
-        module = self.module_state_dict()
+        module = self.module_state_dict(exclude_frozen_parameters=exclude_frozen_parameters)
         self._curr_ckpt_path = None
 
         state = dict(module=module,
                      buffer_names=self._get_buffer_names(),
-                     optimizer=self.optimizer.state_dict()
-                     if self.optimizer and not zero_optimizer_state else None,
-                     param_shapes=self._get_zero_param_shapes()
-                     if self.optimizer and zero_optimizer_state else None,
-                     lr_scheduler=self.lr_scheduler.state_dict()
-                     if self.lr_scheduler is not None else None,
+                     optimizer=self.optimizer.state_dict() if self.optimizer and not zero_optimizer_state else None,
+                     param_shapes=self._get_zero_param_shapes() if self.optimizer and zero_optimizer_state else None,
+                     frozen_param_shapes=self._get_zero_frozen_param_attributes(self._get_param_shape_func)
+                     if save_frozen_param else None,
+                     shared_params=self._get_shared_params() if self.optimizer and zero_optimizer_state else None,
+                     frozen_param_fragments=self._get_zero_frozen_param_attributes(self._get_param_fragment_func)
+                     if save_frozen_param else None,
+                     lr_scheduler=self.lr_scheduler.state_dict() if self.lr_scheduler is not None else None,
                      data_sampler=self.training_dataloader.data_sampler.state_dict() if
-                     (self.training_dataloader is not None
-                      and self.curriculum_learning_enabled()) else None,
-                     random_ltd=self.random_ltd_scheduler.state_dict()
-                     if self.random_ltd_enabled() else None,
+                     (self.training_dataloader is not None and self.curriculum_learning_enabled()) else None,
+                     random_ltd=self.random_ltd_scheduler.state_dict() if self.random_ltd_enabled() else None,
                      sparse_tensor_module_names=self.sparse_tensor_module_names,
                      skipped_steps=self.skipped_steps,
                      global_steps=self.global_steps,
                      global_samples=self.global_samples,
-                     dp_world_size=self.dp_world_size,
+                     dp_world_size=self.seq_dp_world_size,
                      mp_world_size=self.mp_world_size,
                      ds_config=self.config,
                      ds_version=version)
@@ -3358,6 +3309,25 @@ def get_layer_named_buffers(module, prefix=""):
 
         return buffer_names
 
+    def _get_param_shape_func(self, param):
+        return param.ds_shape if hasattr(param, 'ds_id') else param.shape
+
+    def _get_param_fragment_func(self, param):
+        return param.ds_tensor.detach().cpu() if hasattr(param, 'ds_id') else param.detach().cpu()
+
+    def _get_zero_frozen_param_attributes(self, attr_func):
+        frozen_param_fragments = OrderedDict()
+
+        for param in self.module.parameters():
+            if param.requires_grad:
+                continue
+            if param not in self.param_names:
+                raise ValueError(f"failed to find frozen {param} in named params")
+            name = self.param_names[param]
+            frozen_param_fragments[name] = attr_func(param)
+
+        return frozen_param_fragments
+
     def _get_zero_param_shapes(self):
         """Returns a dict of name to shape mapping, only for the flattened fp32 weights saved by the
         optimizer. the names are exactly as in state_dict. The order is absolutely important, since
@@ -3376,7 +3346,7 @@ def _get_zero_param_shapes(self):
         # if we don't use it, we get parameters ordered incorrectly
         if hasattr(self.optimizer, "round_robin_bit16_groups"):
             bit16_groups = self.optimizer.round_robin_bit16_groups
-        elif self.bfloat16_enabled() and not self.zero_optimization():
+        elif self.bfloat16_enabled() and hasattr(self.optimizer, "bf16_groups"):
             bit16_groups = self.optimizer.bf16_groups
         else:
             bit16_groups = self.optimizer.bit16_groups if self.zero_optimization_stage(
@@ -3400,6 +3370,46 @@ def _get_zero_param_shapes(self):
 
         return param_group_shapes
 
+    def _get_shared_params(self):
+        """
+        Returns a dict of shared params, which can later be used to reconstruct the original state dict,
+        e.g. in `zero_to_fp32`. Each dict entry is a pair of param names, where the key is the name
+        of the variable that isn't stored and the value is the actual param holding data.
+        """
+        shared_index = {}
+        shared_params_by_full_name = {}
+
+        is_zero3_model = (self.zero_optimization_partition_weights()
+                          and any(hasattr(param, "ds_id") for param in self.module.parameters()))
+
+        def get_layer_state_dict(module, prefix=""):
+            # handle params
+            for name, param in module.named_parameters(recurse=False):
+                if param is None or (is_zero3_model and not hasattr(param, "ds_id")):
+                    continue
+                key = prefix + name
+
+                # When weights are manged by stage 3, we can't rely on param.data_ptr() as it will be reused
+                # as weights get gathered and reduced, but param.ds_id is unique across all zero weights
+                # (and shared params will have the same param.ds_id)
+                param_id = param.ds_id if is_zero3_model else param.data_ptr()
+
+                if param_id in shared_index:
+                    # shared weights
+                    #print(f"`{key}` is shared with `{shared_index[param_id]}`")
+                    shared_params_by_full_name[key] = shared_index[param_id]
+                else:
+                    shared_index[param_id] = key
+
+            for name, child in module.named_children():
+                if child is not None:
+                    get_layer_state_dict(child, prefix + name + ".")
+
+        if dist.get_rank() == 0:
+            get_layer_state_dict(self.module, prefix="")
+
+        return shared_params_by_full_name
+
     def _copy_recovery_script(self, save_path):
         base_dir = os.path.dirname(os.path.dirname(__file__))
         script = "zero_to_fp32.py"
@@ -3407,14 +3417,21 @@ def _copy_recovery_script(self, save_path):
         dst = os.path.join(save_path, script)
         #logger.info(f"creating recovery script {dst}")
         copyfile(src, dst)
-        # make executable
-        os.chmod(dst, os.stat(dst).st_mode | stat.S_IEXEC)
+        self._change_recovery_script_permissions(dst)
+
+    def _change_recovery_script_permissions(self, dst):
+        # make executable (safeguard for file shares - Azure as example)
+        try:
+            os.chmod(dst, os.stat(dst).st_mode | stat.S_IEXEC)
+        except (FileNotFoundError, PermissionError) as e:
+            #this message is used in unit test TestZeRONonDistributed
+            logger.info(
+                f'Warning: Could not change permissions for {dst} due to error: {e}. Continuing without changing permissions.'
+            )
 
     def _save_zero_checkpoint(self, save_path, tag):
         zero_checkpoint_name = self._get_zero_ckpt_name(save_path, tag)
-        zero_sd = dict(optimizer_state_dict=self.optimizer.state_dict(),
-                       ds_config=self.config,
-                       ds_version=version)
+        zero_sd = dict(optimizer_state_dict=self.optimizer.state_dict(), ds_config=self.config, ds_version=version)
         self.checkpoint_engine.save(zero_sd, zero_checkpoint_name)
 
         if self.global_rank == 0:
@@ -3444,9 +3461,7 @@ def get_layer_state_dict(module, prefix=""):
             # gather one layer at a time to be memory-efficient
             # must use modifier_rank=0 to release GPU memory after each layer gathered
             #see_memory_usage("before GatheredParameters", force=True)
-            with deepspeed.zero.GatheredParameters(list(
-                    module.parameters(recurse=False)),
-                                                   modifier_rank=0):
+            with deepspeed.zero.GatheredParameters(list(module.parameters(recurse=False)), modifier_rank=0):
                 if dist.get_rank() == 0:
                     # handle params
                     for name, param in module.named_parameters(recurse=False):
@@ -3467,8 +3482,7 @@ def get_layer_state_dict(module, prefix=""):
 
                     # now buffers - not sure if need to take care of potentially shared weights here
                     for name, buf in module.named_buffers(recurse=False):
-                        if (buf is not None
-                                and name not in module._non_persistent_buffers_set):
+                        if (buf is not None and name not in module._non_persistent_buffers_set):
                             state_dict[prefix + name] = buf.detach().cpu()
             #see_memory_usage("after GatheredParameters", force=True)
 
@@ -3477,13 +3491,15 @@ def get_layer_state_dict(module, prefix=""):
                     get_layer_state_dict(child, prefix + name + ".")
 
         # Prepare for checkpoint save by ensuring all parameters are partitioned
-        self.optimizer.checkpoint_event_prologue()
+        if self._optimizer_has_ckpt_event_prologue():
+            self.optimizer.checkpoint_event_prologue()
 
         see_memory_usage("before get_layer_state_dict", force=False)
         get_layer_state_dict(self.module, prefix="")
         see_memory_usage("after get_layer_state_dict", force=False)
 
-        self.optimizer.checkpoint_event_epilogue()
+        if self._optimizer_has_ckpt_event_epilogue():
+            self.optimizer.checkpoint_event_epilogue()
 
         return state_dict
 
@@ -3521,15 +3537,29 @@ def save_16bit_model(self, save_dir, save_filename="pytorch_model.bin"):
             else:
                 # the model will be bogus if not consolidated so don't confuse the user by saving it
                 logger.info(
-                    f"Did not save the model {path} because `stage3_gather_16bit_weights_on_model_save` is False"
-                )
+                    f"Did not save the model {path} because `stage3_gather_16bit_weights_on_model_save` is False")
                 return False
         else:
             state_dict = self.module.state_dict()
 
+        tag = f"global_step{self.global_steps}"
+        tag = str(tag)
+        self.checkpoint_engine.create(tag)
+
         if dist.get_rank() == 0:
-            os.makedirs(save_dir, exist_ok=True)
-            logger.info(f"Saving model weights to {path}")
+            self.checkpoint_engine.makedirs(save_dir, exist_ok=True)
+            logger.info(f"Saving model weights to {path}, tag: {tag}")
             self.checkpoint_engine.save(state_dict, path)
 
+        self.checkpoint_engine.commit(tag)
+
         return True
+
+    def empty_partition_cache(self):
+        """
+        Release GPU memory consumed by offloaded model parameters.
+        """
+        if hasattr(self.optimizer, 'empty_partition_cache'):
+            self.optimizer.empty_partition_cache()
+            gc.collect()
+            get_accelerator().empty_cache()
diff --git a/deepspeed/runtime/fp16/__init__.py b/deepspeed/runtime/fp16/__init__.py
index fcb45ab2b685..6c5067f71c8f 100644
--- a/deepspeed/runtime/fp16/__init__.py
+++ b/deepspeed/runtime/fp16/__init__.py
@@ -1 +1,5 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 '''Copyright The Microsoft DeepSpeed Team'''
diff --git a/deepspeed/runtime/fp16/fused_optimizer.py b/deepspeed/runtime/fp16/fused_optimizer.py
index 4f4b5cfa7f2d..182f806c839c 100755
--- a/deepspeed/runtime/fp16/fused_optimizer.py
+++ b/deepspeed/runtime/fp16/fused_optimizer.py
@@ -1,21 +1,32 @@
-'''
-Copyright 2019 The Microsoft DeepSpeed Team
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
 
+# DeepSpeed Team
+"""
 Copyright NVIDIA/apex
 This file is adapted from FP16_Optimizer in NVIDIA/apex
-'''
+"""
 
 import torch
 from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
 
 from deepspeed.runtime import DeepSpeedOptimizer
-from deepspeed.runtime.utils import get_global_norm, get_grad_norm, CheckOverflow, get_weight_norm
+from deepspeed.runtime.utils import get_global_norm, get_grad_norm, CheckOverflow, get_weight_norm, required_torch_version
 from deepspeed.runtime.fp16.loss_scaler import INITIAL_LOSS_SCALE, SCALE_WINDOW, MIN_LOSS_SCALE
 from deepspeed.utils import groups, logger, log_dist
 from deepspeed import comm as dist
 from deepspeed.checkpoint.constants import OPTIMIZER_STATE_DICT, CLIP_GRAD
 from deepspeed.accelerator import get_accelerator
 
+OVERFLOW_CHECK_TIMER = 'overflow_check'
+COMPUTE_NORM_TIMER = 'compute_norm'
+UNSCALE_AND_CLIP_TIMER = 'unscale_and_clip'
+BASIC_STEP_TIMER = 'basic_step'
+UPDATE_FP16_TIMER = 'update_fp16'
+
+OVERFLOW_TIMERS = [COMPUTE_NORM_TIMER, OVERFLOW_CHECK_TIMER]
+STEP_TIMERS = OVERFLOW_TIMERS + [UNSCALE_AND_CLIP_TIMER, BASIC_STEP_TIMER, UPDATE_FP16_TIMER]
+
 
 class FP16_Optimizer(DeepSpeedOptimizer):
     """
@@ -23,6 +34,7 @@ class FP16_Optimizer(DeepSpeedOptimizer):
 
    For usage example please see, TODO:  DeepSpeed V2 Tutorial
     """
+
     def __init__(self,
                  init_optimizer,
                  deepspeed=None,
@@ -58,20 +70,15 @@ def __init__(self,
             # push this group to list before modify
             self.fp16_groups.append(param_group['params'])
             # init fp16 weight buffer, flattened
-            self.fp16_groups_flat.append(
-                _flatten_dense_tensors([p.clone().detach()
-                                        for p in self.fp16_groups[i]]))
+            self.fp16_groups_flat.append(_flatten_dense_tensors([p.clone().detach() for p in self.fp16_groups[i]]))
             # set model fp16 weight to slices of flattened buffer
-            updated_params = _unflatten_dense_tensors(self.fp16_groups_flat[i],
-                                                      self.fp16_groups[i])
+            updated_params = _unflatten_dense_tensors(self.fp16_groups_flat[i], self.fp16_groups[i])
             for p, q in zip(self.fp16_groups[i], updated_params):
                 p.data = q.data
             # init master weight, flattened
-            self.fp32_groups_flat.append(
-                self.fp16_groups_flat[i].clone().float().detach())
+            self.fp32_groups_flat.append(self.fp16_groups_flat[i].clone().float().detach())
             # modify optimizer of have flat master weight
-            self.fp32_groups_flat[
-                i].requires_grad = True  # keep this in case internal optimizer uses it
+            self.fp32_groups_flat[i].requires_grad = True  # keep this in case internal optimizer uses it
             param_group['params'] = [self.fp32_groups_flat[i]]
 
         # we may have a way of fusing dynamic scale. Do not support for now
@@ -100,11 +107,8 @@ def __init__(self,
 
         self.clip_grad = clip_grad
         self.norm_type = 2
-        self.step_count = 0
 
-        TORCH_MAJOR = int(torch.__version__.split('.')[0])
-        TORCH_MINOR = int(torch.__version__.split('.')[1])
-        if TORCH_MAJOR == 0 and TORCH_MINOR <= 4:
+        if required_torch_version(max_version=0.4):
             self.clip_grad_norm = torch.nn.utils.clip_grad_norm
         else:
             self.clip_grad_norm = torch.nn.utils.clip_grad_norm_
@@ -113,16 +117,13 @@ def __init__(self,
         self.mpu = mpu
 
         self.overflow = False
-        self.overflow_checker = CheckOverflow(self.fp16_groups,
-                                              mpu=self.mpu,
-                                              deepspeed=deepspeed)
+        self.overflow_checker = CheckOverflow(self.fp16_groups, mpu=self.mpu, deepspeed=deepspeed)
         self.initialize_optimizer_states()
 
     def initialize_optimizer_states(self):
         for i, group in enumerate(self.fp16_groups):
-            self.fp32_groups_flat[i].grad = torch.zeros(
-                self.fp32_groups_flat[i].size(),
-                device=self.fp32_groups_flat[i].device)
+            self.fp32_groups_flat[i].grad = torch.zeros(self.fp32_groups_flat[i].size(),
+                                                        device=self.fp32_groups_flat[i].device)
 
         self.optimizer.step()
 
@@ -131,7 +132,7 @@ def initialize_optimizer_states(self):
 
         return
 
-    def zero_grad(self, set_to_none=False):
+    def zero_grad(self, set_to_none=True):
         """
         Zero FP16 parameter grads.
         """
@@ -156,10 +157,7 @@ def step_fused_adam(self, closure=None):
         for i, group in enumerate(self.fp16_groups):
             grads_groups_flat.append(
                 _flatten_dense_tensors([
-                    torch.zeros(p.size(),
-                                dtype=p.dtype,
-                                device=p.device) if p.grad is None else p.grad
-                    for p in group
+                    torch.zeros(p.size(), dtype=p.dtype, device=p.device) if p.grad is None else p.grad for p in group
                 ]))
             norm_groups.append(get_weight_norm(grads_groups_flat[i], mpu=self.mpu))
 
@@ -169,17 +167,13 @@ def step_fused_adam(self, closure=None):
 
         if self.overflow:
             if self.verbose:
-                logger.info(
-                    "[deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss "
-                    "scale: {}, reducing to {}".format(prev_scale,
-                                                       self.cur_scale))
+                logger.info("[deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss "
+                            "scale: {}, reducing to {}".format(prev_scale, self.cur_scale))
             return self.overflow
 
         scaled_grad_norm = get_global_norm(norm_list=norm_groups)
 
-        combined_scale = self.unscale_and_clip_grads(grads_groups_flat,
-                                                     scaled_grad_norm,
-                                                     apply_scale=False)
+        combined_scale = self.unscale_and_clip_grads(grads_groups_flat, scaled_grad_norm, apply_scale=False)
 
         # Stash unscaled gradient norm
         self._global_grad_norm = scaled_grad_norm / self.cur_scale
@@ -191,26 +185,11 @@ def step_fused_adam(self, closure=None):
                             grad_norms=norm_groups)
         # TODO: we probably don't need this? just to be safe
         for i in range(len(norm_groups)):
-            updated_params = _unflatten_dense_tensors(self.fp16_groups_flat[i],
-                                                      self.fp16_groups[i])
+            updated_params = _unflatten_dense_tensors(self.fp16_groups_flat[i], self.fp16_groups[i])
             for p, q in zip(self.fp16_groups[i], updated_params):
                 p.data = q.data
         return self.overflow
 
-    def start_timers(self, name_list):
-        if self.timers is not None:
-            for name in name_list:
-                self.timers(name).start()
-
-    def stop_timers(self, name_list):
-        if self.timers is not None:
-            for name in name_list:
-                self.timers(name).stop()
-
-    def log_timers(self, name_list):
-        if self.timers is not None:
-            self.timers.log(name_list)
-
     def set_lr(self, lr):
         """Set the learning rate."""
         for param_group in self.optimizer.param_groups:
@@ -222,9 +201,7 @@ def get_lr(self):
 
     def override_loss_scale(self, loss_scale):
         if loss_scale != self.external_loss_scale:
-            logger.info(
-                f'[deepspeed] setting loss scale from {self.external_loss_scale} -> {loss_scale}'
-            )
+            logger.info(f'[deepspeed] setting loss scale from {self.external_loss_scale} -> {loss_scale}')
         self.custom_loss_scaler = True
         self.external_loss_scale = loss_scale
 
@@ -236,21 +213,13 @@ def step(self, closure=None):
         if self.fused_adam_legacy:
             return self.step_fused_adam()
 
-        COMPUTE_NORM = "compute_norm"
-        OVERFLOW_CHECK = 'overflow_check'
-        OVERFLOW_TIMERS = [COMPUTE_NORM, OVERFLOW_CHECK]
-        UNSCALE_AND_CLIP = 'unscale_and_clip'
-        BASIC_STEP = 'basic_step'
-        UPDATE_FP16 = 'update_fp16'
-        STEP_TIMERS = OVERFLOW_TIMERS + [UNSCALE_AND_CLIP, BASIC_STEP, UPDATE_FP16]
-
         # First determine if there is overflow.
-        self.start_timers([OVERFLOW_CHECK])
+        self.timers(OVERFLOW_CHECK_TIMER).start()
         fp16_params = []
         for i, group in enumerate(self.fp16_groups):
             fp16_params.extend([p for p in group if p.grad is not None])
         self.overflow = self.overflow_checker.has_overflow(fp16_params)
-        self.stop_timers([OVERFLOW_CHECK])
+        self.timers(OVERFLOW_CHECK_TIMER).stop()
         prev_scale = self.cur_scale
         self._update_scale(self.overflow)
         if self.overflow:
@@ -264,7 +233,7 @@ def step(self, closure=None):
                 for p in group:
                     p.grad = None
 
-            self.log_timers(OVERFLOW_TIMERS)
+            self.timers.log(OVERFLOW_TIMERS)
             return self.overflow
 
         grads_groups_flat = []
@@ -273,10 +242,8 @@ def step(self, closure=None):
 
             grads_groups_flat.append(
                 _flatten_dense_tensors([
-                    torch.zeros(p.size(),
-                                dtype=data_type,
-                                device=p.device)
-                    if p.grad is None else p.grad.to(data_type) for p in group
+                    torch.zeros(p.size(), dtype=data_type, device=p.device) if p.grad is None else p.grad.to(data_type)
+                    for p in group
                 ]))
 
             for p in group:
@@ -284,11 +251,11 @@ def step(self, closure=None):
 
             self.fp32_groups_flat[i].grad = grads_groups_flat[i]
 
-        self.start_timers([COMPUTE_NORM])
+        self.timers(COMPUTE_NORM_TIMER).start()
 
         all_groups_norm = get_grad_norm(self.fp32_groups_flat, mpu=self.mpu)
 
-        self.stop_timers([COMPUTE_NORM])
+        self.timers(COMPUTE_NORM_TIMER).stop()
 
         if self.has_moe_layers:
             all_groups_norm = self._get_norm_with_moe_layers(all_groups_norm)
@@ -298,31 +265,28 @@ def step(self, closure=None):
         # Stash unscaled gradient norm
         self._global_grad_norm = scaled_global_grad_norm / self.cur_scale
 
-        self.start_timers([UNSCALE_AND_CLIP])
+        self.timers(UNSCALE_AND_CLIP_TIMER).start()
         self.unscale_and_clip_grads(grads_groups_flat, scaled_global_grad_norm)
-        self.stop_timers([UNSCALE_AND_CLIP])
+        self.timers(UNSCALE_AND_CLIP_TIMER).stop()
 
-        self.start_timers([BASIC_STEP])
+        self.timers(BASIC_STEP_TIMER).start()
         self.optimizer.step()
-        self.stop_timers([BASIC_STEP])
+        self.timers(BASIC_STEP_TIMER).stop()
 
         #get rid of the fp32 gradients. Not needed anymore
         for group in self.fp32_groups_flat:
             group.grad = None
 
-        self.start_timers([UPDATE_FP16])
+        self.timers(UPDATE_FP16_TIMER).start()
 
         for i in range(len(self.fp16_groups)):
-            updated_params = _unflatten_dense_tensors(self.fp32_groups_flat[i],
-                                                      self.fp16_groups[i])
+            updated_params = _unflatten_dense_tensors(self.fp32_groups_flat[i], self.fp16_groups[i])
             for p, q in zip(self.fp16_groups[i], updated_params):
                 p.data.copy_(q.data)
 
-        self.stop_timers([UPDATE_FP16])
-
-        self.log_timers(STEP_TIMERS)
+        self.timers(UPDATE_FP16_TIMER).stop()
 
-        self.step_count += 1
+        self.timers.log(STEP_TIMERS)
 
         return self.overflow
 
@@ -334,9 +298,7 @@ def _get_norm_with_moe_layers(self, all_groups_norm):
         else:
             pg = groups._get_data_parallel_group()
         scaled_norm = all_groups_norm * 1.0 / float(dist.get_world_size(group=pg))
-        scaled_norm_tensor = torch.tensor(scaled_norm,
-                                          device=self.fp32_groups_flat[0].device,
-                                          dtype=torch.float)
+        scaled_norm_tensor = torch.tensor(scaled_norm, device=self.fp32_groups_flat[0].device, dtype=torch.float)
         dist.all_reduce(scaled_norm_tensor, group=pg)
         all_groups_norm = scaled_norm_tensor.item()
         #print(f"old = {all_groups_norm_old} and new = {all_groups_norm} at rank: {deepspeed.comm.get_rank()}")
@@ -376,25 +338,19 @@ def _update_scale(self, skip):
         if self.dynamic_loss_scale:
             prev_scale = self.cur_scale
             if skip:
-                self.cur_scale = max(self.cur_scale / self.scale_factor,
-                                     self.min_loss_scale)
+                self.cur_scale = max(self.cur_scale / self.scale_factor, self.min_loss_scale)
                 self.last_overflow_iter = self.cur_iter
                 if self.verbose:
                     logger.info(f"\nGrad overflow on iteration {self.cur_iter}")
-                    logger.info(
-                        f"Reducing dynamic loss scale from {prev_scale} to {self.cur_scale}"
-                    )
+                    logger.info(f"Reducing dynamic loss scale from {prev_scale} to {self.cur_scale}")
             else:
                 # Ensure self.scale_window updates since last overflow
                 stable_interval = (self.cur_iter - self.last_overflow_iter) - 1
                 if (stable_interval > 0) and (stable_interval % self.scale_window == 0):
                     self.cur_scale *= self.scale_factor
                     if self.verbose:
-                        logger.info(
-                            f"No Grad overflow for {self.scale_window} iterations")
-                        logger.info(
-                            f"Increasing dynamic loss scale from {prev_scale} to {self.cur_scale}"
-                        )
+                        logger.info(f"No Grad overflow for {self.scale_window} iterations")
+                        logger.info(f"Increasing dynamic loss scale from {prev_scale} to {self.cur_scale}")
         else:
             if skip:
                 logger.info("Grad overflow on iteration: %s", self.cur_iter)
diff --git a/deepspeed/runtime/fp16/loss_scaler.py b/deepspeed/runtime/fp16/loss_scaler.py
index 58ab2ae96fe1..451451c51a32 100755
--- a/deepspeed/runtime/fp16/loss_scaler.py
+++ b/deepspeed/runtime/fp16/loss_scaler.py
@@ -1,26 +1,34 @@
-# Copyright 2019 The Microsoft DeepSpeed Team
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#Taken and modified for DeepSpeed from:
-#    https://github.com/NVIDIA/Megatron-LM/blob/master/fp16/loss_scaler.py
-#Commit: 93ab4bea59dc5cbf97c079d313741866af4deac9
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""
+Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+Taken and modified for DeepSpeed from:
+    https://github.com/NVIDIA/Megatron-LM/blob/master/fp16/loss_scaler.py
+Commit: 93ab4bea59dc5cbf97c079d313741866af4deac9
+"""
 
 import torch
+from deepspeed import comm as dist
+from deepspeed.utils import logger
 
 INITIAL_LOSS_SCALE = 'init_scale'
 SCALE_WINDOW = 'scale_window'
 DELAYED_SHIFT = 'delayed_shift'
+CONSECUTIVE_HYSTERESIS = 'consecutive_hysteresis'
 MIN_LOSS_SCALE = 'min_scale'
 
 
@@ -35,6 +43,7 @@ class LossScalerBase:
     """LossScalarBase
     Base class for a loss scaler
     """
+
     def __init__(self, cur_scale):
         self.cur_scale = cur_scale
         self.dynamic = False
@@ -52,6 +61,7 @@ def update_scale(self, overflow):
     def backward(self, loss, retain_graph=False):
         scaled_loss = loss * self.loss_scale
         scaled_loss.backward(retain_graph=retain_graph)
+        # print(f'LossScalerBackward: {scaled_loss=}')
 
 
 class LossScaler(LossScalerBase):
@@ -65,6 +75,7 @@ class LossScaler(LossScalerBase):
     Args:
         scale (float, optional, default=1.0):  The loss scale.
     """
+
     def __init__(self, scale=1):
         super(LossScaler, self).__init__(scale)
 
@@ -101,7 +112,9 @@ class DynamicLossScaler(LossScalerBase):
         init_scale (float, optional, default=2**32):  Initial loss scale attempted by :class:`DynamicLossScaler.`
         scale_factor (float, optional, default=2.0):  Factor used when adjusting the loss scale. If an overflow is encountered, the loss scale is readjusted to loss scale/``scale_factor``.  If ``scale_window`` consecutive iterations take place without an overflow, the loss scale is readjusted to loss_scale*``scale_factor``.
         scale_window (int, optional, default=1000):  Number of consecutive iterations without an overflow to wait before increasing the loss scale.
+        consecutive_hysteresis (bool, optional, default=False): Whether to refill hysteresis if we reach an iteration that doesn't overflow
     """
+
     def __init__(self,
                  init_scale=2**32,
                  scale_factor=2.,
@@ -109,7 +122,8 @@ def __init__(self,
                  min_scale=1,
                  delayed_shift=1,
                  consecutive_hysteresis=False,
-                 raise_error_at_min_scale=True):
+                 raise_error_at_min_scale=True,
+                 dtype=torch.half):
         super(DynamicLossScaler, self).__init__(init_scale)
         self.cur_iter = 0
         self.last_overflow_iter = -1
@@ -121,6 +135,7 @@ def __init__(self,
         self.consecutive_hysteresis = consecutive_hysteresis
         self.raise_error_at_min_scale = raise_error_at_min_scale
         self.dynamic = True
+        self.dtype = dtype
 
     # `params` is a list / generator of torch.Variable
     def has_overflow_serial(self, params):
@@ -158,14 +173,28 @@ def update_scale(self, overflow):
             if self.delayed_shift == 1 or self.cur_hysteresis == 1:
                 if (self.cur_scale == self.min_scale) and self.raise_error_at_min_scale:
                     raise Exception(
-                        "Current loss scale already at minimum - cannot decrease scale anymore. Exiting run."
-                    )
-                self.cur_scale = max(self.cur_scale / self.scale_factor, self.min_scale)
+                        "Current loss scale already at minimum - cannot decrease scale anymore. Exiting run.")
+                else:
+                    next_scale = max(self.cur_scale / self.scale_factor, self.min_scale)
+                    if dist.get_rank() == 0:
+                        overflow_msg = f"[deepspeed] OVERFLOW! Rank {dist.get_rank()} Skipping step."
+                        if self.dtype == torch.half:
+                            overflow_msg += f" Attempted loss scale: {int(self.cur_scale)}, reducing to {int(next_scale)}"
+                        logger.info(overflow_msg)
+                    self.cur_scale = next_scale
             else:
+                if dist.get_rank() == 0:
+                    overflow_msg = f"[deepspeed] OVERFLOW! Rank {dist.get_rank()} Skipping step."
+                    if self.dtype == torch.half:
+                        overflow_msg += f" Attempted loss scale: {int(self.cur_scale)}, but hysteresis is {self.cur_hysteresis}. Reducing hysteresis to {self.cur_hysteresis-1}"
+                    logger.info(overflow_msg)
                 self.cur_hysteresis -= 1
             self.last_overflow_iter = self.cur_iter
         else:
             if self.consecutive_hysteresis:
+                if dist.get_rank() == 0:
+                    hysteresis_msg = f"Consecutive hysteresis is enabled. Restoring hysteresis to {self.delayed_shift}"
+                    logger.info(hysteresis_msg)
                 self.cur_hysteresis = self.delayed_shift
             if (self.cur_iter - self.last_overflow_iter) % self.scale_window == 0:
                 if not self.consecutive_hysteresis:
@@ -179,8 +208,8 @@ def update_scale(self, overflow):
 def CreateLossScaler(dtype, static_loss_scale, dynamic_scaling, dynamic_loss_args):
     if dtype == torch.half and dynamic_scaling:
         if dynamic_loss_args is None:
-            return DynamicLossScaler()
-        return DynamicLossScaler(**dynamic_loss_args)
+            return DynamicLossScaler(dtype=dtype)
+        return DynamicLossScaler(dtype=dtype, **dynamic_loss_args)
 
     loss_scale_value = static_loss_scale if dtype == torch.half else 1.0
     return LossScaler(scale=loss_scale_value)
diff --git a/deepspeed/runtime/fp16/onebit/__init__.py b/deepspeed/runtime/fp16/onebit/__init__.py
index 289769b423ce..ac6f7e9784ce 100644
--- a/deepspeed/runtime/fp16/onebit/__init__.py
+++ b/deepspeed/runtime/fp16/onebit/__init__.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .adam import OnebitAdam
 from .lamb import OnebitLamb
diff --git a/deepspeed/runtime/fp16/onebit/adam.py b/deepspeed/runtime/fp16/onebit/adam.py
index 5eb22fb64d73..236eea8cadc5 100644
--- a/deepspeed/runtime/fp16/onebit/adam.py
+++ b/deepspeed/runtime/fp16/onebit/adam.py
@@ -1,10 +1,13 @@
-'''
-Copyright 2020 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 import types
 import torch
 import numpy as np
 from deepspeed.accelerator import get_accelerator
+from deepspeed.runtime.utils import required_torch_version
 from deepspeed import comm as dist
 
 
@@ -39,14 +42,14 @@ class OnebitAdam(torch.optim.Optimizer):
     .. _On the Convergence of Adam and Beyond:
         https://openreview.net/forum?id=ryQu7f-RZ
     """
+
     def __init__(self,
                  params,
                  deepspeed=None,
                  lr=1e-3,
                  freeze_step=100000,
                  bias_correction=True,
-                 betas=(0.9,
-                        0.999),
+                 betas=(0.9, 0.999),
                  eps=1e-8,
                  eps_inside_sqrt=False,
                  weight_decay=0.,
@@ -87,13 +90,12 @@ def __init__(self,
         self.comm_backend_handle = None
 
         if self.comm_backend_name == 'nccl':
-            TORCH_MAJOR = int(torch.__version__.split('.')[0])
-            TORCH_MINOR = int(torch.__version__.split('.')[1])
-            assert TORCH_MAJOR >= 1 and TORCH_MINOR >= 8, "Please use torch 1.8 or greater to enable NCCL backend in 1-bit Adam. Alternatively, please specify 'mpi' as the 'comm_backend_name' in config file to proceed with the MPI backend"
+            assert (
+                required_torch_version(min_version=1.8)
+            ), "Please use torch 1.8 or greater to enable NCCL backend in 1-bit Adam. Alternatively, please specify 'mpi' as the 'comm_backend_name' in config file to proceed with the MPI backend"
             assert dist.is_initialized() == True, "Please initialize the torch distributed backend."
             from deepspeed.runtime.comm.nccl import NcclBackend
-            self.using_pipeline = hasattr(self.deepspeed,
-                                          'pipeline_enable_backward_allreduce')
+            self.using_pipeline = hasattr(self.deepspeed, 'pipeline_enable_backward_allreduce')
             self.comm_backend_handle = NcclBackend(self.deepspeed.mpu)
 
         elif self.comm_backend_name == 'mpi':
@@ -164,22 +166,17 @@ def step(self, closure=None, grads=None):
                     # Exponential moving average of squared gradient values
                     state['exp_avg_sq'] = torch.zeros_like(p.data)
 
-                if not self.initialize or (self.adam_freeze_key
-                                           and 'worker_error' not in state.keys()):
+                if not self.initialize or (self.adam_freeze_key and 'worker_error' not in state.keys()):
                     state['tensor_size'] = torch.numel(p.data)
                     state['corrected_tensor_size'] = state['tensor_size']
 
                     if state['tensor_size'] % (self.size * self.divider) != 0:
-                        state['corrected_tensor_size'] += ((self.size * self.divider) -
-                                                           (state['tensor_size'] %
-                                                            (self.size * self.divider)))
-                    state['server_chunk_size'] = state[
-                        'corrected_tensor_size'] // self.size
+                        state['corrected_tensor_size'] += ((self.size * self.divider) - (state['tensor_size'] %
+                                                                                         (self.size * self.divider)))
+                    state['server_chunk_size'] = state['corrected_tensor_size'] // self.size
                     get_accelerator().empty_cache()
-                    state['worker_error'] = torch.zeros(state['corrected_tensor_size'],
-                                                        device=p.device)
-                    state['server_error'] = torch.zeros(state['server_chunk_size'],
-                                                        device=p.device)
+                    state['worker_error'] = torch.zeros(state['corrected_tensor_size'], device=p.device)
+                    state['server_error'] = torch.zeros(state['server_chunk_size'], device=p.device)
                     get_accelerator().empty_cache()
                     self.adam_freeze_key = True
                     if not self.initialize and dist.get_rank() == 0:
@@ -211,11 +208,9 @@ def step(self, closure=None, grads=None):
 
                         if self.size > 1:
                             exp_avg.set_(
-                                self.comm_backend_handle.compressed_allreduce(
-                                    exp_avg,
-                                    state['worker_error'],
-                                    state['server_error'],
-                                    self.deepspeed.local_rank))
+                                self.comm_backend_handle.compressed_allreduce(exp_avg, state['worker_error'],
+                                                                              state['server_error'],
+                                                                              self.deepspeed.local_rank))
                         # Because 1-bit compression cannot represent exact zero, it is required to
                         # provide a momentum mask for those params that have constant exact zeros in their
                         # momentums, otherwise the compression error would keep accumulating.
@@ -225,8 +220,7 @@ def step(self, closure=None, grads=None):
                         # (See example in DeepSpeedExamples/bing_bert/deepspeed_train.py.)
                         if 'exp_avg_mask' in group:
                             if exp_avg.device != group['exp_avg_mask'].device:
-                                group['exp_avg_mask'] = group['exp_avg_mask'].to(
-                                    device=exp_avg.device)
+                                group['exp_avg_mask'] = group['exp_avg_mask'].to(device=exp_avg.device)
                             exp_avg.mul_(group['exp_avg_mask'])
 
                     if self.initialize:
@@ -272,8 +266,7 @@ def load_state_dict(self, state_dict):
         for i, group in enumerate(self.param_groups):
             if 'exp_avg_mask' in group:
                 state_dict['param_groups'][i]['exp_avg_mask'] = group['exp_avg_mask']
-            elif 'exp_avg_mask' not in group and 'exp_avg_mask' in state_dict[
-                    'param_groups'][i]:
+            elif 'exp_avg_mask' not in group and 'exp_avg_mask' in state_dict['param_groups'][i]:
                 state_dict['param_groups'][i].pop('exp_avg_mask')
         super().load_state_dict(state_dict)
         if self.state[self.param_groups[0]['params'][0]]['step'] < self.freeze_step:
@@ -287,9 +280,7 @@ def load_state_dict(self, state_dict):
                     self.deepspeed.enable_backward_allreduce = True
         else:
             if dist.get_rank() == 0:
-                print(
-                    "Checkpoint loaded and OnebitAdam compression stage starts/continues."
-                )
+                print("Checkpoint loaded and OnebitAdam compression stage starts/continues.")
             if self.adam_freeze_key is False:
                 self.adam_freeze_key = True
                 if self.using_pipeline:
diff --git a/deepspeed/runtime/fp16/onebit/lamb.py b/deepspeed/runtime/fp16/onebit/lamb.py
index 87c24695e23d..0662fabeeee1 100644
--- a/deepspeed/runtime/fp16/onebit/lamb.py
+++ b/deepspeed/runtime/fp16/onebit/lamb.py
@@ -1,10 +1,13 @@
-'''
-Copyright 2021 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 import types
 import torch
 import numpy as np
 from deepspeed import comm as dist
+from deepspeed.runtime.utils import required_torch_version
 from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
 from deepspeed.accelerator import get_accelerator
 
@@ -54,14 +57,14 @@ class OnebitLamb(torch.optim.Optimizer):
     .. _On the Convergence of Adam and Beyond:
         https://openreview.net/forum?id=ryQu7f-RZ
     """
+
     def __init__(self,
                  params,
                  deepspeed=None,
                  lr=1e-3,
                  freeze_step=100000,
                  bias_correction=True,
-                 betas=(0.9,
-                        0.999),
+                 betas=(0.9, 0.999),
                  eps=1e-8,
                  eps_inside_sqrt=False,
                  weight_decay=0.,
@@ -109,13 +112,12 @@ def __init__(self,
         self.comm_backend_handle = None
 
         if self.comm_backend_name == 'nccl':
-            TORCH_MAJOR = int(torch.__version__.split('.')[0])
-            TORCH_MINOR = int(torch.__version__.split('.')[1])
-            assert TORCH_MAJOR >= 1 and TORCH_MINOR >= 8, "Please use torch 1.8 or greater to enable NCCL backend in 1-bit Adam. Alternatively, please specify 'mpi' as the 'comm_backend_name' in config file to proceed with the MPI backend"
+            assert (
+                required_torch_version(min_version=1.8)
+            ), "Please use torch 1.8 or greater to enable NCCL backend in 1-bit Adam. Alternatively, please specify 'mpi' as the 'comm_backend_name' in config file to proceed with the MPI backend"
             assert dist.is_initialized() == True, "Please initialize the torch distributed backend."
             from deepspeed.runtime.comm.nccl import NcclBackend
-            self.using_pipeline = hasattr(self.deepspeed,
-                                          'pipeline_enable_backward_allreduce')
+            self.using_pipeline = hasattr(self.deepspeed, 'pipeline_enable_backward_allreduce')
             self.comm_backend_handle = NcclBackend(self.deepspeed.mpu)
 
         elif self.comm_backend_name == 'mpi':
@@ -165,24 +167,19 @@ def step(self, closure=None, grads=None):
         if self.lamb_freeze_key:
             exp_avg_last_step = []
             for group in self.param_groups:
-                exp_avg_last_step.append(
-                    [self.state[p]['exp_avg'].detach().clone() for p in group['params']])
+                exp_avg_last_step.append([self.state[p]['exp_avg'].detach().clone() for p in group['params']])
             if 'scaling_coeff' not in self.state[self.param_groups[0]['params'][0]]:
                 # Compute the scaling_coeff for each momentum at the end of warmup stage.
                 # This is used to reduce compression error during compression stage.
                 momentum_scales = []
                 for group in self.param_groups:
-                    momentum_scales.append([
-                        (torch.norm(self.state[p]['exp_avg']) /
-                         np.sqrt(torch.numel(self.state[p]['exp_avg']))).item()
-                        for p in group['params']
-                    ])
-                united_scale = sum([sum(x) for x in momentum_scales]) / sum(
-                    [len(x) for x in momentum_scales])
+                    momentum_scales.append([(torch.linalg.norm(self.state[p]['exp_avg']) /
+                                             np.sqrt(torch.numel(self.state[p]['exp_avg']))).item()
+                                            for p in group['params']])
+                united_scale = sum([sum(x) for x in momentum_scales]) / sum([len(x) for x in momentum_scales])
                 for i, group in enumerate(self.param_groups):
                     for j, p in enumerate(group['params']):
-                        self.state[p][
-                            'scaling_coeff'] = united_scale / momentum_scales[i][j]
+                        self.state[p]['scaling_coeff'] = united_scale / momentum_scales[i][j]
 
         for group, grads_this_group in zip(self.param_groups, grads_group):
             if grads_this_group is None:
@@ -201,8 +198,7 @@ def step(self, closure=None, grads=None):
                 state = self.state[p]
 
                 # State initialization
-                if len(state) == 0 or (len(state) == 1
-                                       and 'scaling_coeff' in state.keys()):
+                if len(state) == 0 or (len(state) == 1 and 'scaling_coeff' in state.keys()):
                     state['step'] = 0
                     state['lamb_coeff_freeze'] = 0.0
                     state['last_factor'] = 1.0
@@ -215,7 +211,8 @@ def step(self, closure=None, grads=None):
                 if not self.initialize:
                     self.lamb_freeze_key = True
 
-                exp_avg, exp_avg_sq, exp_avg_sq_fresh = state['exp_avg'], state['exp_avg_sq'], state['exp_avg_sq_fresh']
+                exp_avg, exp_avg_sq, exp_avg_sq_fresh = state['exp_avg'], state['exp_avg_sq'], state[
+                    'exp_avg_sq_fresh']
                 beta1, beta2 = group['betas']
                 max_coeff = group['max_coeff']
                 min_coeff = group['min_coeff']
@@ -243,8 +240,8 @@ def step(self, closure=None, grads=None):
                             if lamb_coeff < min_coeff:
                                 lamb_coeff = min_coeff
                         if lamb_coeff != 1.0:
-                            state['lamb_coeff_freeze'] = self.coeff_beta * state[
-                                'lamb_coeff_freeze'] + (1 - self.coeff_beta) * lamb_coeff
+                            state['lamb_coeff_freeze'] = self.coeff_beta * state['lamb_coeff_freeze'] + (
+                                1 - self.coeff_beta) * lamb_coeff
                         self.lamb_coeffs.append(lamb_coeff)
                         with torch.no_grad():
                             p.add_(-group['lr'] * lamb_coeff * update)
@@ -266,20 +263,15 @@ def step(self, closure=None, grads=None):
                     tensor_size += torch.numel(p.data)
             corrected_tensor_size = tensor_size
             if tensor_size % (self.size * self.divider) != 0:
-                difference = ((self.size * self.divider) - (tensor_size %
-                                                            (self.size * self.divider)))
+                difference = ((self.size * self.divider) - (tensor_size % (self.size * self.divider)))
                 corrected_tensor_size += difference
-                self.dummy_exp_avg[0] = torch.zeros(
-                    difference,
-                    device=momentum_groups[0].data.device)
+                self.dummy_exp_avg[0] = torch.zeros(difference, device=momentum_groups[0].data.device)
                 momentum_groups.append(self.dummy_exp_avg[0])
             self.corrected_tensor_sizes.append(corrected_tensor_size)
             self.server_chunk_sizes.append(corrected_tensor_size // self.size)
 
-            self.exp_avg_flat.append(
-                _flatten_dense_tensors([p.detach().clone() for p in momentum_groups]))
-            updated_params = _unflatten_dense_tensors(self.exp_avg_flat[0],
-                                                      momentum_groups)
+            self.exp_avg_flat.append(_flatten_dense_tensors([p.detach().clone() for p in momentum_groups]))
+            updated_params = _unflatten_dense_tensors(self.exp_avg_flat[0], momentum_groups)
             for p, q in zip(momentum_groups, updated_params):
                 p.data = q.data
 
@@ -287,11 +279,8 @@ def step(self, closure=None, grads=None):
             get_accelerator().empty_cache()
             for i in range(len(self.exp_avg_flat)):
                 self.worker_errors.append(
-                    torch.zeros(self.corrected_tensor_sizes[i],
-                                device=self.exp_avg_flat[i].device))
-                self.server_errors.append(
-                    torch.zeros(self.server_chunk_sizes[i],
-                                device=self.exp_avg_flat[i].device))
+                    torch.zeros(self.corrected_tensor_sizes[i], device=self.exp_avg_flat[i].device))
+                self.server_errors.append(torch.zeros(self.server_chunk_sizes[i], device=self.exp_avg_flat[i].device))
             get_accelerator().empty_cache()
 
         if self.lamb_freeze_key:
@@ -300,31 +289,23 @@ def step(self, closure=None, grads=None):
                     if not self.initialize:
                         get_accelerator().empty_cache()
                         self.worker_errors.append(
-                            torch.zeros(self.corrected_tensor_sizes[i],
-                                        device=self.exp_avg_flat[i].device))
+                            torch.zeros(self.corrected_tensor_sizes[i], device=self.exp_avg_flat[i].device))
                         self.server_errors.append(
-                            torch.zeros(self.server_chunk_sizes[i],
-                                        device=self.exp_avg_flat[i].device))
+                            torch.zeros(self.server_chunk_sizes[i], device=self.exp_avg_flat[i].device))
                         get_accelerator().empty_cache()
                         if dist.get_rank() == 0:
                             print("Cupy Buffers Initialized Successfully.")
 
-                        self.comm_backend_handle.compressed_allreduce(
-                            self.exp_avg_flat[i],
-                            self.worker_errors[0],
-                            self.server_errors[0],
-                            self.deepspeed.local_rank)
+                        self.comm_backend_handle.compressed_allreduce(self.exp_avg_flat[i], self.worker_errors[0],
+                                                                      self.server_errors[0], self.deepspeed.local_rank)
 
                         if dist.get_rank() == 0:
                             print('Pop out errors', flush=True)
                         del self.worker_errors[:]
                         del self.server_errors[:]
                     else:
-                        self.comm_backend_handle.compressed_allreduce(
-                            self.exp_avg_flat[i],
-                            self.worker_errors[i],
-                            self.server_errors[i],
-                            self.deepspeed.local_rank)
+                        self.comm_backend_handle.compressed_allreduce(self.exp_avg_flat[i], self.worker_errors[i],
+                                                                      self.server_errors[i], self.deepspeed.local_rank)
 
         if self.lamb_freeze_key and self.initialize:
             for i, group in enumerate(self.param_groups):
@@ -332,7 +313,8 @@ def step(self, closure=None, grads=None):
 
                 for j, p in enumerate(group['params']):
                     state = self.state[p]
-                    exp_avg, exp_avg_sq, exp_avg_sq_fresh = state['exp_avg'], state['exp_avg_sq'], state['exp_avg_sq_fresh']
+                    exp_avg, exp_avg_sq, exp_avg_sq_fresh = state['exp_avg'], state['exp_avg_sq'], state[
+                        'exp_avg_sq_fresh']
                     beta1, beta2 = group['betas']
                     exp_avg.div_(self.state[p]['scaling_coeff'])
                     # Because 1-bit compression cannot represent exact zero, it is required to
@@ -345,15 +327,11 @@ def step(self, closure=None, grads=None):
                     # to add this exp_avg_mask for BERT pre-training.)
                     if 'exp_avg_mask' in group:
                         if exp_avg.device != group['exp_avg_mask'].device:
-                            group['exp_avg_mask'] = group['exp_avg_mask'].to(
-                                device=exp_avg.device)
+                            group['exp_avg_mask'] = group['exp_avg_mask'].to(device=exp_avg.device)
                         exp_avg.mul_(group['exp_avg_mask'])
 
-                    grad_reconstruct = ((exp_avg - exp_avg_last_step[i][j] * beta1) /
-                                        (1 - beta1))
-                    exp_avg_sq_fresh.mul_(beta2).addcmul_(1 - beta2,
-                                                          grad_reconstruct,
-                                                          grad_reconstruct)
+                    grad_reconstruct = ((exp_avg - exp_avg_last_step[i][j] * beta1) / (1 - beta1))
+                    exp_avg_sq_fresh.mul_(beta2).addcmul_(1 - beta2, grad_reconstruct, grad_reconstruct)
                     denom = exp_avg_sq.sqrt() + group['eps']
                     update_prelim = exp_avg / denom
 
@@ -367,9 +345,7 @@ def step(self, closure=None, grads=None):
                     denom_real = exp_avg_sq_fresh.sqrt() + group['eps']
                     factor = (denom / denom_real).max().item()
                     if group['weight_decay'] > 0.0:
-                        update_ratio = min(1.0,
-                                           (update_prelim.pow(2).sum().sqrt() /
-                                            update_norm).item())
+                        update_ratio = min(1.0, (update_prelim.pow(2).sum().sqrt() / update_norm).item())
                         factor = factor * update_ratio + (1.0 - update_ratio)
                     if factor > self.factor_max:
                         factor = self.factor_max
@@ -416,8 +392,7 @@ def load_state_dict(self, state_dict):
         for i, group in enumerate(self.param_groups):
             if 'exp_avg_mask' in group:
                 state_dict['param_groups'][i]['exp_avg_mask'] = group['exp_avg_mask']
-            elif 'exp_avg_mask' not in group and 'exp_avg_mask' in state_dict[
-                    'param_groups'][i]:
+            elif 'exp_avg_mask' not in group and 'exp_avg_mask' in state_dict['param_groups'][i]:
                 state_dict['param_groups'][i].pop('exp_avg_mask')
         super().load_state_dict(state_dict)
         # need to reset the fused momentum since loading states will break the linking
@@ -442,9 +417,7 @@ def load_state_dict(self, state_dict):
                         self.state[p].pop('scaling_coeff')
         else:
             if dist.get_rank() == 0:
-                print(
-                    "Checkpoint loaded and OnebitLamb compression stage starts/continues."
-                )
+                print("Checkpoint loaded and OnebitLamb compression stage starts/continues.")
             if self.lamb_freeze_key is False:
                 self.lamb_freeze_key = True
                 if self.using_pipeline:
diff --git a/deepspeed/runtime/fp16/onebit/zoadam.py b/deepspeed/runtime/fp16/onebit/zoadam.py
index f86ae86f36cb..922263ad6a76 100644
--- a/deepspeed/runtime/fp16/onebit/zoadam.py
+++ b/deepspeed/runtime/fp16/onebit/zoadam.py
@@ -1,10 +1,13 @@
-'''
-Copyright 2020 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 import types
 import torch
 import numpy as np
 from deepspeed.accelerator import get_accelerator
+from deepspeed.runtime.utils import required_torch_version
 from deepspeed import comm as dist
 
 
@@ -49,13 +52,13 @@ class ZeroOneAdam(torch.optim.Optimizer):
     .. _On the Convergence of Adam and Beyond:
         https://openreview.net/forum?id=ryQu7f-RZ
     """
+
     def __init__(self,
                  params,
                  deepspeed=None,
                  lr=1e-3,
                  bias_correction=True,
-                 betas=(0.9,
-                        0.999),
+                 betas=(0.9, 0.999),
                  eps=1e-8,
                  eps_inside_sqrt=False,
                  weight_decay=0.,
@@ -100,13 +103,12 @@ def __init__(self,
         self.comm_backend_handle = None
 
         if self.comm_backend_name == 'nccl':
-            TORCH_MAJOR = int(torch.__version__.split('.')[0])
-            TORCH_MINOR = int(torch.__version__.split('.')[1])
-            assert TORCH_MAJOR >= 1 and TORCH_MINOR >= 8, "Please use torch 1.8 or greater to enable NCCL backend in 0/1 Adam. Alternatively, please specify 'mpi' as the 'comm_backend_name' in config file to proceed with the MPI backend"
+            assert (
+                required_torch_version(min_version=1.8)
+            ), "Please use torch 1.8 or greater to enable NCCL backend in 0/1 Adam. Alternatively, please specify 'mpi' as the 'comm_backend_name' in config file to proceed with the MPI backend"
             assert dist.is_initialized() == True, "Please initialize the torch distributed backend."
             from deepspeed.runtime.comm.nccl import NcclBackend
-            self.using_pipeline = hasattr(self.deepspeed,
-                                          'pipeline_enable_backward_allreduce')
+            self.using_pipeline = hasattr(self.deepspeed, 'pipeline_enable_backward_allreduce')
             self.comm_backend_handle = NcclBackend(self.deepspeed.mpu)
 
         elif self.comm_backend_name == 'mpi':
@@ -181,16 +183,12 @@ def step(self, closure=None, grads=None):
                     state['corrected_tensor_size'] = state['tensor_size']
 
                     if state['tensor_size'] % (self.size * self.divider) != 0:
-                        state['corrected_tensor_size'] += ((self.size * self.divider) -
-                                                           (state['tensor_size'] %
-                                                            (self.size * self.divider)))
-                    state['server_chunk_size'] = state[
-                        'corrected_tensor_size'] // self.size
+                        state['corrected_tensor_size'] += ((self.size * self.divider) - (state['tensor_size'] %
+                                                                                         (self.size * self.divider)))
+                    state['server_chunk_size'] = state['corrected_tensor_size'] // self.size
                     get_accelerator().empty_cache()
-                    state['worker_error'] = torch.zeros(state['corrected_tensor_size'],
-                                                        device=p.device)
-                    state['server_error'] = torch.zeros(state['server_chunk_size'],
-                                                        device=p.device)
+                    state['worker_error'] = torch.zeros(state['corrected_tensor_size'], device=p.device)
+                    state['server_error'] = torch.zeros(state['server_chunk_size'], device=p.device)
                     # Accumulation of momentum, i.e., the u variable in the 0/1 Adam paper
                     state['momentum_accumulator'] = torch.zeros_like(p.data)
                     get_accelerator().empty_cache()
@@ -213,16 +211,10 @@ def step(self, closure=None, grads=None):
                             if self.size > 1:
                                 with torch.no_grad():
                                     grad_onebit = self.comm_backend_handle.compressed_allreduce(
-                                        grad,
-                                        state['worker_error'],
-                                        state['server_error'],
-                                        self.deepspeed.local_rank)
+                                        grad, state['worker_error'], state['server_error'], self.deepspeed.local_rank)
                                     if 'exp_avg_mask' in group:
-                                        if grad_onebit.device != group[
-                                                'exp_avg_mask'].device:
-                                            group['exp_avg_mask'] = group[
-                                                'exp_avg_mask'].to(
-                                                    device=grad_onebit.device)
+                                        if grad_onebit.device != group['exp_avg_mask'].device:
+                                            group['exp_avg_mask'] = group['exp_avg_mask'].to(device=grad_onebit.device)
                                         grad_onebit.mul_(group['exp_avg_mask'])
                                     exp_avg.mul_(beta1).add_(1 - beta1, grad_onebit)
                     else:
@@ -233,15 +225,12 @@ def step(self, closure=None, grads=None):
                 if not self.initialize:
                     if self.size > 1:
                         comm_buffer.set_(
-                            self.comm_backend_handle.compressed_allreduce(
-                                comm_buffer,
-                                state['worker_error'],
-                                state['server_error'],
-                                self.deepspeed.local_rank))
+                            self.comm_backend_handle.compressed_allreduce(comm_buffer, state['worker_error'],
+                                                                          state['server_error'],
+                                                                          self.deepspeed.local_rank))
                         if 'exp_avg_mask' in group:
                             if comm_buffer.device != group['exp_avg_mask'].device:
-                                group['exp_avg_mask'] = group['exp_avg_mask'].to(
-                                    device=comm_buffer.device)
+                                group['exp_avg_mask'] = group['exp_avg_mask'].to(device=comm_buffer.device)
                             comm_buffer.mul_(group['exp_avg_mask'])
 
                 if self.initialize:
@@ -252,22 +241,18 @@ def step(self, closure=None, grads=None):
                         p.data.add_(-group['lr'] * update)
                         if self.freeze_key is True:
                             comm_buffer.add_(-group['lr'] * update)
-                    if state['step'] % state[
-                            'local_step_interval'] == 0 and self.freeze_key:
+                    if state['step'] % state['local_step_interval'] == 0 and self.freeze_key:
                         with torch.no_grad():
                             p.data.add_(-1 * comm_buffer)
                             comm_buffer.mul_(exp_avg_sq.sqrt() + group['eps'])
                             if self.size > 1:
                                 comm_buffer.copy_(
-                                    self.comm_backend_handle.compressed_allreduce(
-                                        comm_buffer,
-                                        state['worker_error'],
-                                        state['server_error'],
-                                        self.deepspeed.local_rank))
+                                    self.comm_backend_handle.compressed_allreduce(comm_buffer, state['worker_error'],
+                                                                                  state['server_error'],
+                                                                                  self.deepspeed.local_rank))
                                 if 'exp_avg_mask' in group:
                                     if comm_buffer.device != group['exp_avg_mask'].device:
-                                        group['exp_avg_mask'] = group['exp_avg_mask'].to(
-                                            device=comm_buffer.device)
+                                        group['exp_avg_mask'] = group['exp_avg_mask'].to(device=comm_buffer.device)
                                     comm_buffer.mul_(group['exp_avg_mask'])
                             exp_avg.zero_().add_(comm_buffer / state['lrs'], alpha=-1)
                             p.data.add_(comm_buffer / (exp_avg_sq.sqrt() + group['eps']))
@@ -298,9 +283,8 @@ def step(self, closure=None, grads=None):
                         state['local_step_counter'] += 1
                         if state['local_step_counter'] == self.local_step_scaler:
                             state['local_step_counter'] = 0
-                            state['local_step_interval'] = min(
-                                self.local_step_clipper,
-                                state['local_step_interval'] * 2)
+                            state['local_step_interval'] = min(self.local_step_clipper,
+                                                               state['local_step_interval'] * 2)
 
             if not self.initialize:
                 print('Pop out errors', flush=True)
@@ -343,14 +327,13 @@ def load_state_dict(self, state_dict):
         for i, group in enumerate(self.param_groups):
             if 'exp_avg_mask' in group:
                 state_dict['param_groups'][i]['exp_avg_mask'] = group['exp_avg_mask']
-            elif 'exp_avg_mask' not in group and 'exp_avg_mask' in state_dict[
-                    'param_groups'][i]:
+            elif 'exp_avg_mask' not in group and 'exp_avg_mask' in state_dict['param_groups'][i]:
                 state_dict['param_groups'][i].pop('exp_avg_mask')
         super().load_state_dict(state_dict)
         if self.state[self.param_groups[0]['params'][0]]['step'] < self.var_freeze_step:
             self.var_freeze_key = False
-            if (self.state[self.param_groups[0]['params'][0]]['step'] + 1
-                ) % self.state[self.param_groups[0]['params'][0]]['var_interval'] == 0:
+            if (self.state[self.param_groups[0]['params'][0]]['step'] +
+                    1) % self.state[self.param_groups[0]['params'][0]]['var_interval'] == 0:
                 if self.using_pipeline:
                     self.deepspeed.pipeline_enable_backward_allreduce = True
                 else:
diff --git a/deepspeed/runtime/fp16/unfused_optimizer.py b/deepspeed/runtime/fp16/unfused_optimizer.py
index e0249f15a6fb..14271255df2e 100755
--- a/deepspeed/runtime/fp16/unfused_optimizer.py
+++ b/deepspeed/runtime/fp16/unfused_optimizer.py
@@ -1,16 +1,18 @@
-'''
-Copyright 2019 The Microsoft DeepSpeed Team
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
 
+# DeepSpeed Team
+"""
 Copyright NVIDIA/apex
 This file is adapted from FP16_Optimizer in NVIDIA/apex
-'''
+"""
 
 from deepspeed.moe.utils import split_params_grads_into_shared_and_expert_params
 import torch
 from torch._utils import _flatten_dense_tensors
 
 from deepspeed.runtime import DeepSpeedOptimizer
-from deepspeed.runtime.utils import get_global_norm, CheckOverflow, get_weight_norm
+from deepspeed.runtime.utils import get_global_norm, CheckOverflow, get_weight_norm, required_torch_version
 from deepspeed.runtime.fp16.loss_scaler import INITIAL_LOSS_SCALE, SCALE_WINDOW, MIN_LOSS_SCALE
 from deepspeed.utils import logger
 from deepspeed.checkpoint.constants import OPTIMIZER_STATE_DICT
@@ -24,6 +26,7 @@ class FP16_UnfusedOptimizer(DeepSpeedOptimizer):
 
     For usage example please see, TODO:  DeepSpeed V2 Tutorial
     """
+
     def __init__(self,
                  init_optimizer,
                  deepspeed=None,
@@ -95,9 +98,7 @@ def __init__(self,
         self.clip_grad = clip_grad
         self.norm_type = 2
 
-        TORCH_MAJOR = int(torch.__version__.split('.')[0])
-        TORCH_MINOR = int(torch.__version__.split('.')[1])
-        if TORCH_MAJOR == 0 and TORCH_MINOR <= 4:
+        if required_torch_version(max_version=0.4):
             self.clip_grad_norm = torch.nn.utils.clip_grad_norm
         else:
             self.clip_grad_norm = torch.nn.utils.clip_grad_norm_
@@ -105,13 +106,11 @@ def __init__(self,
         self.mpu = mpu
 
         self.overflow = False
-        self.overflow_checker = CheckOverflow(self.fp16_groups,
-                                              mpu=self.mpu,
-                                              deepspeed=deepspeed)
+        self.overflow_checker = CheckOverflow(self.fp16_groups, mpu=self.mpu, deepspeed=deepspeed)
 
         self.initialize_optimizer_states()
 
-    def zero_grad(self, set_to_none=False):
+    def zero_grad(self, set_to_none=True):
         """
         Zero FP16 parameter grads.
         """
@@ -137,45 +136,33 @@ def step_fused_lamb(self, closure=None):
         expert_norm_groups = []
         for i, group in enumerate(self.fp16_groups):
             grads = [
-                torch.zeros(p.size(),
-                            dtype=p.dtype,
-                            device=p.device) if p.grad is None else p.grad for p in group
+                torch.zeros(p.size(), dtype=p.dtype, device=p.device) if p.grad is None else p.grad for p in group
             ]
             grads_groups.append(grads)
             grads_groups_flat.append(_flatten_dense_tensors(grads))
             grads_for_norm, expert_grads_for_norm = split_params_grads_into_shared_and_expert_params(group)
             norm_group_value = 0.0
             if len(grads_for_norm) > 0:
-                norm_group_value = get_weight_norm(
-                    _flatten_dense_tensors(grads_for_norm),
-                    mpu=self.mpu)
+                norm_group_value = get_weight_norm(_flatten_dense_tensors(grads_for_norm), mpu=self.mpu)
             norm_groups.append(norm_group_value)
             expert_norm_group_value = 0.0
             if len(expert_grads_for_norm) > 0:
-                expert_norm_group_value = get_weight_norm(
-                    _flatten_dense_tensors(expert_grads_for_norm),
-                    mpu=self.mpu)
+                expert_norm_group_value = get_weight_norm(_flatten_dense_tensors(expert_grads_for_norm), mpu=self.mpu)
             expert_norm_groups.append(expert_norm_group_value)
 
-        self.overflow = self.overflow_checker.check_using_norm(norm_groups +
-                                                               expert_norm_groups)
+        self.overflow = self.overflow_checker.check_using_norm(norm_groups + expert_norm_groups)
         prev_scale = self.cur_scale
 
         self._update_scale(self.overflow)
         if self.overflow:
             if self.verbose:
-                logger.info(
-                    "[deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss "
-                    "scale: {}, reducing to {}".format(prev_scale,
-                                                       self.cur_scale))
+                logger.info("[deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss "
+                            "scale: {}, reducing to {}".format(prev_scale, self.cur_scale))
             return self.overflow
 
         self._global_grad_norm = get_global_norm(norm_list=norm_groups)
-        combined_scale = self.unscale_and_clip_grads(self._global_grad_norm,
-                                                     apply_scale=False)
-        self.optimizer.step(grads=grads_groups,
-                            output_params=self.fp16_groups,
-                            scale=combined_scale)
+        combined_scale = self.unscale_and_clip_grads(self._global_grad_norm, apply_scale=False)
+        self.optimizer.step(grads=grads_groups, output_params=self.fp16_groups, scale=combined_scale)
 
         for fp32_group, fp16_group in zip(self.fp32_groups, self.fp16_groups):
             for idx, (fp32_param, fp16_param) in enumerate(zip(fp32_group, fp16_group)):
@@ -199,9 +186,7 @@ def get_lr(self):
 
     def override_loss_scale(self, loss_scale):
         if loss_scale != self.external_loss_scale:
-            logger.info(
-                f'[deepspeed] setting loss scale from {self.external_loss_scale} -> {loss_scale}'
-            )
+            logger.info(f'[deepspeed] setting loss scale from {self.external_loss_scale} -> {loss_scale}')
         self.custom_loss_scaler = True
         self.external_loss_scale = loss_scale
 
@@ -219,10 +204,8 @@ def step(self, closure=None):
         self._update_scale(self.overflow)
         if self.overflow:
             if self.verbose:
-                logger.info(
-                    "[deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss "
-                    "scale: {}, reducing to {}".format(prev_scale,
-                                                       self.cur_scale))
+                logger.info("[deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss "
+                            "scale: {}, reducing to {}".format(prev_scale, self.cur_scale))
             return self.overflow
 
         norm_groups = []
@@ -233,12 +216,10 @@ def step(self, closure=None):
                 norm_group_value = get_weight_norm(grads_for_norm, mpu=self.mpu)
             norm_groups.append(norm_group_value)
 
-            # copying gradients to fp32 to wor  k with fp32 parameters
+            # copying gradients to fp32 to work with fp32 parameters
             for fp32_param, fp16_param in zip(self.fp32_groups[i], self.fp16_groups[i]):
                 if fp16_param.grad is None:
-                    fp32_param.grad = torch.zeros(fp16_param.size(),
-                                                  dtype=fp32_param.dtype,
-                                                  device=fp32_param.device)
+                    fp32_param.grad = torch.zeros(fp16_param.size(), dtype=fp32_param.dtype, device=fp32_param.device)
                 else:
                     fp32_param.grad = fp16_param.grad.to(fp32_param.dtype)
 
@@ -294,25 +275,19 @@ def _update_scale(self, skip):
         if self.dynamic_loss_scale:
             prev_scale = self.cur_scale
             if skip:
-                self.cur_scale = max(self.cur_scale / self.scale_factor,
-                                     self.min_loss_scale)
+                self.cur_scale = max(self.cur_scale / self.scale_factor, self.min_loss_scale)
                 self.last_overflow_iter = self.cur_iter
                 if self.verbose:
                     logger.info("Grad overflow on iteration: %s", self.cur_iter)
-                    logger.info(
-                        f"Reducing dynamic loss scale from {prev_scale} to {self.cur_scale}"
-                    )
+                    logger.info(f"Reducing dynamic loss scale from {prev_scale} to {self.cur_scale}")
             else:
                 # Ensure self.scale_window updates since last overflow
                 stable_interval = (self.cur_iter - self.last_overflow_iter) - 1
                 if (stable_interval > 0) and (stable_interval % self.scale_window == 0):
                     self.cur_scale *= self.scale_factor
                     if self.verbose:
-                        logger.info(
-                            f"No Grad overflow for {self.scale_window} iterations")
-                        logger.info(
-                            f"Increasing dynamic loss scale from {prev_scale} to {self.cur_scale}"
-                        )
+                        logger.info(f"No Grad overflow for {self.scale_window} iterations")
+                        logger.info(f"Increasing dynamic loss scale from {prev_scale} to {self.cur_scale}")
         else:
             if skip:
                 logger.info("Grad overflow on iteration %s", self.cur_iter)
diff --git a/deepspeed/runtime/hybrid_engine.py b/deepspeed/runtime/hybrid_engine.py
new file mode 100644
index 000000000000..da6f7a9be54e
--- /dev/null
+++ b/deepspeed/runtime/hybrid_engine.py
@@ -0,0 +1,440 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+from deepspeed.inference.config import DeepSpeedInferenceConfig
+from deepspeed.module_inject.replace_policy import replace_policies
+from deepspeed.module_inject.utils import policy_to_ds_container
+from .engine import DeepSpeedEngine
+from .utils import TLinear, get_inactive_params
+from deepspeed.runtime.zero import GatheredParameters
+import time
+import gc
+import math
+from deepspeed import comm as dist
+from deepspeed.accelerator import get_accelerator
+from torch import nn
+from deepspeed.utils import logger
+
+from deepspeed.ops.op_builder import InferenceBuilder
+
+from deepspeed.module_inject.layers import LinearLayer, Normalize, EmbeddingLayer, OPTEmbedding
+try:
+    import transformers
+    OPTLearnedPositionalEmbedding = transformers.models.opt.modeling_opt.OPTLearnedPositionalEmbedding
+except:
+    OPTLearnedPositionalEmbedding = None
+inference_cuda_module = None
+
+
+class DeepSpeedHybridEngine(DeepSpeedEngine):
+    r"""DeepSpeed engine for training and inference."""
+    inference_mp_group = None
+
+    def __init__(self, args, model, **kwargs):
+
+        super().__init__(args, model, **kwargs)
+
+        # synch seed between all GPUs
+        _rng_state = get_accelerator().get_rng_state().to(get_accelerator().current_device_name())
+        dist.broadcast(_rng_state, 0)
+        get_accelerator().set_rng_state(_rng_state.cpu())
+
+        self.Z3_enabled = (self._config.zero_config.stage == 3)
+        self.gather_all_layers = self._config.hybrid_engine.pin_parameters
+
+        # inference containers / fwds
+        self._inference_containers = []
+        self._orig_modules = []
+        self._orig_fwds = []
+        self.create_inference_module()
+
+        # Performance stats
+        self._t_start = None
+        self._total_latency = 0
+        self._iters = 0
+        self._training_start_time = None
+        self._generate_latency = 0
+        self._training_latency = 0
+        self._total_batch_size = None
+        self._gather_latency = 0
+
+        global inference_cuda_module
+        if inference_cuda_module is None:
+            builder = InferenceBuilder()
+            inference_cuda_module = builder.load()
+
+        self.is_lora_fused = False
+
+    def convert_to_linear_transposed(self, model):
+
+        def _replace_linear_layer(r_module, parent_type=None, prev_type=None):
+            for name, child in r_module.named_children():
+                if child.__class__ in [torch.nn.Linear] and \
+                    (parent_type is torch.nn.ModuleList or prev_type is torch.nn.ModuleList):
+                    setattr(r_module, name, TLinear(child, name))
+                else:
+                    _replace_linear_layer(child, type(r_module), prev_type=parent_type)
+            return r_module
+
+        _replace_linear_layer(model)
+
+    def new_inference_container(self, orig_layer, policy_cls, layer_id):
+        policy = policy_cls(orig_layer, inference=True)
+
+        if self._config.fp16_enabled:
+            inference_dtype = torch.float16
+        elif self._config.bfloat16_enabled:
+            inference_dtype = torch.bfloat16
+        else:
+            inference_dtype = torch.float32
+
+        _container = policy_to_ds_container(
+            policy=policy,
+            config=DeepSpeedInferenceConfig(
+                set_empty_params=True,
+                dtype=inference_dtype,
+                max_out_tokens=self._config.hybrid_engine.max_out_tokens,
+                min_out_tokens=self._config.hybrid_engine.max_out_tokens,
+                transposed_mode=True,
+            ),
+            model_config=self.module.config if hasattr(self.module, 'config') else None,
+            layer_id=layer_id,
+            child=orig_layer)
+
+        if self.mpu is not None:
+            if hasattr(self.mpu, 'get_model_parallel_world_size'):
+                _container.set_tensor_parallel_config(self.mpu.get_model_parallel_world_size(),
+                                                      self.mpu.get_model_parallel_group())
+            else:
+                _container.set_tensor_parallel_config(self.mpu.get_tensor_model_parallel_world_size(),
+                                                      self.mpu.get_tensor_model_parallel_group())
+        else:
+            _container.set_tensor_parallel_config(self._config.hybrid_engine.inference_tp_size, self.mp_group)
+        _container.initialize_tensors(enable_training=True)
+        _container.create_ds_model_config()
+        _container.create_module()
+        _container.set_params_wo_copy(Z3_enabled=self.Z3_enabled)
+        return _container
+
+    def populate_all_inference_policies(self):
+        self.inference_policies = {}
+        for plcy in replace_policies:
+            _ = plcy(None)
+            if isinstance(plcy._orig_layer_class, list):
+                for orig_layer_class in plcy._orig_layer_class:
+                    self.inference_policies.update({orig_layer_class: (self.new_inference_container, plcy)})
+            elif plcy._orig_layer_class is not None:
+                self.inference_policies.update({plcy._orig_layer_class: (self.new_inference_container, plcy)})
+        self.inference_policies.update({
+            nn.Linear: (LinearLayer, ),
+            nn.Embedding: (EmbeddingLayer, ),
+            nn.LayerNorm: (Normalize, ),
+            OPTLearnedPositionalEmbedding: (OPTEmbedding, )
+        })
+
+    def _fuse_lora_layer(self, layer_id):
+        self._inference_containers[layer_id].fuse_lora()
+
+    def fuse_lora_weight(self):
+        for layer_id in range(len(self.layer_params)):
+            self._fuse_lora_layer(layer_id)
+
+    def _unfuse_lora_layer(self, layer_id):
+        self._inference_containers[layer_id].unfuse_lora()
+
+    def unfuse_lora_weight(self):
+        for layer_id in range(len(self.layer_params)):
+            self._unfuse_lora_layer(layer_id)
+
+    def unfuse_lora_weight_non_pinned(self):
+        for layer_id in range(len(self.layer_params)):
+            non_active_params = get_inactive_params(self.layer_params[layer_id])
+            non_active_lora_params = get_inactive_params(self.layer_lora_params[layer_id])
+            non_active_params.extend(non_active_lora_params)
+
+            with GatheredParameters(non_active_params):
+                self._unfuse_lora_layer(layer_id)
+
+    def retake_inference_cache(self):
+        if self._config.hybrid_engine.release_inference_cache:
+            retake_success = inference_cuda_module.retake_workspace()
+
+            if not retake_success:
+                logger.warning("Unable to acquire workspace on first attempt, emptying cache and retrying.")
+                gc.collect()
+                get_accelerator().empty_cache()
+                retake_success = inference_cuda_module.retake_workspace()
+
+                if not retake_success:
+                    raise RuntimeError("Unable to retake inference workspace.")
+
+    def generate(self, *inputs, **kwargs):
+        if self._total_batch_size is None:
+            bsz = inputs[0].shape[0] if len(inputs) > 0 else \
+                kwargs['input_ids'].shape[0]
+            self._total_batch_size = bsz * dist.get_world_size()
+
+        self._t0 = time.time()
+
+        if self.Z3_enabled and self.gather_all_layers:
+            if self._config.hybrid_engine.inference_tp_size > 1:
+                non_tp_params = []
+                for other_layer in self._other_layers:
+                    non_tp_params.extend(list(other_layer.parameters()))
+
+                partition_size = self._config.hybrid_engine.tp_gather_partition_size
+
+                layer_groups = math.ceil(len(self.layer_params) / partition_size)
+                for lg in range(layer_groups):
+                    non_active_params = []
+                    non_active_lora_params = []
+                    for layer_id in range(lg * partition_size, min(len(self.layer_params), (lg + 1) * partition_size),
+                                          1):
+                        non_tp_params.extend(self.layer_params[layer_id][:4])
+                        non_active_params.extend(get_inactive_params(self.layer_params[layer_id]))
+                        non_active_params.extend(get_inactive_params(self.layer_lora_params[layer_id]))
+                    with GatheredParameters(non_active_params):
+                        for layer_id in range(lg * partition_size,
+                                              min(len(self.layer_params), (lg + 1) * partition_size), 1):
+                            if len(self.all_lora_params) > 0:
+                                self._fuse_lora_layer(layer_id)
+
+                            if self.mpu is not None:
+                                self._inference_containers[layer_id].apply_tensor_parallelism(self.mp_replace,
+                                                                                              reversed_dim=True)
+
+                # TODO(cmikeh2) Evaluate if this can be deferred when release_inference_cache
+                # is enabled.
+                gc.collect()
+                get_accelerator().empty_cache()
+
+                self._gather_latency = time.time() - self._t0
+
+                input_shape = inputs[0].shape if len(inputs) > 0 else \
+                                kwargs['input_ids'].shape
+                output = torch.zeros(
+                    (input_shape[0] * self._config.hybrid_engine.inference_tp_size, ) + input_shape[1:],
+                    dtype=inputs[0].dtype if len(inputs) > 0 else kwargs['input_ids'].dtype,
+                    device=inputs[0].device if len(inputs) > 0 else kwargs['input_ids'].device)
+                input_cont = inputs[0].contiguous() if len(inputs) > 0 else kwargs['input_ids'].contiguous()
+                dist.all_gather_into_tensor(output, input_cont, group=self.mp_group)
+
+                if len(inputs) > 0:
+                    inputs = (output, *inputs[1:])
+                else:
+                    kwargs['input_ids'] = output
+
+                self.retake_inference_cache()
+
+                non_active_params = get_inactive_params(non_tp_params)
+                with GatheredParameters(non_active_params):
+                    generate_ret_vals = self._generate(*inputs, **kwargs)
+
+                for layer_id in range(len(self.layer_params)):
+                    self._inference_containers[layer_id].release_memory()
+
+                rank = dist.get_rank(group=self.mp_group)
+                generate_ret_vals = generate_ret_vals[input_shape[0] * rank:input_shape[0] * (rank + 1)]
+
+            else:
+                non_active_layers = get_inactive_params(self.all_layers_params)
+                non_active_lora_params = get_inactive_params(self.all_lora_params)
+                non_active_layers.extend(non_active_lora_params)
+                with GatheredParameters(non_active_layers):
+                    self._gather_latency = time.time() - self._t0
+
+                    if len(self.all_lora_params) > 0:
+                        self.fuse_lora_weight()
+
+                    self.retake_inference_cache()
+                    generate_ret_vals = self._generate(*inputs, **kwargs)
+
+                    if len(self.all_lora_params) > 0:
+                        self.unfuse_lora_weight()
+        else:
+            if len(self.all_lora_params) > 0 and (not self.Z3_enabled):
+                self.fuse_lora_weight()
+
+            self.retake_inference_cache()
+            generate_ret_vals = self._generate(*inputs, **kwargs)
+
+            if len(self.all_lora_params) > 0:
+                if (not self.Z3_enabled):
+                    self.unfuse_lora_weight()
+                else:
+                    self.unfuse_lora_weight_non_pinned()
+                self.is_lora_fused = False
+
+        if self._config.hybrid_engine.release_inference_cache:
+            inference_cuda_module.release_workspace()
+            gc.collect()
+            get_accelerator().empty_cache()
+
+        self._generate_latency = time.time() - self._t0 - self._gather_latency
+
+        return generate_ret_vals
+
+    def create_inference_containers(self, module, layer_id=0):
+        for name, child in module.named_children():
+            if child.__class__ in self.inference_policies:
+                if self.inference_policies[child.__class__][0] == self.new_inference_container:
+                    self._inference_containers.append(self.inference_policies[child.__class__][0](
+                        child, self.inference_policies[child.__class__][-1], layer_id))
+                    self._orig_modules.append(child)
+                    self._orig_fwds.append(child.forward)
+
+                    self.layer_params.append(self._inference_containers[layer_id].get_all_params())
+
+                    self.lora_params.append(self._inference_containers[layer_id].get_lora_params())
+                    self.layer_lora_params.append([])
+                    for lora_param in self.lora_params[layer_id]:
+                        self.layer_lora_params[layer_id].extend(lora_param[:-1])
+                        self.all_lora_params.extend(lora_param[:-1])
+
+                    layer_id += 1
+                else:
+                    self._other_layers.append(self.inference_policies[child.__class__][0](
+                        weight=child.weight, bias=child.bias if hasattr(child, 'bias') else None))
+                    self._orig_modules_others.append(child)
+                    self._orig_fwds_others.append(child.forward)
+            else:
+                self.create_inference_containers(child, layer_id=layer_id)
+
+    def create_inference_module(self):
+        self.layer_params = []
+        self.layer_lora_params = []
+        self.lora_params = []
+        self.all_lora_params = []
+
+        self._other_layers = []
+        self._orig_modules_others = []
+        self._orig_fwds_others = []
+
+        if self._config.hybrid_engine.inference_tp_size > 1:
+            if self.mpu is None:
+                global_rank = dist.get_rank()
+                world_size = dist.get_world_size()
+                mp_group_id = global_rank // self._config.hybrid_engine.inference_tp_size
+                num_mp_groups = world_size // self._config.hybrid_engine.inference_tp_size
+                for mp_group_id in range(num_mp_groups):
+                    ranks = list(
+                        range(mp_group_id * self._config.hybrid_engine.inference_tp_size, \
+                            (mp_group_id + 1) * self._config.hybrid_engine.inference_tp_size, \
+                            1)
+                    )
+                    mp_group = dist.new_group(ranks)
+                    if global_rank in ranks:
+                        # mp_group is used for broader collective
+                        self.mp_group = mp_group
+
+                        # mp_replace is used for container tensor slicing
+                        from deepspeed.module_inject import ReplaceWithTensorSlicing
+                        self.mp_replace = ReplaceWithTensorSlicing(
+                            mp_group=self.mp_group,
+                            mp_size=self._config.hybrid_engine.inference_tp_size,
+                            out_dim=0,
+                            in_dim=1)
+
+            else:
+                self.mp_group = self.mpu.get_model_parallel_group() if hasattr(self.mpu, 'get_model_parallel_group') else \
+                    self.mpu.get_tensor_model_parallel_group()
+
+                from deepspeed.module_inject import ReplaceWithTensorSlicing
+                self.mp_replace = ReplaceWithTensorSlicing(mp_group=self.mp_group,
+                                                           mp_size=self._config.hybrid_engine.inference_tp_size,
+                                                           out_dim=0,
+                                                           in_dim=1)
+        else:
+            self.mp_group = None
+            self.mp_replace = None
+        self.populate_all_inference_policies()
+        self.all_layers_params = list(self.module.parameters())
+        self.create_inference_containers(self.module)
+
+        if len(self._inference_containers) > 0:
+            self._generate = self.module.generate
+            self.module.generate = self.generate
+
+        self._t0 = time.time()
+
+    def _zero3_forward(self, layer_id):
+
+        def run_forward(*inputs, **kwargs):
+            non_active_params = get_inactive_params(self.layer_params[layer_id])
+            non_active_lora_params = get_inactive_params(self.layer_lora_params[layer_id])
+            non_active_params.extend(non_active_lora_params)
+
+            with GatheredParameters(non_active_params):
+                if len(self.all_lora_params) > 0:
+                    # Use the is_lora_fused flag to prevent multiple fusion in Z3 with non-pinned memory
+                    if not self.is_lora_fused:
+                        self._fuse_lora_layer(layer_id)
+                    # Set the is_lora_fused to true when reaching the last layer
+                    if layer_id == len(self.layer_params) - 1:
+                        self.is_lora_fused = True
+                return self._inference_containers[layer_id].module.forward(*inputs, **kwargs)
+
+        return run_forward
+
+    def eval(self):
+        if self._t_start is not None:
+            latency = time.time() - self._t_start
+            self._total_latency = self._total_latency + latency
+            self._iters = self._iters + 1
+            if not dist.is_initialized() or dist.get_rank() == 0:
+                others = latency - (self._generate_latency + self._training_latency)
+                print(f'|E2E latency={(latency):.2f}s ' + \
+                      f'|Gather latency={self._gather_latency:.2f}s ({(self._gather_latency / latency * 100):.2f}%) '
+                      f'|Generate time={(self._generate_latency):.2f}s ({(self._generate_latency / latency * 100):.2f}%) ' + \
+                      f'|Training time={(self._training_latency):.2f}s ({(self._training_latency / latency * 100):.2f}%) ' + \
+                      f'|Others={others:.2f} ({(others / latency * 100):.2f}%)'
+                      f'|CurSamplesPerSec={(1 / latency * self._total_batch_size):.2f} ' + \
+                      f'|AvgSamplesPerSec={(1 / (self._total_latency / self._iters) * self._total_batch_size):.2f}')
+            self._t_start = time.time()
+        self._training_latency = 0
+        super().eval()
+        if len(self._inference_containers) > 0:
+            for i, (orig_module, inference_container) in enumerate(zip(self._orig_modules,
+                                                                       self._inference_containers)):
+                if self.Z3_enabled and not self.gather_all_layers:
+                    orig_module.forward = self._zero3_forward(i)
+                else:
+                    orig_module.forward = inference_container.module.forward
+
+                inference_container.transform_for_inference()
+
+            if not self.Z3_enabled or self.gather_all_layers:
+                for orig_module, inference_layer in zip(self._orig_modules_others, self._other_layers):
+                    orig_module.forward = inference_layer.forward
+        if self.Z3_enabled:
+            gc.collect()
+            get_accelerator().empty_cache()
+        if self._t_start is None:
+            self._t_start = time.time()
+
+    def train(self, mode=True):
+        if mode and len(self._orig_modules) > 0:
+            for inference_container, orig_module, orig_fwd in zip(self._inference_containers, self._orig_modules,
+                                                                  self._orig_fwds):
+                inference_container.transform_for_training()
+                orig_module.forward = orig_fwd
+            for orig_module, orig_fwd in zip(self._orig_modules_others, self._orig_fwds_others):
+                orig_module.forward = orig_fwd
+        super().train(mode)
+        if mode:
+            self._training_start_time = time.time()
+
+    def step(self, lr_kwargs=None):
+        super().step(lr_kwargs=lr_kwargs)
+
+        if len(self._inference_containers) > 0:
+            if not self.Z3_enabled:
+                for inference_container in self._inference_containers:
+                    inference_container.reset_params()
+
+        if self._training_start_time is not None:
+            self._training_latency += (time.time() - self._training_start_time)
+            self._training_start_time = time.time()
diff --git a/deepspeed/runtime/lr_schedules.py b/deepspeed/runtime/lr_schedules.py
index faf5e6fee910..d7f7e15a4dbd 100755
--- a/deepspeed/runtime/lr_schedules.py
+++ b/deepspeed/runtime/lr_schedules.py
@@ -1,11 +1,12 @@
-"""
-Copyright 2019 The Microsoft DeepSpeed Team
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
 
+# DeepSpeed Team
+"""
 Implementation of learning rate schedules.
 
 Taken and modified from PyTorch v1.0.1 source
 https://github.com/pytorch/pytorch/blob/v1.1.0/torch/optim/lr_scheduler.py
-
 """
 
 import argparse
@@ -18,7 +19,8 @@
 ONE_CYCLE = 'OneCycle'
 WARMUP_LR = 'WarmupLR'
 WARMUP_DECAY_LR = 'WarmupDecayLR'
-VALID_LR_SCHEDULES = [LR_RANGE_TEST, ONE_CYCLE, WARMUP_LR, WARMUP_DECAY_LR]
+WARMUP_COSINE_LR = 'WarmupCosineLR'
+VALID_LR_SCHEDULES = [LR_RANGE_TEST, ONE_CYCLE, WARMUP_LR, WARMUP_DECAY_LR, WARMUP_COSINE_LR]
 
 LR_RANGE_TEST_MIN_LR = 'lr_range_test_min_lr'
 LR_RANGE_TEST_STEP_RATE = 'lr_range_test_step_rate'
@@ -49,32 +51,22 @@
 WARMUP_LOG_RATE = 'log'
 WARMUP_LINEAR_RATE = 'linear'
 
+WARMUP_MIN_RATIO = 'warmup_min_ratio'
+COS_MIN_RATIO = 'cos_min_ratio'
+
 TOTAL_NUM_STEPS = 'total_num_steps'
 
 
 def add_tuning_arguments(parser):
-    group = parser.add_argument_group('Convergence Tuning',
-                                      'Convergence tuning configurations')
+    group = parser.add_argument_group('Convergence Tuning', 'Convergence tuning configurations')
 
     # LR scheduler
-    group.add_argument('--lr_schedule',
-                       type=str,
-                       default=None,
-                       help='LR schedule for training.')
+    group.add_argument('--lr_schedule', type=str, default=None, help='LR schedule for training.')
 
     # Learning rate range test
-    group.add_argument("--lr_range_test_min_lr",
-                       type=float,
-                       default=0.001,
-                       help='Starting lr value.')
-    group.add_argument("--lr_range_test_step_rate",
-                       type=float,
-                       default=1.0,
-                       help='scaling rate for LR range test.')
-    group.add_argument("--lr_range_test_step_size",
-                       type=int,
-                       default=1000,
-                       help='training steps per LR change.')
+    group.add_argument("--lr_range_test_min_lr", type=float, default=0.001, help='Starting lr value.')
+    group.add_argument("--lr_range_test_step_rate", type=float, default=1.0, help='scaling rate for LR range test.')
+    group.add_argument("--lr_range_test_step_size", type=int, default=1000, help='training steps per LR change.')
     group.add_argument("--lr_range_test_staircase",
                        type=bool,
                        default=False,
@@ -89,70 +81,43 @@ def add_tuning_arguments(parser):
                        type=int,
                        default=-1,
                        help='first stair count for 1Cycle schedule.')
-    group.add_argument(
-        "--cycle_second_step_size",
-        type=int,
-        default=-1,
-        help='size of second step of 1Cycle schedule (default first_step_size).')
+    group.add_argument("--cycle_second_step_size",
+                       type=int,
+                       default=-1,
+                       help='size of second step of 1Cycle schedule (default first_step_size).')
     group.add_argument("--cycle_second_stair_count",
                        type=int,
                        default=-1,
                        help='second stair count for 1Cycle schedule.')
-    group.add_argument(
-        "--decay_step_size",
-        type=int,
-        default=1000,
-        help='size of intervals for applying post cycle decay (training steps).')
+    group.add_argument("--decay_step_size",
+                       type=int,
+                       default=1000,
+                       help='size of intervals for applying post cycle decay (training steps).')
 
     # 1Cycle LR
-    group.add_argument("--cycle_min_lr",
-                       type=float,
-                       default=0.01,
-                       help='1Cycle LR lower bound.')
-    group.add_argument("--cycle_max_lr",
-                       type=float,
-                       default=0.1,
-                       help='1Cycle LR upper bound.')
-    group.add_argument("--decay_lr_rate",
-                       type=float,
-                       default=0.0,
-                       help='post cycle LR decay rate.')
+    group.add_argument("--cycle_min_lr", type=float, default=0.01, help='1Cycle LR lower bound.')
+    group.add_argument("--cycle_max_lr", type=float, default=0.1, help='1Cycle LR upper bound.')
+    group.add_argument("--decay_lr_rate", type=float, default=0.0, help='post cycle LR decay rate.')
 
     # 1Cycle Momentum
-    group.add_argument('--cycle_momentum',
-                       default=False,
-                       action='store_true',
-                       help='Enable 1Cycle momentum schedule.')
-    group.add_argument("--cycle_min_mom",
-                       type=float,
-                       default=0.8,
-                       help='1Cycle momentum lower bound.')
-    group.add_argument("--cycle_max_mom",
-                       type=float,
-                       default=0.9,
-                       help='1Cycle momentum upper bound.')
-    group.add_argument("--decay_mom_rate",
-                       type=float,
-                       default=0.0,
-                       help='post cycle momentum decay rate.')
+    group.add_argument('--cycle_momentum', default=False, action='store_true', help='Enable 1Cycle momentum schedule.')
+    group.add_argument("--cycle_min_mom", type=float, default=0.8, help='1Cycle momentum lower bound.')
+    group.add_argument("--cycle_max_mom", type=float, default=0.9, help='1Cycle momentum upper bound.')
+    group.add_argument("--decay_mom_rate", type=float, default=0.0, help='post cycle momentum decay rate.')
 
     # Warmup LR
-    group.add_argument('--warmup_min_lr',
-                       type=float,
-                       default=0,
-                       help='WarmupLR minimum/initial LR value')
-    group.add_argument('--warmup_max_lr',
-                       type=float,
-                       default=0.001,
-                       help='WarmupLR maximum LR value.')
-    group.add_argument('--warmup_num_steps',
-                       type=int,
-                       default=1000,
-                       help='WarmupLR step count for LR warmup.')
+    group.add_argument('--warmup_min_lr', type=float, default=0, help='WarmupLR minimum/initial LR value')
+    group.add_argument('--warmup_max_lr', type=float, default=0.001, help='WarmupLR maximum LR value.')
+    group.add_argument('--warmup_num_steps', type=int, default=1000, help='WarmupLR step count for LR warmup.')
     group.add_argument('--warmup_type',
                        type=str,
                        default=WARMUP_LOG_RATE,
                        help='WarmupLR increasing function during warmup')
+
+    # WarmUP cos LR
+    group.add_argument("--warmup_min_ratio", type=float, default=0.01, help='Cosine LR lower bound.')
+    group.add_argument("--cos_min_ratio", type=float, default=0.01, help='Cosine LR lower bound.')
+
     return parser
 
 
@@ -168,16 +133,13 @@ def override_lr_range_test_params(args, params):
     if hasattr(args, LR_RANGE_TEST_MIN_LR) and args.lr_range_test_min_lr is not None:
         params[LR_RANGE_TEST_MIN_LR] = args.lr_range_test_min_lr
 
-    if hasattr(args,
-               LR_RANGE_TEST_STEP_RATE) and args.lr_range_test_step_rate is not None:
+    if hasattr(args, LR_RANGE_TEST_STEP_RATE) and args.lr_range_test_step_rate is not None:
         params[LR_RANGE_TEST_STEP_RATE] = args.lr_range_test_step_rate
 
-    if hasattr(args,
-               LR_RANGE_TEST_STEP_SIZE) and args.lr_range_test_step_size is not None:
+    if hasattr(args, LR_RANGE_TEST_STEP_SIZE) and args.lr_range_test_step_size is not None:
         params[LR_RANGE_TEST_STEP_SIZE] = args.lr_range_test_step_size
 
-    if hasattr(args,
-               LR_RANGE_TEST_STAIRCASE) and args.lr_range_test_staircase is not None:
+    if hasattr(args, LR_RANGE_TEST_STAIRCASE) and args.lr_range_test_staircase is not None:
         params[LR_RANGE_TEST_STAIRCASE] = args.lr_range_test_staircase
 
 
@@ -185,15 +147,13 @@ def override_1cycle_params(args, params):
     if hasattr(args, CYCLE_FIRST_STEP_SIZE) and args.cycle_first_step_size is not None:
         params[CYCLE_FIRST_STEP_SIZE] = args.cycle_first_step_size
 
-    if hasattr(args,
-               CYCLE_FIRST_STAIR_COUNT) and args.cycle_first_stair_count is not None:
+    if hasattr(args, CYCLE_FIRST_STAIR_COUNT) and args.cycle_first_stair_count is not None:
         params[CYCLE_FIRST_STAIR_COUNT] = args.cycle_first_stair_count
 
     if hasattr(args, CYCLE_SECOND_STEP_SIZE) and args.cycle_second_step_size is not None:
         params[CYCLE_SECOND_STEP_SIZE] = args.cycle_second_step_size
 
-    if hasattr(args,
-               CYCLE_SECOND_STAIR_COUNT) and args.cycle_second_stair_count is not None:
+    if hasattr(args, CYCLE_SECOND_STAIR_COUNT) and args.cycle_second_stair_count is not None:
         params[CYCLE_SECOND_STAIR_COUNT] = args.cycle_second_stair_count
 
     if hasattr(args, DECAY_STEP_SIZE) and args.decay_step_size is not None:
@@ -301,8 +261,7 @@ def get_torch_optimizer(optimizer):
     if hasattr(optimizer, 'optimizer') and isinstance(optimizer.optimizer, Optimizer):
         return optimizer.optimizer
 
-    raise TypeError('{} is not a subclass of torch.optim.Optimizer'.format(
-        type(optimizer).__name__))
+    raise TypeError('{} is not a subclass of torch.optim.Optimizer'.format(type(optimizer).__name__))
 
 
 class LRRangeTest(object):
@@ -343,6 +302,7 @@ class LRRangeTest(object):
         _A disciplined approach to neural network hyper-parameters: Part 1 -- learning rate, batch size, momentum, and weight decay:
         https://arxiv.org/abs/1803.09820
 """
+
     def __init__(self,
                  optimizer: Optimizer,
                  lr_range_test_min_lr: float = 1e-3,
@@ -353,13 +313,10 @@ def __init__(self,
 
         self.optimizer = get_torch_optimizer(optimizer)
 
-        if isinstance(lr_range_test_min_lr,
-                      list) or isinstance(lr_range_test_min_lr,
-                                          tuple):
+        if isinstance(lr_range_test_min_lr, list) or isinstance(lr_range_test_min_lr, tuple):
             if len(lr_range_test_min_lr) != len(self.optimizer.param_groups):
-                raise ValueError("expected {} lr_range_test_min_lr, got {}".format(
-                    len(self.optimizer.param_groups),
-                    len(lr_range_test_min_lr)))
+                raise ValueError("expected {} lr_range_test_min_lr, got {}".format(len(self.optimizer.param_groups),
+                                                                                   len(lr_range_test_min_lr)))
             self.min_lr = list(lr_range_test_min_lr)
         else:
             self.min_lr = [lr_range_test_min_lr] * len(self.optimizer.param_groups)
@@ -384,9 +341,7 @@ def _get_increase(self):
 
     def get_lr(self):
         lr_increase = self._get_increase()
-        return [
-            lr_range_test_min_lr * lr_increase for lr_range_test_min_lr in self.min_lr
-        ]
+        return [lr_range_test_min_lr * lr_increase for lr_range_test_min_lr in self.min_lr]
 
     def get_last_lr(self):
         """ Return last computed learning rate by current scheduler.
@@ -480,6 +435,7 @@ class OneCycle(object):
 
     .. _A disciplined approach to neural network hyper-parameters: Part 1 -- learning rate, batch size, momentum, and weight decay: https://arxiv.org/abs/1803.09820
     """
+
     def __init__(self,
                  optimizer,
                  cycle_min_lr,
@@ -499,43 +455,27 @@ def __init__(self,
         self.optimizer = get_torch_optimizer(optimizer)
 
         # Initialize cycle shape
-        self._initialize_cycle(cycle_first_step_size,
-                               cycle_second_step_size,
-                               cycle_first_stair_count,
-                               cycle_second_stair_count,
-                               decay_step_size)
+        self._initialize_cycle(cycle_first_step_size, cycle_second_step_size, cycle_first_stair_count,
+                               cycle_second_stair_count, decay_step_size)
 
         # Initialize cycle lr
-        self._initialize_lr(self.optimizer,
-                            cycle_min_lr,
-                            cycle_max_lr,
-                            decay_lr_rate,
-                            last_batch_iteration)
+        self._initialize_lr(self.optimizer, cycle_min_lr, cycle_max_lr, decay_lr_rate, last_batch_iteration)
 
         # Initialize cyclic momentum
         self.cycle_momentum = cycle_momentum
         if cycle_momentum:
-            self._initialize_momentum(self.optimizer,
-                                      cycle_min_mom,
-                                      cycle_max_mom,
-                                      decay_mom_rate,
+            self._initialize_momentum(self.optimizer, cycle_min_mom, cycle_max_mom, decay_mom_rate,
                                       last_batch_iteration)
-
         # Initialize batch iteration tracker
         self.last_batch_iteration = last_batch_iteration
 
     # Configure cycle shape
 
-    def _initialize_cycle(self,
-                          cycle_first_step_size,
-                          cycle_second_step_size,
-                          cycle_first_stair_count,
-                          cycle_second_stair_count,
-                          decay_step_size):
+    def _initialize_cycle(self, cycle_first_step_size, cycle_second_step_size, cycle_first_stair_count,
+                          cycle_second_stair_count, decay_step_size):
         cycle_first_step_size = float(cycle_first_step_size)
         cycle_second_step_size = float(
-            cycle_second_step_size
-        ) if cycle_second_step_size is not None else cycle_first_step_size
+            cycle_second_step_size) if cycle_second_step_size is not None else cycle_first_step_size
 
         self.total_size = cycle_first_step_size + cycle_second_step_size
         self.step_ratio = cycle_first_step_size / self.total_size
@@ -551,12 +491,7 @@ def _initialize_cycle(self,
             self.skip_mom_decay = False
 
     # Configure lr schedule
-    def _initialize_lr(self,
-                       optimizer,
-                       cycle_min_lr,
-                       cycle_max_lr,
-                       decay_lr_rate,
-                       last_batch_iteration):
+    def _initialize_lr(self, optimizer, cycle_min_lr, cycle_max_lr, decay_lr_rate, last_batch_iteration):
         self.min_lrs = [cycle_min_lr] * len(optimizer.param_groups)
         if last_batch_iteration == -1:
             for lr, group in zip(self.min_lrs, optimizer.param_groups):
@@ -569,12 +504,7 @@ def _initialize_lr(self,
             self.skip_lr_decay = True
 
     # Configure momentum schedule
-    def _initialize_momentum(self,
-                             optimizer,
-                             cycle_min_mom,
-                             cycle_max_mom,
-                             decay_mom_rate,
-                             last_batch_iteration):
+    def _initialize_momentum(self, optimizer, cycle_min_mom, cycle_max_mom, decay_mom_rate, last_batch_iteration):
         if 'betas' not in optimizer.defaults:
             optimizer_name = type(optimizer).__name__
             logger.warn(
@@ -722,6 +652,7 @@ class WarmupLR(object):
             >>>         scheduler.step()
 
     """
+
     def __init__(self,
                  optimizer: Optimizer,
                  warmup_min_lr: float = 0.0,
@@ -738,9 +669,8 @@ def __init__(self,
         self.warmup_num_steps = max(2, warmup_num_steps)
         # Currently only support linear and log function
         if warmup_type not in {WARMUP_LOG_RATE, WARMUP_LINEAR_RATE}:
-            logger.warning(
-                f"Using unknown warmup_type: {warmup_type}. The increasing function "
-                f"is set to default (log)")
+            logger.warning(f"Using unknown warmup_type: {warmup_type}. The increasing function "
+                           f"is set to default (log)")
             warmup_type = WARMUP_LOG_RATE
         self.warmup_type = warmup_type
         self.inverse_log_warm_up = 1.0 / math.log(self.warmup_num_steps)
@@ -748,15 +678,10 @@ def __init__(self,
 
     def get_lr(self):
         if self.last_batch_iteration < 0:
-            logger.warning(
-                "Attempting to get learning rate from scheduler before it has started")
+            logger.warning("Attempting to get learning rate from scheduler before it has started")
             return [0.0]
         gamma = self._get_gamma()
-        return [
-            min_lr + (delta_lr * gamma) for min_lr,
-            delta_lr in zip(self.min_lrs,
-                            self.delta_lrs)
-        ]
+        return [min_lr + (delta_lr * gamma) for min_lr, delta_lr in zip(self.min_lrs, self.delta_lrs)]
 
     def get_last_lr(self):
         """ Return last computed learning rate by current scheduler.
@@ -789,10 +714,8 @@ def _get_gamma(self):
     def _format_param(self, optimizer, param_value, param_name):
         if isinstance(param_value, list) or isinstance(param_value, tuple):
             if len(param_value) != len(optimizer.param_groups):
-                raise ValueError("expected {} value for {}, got {}".format(
-                    len(optimizer.param_groups),
-                    param_name,
-                    FileNotFoundError(param_value)))
+                raise ValueError("expected {} value for {}, got {}".format(len(optimizer.param_groups), param_name,
+                                                                           FileNotFoundError(param_value)))
             return list(param_value)
         return [param_value] * len(optimizer.param_groups)
 
@@ -819,6 +742,7 @@ class WarmupDecayLR(WarmupLR):
             >>>         scheduler.step()
 
     """
+
     def __init__(self,
                  optimizer: Optimizer,
                  total_num_steps: int,
@@ -829,17 +753,11 @@ def __init__(self,
                  last_batch_iteration: int = -1):
 
         self.total_num_steps = total_num_steps
-        super(WarmupDecayLR,
-              self).__init__(optimizer,
-                             warmup_min_lr,
-                             warmup_max_lr,
-                             warmup_num_steps,
-                             warmup_type,
-                             last_batch_iteration)
+        super(WarmupDecayLR, self).__init__(optimizer, warmup_min_lr, warmup_max_lr, warmup_num_steps, warmup_type,
+                                            last_batch_iteration)
         if self.total_num_steps < self.warmup_num_steps:
             logger.warning('total_num_steps {} is less than warmup_num_steps {}'.format(
-                total_num_steps,
-                warmup_num_steps))
+                total_num_steps, warmup_num_steps))
 
     def _get_gamma(self):
         if self.last_batch_iteration < self.warmup_num_steps:
@@ -850,5 +768,111 @@ def _get_gamma(self):
         return max(
             0.0,
             float(self.total_num_steps - self.last_batch_iteration) /
-            float(max(1.0,
-                      self.total_num_steps - self.warmup_num_steps)))
+            float(max(1.0, self.total_num_steps - self.warmup_num_steps)))
+
+
+class WarmupCosineLR(object):
+    """Increase the learning rate of each parameter group from min lr ratio to max lr ratio
+        over warmup_num_steps steps, and then decay at cosine rate over the remaining training steps to min cosine ratio.
+
+        Args:
+            optimizer (Optimizer): Wrapped optimizer.
+            total_num_steps (int): total number of training steps
+            warmup_min_ratio (float or list): warmup start learning rate ratio. Default: 0
+            warmup_num_steps (int): number of steps to warm up from warmup_min_ratio to 1.0. Default: 1000
+            warmup_type {‘log’, ‘linear’}: increasing function from min_lr to max_lr during warmup. Default: log
+            cos_min_ratio (float): cosine end learning rate ratio. Default: 0.0001
+            last_batch_iteration (int): The index of the last batch. Default: -1.
+        Example:
+            >>> optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
+            >>> scheduler = WarmupCosineLR(optimizer, 1000000)
+            >>> data_loader = torch.utils.data.DataLoader(...)
+            >>> for epoch in range(10):
+            >>>     for batch in data_loader:
+            >>>         train_batch(...)
+            >>>         scheduler.step()
+
+    """
+
+    def __init__(self,
+                 optimizer: Optimizer,
+                 total_num_steps: int,
+                 warmup_min_ratio: float = 0.0,
+                 warmup_num_steps: int = 1000,
+                 cos_min_ratio: float = 0.0001,
+                 warmup_type: str = WARMUP_LOG_RATE,
+                 last_batch_iteration: int = -1):
+
+        self.optimizer = get_torch_optimizer(optimizer)
+
+        self.total_num_steps = total_num_steps
+        self.last_batch_iteration = last_batch_iteration
+        self.cos_min_ratio = cos_min_ratio
+
+        self.warmup_type = warmup_type
+        self.warmup_min_ratio = warmup_min_ratio
+        self.warmup_num_steps = max(2, warmup_num_steps)
+        self.inverse_log_warm_up = 1.0 / math.log(self.warmup_num_steps)
+
+        if self.total_num_steps < self.warmup_num_steps:
+            logger.warning('total_num_steps {} is less than warmup_num_steps {}'.format(
+                total_num_steps, warmup_num_steps))
+        self.org_lrs = [group['lr'] for group in self.optimizer.param_groups]
+
+    def get_lr_ratio(self):
+        if self.last_batch_iteration < 0:
+            logger.warning("Attempting to get learning rate from scheduler before it has started")
+            return [0.0]
+
+        if self.last_batch_iteration < self.warmup_num_steps:
+            if self.warmup_type == WARMUP_LOG_RATE:
+                ratio = self.inverse_log_warm_up * math.log(self.last_batch_iteration + 1)
+            elif self.warmup_type == WARMUP_LINEAR_RATE:
+                ratio = self.last_batch_iteration / self.warmup_num_steps
+            ratio_delta = 1. - self.warmup_min_ratio
+            ratio = self.warmup_min_ratio + ratio * ratio_delta
+            return ratio
+
+        real_last_step = self.last_batch_iteration - self.warmup_num_steps + 1
+        real_total_steps = self.total_num_steps - self.warmup_num_steps
+        ratio_delta = 1. - self.cos_min_ratio
+        ratio = (1 + math.cos(math.pi * real_last_step / real_total_steps)) / 2
+        ratio = max(0.0, self.cos_min_ratio + ratio_delta * ratio)
+        return ratio
+
+    def step(self, last_batch_iteration=None):
+        if last_batch_iteration is None:
+            last_batch_iteration = self.last_batch_iteration + 1
+        self.last_batch_iteration = last_batch_iteration
+
+        lrs = self.get_lr()
+        for param_group, lr in zip(self.optimizer.param_groups, lrs):
+            param_group['lr'] = lr
+        self._last_lr = [group['lr'] for group in self.optimizer.param_groups]
+
+    def get_lr(self):
+        if self.last_batch_iteration < 0:
+            logger.warning("Attempting to get learning rate from scheduler before it has started")
+            return [0.0]
+        lr_ratio = self.get_lr_ratio()
+        return [org_lr * lr_ratio for org_lr in self.org_lrs]
+
+    def get_last_lr(self):
+        """ Return last computed learning rate by current scheduler.
+        """
+        assert getattr(self, '_last_lr', None) is not None, "need to call step() first"
+        return self._last_lr
+
+    def state_dict(self):
+        return {'last_batch_iteration': self.last_batch_iteration}
+
+    def load_state_dict(self, sd):
+        self.last_batch_iteration = sd['last_batch_iteration']
+
+    def _format_param(self, optimizer, param_value, param_name):
+        if isinstance(param_value, list) or isinstance(param_value, tuple):
+            if len(param_value) != len(optimizer.param_groups):
+                raise ValueError("expected {} value for {}, got {}".format(len(optimizer.param_groups), param_name,
+                                                                           FileNotFoundError(param_value)))
+            return list(param_value)
+        return [param_value] * len(optimizer.param_groups)
diff --git a/deepspeed/runtime/pipe/__init__.py b/deepspeed/runtime/pipe/__init__.py
index dcffaa6101b4..cf58eb1801af 100644
--- a/deepspeed/runtime/pipe/__init__.py
+++ b/deepspeed/runtime/pipe/__init__.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .module import PipelineModule, LayerSpec, TiedLayerSpec
 from .topology import ProcessTopology
diff --git a/deepspeed/runtime/pipe/engine.py b/deepspeed/runtime/pipe/engine.py
index a080559b1a2a..f08657b793d7 100644
--- a/deepspeed/runtime/pipe/engine.py
+++ b/deepspeed/runtime/pipe/engine.py
@@ -1,4 +1,7 @@
-# Copyright 2019 The Microsoft DeepSpeed Team
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from types import MethodType
 
@@ -8,10 +11,18 @@
 from deepspeed.utils import logger
 from deepspeed.utils.timer import ThroughputTimer
 from deepspeed.accelerator import get_accelerator
+from deepspeed.runtime.bf16_optimizer import BF16_Optimizer
 
 from ..engine import DeepSpeedEngine, MEMORY_OPT_ALLREDUCE_SIZE
+from deepspeed.utils.timer import FORWARD_MICRO_TIMER, FORWARD_GLOBAL_TIMER, BACKWARD_MICRO_TIMER, \
+    BACKWARD_GLOBAL_TIMER, BACKWARD_INNER_MICRO_TIMER, BACKWARD_INNER_GLOBAL_TIMER, \
+    BACKWARD_REDUCE_MICRO_TIMER, BACKWARD_REDUCE_GLOBAL_TIMER, \
+    STEP_MICRO_TIMER, STEP_GLOBAL_TIMER
+
 from ..utils import PartitionedTensor
 from ..dataloader import RepeatingLoader
+from ..zero.config import ZeroStageEnum
+from ..activation_checkpointing import checkpointing as ds_checkpointing
 
 from .module import PipelineModule, PipelineError
 from . import p2p
@@ -21,6 +32,13 @@
 LOG_STAGE = -2
 DATA_PARALLEL_ID = -2
 
+BATCH_INPUT_TIMER = 'batch_input'
+TRAIN_BATCH_TIMER = 'train_batch'
+PIPE_SEND_OUTPUT_TIMER = 'pipe_send_output'
+PIPE_SEND_GRAD_TIMER = 'pipe_send_grad'
+PIPE_RECV_INPUT_TIMER = 'pipe_recv_input'
+PIPE_RECV_GRAD_TIMER = 'pipe_recv_grad'
+
 
 def is_even(number):
     return number % 2 == 0
@@ -41,18 +59,8 @@ class PipelineEngine(DeepSpeedEngine):
     is provided.
     """
     ID_TO_DTYPE = [
-        torch.float32,
-        torch.float64,
-        torch.complex64,
-        torch.complex128,
-        torch.float16,
-        torch.bfloat16,
-        torch.uint8,
-        torch.int8,
-        torch.int16,
-        torch.int32,
-        torch.int64,
-        torch.bool
+        torch.float32, torch.float64, torch.complex64, torch.complex128, torch.float16, torch.bfloat16, torch.uint8,
+        torch.int8, torch.int16, torch.int32, torch.int64, torch.bool
     ]
     DTYPE_TO_ID = {dtype: id_ for id_, dtype in enumerate(ID_TO_DTYPE)}
 
@@ -60,13 +68,16 @@ def __init__(self, has_bool_tensors=False, *super_args, **super_kwargs):
         super().__init__(*super_args, **super_kwargs)
         assert isinstance(self.module, PipelineModule), "model must base PipelineModule"
 
-        assert self.zero_optimization_stage() < 2, "ZeRO-2 and ZeRO-3 are incompatible with pipeline parallelism"
+        assert self.zero_optimization_stage(
+        ) < ZeroStageEnum.gradients, "ZeRO-2 and ZeRO-3 are incompatible with pipeline parallelism"
 
         # We schedule the all-reduces, so disable it in super().backward()
         self.enable_backward_allreduce = False
         self.has_bool_tensors = has_bool_tensors
         self.eval_return_logits = False
         self.outputs = None
+        # BF16 Optimizer is hardcoded for fp32 gradient accumulation
+        self.using_bf16_optimizer = type(self.optimizer) == BF16_Optimizer
 
         # used to disable the pipeline all-reduce when used with 1-bit Adam/1-bit LAMB
         self.pipeline_enable_backward_allreduce = True
@@ -121,8 +132,12 @@ def __init__(self, has_bool_tensors=False, *super_args, **super_kwargs):
 
         # Partition input/output buffers
         # XXX temporarily disable while I revert some partition hacks.
-        self.is_pipe_partitioned = self.is_model_parallel
-        self.is_grad_partitioned = self.is_model_parallel
+        assert isinstance(self._config.pipeline['pipe_partitioned'], bool)
+        assert isinstance(self._config.pipeline['grad_partitioned'], bool)
+        self.is_pipe_partitioned = self.is_model_parallel and self._config.pipeline['pipe_partitioned']
+        self.is_grad_partitioned = self.is_model_parallel and self._config.pipeline['grad_partitioned']
+        logger.info(f'is_pipe_partitioned= {self.is_pipe_partitioned} '
+                    f'is_grad_partitioned= {self.is_grad_partitioned}')
 
         model_parameters = filter(lambda p: p.requires_grad, self.module.parameters())
         num_params = sum([p.numel() for p in model_parameters])
@@ -134,8 +149,7 @@ def __init__(self, has_bool_tensors=False, *super_args, **super_kwargs):
                 if self.global_rank != min(d['ranks']):
                     tied_params += sum(p.numel() for p in d['module'].parameters())
             unique_params -= tied_params
-        params_tensor = torch.LongTensor(data=[num_params,
-                                               unique_params]).to(self.device)
+        params_tensor = torch.LongTensor(data=[num_params, unique_params]).to(self.device)
         dist.all_reduce(params_tensor, group=self.grid.get_model_parallel_group())
         params_tensor = params_tensor.tolist()
         total_params = params_tensor[0]
@@ -156,10 +170,10 @@ def __init__(self, has_bool_tensors=False, *super_args, **super_kwargs):
         # Pipeline buffers
         self.num_pipe_buffers = 0
         self.pipe_buffers = {
-            'inputs' : [],   # batch input and received activations
-            'labels' : [],   # labels from batch input
-            'outputs' : [],  # activations
-            'output_tensors' : [], # tensor object to preserve backward graph
+            'inputs': [],  # batch input and received activations
+            'labels': [],  # labels from batch input
+            'outputs': [],  # activations
+            'output_tensors': [],  # tensor object to preserve backward graph
         }
         self.pipe_recv_buf = None
         self.grad_layer = None
@@ -178,8 +192,15 @@ def __init__(self, has_bool_tensors=False, *super_args, **super_kwargs):
         self.dp_group_loss = torch.tensor(0.0, requires_grad=False).to(self.device)
 
         if self._config.pipeline['activation_checkpoint_interval'] > 0:
-            self.module.activation_checkpoint_interval = self._config.pipeline[
-                'activation_checkpoint_interval']
+            self.module.activation_checkpoint_interval = self._config.pipeline['activation_checkpoint_interval']
+            # set use_reentrant default to True.
+            if self._config.pipeline.get('use_reentrant') is None:
+                self._config.pipeline['use_reentrant'] = True
+            if self._config.pipeline['use_reentrant'] is False:
+                # set activation_checkpoint_func to non_reentrant_checkpoint func.
+                self.module.activation_checkpoint_func = ds_checkpointing.non_reentrant_checkpoint
+                if self.grid.get_global_rank() == 0:
+                    logger.info(f'CONFIG: activation_checkpoint_func=non_reentrant_checkpoint')
 
         self.module.checkpoint_parallel_write_pipeline = self._config.checkpoint_parallel_write_pipeline
 
@@ -202,29 +223,28 @@ def __init__(self, has_bool_tensors=False, *super_args, **super_kwargs):
         # XXX look into timer reporting timing
         # Initialize some timers because of early weirdness.
         if self.wall_clock_breakdown():
-            self.timers('forward_microstep').start()
-            self.timers('forward_microstep').stop()
-            self.timers('backward_microstep').start()
-            self.timers('backward_microstep').stop()
-            self.timers('backward_inner_microstep').start()
-            self.timers('backward_inner_microstep').stop()
-            self.timers('backward_allreduce_microstep').start()
-            self.timers('backward_allreduce_microstep').stop()
-            self.timers('backward_allreduce').start()
-            self.timers('backward_allreduce').stop()
-            self.timers('step_microstep').start()
-            self.timers('step_microstep').stop()
+            self.timers(FORWARD_MICRO_TIMER).start()
+            self.timers(FORWARD_MICRO_TIMER).stop()
+            self.timers(BACKWARD_MICRO_TIMER).start()
+            self.timers(BACKWARD_MICRO_TIMER).stop()
+            self.timers(BACKWARD_INNER_MICRO_TIMER).start()
+            self.timers(BACKWARD_INNER_MICRO_TIMER).stop()
+            self.timers(BACKWARD_REDUCE_MICRO_TIMER).start()
+            self.timers(BACKWARD_REDUCE_MICRO_TIMER).stop()
+            self.timers(BACKWARD_REDUCE_GLOBAL_TIMER).start()
+            self.timers(BACKWARD_REDUCE_GLOBAL_TIMER).stop()
+            self.timers(STEP_MICRO_TIMER).start()
+            self.timers(STEP_MICRO_TIMER).stop()
 
     def set_has_attention_mask(self, value):
         assert isinstance(value, bool)
         self.has_attention_mask = value
 
     def _build_data_iter(self, dataset):
-        sampler = torch.utils.data.distributed.DistributedSampler(
-            dataset,
-            num_replicas=self.dp_world_size,
-            rank=self.mpu.get_data_parallel_rank(),
-            shuffle=False)
+        sampler = torch.utils.data.distributed.DistributedSampler(dataset,
+                                                                  num_replicas=self.dp_world_size,
+                                                                  rank=self.mpu.get_data_parallel_rank(),
+                                                                  shuffle=False)
         # Build a loader and make it repeating.
         pipe_dataloader = self.deepspeed_io(dataset, data_sampler=sampler)
         pipe_dataloader = RepeatingLoader(pipe_dataloader)
@@ -244,18 +264,15 @@ def _exec_reduce_tied_grads(self):
 
         weight_group_list = self.module.get_tied_weights_and_groups()
         for weight, group in weight_group_list:
-            grad = weight._hp_grad if self.bfloat16_enabled() else weight.grad
+            grad = weight._hp_grad if self.using_bf16_optimizer else weight.grad
             dist.all_reduce(grad, group=group)
 
     def _exec_reduce_grads(self):
         self._force_grad_boundary = True
         if self.pipeline_enable_backward_allreduce:
-            if self.bfloat16_enabled():
-                if self.zero_optimization_stage() == 0:
-                    self._bf16_reduce_grads()
-                else:
-                    assert self.zero_optimization_stage() == 1, "only bf16 + z1 are supported"
-                    raise NotImplementedError()
+            if self.using_bf16_optimizer:
+                # PP+BF16 work for ZeRO Stage 1
+                self._bf16_reduce_grads()
             else:
                 self.allreduce_gradients(bucket_size=MEMORY_OPT_ALLREDUCE_SIZE)
         self._force_grad_boundary = False
@@ -317,8 +334,7 @@ def train_batch(self, data_iter=None):
             The arithmetic mean of the losses computed this batch.
         """
         if not torch._C.is_grad_enabled():
-            raise RuntimeError(
-                f'train_batch() requires gradients enabled. Use eval_batch() instead.')
+            raise RuntimeError(f'train_batch() requires gradients enabled. Use eval_batch() instead.')
 
         # Curriculum learning could change activation shape
         if self.curriculum_enabled_legacy():
@@ -331,7 +347,7 @@ def train_batch(self, data_iter=None):
                 self.global_steps):
                 self.reset_activation_shape()
 
-        if data_iter:
+        if data_iter is not None:
             self.set_dataiterator(data_iter)
 
         self.module.train()
@@ -339,49 +355,45 @@ def train_batch(self, data_iter=None):
         self._compute_loss = True
 
         # Do the work
-        self.timers('train_batch').start()
+        self.timers(TRAIN_BATCH_TIMER).start()
         sched = schedule.TrainSchedule(micro_batches=self.micro_batches,
                                        stages=self.num_stages,
                                        stage_id=self.stage_id)
         self._exec_schedule(sched)
         self.agg_train_loss = self._aggregate_total_loss()
 
-        self.timers('train_batch').stop()
+        self.timers(TRAIN_BATCH_TIMER).stop()
 
         if self.global_steps % self.steps_per_print() == 0:
             if self.global_rank == 0:
-                elapsed = self.timers('train_batch').elapsed(reset=True) / 1000.0
+                elapsed = self.timers(TRAIN_BATCH_TIMER).elapsed(reset=True) / 1000.0
                 iter_time = elapsed / self.steps_per_print()
                 tput = self.train_batch_size() / iter_time
                 print(f'steps: {self.global_steps} '
                       f'loss: {self.agg_train_loss:0.4f} '
                       f'iter time (s): {iter_time:0.3f} '
                       f'samples/sec: {tput:0.3f}')
+            else:
+                self.timers(TRAIN_BATCH_TIMER).elapsed(reset=True)
 
         # Monitoring
         if self.global_rank == 0 and self.monitor.enabled:
-            self.summary_events = [(f'Train/Samples/train_loss',
-                                    self.agg_train_loss.mean().item(),
+            self.summary_events = [(f'Train/Samples/train_loss', self.agg_train_loss.mean().item(),
                                     self.global_samples)]
             self.monitor.write_events(self.summary_events)
 
-        if self.wall_clock_breakdown(
-        ) and self.global_steps % self.steps_per_print() == 0:
+        if self.wall_clock_breakdown() and self.global_steps % self.steps_per_print() == 0:
             self.timers.log([
-                'pipe_send_output',
-                'pipe_send_grad',
-                'pipe_recv_input',
-                'pipe_recv_grad'
+                PIPE_SEND_OUTPUT_TIMER,
+                PIPE_SEND_GRAD_TIMER,
+                PIPE_RECV_INPUT_TIMER,
+                PIPE_RECV_GRAD_TIMER,
             ])
 
         # TODO: should return precisely what loss returned and allow others to be queried?
         return self.agg_train_loss
 
-    def eval_batch(self,
-                   data_iter,
-                   return_logits=False,
-                   compute_loss=True,
-                   reduce_output='avg'):
+    def eval_batch(self, data_iter, return_logits=False, compute_loss=True, reduce_output='avg', bcast_loss=True):
         """Evaluate the pipeline on a batch of data from ``data_iter``. The
         engine will evaluate ``self.train_batch_size()`` total samples
         collectively across all workers.
@@ -444,13 +456,11 @@ def eval_batch(self,
         if self.is_last_stage():
             eval_output = self._reduce_outputs(self.fwd_outputs, reduce=reduce_output)
 
-        if compute_loss:
+        if compute_loss and (bcast_loss or self.monitor.enabled):
             eval_output = self._bcast_pipe_scalar(eval_output)
 
         if self.global_rank == 0 and self.monitor.enabled:
-            self.summary_events = [(f'Train/Samples/eval_loss',
-                                    eval_output.mean().item(),
-                                    self.global_samples)]
+            self.summary_events = [(f'Train/Samples/eval_loss', eval_output.mean().item(), self.global_samples)]
             self.monitor.write_events(self.summary_events)
 
         # Restore the training iterator
@@ -510,8 +520,7 @@ def _reduce_outputs(self, outputs, reduce='avg', reduce_dp=True):
                     reduced /= self.dp_world_size
                 else:
                     for idx in range(len(reduced)):
-                        dist.all_reduce(reduced[idx],
-                                        group=self.mpu.get_data_parallel_group())
+                        dist.all_reduce(reduced[idx], group=self.mpu.get_data_parallel_group())
                         reduced[idx] /= self.dp_world_size
 
             return reduced
@@ -525,13 +534,11 @@ def _bcast_pipe_scalar(self, data, src_rank=None, dtype=torch.float32):
         assert src_rank in self.grid.pp_group
 
         if self.global_rank == src_rank:
-            result = data.clone().detach()
+            result = data.clone().detach().type(dtype).to(self.device)
         else:
             result = torch.Tensor([0.]).type(dtype).to(self.device)
 
-        dist.broadcast(tensor=result,
-                       src=src_rank,
-                       group=self.mpu.get_pipe_parallel_group())
+        dist.broadcast(tensor=result, src=src_rank, group=self.mpu.get_pipe_parallel_group())
 
         return result
 
@@ -549,19 +556,15 @@ def _aggregate_total_loss(self):
                 agg_loss /= self.dp_world_size
 
             assert self.global_rank in self.grid.pp_group
-            losses = torch.Tensor([self.dp_group_loss, agg_loss]).to(self.device)
-            dist.broadcast(tensor=losses,
-                           src=self.global_rank,
-                           group=self.mpu.get_pipe_parallel_group())
-
+            losses = torch.stack([self.dp_group_loss, agg_loss])
+            if self.is_pipe_parallel:
+                dist.broadcast(tensor=losses, src=self.global_rank, group=self.mpu.get_pipe_parallel_group())
         else:
             # Get loss from last stage
             src_rank = self.grid.stage_to_global(self.num_stages - 1)
             assert src_rank in self.grid.pp_group
             losses = torch.Tensor([0., 0.]).to(self.device)
-            dist.broadcast(tensor=losses,
-                           src=src_rank,
-                           group=self.grid.get_pipe_parallel_group())
+            dist.broadcast(tensor=losses, src=src_rank, group=self.grid.get_pipe_parallel_group())
             self.dp_group_loss = losses[0].clone().detach()
             agg_loss = losses[1].clone().detach()
 
@@ -638,10 +641,9 @@ def _exec_forward_pass(self, buffer_id):
 
         # collect the partitioned input from the previous stage
         if self.is_pipe_partitioned and not self.is_first_stage():
-            part_input = PartitionedTensor.from_meta(
-                meta=inputs[0],
-                local_part=inputs[1],
-                group=self.grid.get_slice_parallel_group())
+            part_input = PartitionedTensor.from_meta(meta=inputs[0],
+                                                     local_part=inputs[1],
+                                                     group=self.grid.get_slice_parallel_group())
 
             inputs = (part_input.full(), *inputs[2:])
             inputs[0].requires_grad = True
@@ -651,29 +653,27 @@ def _exec_forward_pass(self, buffer_id):
             inputs = inputs[0] if len(inputs) == 1 else inputs
             self.pipe_buffers['inputs'][buffer_id] = inputs
 
-        # Zero out the gradients each time we use the tensor because only the data in
-        # tensor changes across batches
-        self._zero_grads(inputs)
-
+        # inputs has no gradient because it is from a cloned tensor
         outputs = super().forward(inputs)
 
+        # Reset activation checkpointing buffers.
+        # Need to call this between evaluation iterations
+        if not self.module.training:
+            ds_checkpointing.reset()
+
         # Partition the outputs if we are not the last stage
         if self.is_pipe_partitioned and not self.is_last_stage():
             if isinstance(outputs, tuple):
                 first_output = outputs[0]
                 # TODO: Improve pipe partitioning to pass multiple tensors that require grads
-                assert all([
-                    torch.is_tensor(elt) and elt.requires_grad is False
-                    for elt in outputs[1:]
-                ])
+                assert all([torch.is_tensor(elt) and elt.requires_grad is False for elt in outputs[1:]])
                 outputs_tail = outputs[1:]
             elif torch.is_tensor(outputs):
                 first_output = outputs
                 outputs_tail = []
             else:
                 raise ValueError("expecting a tensor or a tuple of tensors")
-            part = PartitionedTensor(tensor=first_output,
-                                     group=self.grid.get_slice_parallel_group())
+            part = PartitionedTensor(tensor=first_output, group=self.grid.get_slice_parallel_group())
             # Clear the large output data, but save the computation graph
             first_output.data = torch.zeros(1)
             self.pipe_buffers['output_tensors'][buffer_id] = first_output
@@ -723,19 +723,18 @@ def _exec_backward_pass(self, buffer_id):
         outputs = self.pipe_buffers['outputs'][buffer_id]
 
         if self.wall_clock_breakdown():
-            self.timers('backward_microstep').start()
-            self.timers('backward').start()
-            self.timers('backward_inner_microstep').start()
-            self.timers('backward_inner').start()
+            self.timers(BACKWARD_MICRO_TIMER).start()
+            self.timers(BACKWARD_GLOBAL_TIMER).start()
+            self.timers(BACKWARD_INNER_MICRO_TIMER).start()
+            self.timers(BACKWARD_INNER_GLOBAL_TIMER).start()
 
         # Reconstruct if we previously partitioned the output. We must be
         # careful to also restore the computational graph of the tensors we partitioned.
         if self.is_pipe_partitioned:
             if self.is_grad_partitioned:
-                part_output = PartitionedTensor.from_meta(
-                    meta=outputs[0],
-                    local_part=outputs[1],
-                    group=self.grid.get_slice_parallel_group())
+                part_output = PartitionedTensor.from_meta(meta=outputs[0],
+                                                          local_part=outputs[1],
+                                                          group=self.grid.get_slice_parallel_group())
                 self.pipe_buffers['output_tensors'][buffer_id].data = part_output.full()
                 outputs = (self.pipe_buffers['output_tensors'][buffer_id], *outputs[2:])
             else:
@@ -746,15 +745,14 @@ def _exec_backward_pass(self, buffer_id):
         grad_tensors = self.grad_layer
         if self.is_grad_partitioned:
             #print(f'RANK={self.global_rank} BEFORE-BWD restoring grad={self.grad_layer[0].size()} {self.grad_layer[1].size()}')
-            part_grad = PartitionedTensor.from_meta(
-                meta=self.grad_layer[0],
-                local_part=self.grad_layer[1],
-                group=self.grid.get_slice_parallel_group())
+            part_grad = PartitionedTensor.from_meta(meta=self.grad_layer[0],
+                                                    local_part=self.grad_layer[1],
+                                                    group=self.grid.get_slice_parallel_group())
             grad_tensors = (part_grad.full(), *grad_tensors[2:])
             part_grad = None
             #print(f'RANK={self.global_rank} BEFORE-BWD restored grad={self.grad_layer[0].size()} {self.grad_layer[1].size()}')
 
-        if self.bfloat16_enabled() and not self.is_last_stage():
+        if self.using_bf16_optimizer and not self.is_last_stage():
             # manually call because we don't call optimizer.backward()
             self.optimizer.clear_lp_grads()
 
@@ -766,7 +764,7 @@ def _exec_backward_pass(self, buffer_id):
         else:
             torch.autograd.backward(tensors=(outputs, ), grad_tensors=(grad_tensors, ))
 
-        if self.bfloat16_enabled() and not self.is_last_stage():
+        if self.using_bf16_optimizer and not self.is_last_stage():
             # manually call because we don't call optimizer.backward()
             self.optimizer.update_hp_grads(clear_lp_grads=False)
 
@@ -776,16 +774,16 @@ def _exec_backward_pass(self, buffer_id):
         grad_tensors = None
 
         if self.wall_clock_breakdown():
-            self.timers('backward_inner').stop()
-            self.timers('backward_inner_microstep').stop()
-            self.timers('backward').stop()
-            self.timers('backward_microstep').stop()
+            self.timers(BACKWARD_INNER_MICRO_TIMER).stop()
+            self.timers(BACKWARD_INNER_GLOBAL_TIMER).stop()
+            self.timers(BACKWARD_MICRO_TIMER).stop()
+            self.timers(BACKWARD_GLOBAL_TIMER).stop()
 
         self.mem_status('AFTER BWD')
 
     def _exec_load_micro_batch(self, buffer_id):
         if self.wall_clock_breakdown():
-            self.timers('batch_input').start()
+            self.timers(BATCH_INPUT_TIMER).start()
 
         batch = self._next_batch()
 
@@ -793,15 +791,19 @@ def _exec_load_micro_batch(self, buffer_id):
             loaded = None
             if torch.is_tensor(batch[0]):
                 loaded = batch[0].clone().to(self.device).detach()
-                loaded.requires_grad = loaded.is_floating_point()
+                if self._config.pipeline['activation_checkpoint_interval'] > 0 and self._config.pipeline[
+                        'use_reentrant']:
+                    loaded.requires_grad = loaded.is_floating_point()
             else:
-                assert isinstance(batch[0], tuple)
+                assert isinstance(batch[0], (tuple, list))
                 # Assume list or tuple
                 loaded = []
                 for x in batch[0]:
                     assert torch.is_tensor(x)
                     mine = x.clone().detach().to(self.device)
-                    mine.requires_grad = mine.is_floating_point()
+                    if self._config.pipeline['activation_checkpoint_interval'] > 0 and self._config.pipeline[
+                            'use_reentrant']:
+                        mine.requires_grad = mine.is_floating_point()
                     loaded.append(mine)
                 loaded = tuple(loaded)
 
@@ -811,7 +813,8 @@ def _exec_load_micro_batch(self, buffer_id):
             loaded = batch[1]
             if torch.is_tensor(batch[1]):
                 loaded = batch[1].to(self.device)
-            elif isinstance(batch[1], tuple):
+            # XXX: torch 1.6.0 DataLoader will auto convert tuple to list
+            elif isinstance(batch[1], (tuple, list)):
                 loaded = []
                 for x in batch[1]:
                     assert torch.is_tensor(x)
@@ -822,7 +825,7 @@ def _exec_load_micro_batch(self, buffer_id):
             self.pipe_buffers['labels'][buffer_id] = loaded
 
         if self.wall_clock_breakdown():
-            self.timers('batch_input').stop()
+            self.timers(BATCH_INPUT_TIMER).stop()
 
     def _send_tensor_meta(self, buffer, recv_stage):
         """ Communicate metadata about upcoming p2p transfers.
@@ -865,8 +868,7 @@ def _send_tensor_meta(self, buffer, recv_stage):
                 assert isinstance(tensor, torch.Tensor)
                 send_shape = torch.LongTensor(data=tensor.size()).to(self.device)
                 send_ndims = torch.LongTensor(data=[len(tensor.size())]).to(self.device)
-                send_dtype = torch.LongTensor(data=[self.DTYPE_TO_ID[tensor.dtype]]).to(
-                    self.device)
+                send_dtype = torch.LongTensor(data=[self.DTYPE_TO_ID[tensor.dtype]]).to(self.device)
                 p2p.send(send_dtype, recv_stage)
                 p2p.send(send_ndims, recv_stage)
                 p2p.send(send_shape, recv_stage)
@@ -945,7 +947,7 @@ def _recv_tensor_meta(self, send_stage):
 
     def _exec_send_activations(self, buffer_id):
         if self.wall_clock_breakdown():
-            self.timers('pipe_send_output').start()
+            self.timers(PIPE_SEND_OUTPUT_TIMER).start()
 
         outputs = self.pipe_buffers['outputs'][buffer_id]
 
@@ -977,11 +979,11 @@ def _exec_send_activations(self, buffer_id):
             outputs = tuple(outputs)
 
         if self.wall_clock_breakdown():
-            self.timers('pipe_send_output').stop()
+            self.timers(PIPE_SEND_OUTPUT_TIMER).stop()
 
     def _exec_send_grads(self, buffer_id):
         if self.wall_clock_breakdown():
-            self.timers('pipe_send_grad').start()
+            self.timers(PIPE_SEND_GRAD_TIMER).start()
 
         inputs = self.pipe_buffers['inputs'][buffer_id]
 
@@ -990,17 +992,14 @@ def _exec_send_grads(self, buffer_id):
             if isinstance(inputs, tuple):
                 first_input = inputs[0]
                 assert all([torch.is_tensor(elt) for elt in inputs[1:]])
-                inputs_grad_tail = [
-                    elt.grad for elt in inputs[1:] if elt.grad is not None
-                ]
+                inputs_grad_tail = [elt.grad for elt in inputs[1:]]
             elif torch.is_tensor(inputs):
                 first_input = inputs
                 inputs_grad_tail = []
             else:
                 raise ValueError("expecting a tensor or a tuple of tensors")
             assert torch.is_tensor(first_input)
-            part = PartitionedTensor(tensor=first_input.grad,
-                                     group=self.grid.get_slice_parallel_group())
+            part = PartitionedTensor(tensor=first_input.grad, group=self.grid.get_slice_parallel_group())
 
             inputs = (part.to_meta(), part.data(), *inputs_grad_tail)
 
@@ -1036,11 +1035,11 @@ def _exec_send_grads(self, buffer_id):
         self.pipe_buffers['inputs'][buffer_id] = None
 
         if self.wall_clock_breakdown():
-            self.timers('pipe_send_grad').stop()
+            self.timers(PIPE_SEND_GRAD_TIMER).stop()
 
     def _exec_recv_activations(self, buffer_id):
         if self.wall_clock_breakdown():
-            self.timers('pipe_recv_input').start()
+            self.timers(PIPE_RECV_INPUT_TIMER).start()
 
         recvd = None
 
@@ -1060,9 +1059,7 @@ def _exec_recv_activations(self, buffer_id):
                 # XXX hardcode meta type
                 if self.is_pipe_partitioned and idx == 0 and buffer.dtype != torch.long:
                     if self.meta_buffer is None:
-                        self.meta_buffer = torch.zeros(buffer.size(),
-                                                       dtype=torch.long,
-                                                       device=self.device)
+                        self.meta_buffer = torch.zeros(buffer.size(), dtype=torch.long, device=self.device)
                     buffer = self.meta_buffer
 
                 p2p.recv(buffer, self.prev_stage)
@@ -1081,20 +1078,19 @@ def _exec_recv_activations(self, buffer_id):
         self.pipe_buffers['inputs'][buffer_id] = recvd
 
         if self.wall_clock_breakdown():
-            self.timers('pipe_recv_input').stop()
+            self.timers(PIPE_RECV_INPUT_TIMER).stop()
 
     def _exec_recv_grads(self, buffer_id):
         if self.wall_clock_breakdown():
-            self.timers('pipe_recv_grad').start()
+            self.timers(PIPE_RECV_GRAD_TIMER).start()
 
         outputs = self.pipe_buffers['outputs'][buffer_id]
         # XXX these shapes are hardcoded for Megatron
         # Restore partitioned output if it was partitioned and we are sending full gradients
         if self.is_pipe_partitioned and not self.is_grad_partitioned:
-            part_output = PartitionedTensor.from_meta(
-                meta=outputs[0],
-                local_part=outputs[1],
-                group=self.grid.get_slice_parallel_group())
+            part_output = PartitionedTensor.from_meta(meta=outputs[0],
+                                                      local_part=outputs[1],
+                                                      group=self.grid.get_slice_parallel_group())
             outputs[0].data = part_output.full()
             outputs = (outputs[0], *outputs[2:])
             # save for backward
@@ -1104,9 +1100,7 @@ def _exec_recv_grads(self, buffer_id):
         if self.grad_layer is None:
             if isinstance(outputs, torch.Tensor):
                 s = list(outputs.size())
-                self.grad_layer = self._allocate_buffer(s,
-                                                        dtype=outputs.dtype,
-                                                        num_buffers=1)[0]
+                self.grad_layer = self._allocate_buffer(s, dtype=outputs.dtype, num_buffers=1)[0]
             else:
                 # XXX This is a HACK
                 # When we exchange activations/gradients, the two pipe stages
@@ -1123,17 +1117,12 @@ def _exec_recv_grads(self, buffer_id):
                 # branches on is_grad_partitioned so we don't filter out the
                 # metadata tensor.
                 if self.is_grad_partitioned:
-                    sizes_and_dtypes = [
-                        (list(t.size()),
-                         t.dtype) for t in outputs[:2]
-                    ] + [(list(t.size()),
-                          t.dtype) for t in outputs[2:] if t.is_floating_point()]
+                    sizes_and_dtypes = [(list(t.size()), t.dtype)
+                                        for t in outputs[:2]] + [(list(t.size()), t.dtype)
+                                                                 for t in outputs[2:] if t.is_floating_point()]
                 else:
-                    sizes_and_dtypes = [(list(t.size()),
-                                         t.dtype) for t in outputs
-                                        if t.is_floating_point()]
-                self.grad_layer = self._allocate_buffers(sizes_and_dtypes,
-                                                         num_buffers=1)[0]
+                    sizes_and_dtypes = [(list(t.size()), t.dtype) for t in outputs if t.is_floating_point()]
+                self.grad_layer = self._allocate_buffers(sizes_and_dtypes, num_buffers=1)[0]
 
         if isinstance(self.grad_layer, torch.Tensor):
             p2p.recv(self.grad_layer, self.next_stage)
@@ -1142,18 +1131,16 @@ def _exec_recv_grads(self, buffer_id):
             for idx, buffer in enumerate(self.grad_layer):
                 # XXX GPT-2 hack
                 if self.is_grad_partitioned and idx == 0 and buffer.dtype != torch.long:
-                    buffer.data = torch.zeros(buffer.size(),
-                                              dtype=torch.long,
-                                              device=self.device)
+                    buffer.data = torch.zeros(buffer.size(), dtype=torch.long, device=self.device)
                 p2p.recv(buffer, self.next_stage)
 
         if self.wall_clock_breakdown():
-            self.timers('pipe_recv_grad').stop()
+            self.timers(PIPE_RECV_GRAD_TIMER).stop()
 
     def _exec_optimizer_step(self, lr_kwargs=None):
         if self.wall_clock_breakdown():
-            self.timers('step_microstep').start()
-            self.timers('step').start()
+            self.timers(STEP_MICRO_TIMER).start()
+            self.timers(STEP_GLOBAL_TIMER).start()
         self.mem_status('BEFORE STEP', reset_max=True)
 
         self._force_grad_boundary = True
@@ -1163,46 +1150,33 @@ def _exec_optimizer_step(self, lr_kwargs=None):
         self.mem_status('AFTER STEP')
 
         if self.global_rank == 0 and self.monitor.enabled:
-            self.summary_events = [(f'Train/Samples/lr',
-                                    self.get_lr()[0],
-                                    self.global_samples)]
+            self.summary_events = [(f'Train/Samples/lr', self.get_lr()[0], self.global_samples)]
             if self.fp16_enabled() and hasattr(self.optimizer, 'cur_scale'):
-                self.summary_events.append((f'Train/Samples/loss_scale',
-                                            self.optimizer.cur_scale,
-                                            self.global_samples))
+                self.summary_events.append(
+                    (f'Train/Samples/loss_scale', self.optimizer.cur_scale, self.global_samples))
             self.monitor.write_events(self.summary_events)
 
         if self.wall_clock_breakdown():
-            self.timers('step_microstep').stop()
-            self.timers('step').stop()
+            self.timers(STEP_MICRO_TIMER).stop()
+            self.timers(STEP_GLOBAL_TIMER).stop()
             if self.global_steps % self.steps_per_print() == 0:
                 self.timers.log([
-                    'batch_input',
-                    'forward_microstep',
-                    'backward_microstep',
-                    'backward_inner_microstep',
-                    'backward_allreduce_microstep',
-                    'backward_tied_allreduce_microstep',
-                    'step_microstep'
+                    BATCH_INPUT_TIMER,
+                    FORWARD_MICRO_TIMER,
+                    BACKWARD_MICRO_TIMER,
+                    BACKWARD_INNER_MICRO_TIMER,
+                    BACKWARD_REDUCE_MICRO_TIMER,
+                    STEP_MICRO_TIMER,
                 ])
             if self.global_steps % self.steps_per_print() == 0:
                 self.timers.log([
-                    'forward',
-                    'backward',
-                    'backward_inner',
-                    'backward_allreduce',
-                    'step'
+                    FORWARD_GLOBAL_TIMER,
+                    BACKWARD_GLOBAL_TIMER,
+                    BACKWARD_INNER_GLOBAL_TIMER,
+                    BACKWARD_REDUCE_GLOBAL_TIMER,
+                    STEP_GLOBAL_TIMER,
                 ])
 
-    def _zero_grads(self, inputs):
-        if isinstance(inputs, torch.Tensor):
-            if inputs.grad is not None:
-                inputs.grad.data.zero_()
-        else:
-            for t in inputs:
-                if t.grad is not None:
-                    t.grad.data.zero_()
-
     def _allocate_zeros(self, shape, **kwargs):
         """ Allocate a tensor of zeros on the engine's device.
 
@@ -1236,10 +1210,7 @@ def _allocate_buffers(self, shapes_and_dtypes, requires_grad=False, num_buffers=
         for count in range(num_buffers):
             buffer = []
             for shape, dtype in shapes_and_dtypes:
-                buffer.append(
-                    self._allocate_zeros(shape,
-                                         dtype=dtype,
-                                         requires_grad=requires_grad))
+                buffer.append(self._allocate_zeros(shape, dtype=dtype, requires_grad=requires_grad))
             buffers.append(buffer)
         return buffers
 
@@ -1298,13 +1269,11 @@ def mem_status(self, msg, print_rank=-1, reset_max=False):
         max_cached /= 1024**3
 
         print(
-            f'RANK={rank} STAGE={self.stage_id} STEP={self.global_steps} MEMSTATS',
-            msg,
+            f'RANK={rank} STAGE={self.stage_id} STEP={self.global_steps} MEMSTATS', msg,
             f'current alloc={new_alloced:0.4f}GB (delta={delta_alloced:0.4f}GB max={max_alloced:0.4f}GB) '
-            f'current cache={new_cached:0.4f}GB (delta={delta_cached:0.4f}GB max={max_cached:0.4f}GB)'
-        )
+            f'current cache={new_cached:0.4f}GB (delta={delta_cached:0.4f}GB max={max_cached:0.4f}GB)')
 
-    def module_state_dict(self):
+    def module_state_dict(self, exclude_frozen_parameters=False):
         """Override hack to save a pipe model and return the directory path of the save.
 
         This method should only be called by DeepSpeed's ``save_checkpoint()``. The
@@ -1319,10 +1288,11 @@ def module_state_dict(self):
             "PipelineEngine expects module_state_dict() to be called from save_checkpoint()"
 
         self.module.save_state_dict(self._curr_ckpt_path,
-                                    checkpoint_engine=self.checkpoint_engine)
+                                    checkpoint_engine=self.checkpoint_engine,
+                                    exclude_frozen_params=exclude_frozen_parameters)
         return None
 
-    def load_module_state_dict(self, state_dict, strict=True, custom_load_fn=None):
+    def load_module_state_dict(self, checkpoint, strict=True, custom_load_fn=None, fetch_z3_params=False):
         """Override hack to instead use a directory path.
 
         This is important because pipeline models checkpoint by layer instead of rank.
@@ -1334,6 +1304,7 @@ def load_module_state_dict(self, state_dict, strict=True, custom_load_fn=None):
             strict (bool, optional): Strict state loading. Defaults to True.
         """
         assert custom_load_fn is None, "custom_load_fn not supported w. pipeline parallelism"
+        state_dict = checkpoint['module']
         if (state_dict is not None) and (not isinstance(state_dict, str)):
             super().load_module_state_dict(state_dict, strict)
             return
@@ -1367,9 +1338,7 @@ def _exec_schedule(self, pipe_schedule):
             # For each instruction in the step
             for cmd in step_cmds:
                 if type(cmd) not in self._INSTRUCTION_MAP:
-                    raise RuntimeError(
-                        f'{self.__class__.__name__} does not understand instruction {repr(cmd)}'
-                    )
+                    raise RuntimeError(f'{self.__class__.__name__} does not understand instruction {repr(cmd)}')
 
                 # Equivalent to: self._exec_forward_pass(buffer_id=0)
                 self._exec_instr = MethodType(self._INSTRUCTION_MAP[type(cmd)], self)
diff --git a/deepspeed/runtime/pipe/module.py b/deepspeed/runtime/pipe/module.py
index acf066bb710c..c11379b0a0d7 100644
--- a/deepspeed/runtime/pipe/module.py
+++ b/deepspeed/runtime/pipe/module.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import os
 import glob
@@ -17,6 +20,7 @@
 from .topology import PipeDataParallelTopology, PipelineParallelGrid
 from deepspeed.runtime.state_dict_factory import SDLoaderFactory
 from deepspeed.accelerator import get_accelerator
+from deepspeed.checkpoint.utils import clone_tensors_for_torch_save
 
 
 class PipelineError(Exception):
@@ -45,6 +49,7 @@ class LayerSpec:
             LayerSpec(torch.nn.Linear, self.hidden_hidden, self.out_dim)]
         ]
     """
+
     def __init__(self, typename, *module_args, **module_kwargs):
         self.typename = typename
         self.module_args = module_args
@@ -59,9 +64,7 @@ def __init__(self, typename, *module_args, **module_kwargs):
             self.global_rank = -1
 
     def __repr__(self):
-        return ds_utils.call_to_str(self.typename.__name__,
-                                    self.module_args,
-                                    self.module_kwargs)
+        return ds_utils.call_to_str(self.typename.__name__, self.module_args, self.module_kwargs)
 
     def build(self, log=False):
         """Build the stored specification."""
@@ -72,17 +75,12 @@ def build(self, log=False):
 
 
 class TiedLayerSpec(LayerSpec):
-    def __init__(self,
-                 key,
-                 typename,
-                 *module_args,
-                 forward_fn=None,
-                 tied_weight_attr='weight',
-                 **module_kwargs):
+
+    def __init__(self, key, typename, *module_args, forward_fn=None, tied_weight_attr=['weight'], **module_kwargs):
         super().__init__(typename, *module_args, **module_kwargs)
         self.key = key
         self.forward_fn = forward_fn
-        self.tied_weight_attr = tied_weight_attr
+        self.tied_weight_attr = [tied_weight_attr] if type(tied_weight_attr) == str else tied_weight_attr
 
 
 class PipelineModule(nn.Module):
@@ -120,6 +118,7 @@ def forward(self, inputs):
         activation_checkpoint_func (callable, optional): The function to use for activation checkpointing. Defaults to ``deepspeed.checkpointing.checkpoint``.
         checkpointable_layers(list, optional): Checkpointable layers may not be checkpointed. Defaults to None which does not additional filtering.
     """
+
     def __init__(self,
                  layers,
                  num_stages=None,
@@ -154,16 +153,14 @@ def __init__(self,
                 seed_str = self.seed_fn.__name__
             except AttributeError:
                 seed_str = None
-            print(
-                f'SEED_LAYERS={self.seed_layers} BASE_SEED={self.base_seed} SEED_FN={seed_str}'
-            )
+            print(f'SEED_LAYERS={self.seed_layers} BASE_SEED={self.base_seed} SEED_FN={seed_str}')
 
         # Setup world info
         self.world_group = dist.new_group(ranks=range(dist.get_world_size()))
         self.global_rank = dist.get_rank(group=self.world_group)
         self.world_size = dist.get_world_size(group=self.world_group)
         self.local_rank = int(os.environ.get("LOCAL_RANK", None))
-        assert self.local_rank != None
+        assert self.local_rank is not None
 
         if topology:
             self._topo = topology
@@ -173,15 +170,13 @@ def __init__(self,
             if topology is None:
                 if self.world_size % self.num_stages != 0:
                     raise RuntimeError(
-                        f'num_stages ({self.num_stages}) must divide distributed world size ({self.world_size})'
-                    )
+                        f'num_stages ({self.num_stages}) must divide distributed world size ({self.world_size})')
                 dp = self.world_size // num_stages
                 topology = PipeDataParallelTopology(num_pp=num_stages, num_dp=dp)
                 self._topo = topology
 
         # Construct communicators for pipeline topology
-        self._grid = PipelineParallelGrid(process_group=self.world_group,
-                                          topology=self._topo)
+        self._grid = PipelineParallelGrid(process_group=self.world_group, topology=self._topo)
 
         self.stage_id = self._topo.get_coord(self.global_rank).pipe
 
@@ -209,7 +204,9 @@ def __init__(self,
         self._synchronize_tied_weights()
 
         self.activation_checkpoint_interval = activation_checkpoint_interval
+
         self.activation_checkpoint_func = activation_checkpoint_func
+        # if configuration use_reentrant = False, self.activation_checkpoint_func will be set to ``checkpointing.non_reentrant_checkpoint``
 
     def _build(self):
         specs = self._layer_specs
@@ -245,9 +242,7 @@ def _build(self):
                     self.forward_funcs.append(self.tied_modules[layer.key])
                 else:
                     # User specified fn with args (module, input)
-                    self.forward_funcs.append(
-                        partial(layer.forward_fn,
-                                self.tied_modules[layer.key]))
+                    self.forward_funcs.append(partial(layer.forward_fn, self.tied_modules[layer.key]))
 
             # LayerSpec objects contain an nn.Module that should be allocated now.
             elif isinstance(layer, LayerSpec):
@@ -267,6 +262,20 @@ def _build(self):
         for p in self.parameters():
             p.ds_pipe_replicated = False
 
+    def _get_frozen_parameter_names(self, layer):
+        """ Get names of frozen parameters in the layer.
+
+            Returns:
+                A list of frozen parameter names
+        """
+        if isinstance(layer, LayerSpec):
+            l = layer.build()
+            return [n for n, p in l.named_parameters() if not p.requires_grad]
+        elif isinstance(layer, nn.Module):
+            return [n for n, p in layer.named_parameters() if not p.requires_grad]
+
+        return []
+
     def _count_layer_params(self):
         """Count the trainable parameters in individual layers.
 
@@ -304,8 +313,7 @@ def _find_layer_type(self, layername):
                 idxs.append(idx)
 
         if len(idxs) == 0:
-            raise RuntimeError(
-                f"Partitioning '{layername}' found no valid layers to partition.")
+            raise RuntimeError(f"Partitioning '{layername}' found no valid layers to partition.")
         return idxs
 
     def forward(self, forward_input):
@@ -327,8 +335,7 @@ def exec_func(*inputs):
                 for idx, layer in enumerate(self.forward_funcs[start:end]):
                     self.curr_layer = idx + self._local_start
                     if self.seed_layers:
-                        new_seed = (self.base_seed *
-                                    local_micro_offset) + self.curr_layer
+                        new_seed = (self.base_seed * local_micro_offset) + self.curr_layer
                         if self.seed_fn:
                             self.seed_fn(new_seed)
                         else:
@@ -346,8 +353,7 @@ def exec_func(*inputs):
             num_layers = len(self.forward_funcs)
             x = forward_input
             for start_idx in range(0, num_layers, self.activation_checkpoint_interval):
-                end_idx = min(start_idx + self.activation_checkpoint_interval,
-                              num_layers)
+                end_idx = min(start_idx + self.activation_checkpoint_interval, num_layers)
 
                 funcs = self.forward_funcs[start_idx:end_idx]
                 # Since we either pass tensors or tuples of tensors without unpacking, we
@@ -356,10 +362,7 @@ def exec_func(*inputs):
                     x = (x, )
 
                 if self._is_checkpointable(funcs):
-                    x = self.activation_checkpoint_func(
-                        exec_range_func(start_idx,
-                                        end_idx),
-                        *x)
+                    x = self.activation_checkpoint_func(exec_range_func(start_idx, end_idx), *x)
                 else:
                     x = exec_range_func(start_idx, end_idx)(*x)
         return x
@@ -376,19 +379,16 @@ def _partition_layers(self, method='uniform'):
         # Each stage gets a simple uniform number of layers.
         if method == 'uniform':
             num_layers = len(self._layer_specs)
-            self.parts = ds_utils.partition_uniform(num_items=num_layers,
-                                                    num_parts=num_stages)
+            self.parts = ds_utils.partition_uniform(num_items=num_layers, num_parts=num_stages)
         elif method == 'parameters':
             param_counts = self._count_layer_params()
-            self.parts = ds_utils.partition_balanced(weights=param_counts,
-                                                     num_parts=num_stages)
+            self.parts = ds_utils.partition_balanced(weights=param_counts, num_parts=num_stages)
         elif method.startswith('type:'):
             layertype = method.split(':')[1]
             binary_weights = [0] * len(self._layer_specs)
             for idx in self._find_layer_type(layertype):
                 binary_weights[idx] = 1
-            self.parts = ds_utils.partition_balanced(weights=binary_weights,
-                                                     num_parts=num_stages)
+            self.parts = ds_utils.partition_balanced(weights=binary_weights, num_parts=num_stages)
         elif method == 'profile':
             raise NotImplementedError(f'Partitioning method {method} not implemented.')
         else:
@@ -423,24 +423,26 @@ def _partition_layers(self, method='uniform'):
     def allreduce_tied_weight_gradients(self):
         '''All reduce the gradients of the tied weights between tied stages'''
         for key, comm in self.tied_comms.items():
-            weight = getattr(self.tied_modules[key], comm['weight_attr'])
-            dist.all_reduce(weight.grad, group=comm['group'])
+            for attr_name in comm['weight_attr']:
+                weight = getattr(self.tied_modules[key], attr_name)
+                dist.all_reduce(weight.grad, group=comm['group'])
 
     def get_tied_weights_and_groups(self):
         weight_group_list = []
         for key, comm in self.tied_comms.items():
-            weight = getattr(self.tied_modules[key], comm['weight_attr'])
-            weight_group_list.append((weight, comm['group']))
+            for attr_name in comm['weight_attr']:
+                weight = getattr(self.tied_modules[key], attr_name)
+                weight_group_list.append((weight, comm['group']))
         return weight_group_list
 
     def _synchronize_tied_weights(self):
         for key, comm in self.tied_comms.items():
-            dist.broadcast(
-                getattr(comm['module'],
-                        comm['weight_attr']),
-                src=min(comm['ranks']),
-                group=comm['group'],
-            )
+            for attr_name in comm['weight_attr']:
+                dist.broadcast(
+                    getattr(comm['module'], attr_name),
+                    src=min(comm['ranks']),
+                    group=comm['group'],
+                )
 
     def _index_tied_modules(self):
         ''' Build communication structures for tied modules. '''
@@ -467,14 +469,9 @@ def _index_tied_modules(self):
                     tied_ranks = []
                     for s in sorted(tied_stages):
                         if self._grid.get_slice_parallel_world_size() > 1:
-                            tied_ranks.append(
-                                self._grid.stage_to_global(stage_id=s,
-                                                           data=dp,
-                                                           model=mp))
+                            tied_ranks.append(self._grid.stage_to_global(stage_id=s, data=dp, model=mp))
                         else:
-                            tied_ranks.append(
-                                self._grid.stage_to_global(stage_id=s,
-                                                           data=dp))
+                            tied_ranks.append(self._grid.stage_to_global(stage_id=s, data=dp))
                     group = dist.new_group(ranks=tied_ranks)
 
                     # Record this tied module if we own a local copy of it.
@@ -567,7 +564,7 @@ def ckpt_layer_path_list(self, ckpt_dir, local_layer_idx):
         ckpt_files.sort()
         return ckpt_files
 
-    def save_state_dict(self, save_dir, checkpoint_engine):
+    def save_state_dict(self, save_dir, checkpoint_engine, exclude_frozen_params=False):
         # Processes having the same model parallel rank on different data parallel instances
         # have identical layer weights.  We can distribute the task of saving the layer weights
         # among the data parallel ranks.  For example, if a pipeline stage has 9 layers and
@@ -587,22 +584,17 @@ def save_state_dict(self, save_dir, checkpoint_engine):
             start, end = 0, num_layers
         layer_list = self.forward_funcs[start:end]
 
-        os.makedirs(save_dir, exist_ok=True)
+        checkpoint_engine.makedirs(save_dir, exist_ok=True)
         for idx, layer in enumerate(layer_list):
             model_ckpt_path = self.ckpt_layer_path(save_dir, start + idx)
             if not hasattr(layer, 'state_dict'):
                 continue
-            # We pass cloned tensors to torch.save() to avoid checkpoint bloat which occurs because torch.save()
-            # saves the underlying storage rather than the slice of the storage corresponding to individual tensors.
-            # This is a problem in DeepSpeed because we often allocate tensors using slices of large flattened buffers.
-            # Tensor cloning helps to avoid this problem because the storage of cloned tensors are closer to the true size.
-            # It is expected that the garbage collector will reclaim the cloned tensor storage to avoid memory bloat.
-            # See https://pytorch.org/docs/stable/notes/serialization.html#preserve-storage-sharing
+
             orig_state_dict = layer.state_dict()
-            final_state_dict = type(orig_state_dict)(
-                {k: v.clone()
-                 for k,
-                 v in orig_state_dict.items()})
+            if exclude_frozen_params:
+                for n in self._get_frozen_parameter_names(layer):
+                    del orig_state_dict[n]
+            final_state_dict = clone_tensors_for_torch_save(orig_state_dict)
             checkpoint_engine.save(final_state_dict, model_ckpt_path)
 
     def load_state_dir(self, load_dir, checkpoint_engine, strict=True):
@@ -616,13 +608,12 @@ def load_state_dir(self, load_dir, checkpoint_engine, strict=True):
             mp_rank = self._grid.get_slice_parallel_rank()
             mp_world_size = self._grid.get_slice_parallel_world_size()
 
-            sd_loader = SDLoaderFactory.get_sd_loader(
-                model_ckpt_list,
-                version=2.0,
-                checkpoint_engine=checkpoint_engine)
+            sd_loader = SDLoaderFactory.get_sd_loader(model_ckpt_list,
+                                                      version=2.0,
+                                                      checkpoint_engine=checkpoint_engine)
             load_path, checkpoint, _ = sd_loader.load(mp_world_size, mp_rank, module_key=None, is_pipe_parallel=True)
 
-            layer.load_state_dict(checkpoint)
+            layer.load_state_dict(checkpoint, strict=strict)
 
             # if self._grid.data_parallel_id == 0:
             #     logger.info(
@@ -632,14 +623,14 @@ def load_state_dir(self, load_dir, checkpoint_engine, strict=True):
         self._synchronize_tied_weights()
 
     def _is_checkpointable(self, funcs):
-        # This is an unfortunate hack related to torch and deepspeed activation checkpoint implementations.
-        # Some layers like torch.nn.Embedding will not receive grads if checkpointed, which breaks things.
-        # I presume it's related to the discrete inputs that cannot require_grad? Need to revisit.
-        if self.__class__.__name__ in ('GPTModelPipe', 'GPT2ModelPipe'):
-            return all('ParallelTransformerLayerPipe' in f.__class__.__name__
-                       for f in funcs)
+
+        if self.activation_checkpoint_func is not checkpointing.non_reentrant_checkpoint:
+            # This hook excludes the embedding layer
+            # because only non_reentrant_checkpoint can accept inputs with requires_grad=False
+            # otherwise, the backward of the embedding layer won't receive gradients.
+            if self.__class__.__name__ in ('GPTModelPipe', 'GPT2ModelPipe'):
+                return all('ParallelTransformerLayerPipe' in f.__class__.__name__ for f in funcs)
         if self.checkpointable_layers is not None:
             return all(f.__class__.__name__ in self.checkpointable_layers for f in funcs)
-
         params = [f.parameters() for f in funcs if isinstance(f, torch.nn.Module)]
         return any(len(list(p)) > 0 for p in params)
diff --git a/deepspeed/runtime/pipe/p2p.py b/deepspeed/runtime/pipe/p2p.py
index 93c61114c558..31b9a14bf0f4 100644
--- a/deepspeed/runtime/pipe/p2p.py
+++ b/deepspeed/runtime/pipe/p2p.py
@@ -1,6 +1,7 @@
-'''
-Copyright 2019 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import pickle
 import typing
@@ -111,12 +112,10 @@ def send_obj(msg: typing.Any, dest: int):
     # serialize the message
     msg = pickle.dumps(msg)
     # construct a tensor to send
-    msg = torch.ByteTensor(torch.ByteStorage.from_buffer(msg)).to(
-        get_accelerator().device_name())
+    msg = torch.ByteTensor(torch.ByteStorage.from_buffer(msg)).to(get_accelerator().device_name())
 
     # Send meta and message
-    length_tensor = torch.tensor([len(msg)],
-                                 dtype=torch.long).to(get_accelerator().device_name())
+    length_tensor = torch.tensor([len(msg)], dtype=torch.long).to(get_accelerator().device_name())
     dist.send(length_tensor, dst=dest)
     dist.send(msg, dst=dest)
 
@@ -135,8 +134,7 @@ def recv_obj(sender: int) -> typing.Any:
     dist.recv(length, src=sender)
 
     # Receive and deserialize
-    msg = torch.empty(length.item(),
-                      dtype=torch.uint8).to(get_accelerator().device_name())
+    msg = torch.empty(length.item(), dtype=torch.uint8).to(get_accelerator().device_name())
     dist.recv(msg, src=sender)
 
     msg = pickle.loads(msg.cpu().numpy().tobytes())
diff --git a/deepspeed/runtime/pipe/schedule.py b/deepspeed/runtime/pipe/schedule.py
index 19e73da17fb9..21bf24dad8ed 100644
--- a/deepspeed/runtime/pipe/schedule.py
+++ b/deepspeed/runtime/pipe/schedule.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from ..utils import call_to_str
 
@@ -42,6 +45,7 @@ def num_pipe_buffers(self):
         stages (int): The number of pipeline stages.
         stage_id (int): The pipe stage that will execute the generated schedule.
     """
+
     def __init__(self, micro_batches, stages, stage_id):
         super().__init__()
         self.micro_batches = micro_batches
@@ -131,6 +135,7 @@ def __next__(self):
 class InferenceSchedule(PipeSchedule):
     """A schedule for inferencing batches using pipeline parallelism.
     """
+
     def steps(self):
         """"""
         prev_micro_batch_id = -1
@@ -188,6 +193,7 @@ class TrainSchedule(PipeSchedule):
     convergence follows that of a data parallel approach with the same batch
     size.
     """
+
     def steps(self):
         """"""
         prev_micro_batch_id = -1
@@ -206,19 +212,15 @@ def steps(self):
 
             # Exchange activations
             if is_forward:
-                if self._valid_micro_batch(micro_batch_id) and self._valid_stage(
-                        self.prev_stage):
-                    cmds.append(RecvActivation(curr_buffer))
-                if self._valid_micro_batch(prev_micro_batch_id) and self._valid_stage(
-                        self.prev_stage):
+                if self._valid_micro_batch(prev_micro_batch_id) and self._valid_stage(self.prev_stage):
                     cmds.append(SendGrad(prev_buffer))
+                if self._valid_micro_batch(micro_batch_id) and self._valid_stage(self.prev_stage):
+                    cmds.append(RecvActivation(curr_buffer))
             else:
-                if self._valid_micro_batch(prev_micro_batch_id) and self._valid_stage(
-                        self.next_stage):
-                    cmds.append(SendActivation(prev_buffer))
-                if self._valid_micro_batch(micro_batch_id) and self._valid_stage(
-                        self.next_stage):
+                if self._valid_micro_batch(micro_batch_id) and self._valid_stage(self.next_stage):
                     cmds.append(RecvGrad(curr_buffer))
+                if self._valid_micro_batch(prev_micro_batch_id) and self._valid_stage(self.next_stage):
+                    cmds.append(SendActivation(prev_buffer))
 
             # First/last stage loads
             if self.stage_id == 0 or self.stage_id == self.stages - 1:
@@ -243,9 +245,14 @@ def steps(self):
             yield cmds
 
     def num_pipe_buffers(self):
-        """As many buffers as the distance from this stage to the last stage.
+        """Return the number of pipeline buffers required for this stage.
+
+        This is equivalent to the maximum number of in-flight forward passes,
+        since we need to remember the activations of forward passes in order
+        to run backpropagation. For synchronous 1F1B, this is equivalent to
+        the index difference between this stage and the last stage.
         """
-        buffers = min(self.stages - self.stage_id + 1, self.micro_batches)
+        buffers = min(self.stages - self.stage_id, self.micro_batches)
         return max(2, buffers)
 
     def _step_to_micro_batch(self, step_id):
@@ -295,6 +302,7 @@ class DataParallelSchedule(PipeSchedule):
     """An example schedule that trains using traditional data parallelism with gradient
     accumulation.
     """
+
     def steps(self):
         """"""
         for step_id in range(self.micro_batches):
@@ -325,6 +333,7 @@ class PipeInstruction:
     Args:
         kwargs (optional): keyword arguments to store as members
     """
+
     def __init__(self, **kwargs):
         self.name = self.__class__.__name__
         self.kwargs = kwargs
@@ -369,6 +378,7 @@ class BufferOpInstruction(PipeInstruction):
     Args:
         buffer_id (int): the index of the pipeline buffer() to modify.
     """
+
     def __init__(self, buffer_id, **kwargs):
         super().__init__(buffer_id=buffer_id, **kwargs)
 
diff --git a/deepspeed/runtime/pipe/topology.py b/deepspeed/runtime/pipe/topology.py
index 6c0cd96440fb..328c19907100 100644
--- a/deepspeed/runtime/pipe/topology.py
+++ b/deepspeed/runtime/pipe/topology.py
@@ -1,4 +1,7 @@
-# Copyright 2019 The Microsoft DeepSpeed Team
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from deepspeed import comm as dist
 
@@ -19,6 +22,7 @@ class ProcessTopology:
 
     Some methods return ProcessCoord namedtuples.
     """
+
     def __init__(self, axes, dims):
         """Create a mapping of n-dimensional tensor coordinates to linear indices.
 
@@ -62,12 +66,7 @@ def get_axis_names(self):
         """Return a list of the axis names in the ordering of the topology. """
         return self.axes
 
-    def get_rank_repr(self,
-                      rank,
-                      omit_axes=['data',
-                                 'pipe'],
-                      inner_sep='_',
-                      outer_sep='-'):
+    def get_rank_repr(self, rank, omit_axes=['data', 'pipe'], inner_sep='_', outer_sep='-'):
         """Return a string representation of a rank.
 
         This method is primarily used for checkpointing model data.
@@ -181,6 +180,7 @@ def filter_match(self, **filter_kwargs):
         Returns:
             The list of ranks whose coordinates match filter_kwargs.
         """
+
         def _filter_helper(x):
             for key, val in filter_kwargs.items():
                 if getattr(x, key) != val:
@@ -236,12 +236,14 @@ class PipeDataParallelTopology(ProcessTopology):
         reductions to use high-bandwidth intra-node links and lower-volume
         pipeline communications to use low-bandwidth inter-node links.
     """
+
     def __init__(self, num_pp, num_dp):
         super().__init__(axes=['pipe', 'data'], dims=[num_pp, num_dp])
 
 
 class PipeModelDataParallelTopology(ProcessTopology):
     """ A topology for hybrid pipeline, model, and data parallelism. """
+
     def __init__(self, num_pp, num_mp, num_dp):
         super().__init__(axes=['pipe', 'data', 'model'], dims=[num_pp, num_dp, num_mp])
 
@@ -268,6 +270,7 @@ class PipelineParallelGrid:
     data_parallel_id = 0, or similarly [9,5] represents wrapped around stages [4,0]
     for data_parallel_id = 1.
     """
+
     def __init__(self, topology=None, process_group=None):
         # TODO use process_group if provided
         self.global_rank = dist.get_rank()
diff --git a/deepspeed/runtime/progressive_layer_drop.py b/deepspeed/runtime/progressive_layer_drop.py
index 65e1a563096d..bd3be8f13011 100755
--- a/deepspeed/runtime/progressive_layer_drop.py
+++ b/deepspeed/runtime/progressive_layer_drop.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import numpy as np
 from deepspeed.utils import log_dist
@@ -13,6 +16,7 @@ class ProgressiveLayerDrop(object):
         The lower the theta value, the faster the training speed. Default value: 0.5.
         gamma (float): a hyper-parameter that controls how fast the drop ratio increases. Default value: 0.001.
     """
+
     def __init__(self, theta=0.5, gamma=0.001):
         super().__init__()
 
@@ -29,6 +33,7 @@ def get_theta(self):
         return self.current_theta
 
     def update_state(self, global_step):
+
         def _prob(x, gamma, p):
             return (1. - p) * np.exp(-gamma * x) + p
 
diff --git a/deepspeed/runtime/quantize.py b/deepspeed/runtime/quantize.py
index 81a7bd56991b..ad553138b2d7 100755
--- a/deepspeed/runtime/quantize.py
+++ b/deepspeed/runtime/quantize.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 import math
@@ -9,6 +12,7 @@
 
 
 class Quantizer(object):
+
     def __init__(self,
                  q_groups=1,
                  q_mixed_fp16=False,
@@ -39,17 +43,12 @@ def any_precision_switch(self):
         result = False
         for index in range(self.layer_num):
             if self.q_start_bits[index] != self.q_target_bits:
-                next_step = self.qsteps + (
-                    TWO_D_PARAMS * (self.layer_num if self.layer_num != 0 else 1))
+                next_step = self.qsteps + (TWO_D_PARAMS * (self.layer_num if self.layer_num != 0 else 1))
                 if next_step >= self.q_period[index]:
                     result = True
         return result
 
-    def quantize(self,
-                 parameter_group,
-                 overflow,
-                 eigenvalue_enabled,
-                 block_eigenvalue={}):
+    def quantize(self, parameter_group, overflow, eigenvalue_enabled, block_eigenvalue={}):
 
         if overflow and not eigenvalue_enabled:
             return
@@ -65,7 +64,8 @@ def quantize(self,
                     if block_eigenvalue is None:
                         eigenvalue, layer_id = None, 0
                     else:
-                        eigenvalue, layer_id = block_eigenvalue[param_id] if param_id in block_eigenvalue else (None, 0)
+                        eigenvalue, layer_id = block_eigenvalue[param_id] if param_id in block_eigenvalue else (None,
+                                                                                                                0)
                     if eigenvalue is not None:
                         factor = 1 + math.floor(eigenvalue * 4)
                         p.data = self.compute_quantization(p.data, layer_id, factor)
@@ -91,15 +91,11 @@ def quantize_highbit(self, inputs, num_bits):
         if self.q_type == 'symmetric':
             scale = 2 * torch.max(torch.abs(g_min), torch.abs(g_max)) / q_range
             zero_point = 0.
-            input_flat = (input_flat / scale + p).round().clamp(
-                -(q_range >> 1),
-                (q_range >> 1) - 1) * scale
+            input_flat = (input_flat / scale + p).round().clamp(-(q_range >> 1), (q_range >> 1) - 1) * scale
         elif self.q_type == 'asymmetric':
             scale = (g_max - g_min) / q_range
             zero_point = (g_min / scale).round() * scale
-            input_flat = ((input_flat - zero_point) / scale + p).round().clamp(
-                0,
-                (q_range - 1)) * scale + zero_point
+            input_flat = ((input_flat - zero_point) / scale + p).round().clamp(0, (q_range - 1)) * scale + zero_point
         output = input_flat.reshape(inputs.shape).contiguous()
         return output
 
@@ -126,8 +122,7 @@ def quantize_binary(self, inputs):
 
     def mixed_fp16_quantize(self, input, input_q, index):
         if self.q_mixed_fp16 and self.q_start_bits[index] >= (self.q_target_bits - 1):
-            input_q = input * self.quantize_real_ratio + (
-                1 - self.quantize_real_ratio) * input_q
+            input_q = input * self.quantize_real_ratio + (1 - self.quantize_real_ratio) * input_q
             return input_q
         return input_q
 
@@ -152,15 +147,12 @@ def compute_quantization(self, input, index=0, factor=1):
 
         if self.use_quantizer_kernel:
             if input.start_bits <= 2:
-                raise ValueError(
-                    'Quantization bit is too low, please do it without quantization kernel!'
-                )
-            input_q = ds_quantizer(
-                input.data.clone(),
-                self.q_groups,
-                input.start_bits,
-                asym=False if self.q_type == 'symmetric' else True,
-                sr=False if self.q_rounding == 'nearest_neighbor' else True)
+                raise ValueError('Quantization bit is too low, please do it without quantization kernel!')
+            input_q = ds_quantizer(input.data.clone(),
+                                   self.q_groups,
+                                   input.start_bits,
+                                   asym=False if self.q_type == 'symmetric' else True,
+                                   sr=False if self.q_rounding == 'nearest_neighbor' else True)
         else:
             if input.start_bits >= 3:
                 input_flat = self.quantize_highbit(input.data, input.start_bits)
diff --git a/deepspeed/runtime/sparse_tensor.py b/deepspeed/runtime/sparse_tensor.py
index 49dedbe14b7d..f0bb5c75530e 100644
--- a/deepspeed/runtime/sparse_tensor.py
+++ b/deepspeed/runtime/sparse_tensor.py
@@ -1,6 +1,8 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
 
+# DeepSpeed Team
+"""
 Implementation of a compressed sparse tensor. Similar in
 functionality to TensorFlow's IndexedSlices implementation.
 """
@@ -10,6 +12,7 @@
 
 class SparseTensor(object):
     """ Compressed Sparse Tensor """
+
     def __init__(self, dense_tensor=None):
         self.orig_dense_tensor = dense_tensor
         self.is_sparse = dense_tensor.is_sparse
@@ -29,9 +32,7 @@ def __init__(self, dense_tensor=None):
             self.dense_size = None
 
     def to_coo_tensor(self):
-        return torch.sparse_coo_tensor(self.indices.unsqueeze(0),
-                                       self.values,
-                                       self.dense_size)
+        return torch.sparse_coo_tensor(self.indices.unsqueeze(0), self.values, self.dense_size)
 
     @staticmethod
     def type():
@@ -40,10 +41,7 @@ def type():
     def to_dense(self):
         it = self.indices.unsqueeze(1)
         full_indices = torch.cat([it for _ in range(self.dense_size[1])], dim=1)
-        return self.values.new_zeros(self.dense_size).scatter_add_(
-            0,
-            full_indices,
-            self.values)
+        return self.values.new_zeros(self.dense_size).scatter_add_(0, full_indices, self.values)
 
     def sparse_size(self):
         index_size = list(self.indices.size())
diff --git a/deepspeed/runtime/state_dict_factory.py b/deepspeed/runtime/state_dict_factory.py
index 1f5c97f27bd3..c2db85d1ba58 100755
--- a/deepspeed/runtime/state_dict_factory.py
+++ b/deepspeed/runtime/state_dict_factory.py
@@ -1,6 +1,7 @@
-'''
-Copyright 2020 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 import os
@@ -18,6 +19,7 @@
 
 
 class SDLoaderFactory:
+
     @staticmethod
     def get_sd_loader_json(json_file, checkpoint_engine):
         if isinstance(json_file, str):
@@ -33,10 +35,7 @@ def get_sd_loader_json(json_file, checkpoint_engine):
         mp_size = data.get('mp_size', 0)
         if sd_type.lower() in ['bloom', 'ds_model']:
             return data
-        return SDLoaderFactory.get_sd_loader(ckpt_list,
-                                             checkpoint_engine,
-                                             sd_type,
-                                             version)
+        return SDLoaderFactory.get_sd_loader(ckpt_list, checkpoint_engine, sd_type, version)
 
     @staticmethod
     def get_sd_loader(ckpt_list, checkpoint_engine, sd_type='Megatron', version=None):
@@ -47,12 +46,12 @@ def get_sd_loader(ckpt_list, checkpoint_engine, sd_type='Megatron', version=None
 
 
 class SDLoaderBase(ABC):
+
     def __init__(self, ckpt_list, version, checkpoint_engine):
         self.module_key = None
         self.ckpt_list = ckpt_list
         self.version = version
-        self.checkpoint_engine = TorchCheckpointEngine(
-        ) if checkpoint_engine is None else checkpoint_engine
+        self.checkpoint_engine = TorchCheckpointEngine() if checkpoint_engine is None else checkpoint_engine
         self.check_ckpt_list()
 
     def load(self,
@@ -99,9 +98,9 @@ def load(self,
                 loc: storage)
 
             if quantize:
-                quantizer = WeightQuantization(mlp_extra_grouping=mlp_extra_grouping,
-                                               mp_size=mp_world_size)
-                sd_module, all_scales = quantizer.sd_quantize_megatron(self.get_module(sd), quantize_bits, quantize_groups)
+                quantizer = WeightQuantization(mlp_extra_grouping=mlp_extra_grouping, mp_size=mp_world_size)
+                sd_module, all_scales = quantizer.sd_quantize_megatron(self.get_module(sd), quantize_bits,
+                                                                       quantize_groups)
                 self.set_module(sd, sd_module)
             else:
                 all_scales = None
@@ -118,17 +117,10 @@ def get_merge_state_dicts(self, mp_world_size, mp_rank):
         assert num_ckpt % mp_world_size == 0, 'Invalid checkpoints and world size for sd merge'
 
         num_to_merge = num_ckpt // mp_world_size
-        ckpt_list = [
-            self.ckpt_list[i] for i in range(num_to_merge * mp_rank,
-                                             num_to_merge * (mp_rank + 1))
-        ]
+        ckpt_list = [self.ckpt_list[i] for i in range(num_to_merge * mp_rank, num_to_merge * (mp_rank + 1))]
 
         logger.info(f"mp_rank: {mp_rank}, ckpt_list: {ckpt_list}")
-        sd_list = [
-            self.checkpoint_engine.load(ckpt,
-                                        map_location=lambda storage,
-                                        loc: storage) for ckpt in ckpt_list
-        ]
+        sd_list = [self.checkpoint_engine.load(ckpt, map_location=lambda storage, loc: storage) for ckpt in ckpt_list]
         return sd_list
 
     def get_split_state_dict(self, mp_world_size, mp_rank):
@@ -139,18 +131,15 @@ def get_split_state_dict(self, mp_world_size, mp_rank):
         ckpt_index = mp_rank // num_to_split
         ckpt_offset = mp_rank % num_to_split
 
-        logger.info(
-            f"mp_rank: {mp_rank}, ckpt_list: {self.ckpt_list[ckpt_index]}, offset: {ckpt_offset}"
-        )
+        logger.info(f"mp_rank: {mp_rank}, ckpt_list: {self.ckpt_list[ckpt_index]}, offset: {ckpt_offset}")
 
-        sd = self.checkpoint_engine.load(self.ckpt_list[ckpt_index],
-                                         map_location=lambda storage,
-                                         loc: storage)
+        sd = self.checkpoint_engine.load(self.ckpt_list[ckpt_index], map_location=lambda storage, loc: storage)
 
         return sd, num_to_split, ckpt_offset
 
     def _choose_module_key(self, sd):
-        assert not ('module' in sd and 'model' in sd), "checkpoint has both 'model' and 'module' keys, not sure how to proceed"
+        assert not ('module' in sd
+                    and 'model' in sd), "checkpoint has both 'model' and 'module' keys, not sure how to proceed"
         assert 'module' in sd or 'model' in sd, "checkpoint contains neither 'model' or 'module' keys, not sure how to proceed"
         if 'module' in sd:
             return 'module'
@@ -178,32 +167,19 @@ def check_ckpt_list(self):
         #logger.info(f'checkpoint file list: {self.ckpt_list}')
         assert len(self.ckpt_list) > 0
 
-        sd = self.checkpoint_engine.load(self.ckpt_list[0],
-                                         map_location=lambda storage,
-                                         loc: storage)
+        sd = self.checkpoint_engine.load(self.ckpt_list[0], map_location=lambda storage, loc: storage)
 
         # check checkpoint count is same with saved mp_world_size
         if 'mp_world_size' in sd.keys():
-            assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}"
+            assert len(self.ckpt_list) == sd[
+                'mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}"
 
     @abstractmethod
-    def merge_state_dict(self,
-                         mp_world_size,
-                         mp_rank,
-                         quantize,
-                         quantize_bits,
-                         groups,
-                         mlp_extra_grouping):
+    def merge_state_dict(self, mp_world_size, mp_rank, quantize, quantize_bits, groups, mlp_extra_grouping):
         pass
 
     @abstractmethod
-    def split_state_dict(self,
-                         mp_world_size,
-                         mp_rank,
-                         quantize,
-                         quantize_bits,
-                         groups,
-                         mlp_extra_grouping):
+    def split_state_dict(self, mp_world_size, mp_rank, quantize, quantize_bits, groups, mlp_extra_grouping):
         pass
 
     @abstractmethod
@@ -212,6 +188,7 @@ def sanity_check(self, ckpt_file_name):
 
 
 class MegatronSDLoader(SDLoaderBase):
+
     def __init__(self, ckpt_list, version, checkpoint_engine):
         super().__init__(ckpt_list, version, checkpoint_engine)
         """
@@ -340,40 +317,27 @@ def merge_state_dict(self,
         ckpt_ver = self.get_checkpoint_version(ds_sd)
         logger.info(f"checkpoint version: {ckpt_ver}")
         if quantize:
-            quantizer = WeightQuantization(mlp_extra_grouping=mlp_extra_grouping,
-                                           mp_size=mp_world_size)
+            quantizer = WeightQuantization(mlp_extra_grouping=mlp_extra_grouping, mp_size=mp_world_size)
 
         for key in keys:
             value_list = [sd[key] for sd in client_sd_list]
 
             if "attention.dense.weight" in key or "mlp.dense_4h_to_h.weight" in key:
                 if quantize:
-                    value_list = quantizer.Quantize(value_list,
-                                                    quantize_bits,
-                                                    groups,
-                                                    key=key,
-                                                    merge_dim=1)
+                    value_list = quantizer.Quantize(value_list, quantize_bits, groups, key=key, merge_dim=1)
                 new_client_sd[key] = torch.cat(value_list, axis=1)
             elif "attention.query_key_value" in key:
                 if quantize and "attention.query_key_value.weight" in key:
-                    value_list = quantizer.Quantize(value_list,
-                                                    quantize_bits,
-                                                    groups,
-                                                    key=key)
+                    value_list = quantizer.Quantize(value_list, quantize_bits, groups, key=key)
                     new_client_sd[key] = torch.cat(value_list, axis=0)
                 else:
                     if quantize:
                         new_client_sd[key] = torch.cat(value_list, axis=0)
                     else:
-                        new_client_sd[key] = self.merge_query_key_value(
-                            value_list,
-                            ckpt_ver)
+                        new_client_sd[key] = self.merge_query_key_value(value_list, ckpt_ver)
             elif "mlp.dense_h_to_4h.weight" in key or "word_embeddings.weight" in key or "mlp.dense_h_to_4h.bias" in key:
                 if quantize and "mlp.dense_h_to_4h.weight" in key:
-                    value_list = quantizer.Quantize(value_list,
-                                                    quantize_bits,
-                                                    groups,
-                                                    key=key)
+                    value_list = quantizer.Quantize(value_list, quantize_bits, groups, key=key)
                 new_client_sd[key] = torch.cat(value_list, axis=0)
             else:
                 new_client_sd[key] = value_list[0]
@@ -402,8 +366,7 @@ def split_state_dict(self,
         logger.info(f"checkpoint version: {ckpt_ver}")
 
         if quantize:
-            quantizer = WeightQuantization(mlp_extra_grouping=mlp_extra_grouping,
-                                           mp_size=mp_world_size)
+            quantizer = WeightQuantization(mlp_extra_grouping=mlp_extra_grouping, mp_size=mp_world_size)
 
         for key in client_sd.keys():
             value = client_sd[key]
@@ -419,11 +382,7 @@ def split_state_dict(self,
                 if quantize and "attention.query_key_value.weight" in key:
                     q_vals = quantizer.Quantize([value], quantize_bits, groups, key)
                     value = q_vals[0]
-                new_client_sd[key] = self.split_query_key_value(
-                    value,
-                    num_to_split,
-                    ckpt_offset,
-                    ckpt_ver)
+                new_client_sd[key] = self.split_query_key_value(value, num_to_split, ckpt_offset, ckpt_ver)
             elif "mlp.dense_h_to_4h.weight" in key or "word_embeddings.weight" in key or "mlp.dense_h_to_4h.bias" in key or "final_linear.weight" in key:
                 assert value.shape[0] % num_to_split == 0
                 split_size = value.shape[0] // num_to_split
@@ -443,16 +402,11 @@ def split_state_dict(self,
 
     def sanity_check(self, ckpt_file_name):
         keys_to_check = [
-            "attention.dense.weight",
-            "mlp.dense_4h_to_h.weight",
-            "attention.query_key_value",
-            "mlp.dense_h_to_4h.weight",
-            "mlp.dense_h_to_4h.bias"
+            "attention.dense.weight", "mlp.dense_4h_to_h.weight", "attention.query_key_value",
+            "mlp.dense_h_to_4h.weight", "mlp.dense_h_to_4h.bias"
         ]
 
-        sd = self.checkpoint_engine.load(ckpt_file_name,
-                                         map_location=lambda storage,
-                                         loc: storage)
+        sd = self.checkpoint_engine.load(ckpt_file_name, map_location=lambda storage, loc: storage)
 
         # partial_key is a sub-string of one key in the sd
         def check_key_exist(partial_key, sd):
@@ -465,10 +419,9 @@ def check_key_exist(partial_key, sd):
             return found
 
         for key in keys_to_check:
-            assert check_key_exist(key, self.get_module(sd)), f'key: {key} is not found in the checkpoint {ckpt_file_name}'
+            assert check_key_exist(key,
+                                   self.get_module(sd)), f'key: {key} is not found in the checkpoint {ckpt_file_name}'
 
     def get_checkpoint_version(self, state_dict):
         # Use 0 if version info doesn't exist
-        return self.version if self.version is not None else state_dict.get(
-            'checkpoint_version',
-            0)
+        return self.version if self.version is not None else state_dict.get('checkpoint_version', 0)
diff --git a/deepspeed/runtime/swap_tensor/__init__.py b/deepspeed/runtime/swap_tensor/__init__.py
index 8c18558e9423..208299fb8c50 100644
--- a/deepspeed/runtime/swap_tensor/__init__.py
+++ b/deepspeed/runtime/swap_tensor/__init__.py
@@ -1,4 +1,4 @@
-'''
-Copyright 2020 The Microsoft DeepSpeed Team.
-Licensed under the MIT license.
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
diff --git a/deepspeed/runtime/swap_tensor/aio_config.py b/deepspeed/runtime/swap_tensor/aio_config.py
index 6a7014c1863c..df4a38380089 100644
--- a/deepspeed/runtime/swap_tensor/aio_config.py
+++ b/deepspeed/runtime/swap_tensor/aio_config.py
@@ -1,7 +1,7 @@
-'''
-Copyright 2020 The Microsoft DeepSpeed Team.
-Licensed under the MIT license.
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from deepspeed.runtime.config_utils import get_scalar_param
 from deepspeed.runtime.swap_tensor.constants import *
@@ -19,26 +19,11 @@ def get_aio_config(param_dict):
     if AIO in param_dict.keys() and param_dict[AIO] is not None:
         aio_dict = param_dict[AIO]
         return {
-            AIO_BLOCK_SIZE:
-            get_scalar_param(aio_dict,
-                             AIO_BLOCK_SIZE,
-                             AIO_BLOCK_SIZE_DEFAULT),
-            AIO_QUEUE_DEPTH:
-            get_scalar_param(aio_dict,
-                             AIO_QUEUE_DEPTH,
-                             AIO_QUEUE_DEPTH_DEFAULT),
-            AIO_THREAD_COUNT:
-            get_scalar_param(aio_dict,
-                             AIO_THREAD_COUNT,
-                             AIO_THREAD_COUNT_DEFAULT),
-            AIO_SINGLE_SUBMIT:
-            get_scalar_param(aio_dict,
-                             AIO_SINGLE_SUBMIT,
-                             AIO_SINGLE_SUBMIT_DEFAULT),
-            AIO_OVERLAP_EVENTS:
-            get_scalar_param(aio_dict,
-                             AIO_OVERLAP_EVENTS,
-                             AIO_OVERLAP_EVENTS_DEFAULT)
+            AIO_BLOCK_SIZE: get_scalar_param(aio_dict, AIO_BLOCK_SIZE, AIO_BLOCK_SIZE_DEFAULT),
+            AIO_QUEUE_DEPTH: get_scalar_param(aio_dict, AIO_QUEUE_DEPTH, AIO_QUEUE_DEPTH_DEFAULT),
+            AIO_THREAD_COUNT: get_scalar_param(aio_dict, AIO_THREAD_COUNT, AIO_THREAD_COUNT_DEFAULT),
+            AIO_SINGLE_SUBMIT: get_scalar_param(aio_dict, AIO_SINGLE_SUBMIT, AIO_SINGLE_SUBMIT_DEFAULT),
+            AIO_OVERLAP_EVENTS: get_scalar_param(aio_dict, AIO_OVERLAP_EVENTS, AIO_OVERLAP_EVENTS_DEFAULT)
         }
 
     return AIO_DEFAULT_DICT
diff --git a/deepspeed/runtime/swap_tensor/async_swapper.py b/deepspeed/runtime/swap_tensor/async_swapper.py
index 45614abd794e..b808721537fe 100644
--- a/deepspeed/runtime/swap_tensor/async_swapper.py
+++ b/deepspeed/runtime/swap_tensor/async_swapper.py
@@ -1,7 +1,8 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team.
-Licensed under the MIT license.
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
 
+# DeepSpeed Team
+"""
 Functionality of swapping tensors to/from (NVMe) storage devices.
 """
 import torch
@@ -9,12 +10,14 @@
 from deepspeed import comm as dist
 from deepspeed.utils.logging import logger
 from deepspeed.runtime.swap_tensor.utils import swap_out_tensors, SwapBuffer
+from deepspeed.accelerator import get_accelerator
 
 INVALID_BUFFER_INDEX = -1
 ASYNC_SWAPPER_WAIT_TIMER = 'async_swap_gradient_wait'
 
 
 class AsyncTensorSwapper(object):
+
     def __init__(self, aio_handle, numel_alignment, timers):
         self.free_buffer_index = []
         self.swapping_buffer_index = []
@@ -35,7 +38,7 @@ def has_buffers(self):
 
     def add_buffers(self, buffer_list):
         assert len(self.all_buffers) == 0
-        assert all([buffer.is_pinned() for buffer in buffer_list])
+        assert all([get_accelerator().is_pinned(buffer) for buffer in buffer_list])
         dtype = buffer_list[0].dtype
         assert all([buffer.dtype == dtype for buffer in buffer_list])
 
@@ -70,9 +73,7 @@ def _report_statistics(self, message):
         if dist.get_rank() == 0:
             element_size = torch.tensor([], dtype=self.dtype).element_size()
             swapped_GB = (self.num_elements_swapped * element_size) / (1024**3)
-            logger.debug(
-                f'{message} num_elems = {self.num_elements_swapped}, {swapped_GB:5.2f} GB'
-            )
+            logger.debug(f'{message} num_elems = {self.num_elements_swapped}, {swapped_GB:5.2f} GB')
 
     def _swap_out_tensor(self, tensor, swap_path):
         assert len(self.all_buffers) > 0
diff --git a/deepspeed/runtime/swap_tensor/constants.py b/deepspeed/runtime/swap_tensor/constants.py
index 752ec8dcaacf..4c9722bc4e4f 100644
--- a/deepspeed/runtime/swap_tensor/constants.py
+++ b/deepspeed/runtime/swap_tensor/constants.py
@@ -1,10 +1,10 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 """
-"Copyright 2020 The Microsoft DeepSpeed Team.
-Licensed under the MIT license.
+AIO
 """
-#########################################
-# AIO
-#########################################
 AIO_FORMAT = '''
 "aio": {
   "block_size": 1048576,
diff --git a/deepspeed/runtime/swap_tensor/optimizer_utils.py b/deepspeed/runtime/swap_tensor/optimizer_utils.py
index 70b806c3a15f..86e43c98e7e5 100644
--- a/deepspeed/runtime/swap_tensor/optimizer_utils.py
+++ b/deepspeed/runtime/swap_tensor/optimizer_utils.py
@@ -1,7 +1,8 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team.
-Licensed under the MIT license.
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
 
+# DeepSpeed Team
+"""
 Functionality of swapping tensors to/from (NVMe) storage devices.
 """
 
@@ -14,9 +15,11 @@
 from deepspeed.runtime.swap_tensor.utils import swap_in_tensors, swap_out_tensors, \
     MIN_AIO_BYTES, AIO_ALIGNED_BYTES, get_sized_buffers
 from deepspeed.runtime.swap_tensor.utils import SwapBufferManager, SwapBufferPool
+from deepspeed.accelerator import get_accelerator
 
 
 class FlattenedTensorSwapInfo(object):
+
     def __init__(self, path, length, offset):
         self.path = path
         self.offset = offset
@@ -24,6 +27,7 @@ def __init__(self, path, length, offset):
 
 
 class OptimizerStateSwapInfo(object):
+
     def __init__(self, parameter, numel, base_folder):
         self.tensors = []
         self.param_id = id(parameter)
@@ -66,13 +70,8 @@ def get_or_create_gradient_paths(self, offsets, lengths):
         gradient_paths = []
         for offset, length in zip(offsets, lengths):
             if not offset in self.swapped_gradients.keys():
-                path = os.path.join(
-                    self.swap_folder,
-                    f'{self.param_id}_gradient_{offset}_{length}.tensor.swp')
-                self.swapped_gradients[offset] = FlattenedTensorSwapInfo(
-                    path,
-                    length,
-                    offset)
+                path = os.path.join(self.swap_folder, f'{self.param_id}_gradient_{offset}_{length}.tensor.swp')
+                self.swapped_gradients[offset] = FlattenedTensorSwapInfo(path, length, offset)
 
             gradient_paths.append(self.swapped_gradients[offset].path)
 
@@ -86,17 +85,13 @@ def set_swap_buffers(self, buffers):
 
     def get_swap_gradient_buffers(self, swap_buffer):
         assert self.numel() <= swap_buffer.numel()
-        return [
-            swap_buffer.narrow(0,
-                               grad.offset,
-                               grad.length) for grad in self.swapped_gradients.values()
-        ]
+        return [swap_buffer.narrow(0, grad.offset, grad.length) for grad in self.swapped_gradients.values()]
 
     def get_swap_gradient_paths(self):
         return [grad.path for grad in self.swapped_gradients.values()]
 
     def get_unpinned_state_tensors(self):
-        return [t for t in self.tensors if not t.is_pinned()]
+        return [t for t in self.tensors if not get_accelerator().is_pinned(t)]
 
     def read_unswapped_gradients(self, dest_buffer):
         num_elem_count = 0
@@ -116,24 +111,15 @@ def release_unswapped_gradients(self):
 
 
 class OptimizerSwapper(object):
-    def __init__(self,
-                 swap_config,
-                 aio_config,
-                 base_folder,
-                 optimizer,
-                 largest_numel,
-                 device,
-                 dtype,
-                 timers):
+
+    def __init__(self, swap_config, aio_config, base_folder, optimizer, largest_numel, device, dtype, timers):
         self.swap_config = swap_config
         self.aio_config = aio_config
 
         # NVMe swap management
         self.swap_params_info = {}
         self.swap_element_size = torch.tensor([], dtype=dtype).element_size()
-        self.swap_folder = os.path.join(base_folder,
-                                        'optimizer',
-                                        f'rank{dist.get_rank()}')
+        self.swap_folder = os.path.join(base_folder, 'optimizer', f'rank{dist.get_rank()}')
         os.makedirs(self.swap_folder, exist_ok=True)
 
         self.optimizer = optimizer
@@ -191,11 +177,7 @@ def _flush_gradient_swapper(self, gradient_swapper):
             self.timer_names.add(SWAP_OUT_GRADIENT_TIMER)
             self.timer_names.update(gradient_swapper.get_timer_names())
 
-    def _swap_out_gradients(self,
-                            parameter,
-                            gradient_offsets,
-                            gradient_tensors,
-                            gradient_swapper):
+    def _swap_out_gradients(self, parameter, gradient_offsets, gradient_tensors, gradient_swapper):
         if not id(parameter) in self.swap_params_info.keys():
             return
 
@@ -205,10 +187,8 @@ def _swap_out_gradients(self,
         swappable_offsets = []
         swappable_lengths = []
 
-        aligned_gradients, aligned_offsets = self._adjust_for_misaligned_lengths(
-            tensors=gradient_tensors,
-            offsets=gradient_offsets
-        )
+        aligned_gradients, aligned_offsets = self._adjust_for_misaligned_lengths(tensors=gradient_tensors,
+                                                                                 offsets=gradient_offsets)
 
         self._start_timer(SWAP_OUT_GRADIENT_TIMER)
         for tensor, offset in zip(aligned_gradients, aligned_offsets):
@@ -222,38 +202,26 @@ def _swap_out_gradients(self,
 
         if len(swappable_tensors) > 0:
             if not gradient_swapper.has_buffers():
-                pinned_buffers = self.swap_buffer_manager.allocate_all(
-                    num_elems=self.largest_numel,
-                    dtype=self.dtype)
+                pinned_buffers = self.swap_buffer_manager.allocate_all(num_elems=self.largest_numel, dtype=self.dtype)
 
                 gradient_swapper.add_buffers(pinned_buffers)
 
-            swappable_paths = swap_info.get_or_create_gradient_paths(
-                swappable_offsets,
-                swappable_lengths)
+            swappable_paths = swap_info.get_or_create_gradient_paths(swappable_offsets, swappable_lengths)
 
-            gradient_swapper.swap_out_tensors(tensor_list=swappable_tensors,
-                                              path_list=swappable_paths)
+            gradient_swapper.swap_out_tensors(tensor_list=swappable_tensors, path_list=swappable_paths)
 
         self._stop_timer(SWAP_OUT_GRADIENT_TIMER)
         self.timer_names.add(SWAP_OUT_GRADIENT_TIMER)
 
-    def _initialize_from_swapped_fp16_params(self,
-                                             aio_handle,
-                                             fp16_partitions_info,
-                                             fp16_num_elems,
-                                             fp16_pinned_buffers,
-                                             fp32_parameters):
+    def _initialize_from_swapped_fp16_params(self, aio_handle, fp16_partitions_info, fp16_num_elems,
+                                             fp16_pinned_buffers, fp32_parameters):
         assert len(fp32_parameters) == len(fp16_partitions_info)
         assert len(fp32_parameters) == len(fp16_num_elems)
-        assert all([buffer.is_pinned() for buffer in fp16_pinned_buffers])
+        assert all([get_accelerator().is_pinned(buffer) for buffer in fp16_pinned_buffers])
 
-        fp32_swap_paths = self._get_swap_paths(parameters=fp32_parameters,
-                                               num_elems=fp16_num_elems)
+        fp32_swap_paths = self._get_swap_paths(parameters=fp32_parameters, num_elems=fp16_num_elems)
 
-        fp32_pinned_buffers = self.swap_buffer_manager.allocate_all(
-            num_elems=self.largest_numel,
-            dtype=self.dtype)
+        fp32_pinned_buffers = self.swap_buffer_manager.allocate_all(num_elems=self.largest_numel, dtype=self.dtype)
 
         fp16_buffer_numel = [buf.numel() for buf in fp16_pinned_buffers]
         assert all([numel >= self.largest_numel for numel in fp16_buffer_numel]), \
@@ -264,11 +232,10 @@ def _initialize_from_swapped_fp16_params(self,
 
         curr_index = 0
         while curr_index < len(fp32_parameters):
-            fp16_pinned_tensors = self._swap_in_fp16_params(
-                aio_handle=aio_handle,
-                fp16_num_elems=fp16_num_elems[curr_index:],
-                fp16_partitions_info=fp16_partitions_info[curr_index:],
-                fp16_swap_buffers=fp16_swap_buffers)
+            fp16_pinned_tensors = self._swap_in_fp16_params(aio_handle=aio_handle,
+                                                            fp16_num_elems=fp16_num_elems[curr_index:],
+                                                            fp16_partitions_info=fp16_partitions_info[curr_index:],
+                                                            fp16_swap_buffers=fp16_swap_buffers)
 
             if dist.get_rank() == 0 and SWAPPER_DEBUG_MODE:
                 for i, tensor in enumerate(fp16_pinned_tensors):
@@ -277,11 +244,10 @@ def _initialize_from_swapped_fp16_params(self,
                         f'swap_in_fp16_param: fp32_id = {id(fp32_parameters[true_index])} index = {true_index} orig_num_elem = {fp16_num_elems[true_index]}, swap_num_elem = {fp16_pinned_tensors[i].numel()}'
                     )
 
-            swap_out_count = self._swap_out_fp16_params(
-                aio_handle=aio_handle,
-                fp32_swap_paths=fp32_swap_paths[curr_index:],
-                fp32_swap_buffers=fp32_swap_buffers,
-                fp16_pinned_tensors=fp16_pinned_tensors)
+            swap_out_count = self._swap_out_fp16_params(aio_handle=aio_handle,
+                                                        fp32_swap_paths=fp32_swap_paths[curr_index:],
+                                                        fp32_swap_buffers=fp32_swap_buffers,
+                                                        fp16_pinned_tensors=fp16_pinned_tensors)
             assert swap_out_count == len(fp16_pinned_tensors), \
             f"{swap_out_count} does not match {len(fp16_pinned_tensors)}"
 
@@ -291,11 +257,7 @@ def _initialize_from_swapped_fp16_params(self,
 
         self.swap_buffer_manager.free(fp32_pinned_buffers)
 
-    def _swap_in_fp16_params(self,
-                             aio_handle,
-                             fp16_num_elems,
-                             fp16_partitions_info,
-                             fp16_swap_buffers):
+    def _swap_in_fp16_params(self, aio_handle, fp16_num_elems, fp16_partitions_info, fp16_swap_buffers):
         assert len(fp16_num_elems) > 0
 
         swapped_fp16_tensors = []
@@ -330,11 +292,7 @@ def _swap_in_fp16_params(self,
 
         return swapped_fp16_tensors
 
-    def _swap_out_fp16_params(self,
-                              aio_handle,
-                              fp32_swap_paths,
-                              fp32_swap_buffers,
-                              fp16_pinned_tensors):
+    def _swap_out_fp16_params(self, aio_handle, fp32_swap_paths, fp32_swap_buffers, fp16_pinned_tensors):
 
         assert len(fp16_pinned_tensors) <= len(fp32_swap_paths)
         swap_out_count = 0
@@ -343,11 +301,8 @@ def _swap_out_fp16_params(self,
                 fp32_swap_buffers.swap_out(aio_handle)
                 fp32_swap_buffers.reset()
 
-            pinned_tensor, _ = fp32_swap_buffers.insert_tensor(
-                fp16_tensor,
-                fp32_swap_paths[i],
-                self._io_aligned_numel(fp16_tensor.numel())
-                )
+            pinned_tensor, _ = fp32_swap_buffers.insert_tensor(fp16_tensor, fp32_swap_paths[i],
+                                                               self._io_aligned_numel(fp16_tensor.numel()))
             assert pinned_tensor is not None
             swap_out_count += 1
 
@@ -359,15 +314,12 @@ def _swap_out_fp16_params(self,
     def _initialize_parameters(self, parameters, src_tensors, aio_handle):
         assert len(parameters) == len(src_tensors)
 
-        swap_paths = self._get_swap_paths(parameters=parameters,
-                                          num_elems=[src.numel() for src in src_tensors])
+        swap_paths = self._get_swap_paths(parameters=parameters, num_elems=[src.numel() for src in src_tensors])
 
         SWAP_INIT_TIMER = "swap_init_write"
         self._start_timer(SWAP_INIT_TIMER)
 
-        pinned_buffers = self.swap_buffer_manager.allocate_all(
-            num_elems=self.largest_numel,
-            dtype=self.dtype)
+        pinned_buffers = self.swap_buffer_manager.allocate_all(num_elems=self.largest_numel, dtype=self.dtype)
         assert pinned_buffers is not None
 
         self._swap_out_unpinned_tensors(aio_handle=aio_handle,
@@ -397,11 +349,7 @@ def _get_swap_paths(self, parameters, num_elems):
         swap_paths = [info.swap_paths[0] for info in swap_info_list]
         return swap_paths
 
-    def _swap_out_unpinned_tensors(self,
-                                   aio_handle,
-                                   unpinned_tensors,
-                                   dest_paths,
-                                   pinned_buffers):
+    def _swap_out_unpinned_tensors(self, aio_handle, unpinned_tensors, dest_paths, pinned_buffers):
 
         swap_buffer_count = len(pinned_buffers)
         unpinned_tensor_count = len(unpinned_tensors)
@@ -441,8 +389,7 @@ def _adjust_for_misaligned_lengths(self, tensors, offsets):
                 continue
 
             # Split into two by making remainder a tensor
-            aligned_length = (orig_tensor.numel() //
-                              self.numel_alignment) * self.numel_alignment
+            aligned_length = (orig_tensor.numel() // self.numel_alignment) * self.numel_alignment
             new_tensors.append(orig_tensor.narrow(0, 0, aligned_length))
             new_offsets.append(orig_offset)
 
@@ -489,10 +436,9 @@ def _create_param_swap_info(self, parameter, numel):
         param_id = id(parameter)
         assert not param_id in self.swap_params_info
 
-        self.swap_params_info[param_id] = OptimizerStateSwapInfo(
-            parameter=parameter,
-            numel=numel,
-            base_folder=self.swap_folder)
+        self.swap_params_info[param_id] = OptimizerStateSwapInfo(parameter=parameter,
+                                                                 numel=numel,
+                                                                 base_folder=self.swap_folder)
         swap_info = self.swap_params_info[param_id]
 
         self._update_param_state_info(swap_info, parameter)
diff --git a/deepspeed/runtime/swap_tensor/partitioned_optimizer_swapper.py b/deepspeed/runtime/swap_tensor/partitioned_optimizer_swapper.py
index 515853529e0c..e7bf06043fd7 100644
--- a/deepspeed/runtime/swap_tensor/partitioned_optimizer_swapper.py
+++ b/deepspeed/runtime/swap_tensor/partitioned_optimizer_swapper.py
@@ -1,7 +1,8 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
 
+# DeepSpeed Team
+"""
 Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
 """
 
@@ -16,6 +17,7 @@
     get_sized_buffers
 from deepspeed.runtime.swap_tensor.async_swapper import AsyncTensorSwapper
 from deepspeed.runtime.swap_tensor.optimizer_utils import OptimizerSwapper
+from deepspeed.accelerator import get_accelerator
 
 DEBUG_MODE = False
 
@@ -25,30 +27,14 @@
 
 
 class PartitionedOptimizerSwapper(OptimizerSwapper):
-    def __init__(self,
-                 swap_config,
-                 aio_config,
-                 base_folder,
-                 optimizer,
-                 largest_numel,
-                 device,
-                 dtype,
-                 timers):
-        super(PartitionedOptimizerSwapper,
-              self).__init__(swap_config,
-                             aio_config,
-                             base_folder,
-                             optimizer,
-                             largest_numel,
-                             device,
-                             dtype,
-                             timers)
+
+    def __init__(self, swap_config, aio_config, base_folder, optimizer, largest_numel, device, dtype, timers):
+        super(PartitionedOptimizerSwapper, self).__init__(swap_config, aio_config, base_folder, optimizer,
+                                                          largest_numel, device, dtype, timers)
 
         aio_op = AsyncIOBuilder().load()
-        self.aio_handle = aio_op.aio_handle(aio_config[AIO_BLOCK_SIZE],
-                                            aio_config[AIO_QUEUE_DEPTH],
-                                            aio_config[AIO_SINGLE_SUBMIT],
-                                            aio_config[AIO_OVERLAP_EVENTS],
+        self.aio_handle = aio_op.aio_handle(aio_config[AIO_BLOCK_SIZE], aio_config[AIO_QUEUE_DEPTH],
+                                            aio_config[AIO_SINGLE_SUBMIT], aio_config[AIO_OVERLAP_EVENTS],
                                             aio_config[AIO_THREAD_COUNT])
 
         # Overlap swapping out
@@ -56,33 +42,21 @@ def __init__(self,
                                                    numel_alignment=self.numel_alignment,
                                                    timers=self.timers)
 
-        self.print_exclude_list += [
-            'aio_handle',
-            'gradient_swapper',
-            'print_exclude_list'
-        ]
+        self.print_exclude_list += ['aio_handle', 'gradient_swapper', 'print_exclude_list']
 
         if dist.get_rank() == 0:
-            print_object(obj=self,
-                         name='PartitionedOptimizerSwapper',
-                         exclude_list=self.print_exclude_list)
+            print_object(obj=self, name='PartitionedOptimizerSwapper', exclude_list=self.print_exclude_list)
 
     def initialize_parameters(self, parameters, src_tensors):
-        self._initialize_parameters(parameters=parameters,
-                                    src_tensors=src_tensors,
-                                    aio_handle=self.aio_handle)
-
-    def initialize_from_swapped_fp16_params(self,
-                                            fp16_partitions_info,
-                                            fp16_num_elems,
-                                            fp16_pinned_buffers,
+        self._initialize_parameters(parameters=parameters, src_tensors=src_tensors, aio_handle=self.aio_handle)
+
+    def initialize_from_swapped_fp16_params(self, fp16_partitions_info, fp16_num_elems, fp16_pinned_buffers,
                                             fp32_parameters):
-        self._initialize_from_swapped_fp16_params(
-            aio_handle=self.aio_handle,
-            fp16_partitions_info=fp16_partitions_info,
-            fp16_num_elems=fp16_num_elems,
-            fp16_pinned_buffers=fp16_pinned_buffers,
-            fp32_parameters=fp32_parameters)
+        self._initialize_from_swapped_fp16_params(aio_handle=self.aio_handle,
+                                                  fp16_partitions_info=fp16_partitions_info,
+                                                  fp16_num_elems=fp16_num_elems,
+                                                  fp16_pinned_buffers=fp16_pinned_buffers,
+                                                  fp32_parameters=fp32_parameters)
 
     def flush_gradients(self):
         self._flush_gradient_swapper(self.gradient_swapper)
@@ -94,8 +68,7 @@ def swap_in_optimizer_state(self, parameter, async_parameter=None):
 
         self._flush_gradient_swapper(self.gradient_swapper)
 
-        required_buffer_count = len(
-            swap_info.tensors) + (1 if swap_info.has_gradients() else 0)
+        required_buffer_count = len(swap_info.tensors) + (1 if swap_info.has_gradients() else 0)
         aligned_numel = self._io_aligned_numel(swap_info.numel())
         pinned_buffers = self.swap_buffer_manager.allocate(num_elems=aligned_numel,
                                                            count=required_buffer_count,
@@ -111,9 +84,7 @@ def swap_in_optimizer_state(self, parameter, async_parameter=None):
         self.timer_names.add(SWAP_IN_PARAM_TIMER)
 
         self._start_timer(SWAP_IN_GRADIENT_TIMER)
-        self._swap_in_gradients(aio_handle=self.aio_handle,
-                                parameter=parameter,
-                                dest_buffer=pinned_buffers[-1])
+        self._swap_in_gradients(aio_handle=self.aio_handle, parameter=parameter, dest_buffer=pinned_buffers[-1])
         self._stop_timer(SWAP_IN_GRADIENT_TIMER)
         self.timer_names.add(SWAP_IN_GRADIENT_TIMER)
 
@@ -125,10 +96,7 @@ def swap_out_optimizer_state(self, parameter, async_swap=False):
 
         self._start_timer(SWAP_OUT_PARAM_TIMER)
         pinned_tensors, pinned_paths, unpinned_tensors, unpinned_paths = self._separate_pinned_tensors(swap_info)
-        swap_bytes = sum([
-            self._io_aligned_numel(t.numel()) * t.element_size()
-            for t in swap_info.tensors
-        ])
+        swap_bytes = sum([self._io_aligned_numel(t.numel()) * t.element_size() for t in swap_info.tensors])
 
         WRITE_TIMER = 'swap_submit_write'
         self._start_timer(WRITE_TIMER)
@@ -139,9 +107,7 @@ def swap_out_optimizer_state(self, parameter, async_swap=False):
             t.data = torch.Tensor()
 
         if len(unpinned_tensors) > 0:
-            pinned_buffers = self.swap_buffer_manager.allocate_all(
-                num_elems=self.largest_numel,
-                dtype=self.dtype)
+            pinned_buffers = self.swap_buffer_manager.allocate_all(num_elems=self.largest_numel, dtype=self.dtype)
             self._swap_out_unpinned_tensors(aio_handle=self.aio_handle,
                                             unpinned_tensors=unpinned_tensors,
                                             dest_paths=unpinned_paths,
@@ -176,8 +142,7 @@ def _swap_in_parameter(self, aio_handle, parameter, dest_buffers):
 
         assert len(swap_info.tensors) <= len(dest_buffers)
 
-        swap_lengths = [self._io_aligned_numel(swap_info.numel())] * len(
-            swap_info.tensors)
+        swap_lengths = [self._io_aligned_numel(swap_info.numel())] * len(swap_info.tensors)
         swap_buffers = get_sized_buffers(dest_buffers, swap_lengths)
 
         READ_TIMER = 'swap_submit_read_param'
@@ -187,8 +152,7 @@ def _swap_in_parameter(self, aio_handle, parameter, dest_buffers):
         swap_in_tensors(aio_handle, swap_buffers, swap_info.swap_paths)
         self._stop_timer(READ_TIMER)
 
-        swap_bytes = sum(
-            [buffer.numel() * buffer.element_size() for buffer in swap_buffers])
+        swap_bytes = sum([buffer.numel() * buffer.element_size() for buffer in swap_buffers])
 
         self._start_timer(WAIT_TIMER)
         aio_handle.wait()
@@ -211,7 +175,7 @@ def _separate_pinned_tensors(self, swap_info):
         unpinned_paths = []
 
         for tensor, path in zip(swap_info.tensors, swap_info.swap_paths):
-            if tensor.is_pinned():
+            if get_accelerator().is_pinned(tensor):
                 pinned_tensors.append(tensor)
                 pinned_paths.append(path)
             else:
@@ -223,11 +187,7 @@ def _separate_pinned_tensors(self, swap_info):
     def _swap_in_pinned_gradients(self, aio_handle, parameter, gradient_tensor):
         swap_info = self.swap_params_info[id(parameter)]
         param_gradients = swap_info.swapped_gradients.values()
-        swap_buffers = [
-            gradient_tensor.narrow(0,
-                                   grad.offset,
-                                   grad.length) for grad in param_gradients
-        ]
+        swap_buffers = [gradient_tensor.narrow(0, grad.offset, grad.length) for grad in param_gradients]
         swap_paths = [grad.path for grad in param_gradients]
         SWAP_READ_GRADIENTS = 'swap_submit_read_gradient'
         SWAP_WAIT_GRADIENTS = 'swap_submit_wait_gradient'
@@ -247,7 +207,7 @@ def _swap_in_gradients(self, aio_handle, parameter, dest_buffer):
         if not (swap_info and swap_info.has_gradients()):
             return
 
-        assert dest_buffer.is_pinned()
+        assert get_accelerator().is_pinned(dest_buffer)
         assert parameter.numel() <= dest_buffer.numel()
 
         parameter.grad = dest_buffer.narrow(0, 0, parameter.numel())
@@ -256,5 +216,4 @@ def _swap_in_gradients(self, aio_handle, parameter, dest_buffer):
             self._swap_in_pinned_gradients(aio_handle, parameter, parameter.grad)
 
         if swap_info.unswapped_gradients:
-            self._retrieve_unswapped_grad_partitions(swap_info=swap_info,
-                                                     dest_buffer=parameter.grad)
+            self._retrieve_unswapped_grad_partitions(swap_info=swap_info, dest_buffer=parameter.grad)
diff --git a/deepspeed/runtime/swap_tensor/partitioned_param_swapper.py b/deepspeed/runtime/swap_tensor/partitioned_param_swapper.py
index e9e134e6779a..fcc6a272883f 100644
--- a/deepspeed/runtime/swap_tensor/partitioned_param_swapper.py
+++ b/deepspeed/runtime/swap_tensor/partitioned_param_swapper.py
@@ -1,7 +1,8 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team.
-Licensed under the MIT license.
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
 
+# DeepSpeed Team
+"""
 Functionality of swapping tensors to/from (NVMe) storage devices.
 """
 
@@ -33,6 +34,7 @@ class PartitionedParamStatus(Enum):
 
 
 class AsyncPartitionedParameterSwapper(object):
+
     def __init__(self, ds_config, model_dtype):
 
         aio_op = AsyncIOBuilder().load(verbose=False)
@@ -74,9 +76,7 @@ def __init__(self, ds_config, model_dtype):
 
         if dist.get_rank() == 0:
             exclude_list = ['aio_read_handle', 'aio_write_handle', 'buffers']
-            print_object(obj=self,
-                         name='AsyncPartitionedParameterSwapper',
-                         exclude_list=exclude_list)
+            print_object(obj=self, name='AsyncPartitionedParameterSwapper', exclude_list=exclude_list)
 
     def available_swap_in_buffers(self):
         return len(self.available_buffer_ids)
@@ -84,9 +84,7 @@ def available_swap_in_buffers(self):
     def _configure_aio(self, ds_config):
         self.swap_config = ds_config.zero_config.offload_param
         torch_dtype_string = str(self.dtype).split(".")[1]
-        self.swap_folder = os.path.join(self.swap_config.nvme_path,
-                                        'zero_stage_3',
-                                        f'{torch_dtype_string}params',
+        self.swap_folder = os.path.join(self.swap_config.nvme_path, 'zero_stage_3', f'{torch_dtype_string}params',
                                         f'rank{dist.get_rank()}')
         shutil.rmtree(self.swap_folder, ignore_errors=True)
         os.makedirs(self.swap_folder, exist_ok=True)
@@ -101,28 +99,24 @@ def _configure_aio(self, ds_config):
         self.numel_alignment = self.aligned_bytes // self.swap_element_size
 
         self.elements_per_buffer = self.swap_config.buffer_size
-        self.aligned_elements_per_buffer = self._io_aligned_numel(
-            self.elements_per_buffer)
+        self.aligned_elements_per_buffer = self._io_aligned_numel(self.elements_per_buffer)
         self.param_buffer_count = self.swap_config.buffer_count
 
         self.available_buffer_ids = [i for i in range(self.param_buffer_count)]
         self.reserved_buffer_ids = []
-        self.buffers = get_accelerator().pin_memory(
-            torch.empty(int(self.aligned_elements_per_buffer * self.param_buffer_count),
-                        dtype=self.dtype,
-                        requires_grad=False))
-
-        self.aio_read_handle = self.aio_handle(self.aio_config[AIO_BLOCK_SIZE],
-                                               self.aio_config[AIO_QUEUE_DEPTH],
-                                               self.aio_config[AIO_SINGLE_SUBMIT],
-                                               self.aio_config[AIO_OVERLAP_EVENTS],
+        self.buffers = get_accelerator().pin_memory(torch.empty(int(self.aligned_elements_per_buffer *
+                                                                    self.param_buffer_count),
+                                                                dtype=self.dtype,
+                                                                requires_grad=False),
+                                                    align_bytes=0)
+
+        self.aio_read_handle = self.aio_handle(self.aio_config[AIO_BLOCK_SIZE], self.aio_config[AIO_QUEUE_DEPTH],
+                                               self.aio_config[AIO_SINGLE_SUBMIT], self.aio_config[AIO_OVERLAP_EVENTS],
                                                self.aio_config[AIO_THREAD_COUNT])
 
-        self.aio_write_handle = self.aio_handle(self.aio_config[AIO_BLOCK_SIZE],
-                                                self.aio_config[AIO_QUEUE_DEPTH],
+        self.aio_write_handle = self.aio_handle(self.aio_config[AIO_BLOCK_SIZE], self.aio_config[AIO_QUEUE_DEPTH],
                                                 self.aio_config[AIO_SINGLE_SUBMIT],
-                                                self.aio_config[AIO_OVERLAP_EVENTS],
-                                                self.aio_config[AIO_THREAD_COUNT])
+                                                self.aio_config[AIO_OVERLAP_EVENTS], self.aio_config[AIO_THREAD_COUNT])
 
         self.swap_out_params = []
 
@@ -147,8 +141,7 @@ def _get_swap_paths(self, params, must_exist=False):
                 param_path = self.id_to_path[param_id]
             else:
                 assert not must_exist, f"Path for param id {param_id} does not exist"
-                param_path = os.path.join(self.swap_folder,
-                                          f'{param_id}_param.tensor.swp')
+                param_path = os.path.join(self.swap_folder, f'{param_id}_param.tensor.swp')
 
                 self.id_to_path[param_id] = param_path
             paths.append(param_path)
@@ -177,18 +170,16 @@ def _allocate_and_return_buffers_for_swap_in(self, params):
         for param in params:
             param_id = param.ds_id
             assert param_id in self.param_id_to_numel.keys(), f" Number of elements in param {param_id} is unknown"
-            assert param_id not in self.param_id_to_buffer_id.keys(), f"param {param_id} already assigned swap buffer id {self.param_id_to_buffer_id[param_id]}"
-            assert param_id not in self.param_id_to_swap_buffer.keys(), f"param {param_id} has already been assigned a swap buffer"
+            assert param_id not in self.param_id_to_buffer_id.keys(
+            ), f"param {param_id} already assigned swap buffer id {self.param_id_to_buffer_id[param_id]}"
+            assert param_id not in self.param_id_to_swap_buffer.keys(
+            ), f"param {param_id} has already been assigned a swap buffer"
 
             buffer_id = self.available_buffer_ids.pop()
-            print_rank_0(
-                f"param {param.ds_id} is assigned swap in buffer id {buffer_id}  ")
+            print_rank_0(f"param {param.ds_id} is assigned swap in buffer id {buffer_id}  ")
             self.param_id_to_buffer_id[param_id] = buffer_id
             aligned_swap_numel = self._io_aligned_numel(self.param_id_to_numel[param_id])
-            swap_buffer = self.buffers.narrow(
-                0,
-                int(buffer_id * self.aligned_elements_per_buffer),
-                aligned_swap_numel)
+            swap_buffer = self.buffers.narrow(0, int(buffer_id * self.aligned_elements_per_buffer), aligned_swap_numel)
 
             self.param_id_to_swap_buffer[param_id] = swap_buffer
             compute_buffer = swap_buffer.narrow(0, 0, self.param_id_to_numel[param_id])
@@ -217,9 +208,7 @@ def synchronize_reads(self):
 
         for param, swap_in_buffer in zip(self.inflight_params, self.inflight_swap_in_buffers):
             param_id = param.ds_id
-            compute_buffer = swap_in_buffer.narrow(0,
-                                                   0,
-                                                   self.param_id_to_numel[param_id])
+            compute_buffer = swap_in_buffer.narrow(0, 0, self.param_id_to_numel[param_id])
             param.ds_tensor.data = compute_buffer.data
             param.ds_tensor.status = PartitionedParamStatus.AVAILABLE
 
@@ -289,7 +278,8 @@ def _update_inflight_swap_in(self, params, swap_in_buffers, inflight_numel):
     #assigns an in memory buffer and swaps in from nvme
     def swap_in(self, params, async_op=True, swap_in_buffers=None):
 
-        assert all([param.ds_tensor.status == PartitionedParamStatus.NOT_AVAILABLE for param in params]), "Some params are already available or in flight"
+        assert all([param.ds_tensor.status == PartitionedParamStatus.NOT_AVAILABLE
+                    for param in params]), "Some params are already available or in flight"
         swap_in_paths = self._get_swap_paths(params)
 
         if swap_in_buffers is None:
@@ -305,7 +295,9 @@ def swap_in(self, params, async_op=True, swap_in_buffers=None):
                     f'Num available params: count = {len(self.available_params)}, ids = {self.available_params}, numel = {self.available_numel}',
                     force=True)
 
-            assert len(swap_in_paths) <= len(self.available_buffer_ids), f"Not enough buffers {len(self.available_buffer_ids)} for swapping {len(swap_in_paths)}"
+            assert len(swap_in_paths) <= len(
+                self.available_buffer_ids
+            ), f"Not enough buffers {len(self.available_buffer_ids)} for swapping {len(swap_in_paths)}"
             compute_buffers, swap_in_buffers = self._allocate_and_return_buffers_for_swap_in(params)
             inflight_numel = sum([t.numel() for t in compute_buffers])
         else:
@@ -322,7 +314,7 @@ def swap_in(self, params, async_op=True, swap_in_buffers=None):
     def swap_into_buffer(self, param, dest_buffer):
         assert param.ds_tensor.status == PartitionedParamStatus.NOT_AVAILABLE, f"param {param.ds_id} is already available or inflight"
 
-        require_swap_buffer = not (dest_buffer.is_pinned()
+        require_swap_buffer = not (get_accelerator().is_pinned(dest_buffer)
                                    and self._is_io_aligned(dest_buffer.numel()))
 
         if require_swap_buffer:
@@ -348,17 +340,15 @@ def swap_into_buffer(self, param, dest_buffer):
     def get_buffer(self, param, numel):
         param_id = param.ds_id
 
-        assert self.available_swap_in_buffers() > 0, f"No swap buffers to allocate for fp16 param {param_id} of numel = {numel}"
+        assert self.available_swap_in_buffers(
+        ) > 0, f"No swap buffers to allocate for fp16 param {param_id} of numel = {numel}"
         assert numel < self.elements_per_buffer, f"More elements {numel} than buffer size {self.elements_per_buffer}"
 
         self.param_id_to_numel[param_id] = numel
         buffer_id = self.available_buffer_ids.pop()
         self.param_id_to_buffer_id[param_id] = buffer_id
         aligned_swap_numel = self._io_aligned_numel(self.param_id_to_numel[param_id])
-        swap_buffer = self.buffers.narrow(
-            0,
-            int(buffer_id * self.aligned_elements_per_buffer),
-            aligned_swap_numel)
+        swap_buffer = self.buffers.narrow(0, int(buffer_id * self.aligned_elements_per_buffer), aligned_swap_numel)
 
         self.param_id_to_swap_buffer[param_id] = swap_buffer
         compute_buffer = swap_buffer.narrow(0, 0, self.param_id_to_numel[param_id])
@@ -369,8 +359,7 @@ def reserve_available_buffers(self):
         buffers = []
         for id in self.available_buffer_ids:
             buffers.append(
-                self.buffers.narrow(0,
-                                    int(id * self.aligned_elements_per_buffer),
+                self.buffers.narrow(0, int(id * self.aligned_elements_per_buffer),
                                     int(self.aligned_elements_per_buffer)))
             self.reserved_buffer_ids.append(id)
 
@@ -390,12 +379,11 @@ def _is_io_aligned(self, numel):
         return (numel % self.numel_alignment) == 0
 
     def reserve_partitioned_swap_space(self, partition_num_elems):
-        aligned_numel = sum(
-            [self._io_aligned_numel(numel) for numel in partition_num_elems])
-        self.partitioned_swap_buffer = get_accelerator().pin_memory(
-            torch.zeros(aligned_numel,
-                        device='cpu',
-                        dtype=self.dtype))
+        aligned_numel = sum([self._io_aligned_numel(numel) for numel in partition_num_elems])
+        self.partitioned_swap_buffer = get_accelerator().pin_memory(torch.zeros(aligned_numel,
+                                                                                device='cpu',
+                                                                                dtype=self.dtype),
+                                                                    align_bytes=0)
         self.partitioned_swap_pool = SwapBufferPool([self.partitioned_swap_buffer])
 
     def swap_out_partitioned_params(self, dst_fp16_params, src_fp32_params):
@@ -408,11 +396,8 @@ def swap_out_partitioned_params(self, dst_fp16_params, src_fp32_params):
         self.synchronize_writes()
         self.partitioned_swap_pool.reset()
         for i, fp32_tensor in enumerate(src_fp32_params):
-            swap_tensor, _ = self.partitioned_swap_pool.insert_tensor(
-                fp32_tensor,
-                fp16_swap_paths[i],
-                self._io_aligned_numel(fp32_tensor.numel())
-            )
+            swap_tensor, _ = self.partitioned_swap_pool.insert_tensor(fp32_tensor, fp16_swap_paths[i],
+                                                                      self._io_aligned_numel(fp32_tensor.numel()))
             assert swap_tensor is not None
             dst_fp16_params[i].ds_tensor.status = PartitionedParamStatus.AVAILABLE
 
diff --git a/deepspeed/runtime/swap_tensor/pipelined_optimizer_swapper.py b/deepspeed/runtime/swap_tensor/pipelined_optimizer_swapper.py
index 4e101528cd23..cb00e3dc2fad 100644
--- a/deepspeed/runtime/swap_tensor/pipelined_optimizer_swapper.py
+++ b/deepspeed/runtime/swap_tensor/pipelined_optimizer_swapper.py
@@ -1,7 +1,8 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team.
-Licensed under the MIT license.
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
 
+# DeepSpeed Team
+"""
 Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
 """
 
@@ -16,13 +17,8 @@
 
 
 class OptimizerSwapOp(object):
-    def __init__(self,
-                 aio_handle,
-                 read_op,
-                 param_info,
-                 allocated_buffers,
-                 state_buffers,
-                 num_ops):
+
+    def __init__(self, aio_handle, read_op, param_info, allocated_buffers, state_buffers, num_ops):
         self.aio_handle = aio_handle
         self.read_op = read_op
         self.param_info = param_info
@@ -53,36 +49,18 @@ def wait(self):
 
 
 class PipelinedOptimizerSwapper(OptimizerSwapper):
-    def __init__(self,
-                 swap_config,
-                 aio_config,
-                 base_folder,
-                 optimizer,
-                 largest_numel,
-                 device,
-                 dtype,
-                 timers):
-        super(PipelinedOptimizerSwapper,
-              self).__init__(swap_config,
-                             aio_config,
-                             base_folder,
-                             optimizer,
-                             largest_numel,
-                             device,
-                             dtype,
-                             timers)
+
+    def __init__(self, swap_config, aio_config, base_folder, optimizer, largest_numel, device, dtype, timers):
+        super(PipelinedOptimizerSwapper, self).__init__(swap_config, aio_config, base_folder, optimizer, largest_numel,
+                                                        device, dtype, timers)
 
         aio_op = AsyncIOBuilder().load()
-        self.write_aio_handle = aio_op.aio_handle(aio_config[AIO_BLOCK_SIZE],
-                                                  aio_config[AIO_QUEUE_DEPTH],
-                                                  aio_config[AIO_SINGLE_SUBMIT],
-                                                  aio_config[AIO_OVERLAP_EVENTS],
+        self.write_aio_handle = aio_op.aio_handle(aio_config[AIO_BLOCK_SIZE], aio_config[AIO_QUEUE_DEPTH],
+                                                  aio_config[AIO_SINGLE_SUBMIT], aio_config[AIO_OVERLAP_EVENTS],
                                                   aio_config[AIO_THREAD_COUNT])
 
-        self.read_aio_handle = aio_op.aio_handle(aio_config[AIO_BLOCK_SIZE],
-                                                 aio_config[AIO_QUEUE_DEPTH],
-                                                 aio_config[AIO_SINGLE_SUBMIT],
-                                                 aio_config[AIO_OVERLAP_EVENTS],
+        self.read_aio_handle = aio_op.aio_handle(aio_config[AIO_BLOCK_SIZE], aio_config[AIO_QUEUE_DEPTH],
+                                                 aio_config[AIO_SINGLE_SUBMIT], aio_config[AIO_OVERLAP_EVENTS],
                                                  aio_config[AIO_THREAD_COUNT])
 
         # Overlap gradient swap out
@@ -93,42 +71,25 @@ def __init__(self,
         self.async_swap_in = swap_config.pipeline_read
         self.async_swap_out = swap_config.pipeline_write
 
-        self.swap_ops = {
-            SYNC_SWAP_IN: None,
-            ASYNC_SWAP_IN: None,
-            SYNC_SWAP_OUT: None,
-            ASYNC_SWAP_OUT: None
-        }
+        self.swap_ops = {SYNC_SWAP_IN: None, ASYNC_SWAP_IN: None, SYNC_SWAP_OUT: None, ASYNC_SWAP_OUT: None}
 
         self.print_exclude_list += [
-            'gradient_swapper',
-            'read_aio_handle',
-            'write_aio_handle',
-            'swap_ops',
-            'print_exclude_list'
+            'gradient_swapper', 'read_aio_handle', 'write_aio_handle', 'swap_ops', 'print_exclude_list'
         ]
 
         if dist.get_rank() == 0:
-            print_object(obj=self,
-                         name='PipelinedOptimizerSwapper',
-                         exclude_list=self.print_exclude_list)
+            print_object(obj=self, name='PipelinedOptimizerSwapper', exclude_list=self.print_exclude_list)
 
     def initialize_parameters(self, parameters, src_tensors):
-        self._initialize_parameters(parameters=parameters,
-                                    src_tensors=src_tensors,
-                                    aio_handle=self.write_aio_handle)
-
-    def initialize_from_swapped_fp16_params(self,
-                                            fp16_partitions_info,
-                                            fp16_num_elems,
-                                            fp16_pinned_buffers,
+        self._initialize_parameters(parameters=parameters, src_tensors=src_tensors, aio_handle=self.write_aio_handle)
+
+    def initialize_from_swapped_fp16_params(self, fp16_partitions_info, fp16_num_elems, fp16_pinned_buffers,
                                             fp32_parameters):
-        self._initialize_from_swapped_fp16_params(
-            aio_handle=self.write_aio_handle,
-            fp16_partitions_info=fp16_partitions_info,
-            fp16_num_elems=fp16_num_elems,
-            fp16_pinned_buffers=fp16_pinned_buffers,
-            fp32_parameters=fp32_parameters)
+        self._initialize_from_swapped_fp16_params(aio_handle=self.write_aio_handle,
+                                                  fp16_partitions_info=fp16_partitions_info,
+                                                  fp16_num_elems=fp16_num_elems,
+                                                  fp16_pinned_buffers=fp16_pinned_buffers,
+                                                  fp32_parameters=fp32_parameters)
 
     def flush_gradients(self):
         self._flush_gradient_swapper(self.gradient_swapper)
@@ -146,18 +107,16 @@ def swap_in_optimizer_state(self, parameter, async_parameter):
             self.swap_ops[SYNC_SWAP_IN] = self.swap_ops[ASYNC_SWAP_IN]
             self.swap_ops[ASYNC_SWAP_IN] = None
         else:
-            self.swap_ops[SYNC_SWAP_IN] = self._swap_in_optimizer_state(
-                aio_handle=self.read_aio_handle,
-                parameter=parameter)
+            self.swap_ops[SYNC_SWAP_IN] = self._swap_in_optimizer_state(aio_handle=self.read_aio_handle,
+                                                                        parameter=parameter)
 
         if self.swap_ops[SYNC_SWAP_IN]:
             self.swap_ops[SYNC_SWAP_IN].wait()
 
         if self.async_swap_in and async_parameter is not None:
             assert self.swap_ops[ASYNC_SWAP_IN] is None
-            self.swap_ops[ASYNC_SWAP_IN] = self._swap_in_optimizer_state(
-                aio_handle=self.read_aio_handle,
-                parameter=async_parameter)
+            self.swap_ops[ASYNC_SWAP_IN] = self._swap_in_optimizer_state(aio_handle=self.read_aio_handle,
+                                                                         parameter=async_parameter)
 
         self._stop_timer(SWAP_IN_STATE_TIMER)
         self.timer_names.add(SWAP_IN_STATE_TIMER)
@@ -209,10 +168,9 @@ def _swap_out_optimizer_state(self, aio_handle, parameter, swap_in_op):
         unpinned_tensors = param_info.get_unpinned_state_tensors()
 
         if len(unpinned_tensors) > 0:
-            new_alloc_buffers = self.swap_buffer_manager.allocate(
-                num_elems=self._io_aligned_numel(param_info.numel()),
-                count=len(unpinned_tensors),
-                dtype=param_info.dtype())
+            new_alloc_buffers = self.swap_buffer_manager.allocate(num_elems=self._io_aligned_numel(param_info.numel()),
+                                                                  count=len(unpinned_tensors),
+                                                                  dtype=param_info.dtype())
             assert new_alloc_buffers is not None
 
             allocated_buffers += new_alloc_buffers
@@ -241,13 +199,11 @@ def _swap_in_optimizer_state(self, aio_handle, parameter):
         if param_info is None:
             return None
 
-        required_buffer_count = len(
-            param_info.tensors) + (1 if param_info.has_gradients() else 0)
+        required_buffer_count = len(param_info.tensors) + (1 if param_info.has_gradients() else 0)
         aligned_numel = self._io_aligned_numel(param_info.numel())
-        allocated_buffers = self.swap_buffer_manager.allocate(
-            num_elems=aligned_numel,
-            count=required_buffer_count,
-            dtype=parameter.dtype)
+        allocated_buffers = self.swap_buffer_manager.allocate(num_elems=aligned_numel,
+                                                              count=required_buffer_count,
+                                                              dtype=parameter.dtype)
         assert allocated_buffers is not None, \
         f"PipelinedOptimizerSwapper ran out of swap buffers, try increasing 'buffer_count'"
 
@@ -266,8 +222,7 @@ def _swap_in_optimizer_state(self, aio_handle, parameter):
         swap_in_tensors(aio_handle, swap_buffers, swap_paths)
 
         if param_info.unswapped_gradients:
-            self._retrieve_unswapped_grad_partitions(swap_info=param_info,
-                                                     dest_buffer=parameter.grad)
+            self._retrieve_unswapped_grad_partitions(swap_info=param_info, dest_buffer=parameter.grad)
 
         swap_in_op = OptimizerSwapOp(aio_handle=aio_handle,
                                      param_info=param_info,
diff --git a/deepspeed/runtime/swap_tensor/utils.py b/deepspeed/runtime/swap_tensor/utils.py
index 1d236bd7fcac..90b2d9b8bd31 100644
--- a/deepspeed/runtime/swap_tensor/utils.py
+++ b/deepspeed/runtime/swap_tensor/utils.py
@@ -1,7 +1,8 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
 
+# DeepSpeed Team
+"""
 Functionality of swapping tensors to/from (NVMe) storage devices.
 """
 
@@ -34,6 +35,7 @@ def print_object(obj, name, exclude_list=[]):
 
 
 class SwapBuffer(object):
+
     def __init__(self, buffer):
         self.buffer = buffer
         self.reset()
@@ -92,8 +94,9 @@ def get_swap_path(self, offset):
 
 
 class SwapBufferPool(object):
+
     def __init__(self, buffers):
-        assert all([buf.is_pinned() for buf in buffers])
+        assert all([get_accelerator().is_pinned(buf) for buf in buffers])
         self.buffers = [SwapBuffer(buf) for buf in buffers]
         self.current_index = 0
 
@@ -175,20 +178,18 @@ def _get_used_buffers(self):
 
 
 class SwapBufferManager(object):
+
     def __init__(self, num_elems, count, dtype):
         self.num_elems = num_elems
         self.count = count
         self.dtype = dtype
         self.all_buffers = [
-            get_accelerator().pin_memory(
-                torch.zeros(num_elems,
-                            device='cpu',
-                            dtype=dtype)) for _ in range(count)
+            get_accelerator().pin_memory(torch.zeros(num_elems, device='cpu', dtype=dtype), align_bytes=0)
+            for _ in range(count)
         ]
         self.free_buffer_index = [i for i in range(count)]
         self.used_buffer_index = {}
-        self.gigabytes = (self.all_buffers[0].element_size() * num_elems * count) / (1024
-                                                                                     **3)
+        self.gigabytes = (self.all_buffers[0].element_size() * num_elems * count) / (1024**3)
 
         if dist.get_rank() == 0:
             exclude_list = ['all_buffers']
@@ -211,9 +212,7 @@ def allocate(self, num_elems, count, dtype):
         return buffers
 
     def allocate_all(self, num_elems, dtype):
-        return self.allocate(num_elems=num_elems,
-                             count=len(self.free_buffer_index),
-                             dtype=dtype)
+        return self.allocate(num_elems=num_elems, count=len(self.free_buffer_index), dtype=dtype)
 
     def free(self, buffers):
         buffer_ids = []
diff --git a/deepspeed/runtime/utils.py b/deepspeed/runtime/utils.py
index 30dad84b16d1..108c7775530b 100755
--- a/deepspeed/runtime/utils.py
+++ b/deepspeed/runtime/utils.py
@@ -1,10 +1,12 @@
-'''
-Copyright 2019 The Microsoft DeepSpeed Team
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
 
+# DeepSpeed Team
+"""
 Copyright NVIDIA/Megatron
 
 Helper functions and classes from multiple sources.
-'''
+"""
 
 from collections.abc import Iterable
 from deepspeed.moe.utils import is_moe_param
@@ -12,22 +14,25 @@
 import psutil
 import gc
 from math import sqrt
-from math import floor
 from bisect import bisect_left
+from packaging import version as pkg_version
 
 import torch
 from deepspeed import comm as dist
 
 try:
-    from torch._six import inf as inf
+    from torch._six import inf
 except ModuleNotFoundError:
-    from torch import inf as inf
+    from torch import inf
 
 from deepspeed.utils import groups, logger
 from deepspeed.runtime.constants import PIPE_REPLICATED
 from numpy import prod
 from deepspeed.accelerator import get_accelerator
 
+from deepspeed.module_inject.policy import transpose
+from torch.nn import functional as F
+
 torch_memory_reserved = get_accelerator().memory_reserved
 torch_max_memory_reserved = get_accelerator().max_memory_reserved
 
@@ -37,6 +42,7 @@ class DummyOptim():
     Dummy optimizer presents model parameters as a param group, this is
     primarily used to allow ZeRO-3 without an optimizer
     """
+
     def __init__(self, params):
         self.param_groups = []
         self.param_groups.append({'params': params})
@@ -46,6 +52,18 @@ def noop_decorator(func):
     return func
 
 
+class noop_context(object):
+
+    def __init__(self):
+        pass
+
+    def __enter__(self):
+        pass
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        pass
+
+
 def ensure_directory_exists(filename):
     """Create the directory path to ``filename`` if it does not already exist.
 
@@ -169,11 +187,8 @@ def move_to_device(item, device, criterion_func):
 
 class CheckOverflow(object):
     '''Checks for overflow in gradient across parallel process'''
-    def __init__(self,
-                 param_groups=None,
-                 mpu=None,
-                 zero_reduce_scatter=False,
-                 deepspeed=None):
+
+    def __init__(self, param_groups=None, mpu=None, zero_reduce_scatter=False, deepspeed=None):
         self.mpu = mpu
         self.params = [] if param_groups else None
         self.zero_reduce_scatter = zero_reduce_scatter
@@ -196,13 +211,9 @@ def check_using_norm(self, norm_group, reduce_overflow=True):
             # an overflow due to expert weights, we detect it
 
             # Only need to check groups.get_largest_expert_parallel_group()
-            dist.all_reduce(overflow_gpu,
-                            op=dist.ReduceOp.MAX,
-                            group=groups._get_max_expert_parallel_group())
+            dist.all_reduce(overflow_gpu, op=dist.ReduceOp.MAX, group=groups._get_max_expert_parallel_group())
         if self.mpu is not None:
-            dist.all_reduce(overflow_gpu,
-                            op=dist.ReduceOp.MAX,
-                            group=self.mpu.get_model_parallel_group())
+            dist.all_reduce(overflow_gpu, op=dist.ReduceOp.MAX, group=self.mpu.get_model_parallel_group())
         elif reduce_overflow:
             dist.all_reduce(overflow_gpu, op=dist.ReduceOp.MAX)
             dist.barrier()
@@ -241,37 +252,24 @@ def has_overflow(self, params, has_moe_params=None):
         # Since each model parallel GPU carries only part of the model,
         # make sure overflow flag is synced across all the model parallel GPUs
         overflow_gpu = get_accelerator().ByteTensor([overflow])
-        # deepspeeed.comm.all_reduce(overflow_gpu,
+        # deepspeed.comm.all_reduce(overflow_gpu,
         #                             op=deepspeed.comm.ReduceOp.MAX,
         #                             group=mpu.get_model_parallel_group())
         if has_moe_params:
             # All reduce this across expert_parallel_group, so that if an expert
             # overflows, we detect it here
-            dist.all_reduce(overflow_gpu,
-                            op=dist.ReduceOp.MAX,
-                            group=groups._get_max_expert_parallel_group())
+            dist.all_reduce(overflow_gpu, op=dist.ReduceOp.MAX, group=groups._get_max_expert_parallel_group())
         if self.zero_reduce_scatter:
-            dist.all_reduce(overflow_gpu,
-                            op=dist.ReduceOp.MAX,
-                            group=dist.get_world_group())
+            dist.all_reduce(overflow_gpu, op=dist.ReduceOp.MAX, group=dist.get_world_group())
         elif self.mpu is not None:
             if self.deepspeed is not None:
-                using_pipeline = hasattr(self.deepspeed,
-                                         'pipeline_enable_backward_allreduce')
-                if (using_pipeline
-                        and self.deepspeed.pipeline_enable_backward_allreduce is False
-                    ) or (not using_pipeline
-                          and self.deepspeed.enable_backward_allreduce is False):
-                    dist.all_reduce(overflow_gpu,
-                                    op=dist.ReduceOp.MAX,
-                                    group=self.mpu.get_data_parallel_group())
-            dist.all_reduce(overflow_gpu,
-                            op=dist.ReduceOp.MAX,
-                            group=self.mpu.get_model_parallel_group())
+                using_pipeline = hasattr(self.deepspeed, 'pipeline_enable_backward_allreduce')
+                if (using_pipeline and self.deepspeed.pipeline_enable_backward_allreduce is False) or (
+                        not using_pipeline and self.deepspeed.enable_backward_allreduce is False):
+                    dist.all_reduce(overflow_gpu, op=dist.ReduceOp.MAX, group=self.mpu.get_data_parallel_group())
+            dist.all_reduce(overflow_gpu, op=dist.ReduceOp.MAX, group=self.mpu.get_model_parallel_group())
         elif self.deepspeed is not None and self.deepspeed.enable_backward_allreduce is False:
-            dist.all_reduce(overflow_gpu,
-                            op=dist.ReduceOp.MAX,
-                            group=dist.get_world_group())
+            dist.all_reduce(overflow_gpu, op=dist.ReduceOp.MAX, group=dist.get_world_group())
 
         overflow = overflow_gpu[0].item()
         return bool(overflow)
@@ -308,9 +306,7 @@ def _handle_overflow(cpu_sum, x, i):
             if not math.isfinite(float(v)):
                 t_i = v_i
                 break
-        logger.info(
-            f"rank {rank} detected overflow {cpu_sum} in tensor {i}:{t_i} shape {x.shape}"
-        )
+        logger.info(f"rank {rank} detected overflow {cpu_sum} in tensor {i}:{t_i} shape {x.shape}")
 
 
 def get_global_norm(norm_list):
@@ -319,6 +315,7 @@ def get_global_norm(norm_list):
     total_norm = 0.0
     for norm in norm_list:
         total_norm += norm**2.0
+    # logger.info(f'norm_list = {norm_list} global = {sqrt(total_norm)}')
     return sqrt(total_norm)
 
 
@@ -353,16 +350,13 @@ def clip_grad_norm_(parameters, max_norm, norm_type=2, mpu=None):
         total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)])
         # Take max across all GPUs.
         if mpu is not None:
-            dist.all_reduce(total_norm_cuda,
-                            op=dist.ReduceOp.MAX,
-                            group=mpu.get_model_parallel_group())
+            dist.all_reduce(total_norm_cuda, op=dist.ReduceOp.MAX, group=mpu.get_model_parallel_group())
         total_norm = total_norm_cuda[0].item()
     else:
         total_norm = 0
         for p in parameters:
             if mpu is not None:
-                if (mpu.get_model_parallel_rank()
-                        == 0) or is_model_parallel_parameter(p):
+                if (mpu.get_model_parallel_rank() == 0) or is_model_parallel_parameter(p):
                     param_norm = p.grad.data.norm(norm_type)
                     total_norm += param_norm.item()**norm_type
             else:
@@ -372,9 +366,7 @@ def clip_grad_norm_(parameters, max_norm, norm_type=2, mpu=None):
         # Sum across all model parallel GPUs.
         total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)])
         if mpu is not None:
-            dist.all_reduce(total_norm_cuda,
-                            op=dist.ReduceOp.SUM,
-                            group=mpu.get_model_parallel_group())
+            dist.all_reduce(total_norm_cuda, op=dist.ReduceOp.SUM, group=mpu.get_model_parallel_group())
         total_norm = total_norm_cuda[0].item()**(1. / norm_type)
 
     # Need to average total_norm across different GPUs due to the presence of moe params
@@ -402,7 +394,6 @@ def get_grad_norm(parameters, norm_type=2, mpu=None):
     Arguments:
         parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
             single Tensor that will have gradients normalized
-        max_norm (float or int): max norm of the gradients
         norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
             infinity norm.
 
@@ -419,9 +410,7 @@ def get_grad_norm(parameters, norm_type=2, mpu=None):
         total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)])
         # Take max across all GPUs.
         if mpu is not None:
-            dist.all_reduce(total_norm_cuda,
-                            op=dist.ReduceOp.MAX,
-                            group=mpu.get_model_parallel_group())
+            dist.all_reduce(total_norm_cuda, op=dist.ReduceOp.MAX, group=mpu.get_model_parallel_group())
         total_norm = total_norm_cuda[0].item()
     else:
         total_norm = 0.
@@ -442,13 +431,10 @@ def get_grad_norm(parameters, norm_type=2, mpu=None):
         # Sum across all model parallel GPUs.
         total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)])
         if mpu is not None:
-            dist.all_reduce(total_norm_cuda,
-                            op=dist.ReduceOp.SUM,
-                            group=mpu.get_model_parallel_group())
+            dist.all_reduce(total_norm_cuda, op=dist.ReduceOp.SUM, group=mpu.get_model_parallel_group())
         total_norm = total_norm_cuda[0].item()**(1. / norm_type)
 
-    if total_norm == float(
-            'inf') or total_norm == -float('inf') or total_norm != total_norm:
+    if total_norm == float('inf') or total_norm == -float('inf') or total_norm != total_norm:
         total_norm = -1
 
     return total_norm
@@ -488,9 +474,7 @@ def get_grad_zeros(parameters, mpu=None):
     # Sum across all model parallel GPUs.
     total_zeros_cuda = get_accelerator().FloatTensor([float(total_zeros)])
     if mpu is not None:
-        dist.all_reduce(total_zeros_cuda,
-                        op=dist.ReduceOp.SUM,
-                        group=mpu.get_model_parallel_group())
+        dist.all_reduce(total_zeros_cuda, op=dist.ReduceOp.SUM, group=mpu.get_model_parallel_group())
     total_zeros = total_zeros_cuda[0].item()
 
     return total_zeros
@@ -506,12 +490,12 @@ def get_weight_norm(parameters, norm_type=2, mpu=None):
     Arguments:
         parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
             single Tensor that will have gradients normalized
-        max_norm (float or int): max norm of the gradients
         norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
             infinity norm.
 
     Returns:
         Total norm of the parameters (viewed as a single vector).
+        -1 if the norm value is NaN or Inf.
     """
     if isinstance(parameters, torch.Tensor):
         parameters = [parameters]
@@ -522,9 +506,7 @@ def get_weight_norm(parameters, norm_type=2, mpu=None):
         total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)])
         # Take max across all GPUs.
         if mpu is not None:
-            dist.all_reduce(total_norm_cuda,
-                            op=dist.ReduceOp.MAX,
-                            group=mpu.get_model_parallel_group())
+            dist.all_reduce(total_norm_cuda, op=dist.ReduceOp.MAX, group=mpu.get_model_parallel_group())
         total_norm = total_norm_cuda[0].item()
     else:
         total_norm = 0.
@@ -545,13 +527,10 @@ def get_weight_norm(parameters, norm_type=2, mpu=None):
         # Sum across all model parallel GPUs.
         total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)])
         if mpu is not None:
-            dist.all_reduce(total_norm_cuda,
-                            op=dist.ReduceOp.SUM,
-                            group=mpu.get_model_parallel_group())
+            dist.all_reduce(total_norm_cuda, op=dist.ReduceOp.SUM, group=mpu.get_model_parallel_group())
         total_norm = total_norm_cuda[0].item()**(1. / norm_type)
 
-    if total_norm == float(
-            'inf') or total_norm == -float('inf') or total_norm != total_norm:
+    if total_norm == float('inf') or total_norm == -float('inf') or total_norm != total_norm:
         total_norm = -1
 
     return total_norm
@@ -571,6 +550,7 @@ def prefix_sum_inc(weights):
 
 
 def partition_uniform(num_items, num_parts):
+    import numpy
     parts = [0] * (num_parts + 1)
     # First check for the trivial edge case
     if num_items <= num_parts:
@@ -578,10 +558,15 @@ def partition_uniform(num_items, num_parts):
             parts[p] = min(p, num_items)
         return parts
 
-    chunksize = floor(num_items / num_parts)
-    for p in range(num_parts):
-        parts[p] = min(chunksize * p, num_items)
-    parts[num_parts] = num_items
+    chunksize = num_items // num_parts
+    residual = num_items - (chunksize * num_parts)
+
+    parts = numpy.arange(0, (num_parts + 1) * chunksize, chunksize)
+
+    for i in range(residual):
+        parts[i + 1:] += 1
+    parts = parts.tolist()
+
     return parts
 
 
@@ -603,11 +588,7 @@ def _lprobe(weights, num_parts, bottleneck):
             step += chunksize
 
         # Find the end index of partition p
-        parts[p] = bisect_left(weights,
-                               bsum,
-                               lo=step - chunksize,
-                               hi=min(step,
-                                      num_items))
+        parts[p] = bisect_left(weights, bsum, lo=step - chunksize, hi=min(step, num_items))
         # Nothing more to partition, return early
         if parts[p] == num_items:
             # See if the current partition is overweight.
@@ -655,6 +636,7 @@ def partition_balanced(weights, num_parts, eps=1e-3):
 
 
 class PartitionedTensor:
+
     def __init__(self, tensor, group, partition_meta=None):
         super().__init__()
 
@@ -696,10 +678,7 @@ def _partition_tensor(self, tensor):
         partition = partition_uniform(num_items=tensor.numel(), num_parts=self.num_parts)
         start = partition[self.rank]
         length = partition[self.rank + 1] - start
-        tensor_part = tensor.detach().contiguous().view(-1).narrow(
-            0,
-            start=start,
-            length=length).clone()
+        tensor_part = tensor.detach().contiguous().view(-1).narrow(0, start=start, length=length).clone()
 
         return tensor_part, partition
 
@@ -709,9 +688,7 @@ def full(self, device=None):
 
         # Allocate the full tensor as a flat buffer.
         full_numel = prod(self.full_size())
-        flat_tensor = torch.zeros([full_numel],
-                                  dtype=self.local_data.dtype,
-                                  device=device)
+        flat_tensor = torch.zeros([full_numel], dtype=self.local_data.dtype, device=device)
 
         # Prepare all-gather buffer
         partition_tensors = []
@@ -723,9 +700,7 @@ def full(self, device=None):
             partition_tensors.append(buf)
 
         # Collect the full tensor
-        dist.all_gather(partition_tensors,
-                        partition_tensors[self.rank],
-                        group=self.group)
+        dist.all_gather(partition_tensors, partition_tensors[self.rank], group=self.group)
 
         for i in range(len(partition_tensors)):
             partition_tensors[i].data = torch.zeros(1)
@@ -798,12 +773,9 @@ def memory_status(msg, print_rank=-1, reset_max=False):
     max_cached /= 1024**3
 
     print(
-        f'RANK={rank} MEMSTATS',
-        msg,
-        f'device={get_accelerator().current_device_name()} '
+        f'RANK={rank} MEMSTATS', msg, f'device={get_accelerator().current_device_name()} '
         f'current alloc={new_alloced:0.4f}GB (delta={delta_alloced:0.4f}GB max={max_alloced:0.4f}GB) '
-        f'current cache={new_cached:0.4f}GB (delta={delta_cached:0.4f}GB max={max_cached:0.4f}GB)'
-    )
+        f'current cache={new_cached:0.4f}GB (delta={delta_cached:0.4f}GB max={max_cached:0.4f}GB)')
 
 
 def get_ma_status():
@@ -814,6 +786,7 @@ def get_ma_status():
 
 def empty_cache():
     get_accelerator().empty_cache()
+    get_accelerator().reset_peak_memory_stats()
 
 
 def see_memory_usage(message, force=False):
@@ -827,16 +800,14 @@ def see_memory_usage(message, force=False):
 
     # Print message except when distributed but not rank 0
     logger.info(message)
-    logger.info(
-        f"MA {round(get_accelerator().memory_allocated() / (1024 * 1024 * 1024),2 )} GB \
+    logger.info(f"MA {round(get_accelerator().memory_allocated() / (1024 * 1024 * 1024),2 )} GB \
         Max_MA {round(get_accelerator().max_memory_allocated() / (1024 * 1024 * 1024),2)} GB \
         CA {round(torch_memory_reserved() / (1024 * 1024 * 1024),2)} GB \
         Max_CA {round(torch_max_memory_reserved() / (1024 * 1024 * 1024))} GB ")
 
     vm_stats = psutil.virtual_memory()
     used_GB = round(((vm_stats.total - vm_stats.available) / (1024**3)), 2)
-    logger.info(
-        f'CPU Virtual Memory:  used = {used_GB} GB, percent = {vm_stats.percent}%')
+    logger.info(f'CPU Virtual Memory:  used = {used_GB} GB, percent = {vm_stats.percent}%')
 
     # get the peak memory to report correct data, so reset the counter for the next call
     get_accelerator().reset_peak_memory_stats()
@@ -915,32 +886,22 @@ def get_global_norm_of_tensors(input_tensors, norm_type=2, mpu=None):
         total_norm = max(t.data.abs().max() for t in input_tensors)
         total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)])
         if mpu is not None:
-            dist.all_reduce(total_norm_cuda,
-                            op=dist.ReduceOp.MAX,
-                            group=mpu.get_model_parallel_group())
+            dist.all_reduce(total_norm_cuda, op=dist.ReduceOp.MAX, group=mpu.get_model_parallel_group())
             total_norm = total_norm_cuda[0].item()
     else:
-        total_norm = sum(
-            [t.data.float().norm(norm_type).item()**norm_type for t in input_tensors])
+        total_norm = sum([t.data.float().norm(norm_type).item()**norm_type for t in input_tensors])
         total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)])
         if mpu is not None:
-            dist.all_reduce(total_norm_cuda,
-                            op=dist.ReduceOp.SUM,
-                            group=mpu.get_model_parallel_group())
+            dist.all_reduce(total_norm_cuda, op=dist.ReduceOp.SUM, group=mpu.get_model_parallel_group())
         total_norm = total_norm_cuda[0].item()**(1. / norm_type)
 
-    if total_norm == float(
-            'inf') or total_norm == -float('inf') or total_norm != total_norm:
+    if total_norm == float('inf') or total_norm == -float('inf') or total_norm != total_norm:
         total_norm = -1
 
     return total_norm
 
 
-def clip_tensors_by_global_norm(input_tensors,
-                                max_norm=1.0,
-                                global_norm=None,
-                                mpu=None,
-                                eps=1e-6):
+def clip_tensors_by_global_norm(input_tensors, max_norm=1.0, global_norm=None, mpu=None, eps=1e-6):
     """Clip list of tensors by global norm.
     Args:
         input_tensors: List of tensors to be clipped
@@ -968,9 +929,7 @@ def align_dense_tensors(tensor_list, alignment):
 
     if remaining:
         elements_to_add = alignment - remaining
-        pad_tensor = torch.zeros(elements_to_add,
-                                 device=tensor_list[0].device,
-                                 dtype=tensor_list[0].dtype)
+        pad_tensor = torch.zeros(elements_to_add, device=tensor_list[0].device, dtype=tensor_list[0].dtype)
         padded_tensor_list = tensor_list + [pad_tensor]
     else:
         padded_tensor_list = tensor_list
@@ -978,19 +937,26 @@ def align_dense_tensors(tensor_list, alignment):
     return padded_tensor_list
 
 
-def all_gather_dp_groups(partitioned_param_groups,
-                         dp_process_group,
-                         start_alignment_factor,
-                         allgather_bucket_size):
+def all_gather_all_partitions(global_flatten_group, partitioned_param_groups, dp_process_group):
+    for group_id, partitioned_params in enumerate(partitioned_param_groups):
+        # Sequential AllGather Best of both worlds
+        partition_id = dist.get_rank(group=dp_process_group[group_id])
+        dp_world_size = dist.get_world_size(group=dp_process_group[group_id])
+        dist.all_gather_into_tensor(global_flatten_group[group_id], partitioned_params[partition_id],
+                                    dp_process_group[group_id])
+
+
+def all_gather_dp_groups(partitioned_param_groups, dp_process_group, start_alignment_factor, allgather_bucket_size):
     for group_id, partitioned_params in enumerate(partitioned_param_groups):
         # Sequential AllGather Best of both worlds
         partition_id = dist.get_rank(group=dp_process_group[group_id])
         dp_world_size = dist.get_world_size(group=dp_process_group[group_id])
 
-        num_shards = max(
-            1,
-            partitioned_params[partition_id].numel() * dp_world_size //
-            allgather_bucket_size)
+        if dp_world_size == 1:
+            # no groups share optimizer states
+            # pipeline parallel with bf16 will default call this even if dp size = 1.
+            continue
+        num_shards = max(1, partitioned_params[partition_id].numel() * dp_world_size // allgather_bucket_size)
 
         shard_size = partitioned_params[partition_id].numel() // num_shards
 
@@ -1004,16 +970,50 @@ def all_gather_dp_groups(partitioned_param_groups,
         for shard_id in range(num_shards):
 
             if shard_id == (num_shards - 1):
-                num_elements = partitioned_params[partition_id].numel(
-                ) - shard_id * shard_size
+                num_elements = partitioned_params[partition_id].numel() - shard_id * shard_size
 
             shard_list = []
             for dp_id in range(dp_world_size):
-                curr_shard = partitioned_params[dp_id].narrow(0,
-                                                              shard_id * shard_size,
-                                                              num_elements).detach()
+                curr_shard = partitioned_params[dp_id].narrow(0, shard_id * shard_size, num_elements).detach()
                 shard_list.append(curr_shard)
 
-            dist.all_gather(shard_list,
-                            shard_list[partition_id],
-                            dp_process_group[group_id])
+            dist.all_gather(shard_list, shard_list[partition_id], dp_process_group[group_id])
+
+
+class TLinear(torch.nn.Linear):
+
+    def __init__(self, orig_layer, name=""):
+        self.name = name
+        super().__init__(orig_layer.weight.shape[1], orig_layer.weight.shape[0], bias=(orig_layer.bias is not None))
+        self.weight.data = transpose(orig_layer.weight.data)
+        self.bias = orig_layer.bias
+        self._fwd_func = self._fwd_bias_add if self.bias is not None else self._fwd
+
+    def _fwd(self, input):
+        return F.linear(input, self.weight)
+
+    def _fwd_bias_add(self, input):
+        return F.linear(input, self.weight, bias=self.bias)
+
+    def forward(self, input):
+        return self._fwd_func(input)
+
+
+def get_inactive_params(param_list):
+    from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
+    return [param for param in param_list if (hasattr(param, 'ds_id') and \
+                            param.ds_status == ZeroParamStatus.NOT_AVAILABLE)]
+
+
+def required_torch_version(min_version=None, max_version=None):
+    assert min_version or max_version, "Must provide a min_version or max_version argument"
+
+    torch_version = pkg_version.parse(torch.__version__)
+
+    if min_version and pkg_version.parse(str(min_version)) > torch_version:
+        return False
+
+    if max_version and pkg_version.parse(str(max_version)) < torch_version:
+        return False
+
+    return True
diff --git a/deepspeed/runtime/weight_quantizer.py b/deepspeed/runtime/weight_quantizer.py
index f88a9310ba32..95d737614e59 100644
--- a/deepspeed/runtime/weight_quantizer.py
+++ b/deepspeed/runtime/weight_quantizer.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 from ..module_inject.replace_policy import HFBertLayerPolicy, replace_policies
@@ -6,6 +9,7 @@
 
 
 class WeightQuantization(object):
+
     def __init__(self, mlp_extra_grouping=True, mp_size=1):
         self.dense_scales = []
         self.qkv_scales = []
@@ -20,8 +24,7 @@ def quantize_data(self, data, quantize_bits, groups, key=None):
         data_scale = [float(1 << quantize_bits) / (2 * mx + 1e-5) for mx in max_d]
         data_int = [(g * s) for g, s in zip(data_groups, data_scale)]
         data_int = [
-            di.round().clamp(-(1 << (quantize_bits - 1)),
-                             (((1 << (quantize_bits - 1)) - 1))) for di in data_int
+            di.round().clamp(-(1 << (quantize_bits - 1)), (((1 << (quantize_bits - 1)) - 1))) for di in data_int
         ]
         data_int = torch.cat(data_int).reshape(data.shape)
         data_int = data_int.to(torch.int8)
@@ -37,8 +40,7 @@ def is_qkv(self, data):
                 (self.mp_size * data.shape[1]) / data.shape[0] == 3)
 
     def Quantize(self, value_list, quantize_bits, groups, key, merge_dim=0):
-        if self.mlp_extra_grouping and self.is_mlp(value_list[0],
-                                                   merge_count=len(value_list)):
+        if self.mlp_extra_grouping and self.is_mlp(value_list[0], merge_count=len(value_list)):
             groups *= 2
         q_scale = []
         index = 0
@@ -47,11 +49,8 @@ def Quantize(self, value_list, quantize_bits, groups, key, merge_dim=0):
             q_scale.append(data_scale)
             value_list[index] = data_int
             index += 1
-        q_scale = (
-            1 /
-            torch.cat(q_scale,
-                      dim=merge_dim).to(
-                          get_accelerator().current_device_name()).view(-1).unsqueeze(0))
+        q_scale = (1 /
+                   torch.cat(q_scale, dim=merge_dim).to(get_accelerator().current_device_name()).view(-1).unsqueeze(0))
         if "mlp.dense_4h_to_h.weight" in key:
             self.mlp4hh_scales.append(q_scale)
         elif "mlp.dense_h_to_4h.weight" in key:
@@ -65,10 +64,7 @@ def Quantize(self, value_list, quantize_bits, groups, key, merge_dim=0):
     def merge_layer_scales(self, layer_scales):
         max_dim = max([s.shape[-1] for s in layer_scales])
         layer_scales = [
-            torch.cat((s,
-                       torch.zeros((1,
-                                    max_dim - s.shape[-1]),
-                                   device=get_accelerator().current_device_name())),
+            torch.cat((s, torch.zeros((1, max_dim - s.shape[-1]), device=get_accelerator().current_device_name())),
                       dim=-1) if s.shape[-1] < max_dim else s for s in layer_scales
         ]
         return torch.cat(layer_scales).unsqueeze(0)
@@ -77,11 +73,7 @@ def merge_scales(self):
         all_scales = []
         for dense_scale, qkv_scale, m4hh_scale, mh4h_scale in \
             zip(self.dense_scales, self.qkv_scales, self.mlp4hh_scales, self.mlph4h_scales):
-            all_scales.append(
-                self.merge_layer_scales([qkv_scale,
-                                         dense_scale,
-                                         mh4h_scale,
-                                         m4hh_scale]))
+            all_scales.append(self.merge_layer_scales([qkv_scale, dense_scale, mh4h_scale, m4hh_scale]))
         return torch.cat(all_scales)
 
     def merge_scales_split(self, split_count):
@@ -95,13 +87,8 @@ def merge_scales_split(self, split_count):
             for s in range(split_count):
                 all_scales[s].append(
                     torch.cat([
-                        torch.cat((qkv_scale[s],
-                                   torch.zeros_like(qkv_scale[s])),
-                                  dim=1),
-                        torch.cat((dense_scale[s],
-                                   torch.zeros_like(dense_scale[s])),
-                                  dim=1),
-                        mh4h_scale[s],
+                        torch.cat((qkv_scale[s], torch.zeros_like(qkv_scale[s])), dim=1),
+                        torch.cat((dense_scale[s], torch.zeros_like(dense_scale[s])), dim=1), mh4h_scale[s],
                         m4hh_scale[s]
                     ]).unsqueeze(0))
             for scales_a in all_scales:
@@ -139,8 +126,7 @@ def quantize_fn(layer, policy_cls):
                 else:
                     data_quantized, data_scale = self.quantize_data(keys[key], quantize_bits, groups)
                 keys[key].copy_(data_quantized)
-                layer_scales.append((1 / data_scale.to(
-                    get_accelerator().current_device_name()).view(-1).unsqueeze(0)))
+                layer_scales.append((1 / data_scale.to(get_accelerator().current_device_name()).view(-1).unsqueeze(0)))
             all_scales.append(self.merge_layer_scales(layer_scales))
             return layer
 
diff --git a/deepspeed/runtime/zero/__init__.py b/deepspeed/runtime/zero/__init__.py
index bd881c8ec69f..1ccca09a9e69 100644
--- a/deepspeed/runtime/zero/__init__.py
+++ b/deepspeed/runtime/zero/__init__.py
@@ -1,7 +1,7 @@
-"""
-"Copyright 2020 The Microsoft DeepSpeed Team.
-Licensed under the MIT license.
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .partition_parameters import ZeroParamType
 from .partition_parameters import ZeroParamStatus
@@ -11,3 +11,5 @@
 
 from .tiling import TiledLinear
 from .tiling import TiledLinearReturnBias
+
+from .mics import MiCS_Init
diff --git a/deepspeed/runtime/zero/config.py b/deepspeed/runtime/zero/config.py
index d7ab552188b8..76583c129cb9 100644
--- a/deepspeed/runtime/zero/config.py
+++ b/deepspeed/runtime/zero/config.py
@@ -1,13 +1,12 @@
-'''Copyright The Microsoft DeepSpeed Team'''
-"""
-Copyright (c) Microsoft Corporation
-Licensed under the MIT license.
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
-from pydantic import Field, validator
 import sys
 from typing import Optional
 from enum import Enum
+from deepspeed.pydantic_v1 import Field, validator, root_validator
 from deepspeed.runtime.config_utils import get_scalar_param, pp_int, DeepSpeedConfigModel
 from deepspeed.utils import logger
 from .offload_config import DeepSpeedZeroOffloadParamConfig, DeepSpeedZeroOffloadOptimizerConfig, OffloadDeviceEnum
@@ -22,6 +21,7 @@
     "stage3_max_live_parameters" : 1000000000,
     "stage3_max_reuse_distance" : 1000000000,
     "allgather_partitions": [true|false],
+    "use_multi_rank_bucket_allreduce": [true|false],
     "allgather_bucket_size": 500000000,
     "reduce_scatter": [true|false],
     "contiguous_gradients" : [true|false]
@@ -35,7 +35,13 @@
     "offload_param": {...},
     "offload_optimizer": {...},
     "ignore_unused_parameters": [true|false],
-    "round_robin_gradients": [true|false]
+    "round_robin_gradients": [true|false],
+    "zero_hpz_partition_size": 1,
+    "zero_quantized_weights": [true|false],
+    "zero_quantized_nontrainable_weights": [true|false],
+    "zero_quantized_gradients": [true|false],
+    "memory_efficient_linear": [true|false],
+    "override_module_apply": [true|false],
     }
 }
 """
@@ -47,13 +53,10 @@ def read_zero_config_deprecated(param_dict):
     zero_config_dict = {}
     zero_config_dict["stage"] = 1 if param_dict[ZERO_OPTIMIZATION] else 0
     if zero_config_dict["stage"] > 0:
-        zero_config_dict["allgather_bucket_size"] = get_scalar_param(
-            param_dict,
-            "allgather_size",
-            5e8)
+        zero_config_dict["allgather_bucket_size"] = get_scalar_param(param_dict, "allgather_size", 5e8)
     logger.warning(
-        "DeepSpeedConfig: this format of ZeRO optimization setup is deprecated. Please use the following format: {}"
-        .format(ZERO_FORMAT))
+        "DeepSpeedConfig: this format of ZeRO optimization setup is deprecated. Please use the following format: {}".
+        format(ZERO_FORMAT))
     return zero_config_dict
 
 
@@ -105,6 +108,13 @@ class DeepSpeedZeroConfig(DeepSpeedConfigModel):
     for the allgather for large model sizes
     """
 
+    use_multi_rank_bucket_allreduce: bool = True
+    """
+    Combine the reduce buckets of the different ranks and do an All-Reduce instead of multiple Reduce ops.
+    This feature is useful when the model is small and we want to scale it on too many GPUs which therefore
+    reduces the message sizes of each packet.
+    """
+
     allgather_partitions: bool = True
     """
     Chooses between allgather collective or a series of broadcast collectives
@@ -161,9 +171,7 @@ class DeepSpeedZeroConfig(DeepSpeedConfigModel):
         None,
         deprecated=True,
         new_param="offload_param",
-        new_param_fn=(
-            lambda val: DeepSpeedZeroOffloadParamConfig(device=OffloadDeviceEnum.cpu)
-            if val else None),
+        new_param_fn=(lambda val: DeepSpeedZeroOffloadParamConfig(device=OffloadDeviceEnum.cpu) if val else None),
     )
     """ Deprecated, please use ``offload_param`` """
 
@@ -179,31 +187,24 @@ class DeepSpeedZeroConfig(DeepSpeedConfigModel):
         None,
         deprecated=True,
         new_param="offload_optimizer",
-        new_param_fn=(
-            lambda val: DeepSpeedZeroOffloadOptimizerConfig(device=OffloadDeviceEnum.cpu)
-            if val else None),
+        new_param_fn=(lambda val: DeepSpeedZeroOffloadOptimizerConfig(device=OffloadDeviceEnum.cpu) if val else None),
     )
     """ Deprecated, please use ``offload_optimizer`` """
 
-    prefetch_bucket_size: int = Field(pp_int(5e7),
-                                      ge=0,
-                                      alias="stage3_prefetch_bucket_size")
+    prefetch_bucket_size: int = Field(pp_int(5e7), ge=0, alias="stage3_prefetch_bucket_size")
     """
     Maximum number of parameter elements to fetch ahead of use. Used by ZeRO3,
     ZeRO3-Offload, ZeRO-Infinity, and ZeRO-Inference.
     """
 
-    param_persistence_threshold: int = Field(pp_int(1e5),
-                                             ge=0,
-                                             alias="stage3_param_persistence_threshold")
+    param_persistence_threshold: int = Field(pp_int(1e5), ge=0, alias="stage3_param_persistence_threshold")
     """
     Do not partition parameters smaller than this threshold. Smaller values use
     less memory, but can greatly increase communication (especially
     latency-bound messages).
     """
 
-    model_persistence_threshold: int = Field(pp_int(sys.maxsize,
-                                                    "sys.maxsize"),
+    model_persistence_threshold: int = Field(pp_int(sys.maxsize, "sys.maxsize"),
                                              ge=0,
                                              alias="stage3_model_persistence_threshold")
     """
@@ -213,9 +214,7 @@ class DeepSpeedZeroConfig(DeepSpeedConfigModel):
     ZeRO3-Offload, ZeRO-Infinity and ZeRO-Inference.
     """
 
-    max_live_parameters: int = Field(pp_int(1e9),
-                                     ge=0,
-                                     alias="stage3_max_live_parameters")
+    max_live_parameters: int = Field(pp_int(1e9), ge=0, alias="stage3_max_live_parameters")
     """
     The maximum number of parameters resident per GPU before releasing. Smaller
     values use less memory, but perform more communication.
@@ -227,9 +226,7 @@ class DeepSpeedZeroConfig(DeepSpeedConfigModel):
     parameters. Smaller values use less memory, but perform more communication.
     """
 
-    gather_16bit_weights_on_model_save: bool = Field(
-        False,
-        alias="stage3_gather_16bit_weights_on_model_save")
+    gather_16bit_weights_on_model_save: bool = Field(False, alias="stage3_gather_16bit_weights_on_model_save")
     """
     Consolidate the weights before saving the model by ``save_16bit_model()``.
     Since the weights are partitioned across GPUs, they aren’t part of
@@ -237,10 +234,9 @@ class DeepSpeedZeroConfig(DeepSpeedConfigModel):
     this option is enabled and then saves the fp16 model weights.
     """
 
-    stage3_gather_fp16_weights_on_model_save: bool = Field(
-        False,
-        deprecated=True,
-        new_param="gather_16bit_weights_on_model_save")
+    stage3_gather_fp16_weights_on_model_save: bool = Field(False,
+                                                           deprecated=True,
+                                                           new_param="gather_16bit_weights_on_model_save")
     """ Deprecated, please use ``gather_16bit_weights_on_model_save`` """
 
     ignore_unused_parameters: bool = True
@@ -248,7 +244,7 @@ class DeepSpeedZeroConfig(DeepSpeedConfigModel):
     Unused parameters in modules may be unexpected in static networks, but
     could be normal in dynamic networks. This controls whether or not training
     should terminate with an error message when unused parameters are detected.
-    This is set to ``False`` by default, which means unused parameters are
+    This is set to ``True`` by default, which means unused parameters are
     ignored and training continues. Now is just used in stage 2.
     """
 
@@ -265,13 +261,57 @@ class DeepSpeedZeroConfig(DeepSpeedConfigModel):
     Performance benefit grows with gradient accumulation steps (more copying
     between optimizer steps) or GPU count (increased parallelism).
     """
+    zero_hpz_partition_size: int = Field(1, ge=0)
+    """
+    Number of ranks in zero parameters partitioning secondary group
+    """
+    zero_quantized_weights: bool = False
+    """
+    Boolean indicating whether to quantize zero parameters (weights)
+    for efficient all_gather comm
+    """
+    zero_quantized_nontrainable_weights: bool = False
+    """
+    Boolean indicating whether to quantize non-trainable zero parameters (weights)
+    for efficient memory usage and communication. Different from zero_quantized_weights
+    that stores the weights in original precision and only perform quantization during communication,
+    this flag will store the weights in quantized precision. This is useful for LoRA training.
+    """
+    zero_quantized_gradients: bool = False
+    """
+    Boolean indicating whether to use quantized zero gradients
+    for efficient all_2_all_reduce comm
+    """
+
+    mics_shard_size: int = Field(-1, new_param="mics_shard_size")
+
+    mics_hierarchical_params_gather: bool = False
+
+    memory_efficient_linear: bool = True
+    """
+    Use memory efficient linear implementation, for Stage 3.
+    """
+    """
+    Whether force load checkpoint in pipeline mode, current only for Stage 3.
+    """
+    pipeline_loading_checkpoint: bool = False
+
+    override_module_apply: bool = True
+    """
+    Override nn.Module apply function, for Stage 3.
+    """
 
     # Validators
     @validator("overlap_comm")
     def overlap_comm_valid(cls, field_value, values):
         if field_value is None:
-            assert (
-                "stage" in values
-            ), "DeepSpeedZeroConfig: 'stage' must be defined before 'overlap_comm'"
+            assert ("stage" in values), "DeepSpeedZeroConfig: 'stage' must be defined before 'overlap_comm'"
             field_value = values["stage"] == ZeroStageEnum.weights
         return field_value
+
+    @root_validator
+    def offload_ratio_check(cls, values):
+        offload_config = getattr(values, "offload_optimizer", {})
+        if offload_config and offload_config.ratio < 1.0:
+            assert values.get("stage") == ZeroStageEnum.weights, "Partial offloading only supported for ZeRO Stage 3."
+        return values
diff --git a/deepspeed/runtime/zero/contiguous_memory_allocator.py b/deepspeed/runtime/zero/contiguous_memory_allocator.py
index 46e55787e664..632f68ca1718 100644
--- a/deepspeed/runtime/zero/contiguous_memory_allocator.py
+++ b/deepspeed/runtime/zero/contiguous_memory_allocator.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 
@@ -11,6 +14,7 @@ def print_rank_0(message):
 
 
 class ContiguousMemoryAllocator(object):
+
     def __init__(self, size, dtype, device):
         self.buffer = torch.zeros(size, dtype=dtype, device=device)
 
@@ -74,7 +78,7 @@ def allocate_tensor(self, size):
         return ret_tensor
 
     #assigns the tensor data to the param data and keeps track of the assignment
-    #any change the the underlying buffer from defragmentation will cause a
+    #any change the underlying buffer from defragmentation will cause a
     #reassignment of the param data
     def assign_to_param(self, tensor, param, numel, shape):
         tensor_id = id(tensor)
@@ -98,8 +102,7 @@ def release_tensor(self, tensor):
         self._unassign_params(tensor_id)
         self.total_free += tensor_size
         print_rank_0(
-            f"Free before release {free_before}. Released {tensor.numel()}. Total free after {self.total_free}."
-        )
+            f"Free before release {free_before}. Released {tensor.numel()}. Total free after {self.total_free}.")
         assert self.total_free - tensor_size == free_before, "Release bookkeeping error"
 
     def release_tensor_with_id(self, tensor_id):
@@ -111,8 +114,7 @@ def release_tensor_with_id(self, tensor_id):
         self._unassign_params(tensor_id)
         self.total_free += tensor_size
         print_rank_0(
-            f"Free before release {free_before}. Released {tensor.numel()}. Total free after {self.total_free}."
-        )
+            f"Free before release {free_before}. Released {tensor.numel()}. Total free after {self.total_free}.")
         assert self.total_free - tensor_size == free_before, "Release bookkeeping error"
 
     #shows the current memory allocation at specified resolution
@@ -136,9 +138,7 @@ def max_allocated(self):
     def _reset_param_data(self):
         for id, tensor in self.tensor_map.items():
             for param in self.id_to_params[id]:
-                param.data = tensor.narrow(0,
-                                           0,
-                                           param.numel()).view(param.data.shape).data
+                param.data = tensor.narrow(0, 0, param.numel()).view(param.data.shape).data
 
     def _unassign_params(self, tensor_id):
         if tensor_id in self.id_to_params.keys():
diff --git a/deepspeed/runtime/zero/linear.py b/deepspeed/runtime/zero/linear.py
index 6fbcabb1675a..e9dd78864cde 100644
--- a/deepspeed/runtime/zero/linear.py
+++ b/deepspeed/runtime/zero/linear.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 #Linear Module to use with ZeRO Stage 3 to allow for parameter memory release
 #after the module execution during forward
@@ -23,23 +26,16 @@
 from deepspeed import comm as dist
 from deepspeed.accelerator import get_accelerator
 
-tensor_map = {}
-
 
 def print_rank_0(message, debug=False, force=False):
     if dist.get_rank() == 0 and (debug or force):
         print(message)
 
 
-device = get_accelerator().device_name()
-if device == 'cuda':
-    try:
-        autocast_custom_fwd = torch.cuda.amp.custom_fwd
-        autocast_custom_bwd = torch.cuda.amp.custom_bwd
-    except (ImportError, AttributeError) as exp:
-        autocast_custom_fwd = noop_decorator
-        autocast_custom_bwd = noop_decorator
-else:
+try:
+    autocast_custom_fwd = get_accelerator().amp().custom_fwd
+    autocast_custom_bwd = get_accelerator().amp().custom_bwd
+except (ImportError, AttributeError) as exp:
     autocast_custom_fwd = noop_decorator
     autocast_custom_bwd = noop_decorator
 
@@ -51,16 +47,8 @@ class LinearFunctionForZeroStage3(torch.autograd.Function):
     @autocast_custom_fwd
     # bias is an optional argument
     def forward(ctx, input, weight, bias=None):
-        #print("In ZeRO Linear Function")
-
-        weight_id = id(weight)
-        bias_id = id(bias)
-
-        #ctx.save_for_backward(input, weight, bias)
-        ctx.save_for_backward(input, torch.tensor(weight_id), torch.tensor(bias_id))
 
-        tensor_map[weight_id] = weight
-        tensor_map[bias_id] = bias
+        ctx.save_for_backward(input, weight, bias)
 
         if input.dim() == 2 and bias is not None:
             # fused op is marginally faster
@@ -82,11 +70,7 @@ def backward(ctx, grad_output):
         # None. Thanks to the fact that additional trailing Nones are
         # ignored, the return statement is simple even when the function has
         # optional inputs.
-        #input, weight, bias = ctx.saved_tensors
-
-        input, weight_id, bias_id = ctx.saved_tensors
-        weight = tensor_map[weight_id.item()]
-        bias = tensor_map[bias_id.item()]
+        input, weight, bias = ctx.saved_tensors
 
         grad_input = grad_weight = grad_bias = None
 
@@ -104,15 +88,16 @@ def backward(ctx, grad_output):
             dim = grad_output.dim()
             if dim > 2:
                 grad_weight = grad_output.reshape(-1,
-                                                  grad_output.shape[-1]).t().matmul(
-                                                      input.reshape(-1,
-                                                                    input.shape[-1]))
+                                                  grad_output.shape[-1]).t().matmul(input.reshape(-1, input.shape[-1]))
             else:
                 grad_weight = grad_output.t().matmul(input)
             #print(f"Computed grad weight grad_weight {grad_weight.shape}")
         if bias is not None and ctx.needs_input_grad[2]:
             #print("Computing grad bias")
-            grad_bias = grad_output.sum(0)
+            if dim > 2:
+                grad_bias = grad_output.sum([i for i in range(dim - 1)])
+            else:
+                grad_bias = grad_output.sum(0)
             #print("Done computing grad bias")
             #print("needs bias")
         #print(f"backward shaped grad_input {grad_input.shape}, grad_weight {grad_weight.shape}, grad_bias {grad_bias.shape if grad_bias is not None else None}")
@@ -189,7 +174,5 @@ def forward(self, input: Tensor) -> Tensor:
         return LinearFunctionForZeroStage3.apply(input, self.weight, self.bias)
 
     def extra_repr(self) -> str:
-        return 'in_features={}, out_features={}, bias={}'.format(
-            self.in_features,
-            self.out_features,
-            self.bias is not None)
+        return 'in_features={}, out_features={}, bias={}'.format(self.in_features, self.out_features, self.bias
+                                                                 is not None)
diff --git a/deepspeed/runtime/zero/mics.py b/deepspeed/runtime/zero/mics.py
new file mode 100755
index 000000000000..95e48053abe4
--- /dev/null
+++ b/deepspeed/runtime/zero/mics.py
@@ -0,0 +1,444 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+import sys
+from typing import List
+
+import deepspeed
+import torch
+from deepspeed import comm as dist
+from deepspeed.runtime.zero.mics_utils import (MiCS_CommGroups, create_mics_comm_groups, scale_tensors)
+from deepspeed.runtime.zero.parameter_offload import (DeepSpeedZeRoOffload, is_zero_param)
+from deepspeed.runtime.zero.partition_parameters import Init, AllGatherCoalescedHandle, ZeroParamStatus
+from deepspeed.runtime.zero.stage3 import DeepSpeedZeroOptimizer_Stage3
+from deepspeed.utils import instrument_w_nvtx, log_dist
+from deepspeed.accelerator import get_accelerator
+from torch import Tensor
+from torch.nn import Parameter
+
+
+def has_hierarchical_all_gather_groups(comm_groups: MiCS_CommGroups):
+    result = False
+    if comm_groups.param_intra_node_group is not None and comm_groups.param_inter_node_shard_group is not None:
+        result = True
+    return result
+
+
+class MiCS_AllGatherCoalescedHandle(AllGatherCoalescedHandle):
+    """ This handle assumes that no need to
+    copy data out from a contiguous tensor
+    """
+
+    def __init__(self, allgather_handle, params: List[Parameter], partitions: List[Tensor], world_size: int) -> None:
+        super().__init__(allgather_handle, params, partitions, world_size)
+
+    def wait(self) -> None:
+        """
+        """
+        # let the current stream to op
+        instrument_w_nvtx(self.allgather_handle.wait)()
+        if self.complete:
+            return
+
+        for _, param in enumerate(self.params):
+            assert param.ds_status == ZeroParamStatus.INFLIGHT, f"expected param {param.ds_summary()} to be inflight"
+            param.ds_status = ZeroParamStatus.AVAILABLE
+
+        self.complete = True
+
+
+class MiCS_Init(Init):
+
+    def __init__(self,
+                 module=None,
+                 data_parallel_group=None,
+                 mem_efficient_linear=True,
+                 remote_device=None,
+                 pin_memory=False,
+                 config_dict_or_path=None,
+                 config=None,
+                 enabled=True,
+                 dtype=None,
+                 mpu=None):
+        """A context manager to partition the model parameters during the model
+        construction with MiCS partition strategy. Model states are partitioned
+        to the number of devices specified via ``mics_shard_size`` field in the
+        deepspeed config json file. The context manager also introduces
+        hierarchical communication method to reduce the cost of inter-node
+        communications, which can be enabled with
+        ``mics_hierarchical_params_gather`` field in deepspeed config.
+
+        Args:
+            module (``torch.nn.Module``, optional): If provided, partition the model as
+                if it was constructed in the context.
+            data_parallel_group (``deepspeed.comm`` process group, optional):
+                The group of processes to partition among. Defaults to all processes.
+            mem_efficient_linear (bool, optional): Replace
+                torch.nn.functional.linear with an implementation that allows
+                DeepSpeed to partition parameters. Defaults to ``True``.
+            remote_device (string, optional): The initial device to store model
+                weights e.g., ``cpu``, ``nvme``. Passing ``"cpu"`` will create the model in CPU
+                memory. The model may still be moved to GPU based on the
+                offload settings for training. Defaults to param offload device if a config is
+                defined, otherwise GPU.
+            pin_memory (bool, optional): Potentially increase performance by
+                using pinned memory for model weights. ``remote_device`` must be
+                ``"cpu"``. Defaults to pin_memory value in config, otherwise ``False``.
+            config_dict_or_path (dict or ``json file``, optional): If provided, provides configuration
+                for swapping fp16 params to NVMe.
+            config (dict or ``json file``, optional): Deprecated, use config_dict_or_path instead.
+            enabled (bool, optional): If ``False``, this context has no
+                effect. Defaults to ``True``.
+            dtype (``dtype``, optional): Can be used to change the data type of the parameters.
+                Supported options are ``torch.half`` and ``torch.float``. Defaults to ``None``
+            mpu (``object``, optional): A model parallelism unit object that implements get_{model,data}_parallel_{rank,group,world_size}.
+
+        This context follows the same logic as ``deepspeed.zero.Init()``, but
+        with the modification for partition size of each parameter.
+
+        Examples
+        --------
+
+        #. Allocate a model and partition it among all processes:
+
+            .. code-block:: python
+                # the config_dict_or_path is required to let the context manager know
+                # how partition the parameters.
+                # The configuration has to include the field ``mics_shard_size``
+                with deepspeed.zero.MiCS_Init(config_dict_or_path=ds_config):
+                    model = MyLargeModel()
+
+
+        #. Allocate a model in pinned CPU memory and partition it among a subgroup of processes:
+
+            .. code-block:: python
+
+                with deepspeed.zero.MiCS_Init(data_parallel_group=mpu.get_data_parallel_group(),
+                                              remote_device="cpu",
+                                              pin_memory=True
+                                              config_dict_or_path=ds_config):
+                    model = MyLargeModel()
+
+
+        #. Partition an already-allocated model in CPU memory:
+
+            .. code-block:: python
+
+                model = deepspeed.zero.MiCS_Init(module=model,
+                                                 config_dict_or_path=ds_config)
+        """
+
+        assert config_dict_or_path is not None, "Must provide configuration for MiCS Initialization"
+        _ds_config = deepspeed.runtime.config.DeepSpeedConfig(config_dict_or_path, mpu)
+        if not dist.is_initialized():
+            dist.init_distributed()
+            assert dist.is_initialized(), "Parameters cannot be scattered without initializing deepspeed.comm"
+        self.mics_comm_groups = create_mics_comm_groups(
+            _ds_config.mics_shard_size,
+            data_parallel_group,
+            hierarchical_allgather=_ds_config.mics_hierarchial_params_gather,
+            mpu=mpu)
+
+        super().__init__(module, data_parallel_group, mem_efficient_linear, remote_device, pin_memory,
+                         config_dict_or_path, config, enabled, dtype, mpu)
+
+    def _convert_to_deepspeed_param(self, param):
+        super()._convert_to_deepspeed_param(param)
+        # attach communication groups to every param
+        param.comm = self.mics_comm_groups
+
+        # record existing all_gather_coalesced implementation
+        # so that we can fallback later
+        old_all_gather_coalesced = param.all_gather_coalesced
+
+        def _param_all_gather_coalesced(params, param_buffers=None, **kwargs):
+            """"""
+            mics_comm_groups: MiCS_CommGroups = params[0].comm
+            hierarchical_all_gather = has_hierarchical_all_gather_groups(mics_comm_groups)
+            if dist.has_coalescing_manager() and hierarchical_all_gather:
+                return self._hierarchical_all_gather_params(params, param_buffers)
+            elif dist.has_coalescing_manager():
+                return self._flat_all_gather_with_coalescing_manager(params, param_buffers)
+            else:
+                return old_all_gather_coalesced(params, **kwargs)
+
+        # change the all_gather_coalesced method
+        param.all_gather_coalesced = _param_all_gather_coalesced
+
+    def _pre_all_gather(self, params, params_buffers=None):
+        # fetches from nvme if the partition is not available and in nvme
+        self._ensure_availability_of_partitioned_params(params)
+
+        for param in params:
+            if param.ds_status != ZeroParamStatus.NOT_AVAILABLE:
+                raise RuntimeError(param.ds_summary())
+            param.ds_status = ZeroParamStatus.INFLIGHT
+
+        # ensure that each rank has params in same order. the allgather
+        # is done by flattening the parameter list into a single tensor that
+        # can be allgathered in a single call - this means that if each rank
+        # gives a list of the same parameters in a different order we will
+        # silently get incorrect parameter values, and have very difficult
+        # to debug correctness issues.
+        params = sorted(params, key=lambda p: p.ds_id)
+        return params, params_buffers
+
+    def _flat_all_gather_with_coalescing_manager(self, params, params_buffers=None):
+        """"""
+        # must have to change the status of the param
+        # and ensure they are on the device
+        params, params_buffers = self._pre_all_gather(params, params_buffers)
+
+        mics_comm_groups: MiCS_CommGroups = params[0].comm
+        param_shard_size = mics_comm_groups.param_shard_size
+
+        output_tensors = []
+        input_tensors = []
+        for i, p in enumerate(params):
+            t_size = p.ds_tensor.ds_numel * param_shard_size
+            if params_buffers is not None and params_buffers[i] is not None:
+                assert params_buffers[i].numel(
+                ) == t_size, f'params_to_gather_buffers[{i}] size {params_buffers[i].numel()} does not match with t_size {t_size}'
+                flat_out = params_buffers[i]
+            else:
+                flat_out = torch.empty(t_size, dtype=p.dtype, device=self.local_device, requires_grad=False).view(-1)
+            output_tensors.append(flat_out)
+            _flat_input = p.ds_tensor.data.view(-1)
+            input_tensors.append(_flat_input)
+
+        all_gather_handle = dist.all_gather_coalesced(output_tensors,
+                                                      input_tensors,
+                                                      group=mics_comm_groups.param_shard_group,
+                                                      async_op=True)
+
+        for idx, param in enumerate(params):
+            param.data = output_tensors[idx].narrow(0, 0, param.ds_numel).view(param.ds_shape).data
+
+        return MiCS_AllGatherCoalescedHandle(allgather_handle=all_gather_handle,
+                                             params=params,
+                                             partitions=[],
+                                             world_size=param_shard_size)
+
+    def _hierarchical_all_gather_params(self, params, params_buffers=None):
+        """"""
+        params, params_buffers = self._pre_all_gather(params, params_buffers)
+
+        mics_comm_groups: MiCS_CommGroups = params[0].comm
+        local_rank = dist.get_rank(group=mics_comm_groups.param_intra_node_group)
+        inter_node_comm_group = mics_comm_groups.param_inter_node_shard_group
+        intra_node_comm_group = mics_comm_groups.param_intra_node_group
+        param_shard_size = mics_comm_groups.param_shard_size
+
+        inter_node_size = dist.get_world_size(group=inter_node_comm_group)
+        intra_node_size = dist.get_world_size(group=intra_node_comm_group)
+        param_tensors = []
+        for i, p in enumerate(params):
+            param_size = p.ds_tensor.ds_numel * param_shard_size
+            if params_buffers is not None and params_buffers[i] is not None:
+                assert params_buffers[i].numel(
+                ) == param_size, f'param_buffers[{i}] size {params_buffers[i].numel()} does not match with param_size {param_size}'
+                param_tensor = params_buffers[i]
+            else:
+                param_tensor = torch.empty(param_size, dtype=p.dtype, device=self.local_device,
+                                           requires_grad=False).view(-1)
+            param_tensors.append(param_tensor)
+
+        # inter node all-gather
+        inter_outputs = []
+        inter_inputs = []
+        for i, p in enumerate(params):
+            inter_size = p.ds_tensor.ds_numel * inter_node_size
+            _out = param_tensors[i].narrow(0, local_rank * inter_size, inter_size)
+            inter_outputs.append(_out)
+            inter_inputs.append(p.ds_tensor.data.view(-1).to(self.local_device))
+        # sync enqueue
+        dist.all_gather_coalesced(inter_outputs, inter_inputs, group=inter_node_comm_group, async_op=False)
+
+        # intra node all-gather
+        intra_outputs = []
+        intra_inputs = []
+        for i, p in enumerate(params):
+            # partition param into multiple chunks for allgather
+            # because inter-node all-gather outputs are in a continues memory
+            # while in param memory, those inter-node data are placed in different
+            # location.
+            # each chunk is an intra-node output
+            param_chunk = param_tensors[i].view(
+                (inter_node_size, intra_node_size, p.ds_tensor.ds_numel)).narrow(1, local_rank, 1)
+            param_chunk.copy_(inter_outputs[i].detach().clone().view(param_chunk.size()))
+            output_chunks = torch.chunk(param_tensors[i], inter_node_size)
+            for j, _out in enumerate(output_chunks):
+                intra_chunk_size = intra_node_size * p.ds_tensor.ds_numel
+                local_offset = local_rank * p.ds_tensor.ds_numel
+                _in = param_tensors[i].narrow(0, j * intra_chunk_size + local_offset, p.ds_tensor.ds_numel)
+                intra_outputs.append(_out)
+                intra_inputs.append(_in)
+
+        all_gather_handle = dist.all_gather_coalesced(intra_outputs,
+                                                      intra_inputs,
+                                                      group=intra_node_comm_group,
+                                                      async_op=True)
+        for i, param in enumerate(params):
+            param.data = param_tensors[i].narrow(0, 0, param.ds_numel).view(param.ds_shape).data
+
+        return MiCS_AllGatherCoalescedHandle(
+            allgather_handle=all_gather_handle,
+            params=params,
+            partitions=[],
+            world_size=param_shard_size,
+        )
+
+    def get_partition_dp_group(self, param):
+        return param.comm.param_shard_group
+
+    def get_partition_rank(self):
+        return self.mics_comm_groups.param_shard_rank
+
+    @property
+    def num_partitions(self):
+        return self.mics_comm_groups.param_shard_size
+
+
+class MiCS_Offload(DeepSpeedZeRoOffload):
+    """ Wrapper to change the behavior for parameter sharding
+    """
+
+    def _convert_to_zero_parameters(self, ds_config, module, mpu):
+        """ overload the parent class function for convert the parameters
+
+        """
+        log_dist(f'Convert to zero parameters from MiCS Offload manager', ranks=[0])
+        non_zero_params = [p for p in module.parameters() if not is_zero_param(p)]
+        if non_zero_params:
+            zero_params = [p for p in module.parameters() if is_zero_param(p)]
+            if zero_params:
+                zero_params[0].convert_to_zero_parameters(param_list=non_zero_params)
+            else:
+                group = None
+                if mpu:
+                    group = mpu.get_data_parallel_group()
+
+                MiCS_Init(module=module,
+                          data_parallel_group=group,
+                          dtype=self.dtype,
+                          config_dict_or_path=ds_config,
+                          remote_device=self.offload_device,
+                          pin_memory=self.offload_param_pin_memory,
+                          mpu=mpu)
+
+
+class MiCS_Optimizer(DeepSpeedZeroOptimizer_Stage3):
+    """
+    MiCS Optimizer
+    """
+
+    def __init__(self,
+                 module,
+                 init_optimizer,
+                 timers,
+                 ds_config,
+                 static_loss_scale=1,
+                 dynamic_loss_scale=False,
+                 dynamic_loss_args=None,
+                 verbose=True,
+                 contiguous_gradients=True,
+                 reduce_bucket_size=500000000,
+                 prefetch_bucket_size=50000000,
+                 max_reuse_distance=1000000000,
+                 max_live_parameters=1000000000,
+                 param_persistence_threshold=100000,
+                 model_persistence_threshold=sys.maxsize,
+                 dp_process_group=None,
+                 reduce_scatter=True,
+                 overlap_comm=False,
+                 offload_optimizer_config=None,
+                 offload_param_config=None,
+                 sub_group_size=1000000000000,
+                 mpu=None,
+                 clip_grad=0,
+                 gradient_accumulation_dtype=torch.float16,
+                 communication_data_type=torch.float16,
+                 postscale_gradients=True,
+                 gradient_predivide_factor=1,
+                 gradient_accumulation_steps=1,
+                 elastic_checkpoint=False,
+                 aio_config=None):
+
+        log_dist("Init MiCS optimizer", ranks=[0])
+        super().__init__(module, init_optimizer, timers, ds_config, static_loss_scale, dynamic_loss_scale,
+                         dynamic_loss_args, verbose, contiguous_gradients, reduce_bucket_size, prefetch_bucket_size,
+                         max_reuse_distance, max_live_parameters, param_persistence_threshold,
+                         model_persistence_threshold, dp_process_group, reduce_scatter, overlap_comm,
+                         offload_optimizer_config, offload_param_config, sub_group_size, mpu, clip_grad,
+                         gradient_accumulation_dtype, communication_data_type, postscale_gradients,
+                         gradient_predivide_factor, gradient_accumulation_steps, elastic_checkpoint, aio_config)
+        first_param = next(module.parameters())
+        # overload the dp_process_group and partition_count
+        assert hasattr(first_param, "comm"), " ".join([
+            "Sharded parameters don't have the MiCS_CommGroups attached.",
+            "Might due to the use of deepspeed.zero.Init context for initializing the weights.",
+            "To use MiCS sharding, please use deepspeed.zero.MiCS_Init instead for initializing parameter."
+        ])
+        self.dp_process_group = first_param.comm.param_shard_group
+        self.partition_count = first_param.comm.param_shard_size
+
+    def initialize_ds_offload(
+        self,
+        *args,
+        **kwargs,
+    ):
+        return MiCS_Offload(*args, **kwargs)
+
+    def partition_grads(self, params_to_release: List[Parameter], grad_partitions: List[Tensor]) -> None:
+        grad_buffers = super().partition_grads(params_to_release, grad_partitions)
+        # perform all-reduce among replication groups
+        # the function will perform accumulation boundary check
+        self.allreduce_mics_shard_grads(params_to_release, grad_buffers)
+
+    @instrument_w_nvtx
+    def allreduce_mics_shard_grads(self, params, partitioned_grads_buffers: List[Tensor]):
+        """
+        """
+        # TODO: improve the condition check
+        if not self.is_gradient_accumulation_boundary or \
+            len(partitioned_grads_buffers) == 0:
+            return
+
+        mics_comm_groups: MiCS_CommGroups = params[0].comm
+        param_repli_group = mics_comm_groups.param_repli_group
+        param_repli_size = mics_comm_groups.param_repli_size
+
+        if param_repli_size is None or param_repli_size <= 1:
+            return
+        if not get_accelerator().on_accelerator(partitioned_grads_buffers[0]):
+            raise RuntimeError("Local sharding has no support for CPU offloading")
+
+        if dist.has_all_reduce_coalesced():
+            scale_tensors(partitioned_grads_buffers, param_repli_size)
+            dist.all_reduce_coalesced(tensors=partitioned_grads_buffers, group=param_repli_group)
+        else:
+            # manually coalescing all-reduce
+            aggregated_buffer: Tensor = torch.cat(partitioned_grads_buffers)
+            aggregated_buffer.div_(param_repli_size)
+            dist.all_reduce(aggregated_buffer, group=param_repli_group)
+            offset = 0
+            for grad_buff in partitioned_grads_buffers:
+                grad_buff.view(-1).copy_(aggregated_buffer.narrow(0, offset, grad_buff.numel()))
+                offset += grad_buff.numel()
+
+    def load_state_dict(self,
+                        state_dict_list,
+                        load_optimizer_states=True,
+                        load_from_fp32_weights=False,
+                        checkpoint_folder=None,
+                        load_serial=None):
+        r""" Loading the ZeRO-3/MiCS partitioned checkpoints
+        Because the self.dp_process_group is replaced with the communicator for
+        partition group we can call the load_state_dict logic from ZeRO-3.
+        """
+        super().load_state_dict(state_dict_list, load_optimizer_states, load_from_fp32_weights, checkpoint_folder)
diff --git a/deepspeed/runtime/zero/mics_utils.py b/deepspeed/runtime/zero/mics_utils.py
new file mode 100644
index 000000000000..06b83160bd6c
--- /dev/null
+++ b/deepspeed/runtime/zero/mics_utils.py
@@ -0,0 +1,203 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+from dataclasses import dataclass
+from typing import List
+
+import numpy as np
+import torch
+from torch import Tensor
+
+from deepspeed import comm as dist
+from deepspeed.accelerator import get_accelerator
+from deepspeed.utils import logger
+
+
+def _log_rank0(msg):
+    if dist.get_rank() == 0:
+        logger.info(msg)
+
+
+@torch.jit.script
+def scale_tensors(tensors: List[Tensor], scale: int):
+    for t in tensors:
+        t.div_(scale)
+
+
+@dataclass
+class MiCS_CommGroups:
+    """"""
+    param_shard_group = None
+    param_shard_size = -1
+    param_shard_rank = -1
+
+    param_repli_group = None
+    param_repli_size = -1
+    param_repli_rank = -1
+
+    param_intra_node_group = None
+    param_inter_node_shard_group = None
+
+
+def create_mics_comm_groups(
+    shard_size,
+    dp_group,
+    hierarchical_allgather=False,
+    mpu=None,
+):
+    """
+    create shard-group, replicate-group from config_file
+    TODO: consider broadcast the config from rank0
+
+    Returns:
+        MiCS_CommGroups
+    """
+    # env var for debugging purpose
+    ndevices_per_node = int(os.environ.get("NDEV_PER_NODE", get_accelerator().device_count()))
+    _log_rank0(f'creating MiCS communication groups with per node device size {ndevices_per_node}')
+    groups = MiCS_CommGroups()
+
+    if mpu is not None:
+        assert dp_group == mpu.get_data_parallel_group()
+
+    # full size of the world
+    world_size = dist.get_world_size()
+    # global rank
+    global_rank = dist.get_rank()
+
+    config = _generate_mics_config(world_size, ndevices_per_node, shard_size, 1)
+    ranks_of_shard_group = config['shard_groups']
+    ranks_of_repli_group = config['replicate_groups']
+    if len(ranks_of_repli_group) == 0:
+        assert len(ranks_of_shard_group) == 1, "replicate groups are empty only for single shard group"
+        for r in ranks_of_shard_group[0]:
+            ranks_of_repli_group.append([r])
+
+    # for simplicity
+    assert _sizes_all_same(ranks_of_repli_group), "replicate groups must have the same size"
+    assert _sizes_all_same(ranks_of_shard_group), "shard groups must have the same size"
+
+    assert sum([len(g) for g in ranks_of_shard_group]) == dist.get_world_size(), "all sharded ranks "
+    if len(ranks_of_shard_group) > 1:  # if only shard on one group then no need for replicate groups
+        assert len(ranks_of_shard_group) == len(
+            ranks_of_repli_group[0]), "number of shard groups must equal to the size of each replicate group"
+
+    global_rank = dist.get_rank()
+    # create shard groups
+    for shard_ranks in ranks_of_shard_group:
+        _group = dist.new_group(shard_ranks)
+        if global_rank in shard_ranks:
+            groups.param_shard_group = _group
+            groups.param_shard_size = len(shard_ranks)
+            groups.param_shard_rank = dist.get_rank(_group)
+            logger.info(f'rank {global_rank}, shard group'
+                        f' {groups.param_shard_rank}/{dist.get_world_size(group=_group)}')
+
+    # create replicate groups
+    for repli_ranks in ranks_of_repli_group:
+        if len(repli_ranks) > 1:
+            _group = dist.new_group(repli_ranks)
+            if global_rank in repli_ranks:
+                groups.param_repli_group = _group
+                groups.param_repli_size = len(repli_ranks)
+                groups.param_repli_rank = dist.get_rank(group=_group)
+                logger.info(f'rank {global_rank} '
+                            f'replicate group {groups.param_repli_rank}/{dist.get_world_size(group=_group)}')
+        else:
+            groups.param_repli_group = None
+            groups.param_repli_size = 1
+            groups.param_repli_rank = 0
+            logger.info(f'rank {global_rank} replicate group 0/1')
+
+    # assign shard group size as world size
+    assert groups.param_shard_size == len(ranks_of_shard_group[0])
+
+    if hierarchical_allgather:
+        # create hierarchy inter-node, intra-node groups
+        # n_span_nodes = config['shard_span']
+        n_span_nodes = config['span_nodes']
+        assert n_span_nodes > 1, "sharding spans on single node, no need for hierarchy allgather"
+        assert len(ranks_of_shard_group[0]) % n_span_nodes == 0
+
+        n_gpu_per_node = len(ranks_of_shard_group[0]) // n_span_nodes
+        intra_node_ranks_group = []
+        inter_node_ranks_group = []
+        for shard_group in ranks_of_shard_group:
+            _intra_node_ranks = []
+            for i in range(0, len(shard_group), n_gpu_per_node):
+                _intra_node_ranks.append(shard_group[i:i + n_gpu_per_node])
+            _inter_node_ranks = []
+            for i in range(n_gpu_per_node):
+                _ranks = [_g[i] for _g in _intra_node_ranks]
+                _inter_node_ranks.append(_ranks)
+
+            intra_node_ranks_group.append(_intra_node_ranks)
+            inter_node_ranks_group.append(_inter_node_ranks)
+
+        _log_rank0(f"create for hierarchy all-gather groups: intra nodes {intra_node_ranks_group}")
+        _log_rank0(f"create for hierarchy all-gather groups: inter nodes {inter_node_ranks_group}")
+
+        # create communicators
+        for shard_group in intra_node_ranks_group:
+            for intra_node_ranks in shard_group:
+                _group = dist.new_group(intra_node_ranks)
+                if global_rank in intra_node_ranks:
+                    groups.param_intra_node_group = _group
+                _log_rank0(f'create group for intra node ranks {intra_node_ranks}')
+
+        for shard_group in inter_node_ranks_group:
+            for inter_node_ranks in shard_group:
+                _group = dist.new_group(inter_node_ranks)
+                if global_rank in inter_node_ranks:
+                    groups.param_inter_node_shard_group = _group
+                _log_rank0(f'create group for inter node ranks {inter_node_ranks}')
+    return groups
+
+
+def _generate_mics_config(world_size, ndev_per_node, shard_size, pp_size=1):
+    """Generating the configuration for sharding This shard config generation assume
+    that the pipeline stages are partitioned in order, i.e., first ranks
+    hold the stage0, etc.
+
+    Args:
+
+        shard_size (int): zero3 data-parallel shard size, FIXME:
+        change the name later
+
+        pp_size (int): pipeline parallel size, currently, only work with
+        pipeline parallelism + zero
+
+    """
+    assert world_size % pp_size == 0
+    assert (world_size // pp_size) % shard_size == 0, \
+        f"dp group size is not dividable by dp_shard_size, "\
+        f" (world_size {world_size}, pp_size {pp_size}, dp_shard_size {shard_size})"
+
+    config = {}
+    shard_groups = np.arange(world_size).reshape(-1, shard_size)
+    replicate_groups = []
+    for i in range(shard_size):
+        same_shard_ranks = shard_groups[:, i].tolist()
+        n_ranks = len(same_shard_ranks)
+        replicate_size = n_ranks // pp_size
+        replicate_groups.extend([same_shard_ranks[j:j + replicate_size] for j in range(0, n_ranks, replicate_size)])
+
+    config['replicate_groups'] = replicate_groups
+    config['shard_groups'] = shard_groups.tolist()
+    config["span_nodes"] = len(shard_groups[0]) // ndev_per_node
+    return config
+
+
+def _sizes_all_same(groups):
+    """all groups have same length"""
+    all_same = True
+    for g in groups:
+        if len(g) != len(groups[0]):
+            return False
+    return all_same
diff --git a/deepspeed/runtime/zero/offload_config.py b/deepspeed/runtime/zero/offload_config.py
index 7ea76c68178d..b7adc13a0ea2 100644
--- a/deepspeed/runtime/zero/offload_config.py
+++ b/deepspeed/runtime/zero/offload_config.py
@@ -1,12 +1,11 @@
-'''Copyright The Microsoft DeepSpeed Team'''
-"""
-Copyright (c) Microsoft Corporation
-Licensed under the MIT license.
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
-from pydantic import Field, validator
 from enum import Enum
 from pathlib import Path
+from deepspeed.pydantic_v1 import Field, validator
 from deepspeed.runtime.config_utils import DeepSpeedConfigModel, pp_int
 
 
@@ -88,7 +87,11 @@ class DeepSpeedZeroOffloadOptimizerConfig(DeepSpeedConfigModel):
 
     fast_init: bool = False
     """ Enable fast optimizer initialization when offloading to NVMe. """
+
     @validator("pipeline_read", "pipeline_write", always=True)
     def set_pipeline(cls, field_value, values):
         values["pipeline"] = field_value or values.get("pipeline", False)
         return field_value
+
+    ratio: float = Field(1.0, ge=0.0, le=1.0)
+    """ Percentage of offloaded optimizer states to CPU Adam. Only valid with ZeRO Stage 3."""
diff --git a/deepspeed/runtime/zero/parameter_offload.py b/deepspeed/runtime/zero/parameter_offload.py
index 336c63e98721..d1ecd9288878 100644
--- a/deepspeed/runtime/zero/parameter_offload.py
+++ b/deepspeed/runtime/zero/parameter_offload.py
@@ -1,7 +1,7 @@
-"""
-"Copyright 2022 The Microsoft DeepSpeed Team.
-Licensed under the MIT license.
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import sys
 import torch
@@ -10,7 +10,7 @@
 from deepspeed.runtime.zero.offload_config import OffloadDeviceEnum
 from deepspeed.runtime.zero.partition_parameters import _init_external_params
 from deepspeed.runtime.zero.partition_parameters import *
-from deepspeed.runtime.zero.partitioned_param_coordinator import PartitionedParameterCoordinator, iter_params
+from deepspeed.runtime.zero.partitioned_param_coordinator import PartitionedParameterCoordinator, InflightParamRegistry, iter_params
 from deepspeed import comm as dist
 from deepspeed.accelerator import get_accelerator
 
@@ -22,28 +22,53 @@ def is_builtin_type(obj):
     return obj.__class__.__module__ == '__builtin__' or obj.__class__.__module__ == "builtins"
 
 
+def isinstance_namedtuple(obj: object) -> bool:
+    """
+    Is this an instance of namedtuple/NamedTuple?
+    From: https://stackoverflow.com/a/62692640
+
+    Args:
+        obj (object): An object.
+
+    Returns:
+        bool: True if namedtuple/NamedTuple else False.
+    """
+    return isinstance(obj, tuple) and hasattr(obj, '_asdict') and hasattr(obj, '_fields')
+
+
 # ensure we only warn once, otherwise every iteration will trigger a warning
 warned = False
 
 
-#apply torch.autograd.Function that calls a backward_function to tensors in output
 def _apply_to_tensors_only(module, functional, backward_function, outputs):
+    """
+    Apply a torch.autograd.Function that calls a `backward_function` to every Tensor in `outputs`.
+
+    Args:
+        module (torch.nn.Module):  A torch module
+        functional (Type[torch.autograd.Function]): The function class to apply.
+        backward_function (Callable[[torch.nn.Module], None]): A backward_function to pass to
+            `functional.apply`.
+        outputs (Any): The output of `module`.
+
+    Returns:
+        Any: The output of `module`.
+    """
     if isinstance(outputs, (tuple, list)):
         touched_outputs = []
         for output in outputs:
-            touched_output = _apply_to_tensors_only(module,
-                                                    functional,
-                                                    backward_function,
-                                                    output)
+            touched_output = _apply_to_tensors_only(module, functional, backward_function, output)
             touched_outputs.append(touched_output)
+
+        if isinstance_namedtuple(outputs):
+            # namedtuples require a slightly different syntax.
+            return outputs.__class__(*touched_outputs)
+
         return outputs.__class__(touched_outputs)
     elif isinstance(outputs, dict):
         # apply inplace to avoid recreating dict inherited objects
         for key in outputs.keys():
-            outputs[key] = _apply_to_tensors_only(module,
-                                                  functional,
-                                                  backward_function,
-                                                  outputs[key])
+            outputs[key] = _apply_to_tensors_only(module, functional, backward_function, outputs[key])
         return outputs
 
     elif isinstance(outputs, torch.Tensor):
@@ -67,18 +92,12 @@ def _apply_to_tensors_only(module, functional, backward_function, outputs):
 
 
 #for each tensor in outputs run the forward_function and register backward_function as hook
-def _apply_forward_and_backward_to_tensors_only(module,
-                                                forward_function,
-                                                backward_function,
-                                                outputs):
+def _apply_forward_and_backward_to_tensors_only(module, forward_function, backward_function, outputs):
     if type(outputs) is tuple:
         touched_outputs = []
         for output in outputs:
-            touched_output = _apply_forward_and_backward_to_tensors_only(
-                module,
-                forward_function,
-                backward_function,
-                output)
+            touched_output = _apply_forward_and_backward_to_tensors_only(module, forward_function, backward_function,
+                                                                         output)
             touched_outputs.append(touched_output)
         return tuple(touched_outputs)
     elif type(outputs) is torch.Tensor:
@@ -91,6 +110,7 @@ def _apply_forward_and_backward_to_tensors_only(module,
 
 
 class ZeROOrderedDict(OrderedDict):
+
     def __init__(self, parent_module, *args, **kwargs):
         """A replacement for ``collections.OrderedDict`` to detect external ZeRO params.
 
@@ -113,9 +133,7 @@ def __getitem__(self, key):
             if self._parent_module._parameters._in_forward:
                 register_external_parameter(FWD_MODULE_STACK[-1], param)
                 param.all_gather()
-                print_rank_0(
-                    f'Registering external parameter from getter {key} ds_id = {param.ds_id}',
-                    force=False)
+                print_rank_0(f'Registering external parameter from getter {key} ds_id = {param.ds_id}', force=False)
 
         return param
 
@@ -133,6 +151,7 @@ def _inject_parameters(module, cls):
 
 
 class PreBackwardFunction(torch.autograd.Function):
+
     @staticmethod
     def forward(ctx, module, pre_backward_function, outputs):
         ctx.module = module
@@ -152,8 +171,9 @@ def backward(ctx, *args):
 
 
 class PostBackwardFunction(torch.autograd.Function):
+
     @staticmethod
-    def forward(ctx, module, pre_backward_function, output):
+    def forward(ctx, module, post_backward_function, output):
         ctx.module = module
         if output.requires_grad:
             #TODO SOME TIMES post backward does not seem to be triggered debug in detail
@@ -165,7 +185,7 @@ def forward(ctx, module, pre_backward_function, output):
             #if module.ds_grads_remaining == 0:
             #    print(f"Before Forward: {ctx.module.__class__.__name__}")
             module.ds_grads_remaining += 1
-            ctx.pre_backward_function = pre_backward_function
+            ctx.post_backward_function = post_backward_function
         output = output.detach()
         return output
 
@@ -173,34 +193,45 @@ def forward(ctx, module, pre_backward_function, output):
     def backward(ctx, *args):
         ctx.module.ds_grads_remaining = ctx.module.ds_grads_remaining - 1
         if ctx.module.ds_grads_remaining == 0:
-            ctx.pre_backward_function(ctx.module)
+            ctx.post_backward_function(ctx.module)
             #print(f"After Backward: {ctx.module.__class__.__name__}")
         return (None, None) + args
 
 
 class DeepSpeedZeRoOffload(object):
-    def __init__(self,
-                 module,
-                 timers,
-                 ds_config,
-                 overlap_comm=True,
-                 prefetch_bucket_size=50000000,
-                 max_reuse_distance=1000000000,
-                 max_live_parameters=1000000000,
-                 param_persistence_threshold=100000,
-                 model_persistence_threshold=sys.maxsize,
-                 offload_param_config=None,
-                 mpu=None):
+
+    def __init__(
+        self,
+        module,
+        timers,
+        ds_config,
+        overlap_comm=True,
+        prefetch_bucket_size=50000000,
+        max_reuse_distance=1000000000,
+        max_live_parameters=1000000000,
+        param_persistence_threshold=100000,
+        model_persistence_threshold=sys.maxsize,
+        dp_process_group=None,
+        offload_param_config=None,
+        mpu=None,
+        zero_param_parallel_group=None,
+        zero_quantized_weights=False,
+        zero_quantized_nontrainable_weights=False,
+    ):
 
         see_memory_usage("DeepSpeedZeRoOffload initialize [begin]", force=True)
 
-        print_rank_0(f"initialized {__class__.__name__} with args: {locals()}",
-                     force=False)
+        print_rank_0(f"initialized {__class__.__name__} with args: {locals()}", force=False)
 
         self.module = module
+        self.timers = timers
         self.dtype = list(module.parameters())[0].dtype
+        self.dp_process_group = dp_process_group
         self.offload_device = None
         self.offload_param_pin_memory = False
+        self.zero_param_parallel_group = zero_param_parallel_group
+        self.zero_quantized_weights = zero_quantized_weights
+        self.zero_quantized_nontrainable_weights = zero_quantized_nontrainable_weights
 
         if offload_param_config is not None and offload_param_config.device != OffloadDeviceEnum.none:
             self.offload_device = offload_param_config.device
@@ -215,17 +246,23 @@ def __init__(self,
 
         self.param_numel_persistence_threshold = int(param_persistence_threshold)
         self.model_persistence_threshold = int(model_persistence_threshold)
-        self.persistent_parameters = self.mark_persistent_parameters(
-            self.param_numel_persistence_threshold,
-            self.model_persistence_threshold)
+        self.persistent_parameters = self.mark_persistent_parameters(self.param_numel_persistence_threshold,
+                                                                     self.model_persistence_threshold)
 
         self.param_coordinators = {}
         self._prefetch_bucket_sz = int(prefetch_bucket_size)
         self._max_reuse_distance_in_numel = int(max_reuse_distance)
         self._max_available_parameters_in_numel = int(max_live_parameters)
-        self.__allgather_stream = get_accelerator().Stream(
+        self.__allgather_stream = None if get_accelerator().is_synchronized_device() else get_accelerator().Stream(
         ) if overlap_comm else get_accelerator().default_stream()
 
+        if not hasattr(module, "ds_inflight_param_registry"):
+            module.ds_inflight_param_registry = dict()
+            # we need two registries, one for training and one for eval. They will be used when creating PartitionedParameterCoordinator
+            module.ds_inflight_param_registry[True] = InflightParamRegistry()
+            module.ds_inflight_param_registry[False] = InflightParamRegistry()
+        self.__inflight_param_registry = module.ds_inflight_param_registry
+
         self.forward_hooks = []
         self.backward_hooks = []
         self.setup_zero_stage3_hooks()
@@ -240,8 +277,7 @@ def partition_all_parameters(self):
         """Partitioning Parameters that were not partitioned usually if parameters
         of modules whose input parameters do not require grad computation do not
         trigger post call and will therefore will remain unpartitioned"""
-        self.get_param_coordinator(training=self.module.training).release_and_reset_all(
-            self.module)
+        self.get_param_coordinator(training=self.module.training).release_and_reset_all(self.module)
         for param in iter_params(self.module, recurse=True):
             if param.ds_status != ZeroParamStatus.NOT_AVAILABLE:
                 raise RuntimeError(f"{param.ds_summary()} expected to be released")
@@ -251,14 +287,20 @@ def get_param_coordinator(self, training):
             self.param_coordinators[training] = PartitionedParameterCoordinator(
                 prefetch_bucket_sz=self._prefetch_bucket_sz,
                 max_reuse_distance_in_numel=self._max_reuse_distance_in_numel,
-                max_available_parameters_in_numel=self.
-                _max_available_parameters_in_numel,
+                max_available_parameters_in_numel=self._max_available_parameters_in_numel,
                 allgather_stream=self.__allgather_stream,
+                inflight_param_registry=self.__inflight_param_registry[training],
                 prefetch_nvme=self.offload_device == OffloadDeviceEnum.nvme,
+                timers=self.timers,
+                zero_quantized_weights=self.zero_quantized_weights,
+                zero_quantized_nontrainable_weights=self.zero_quantized_nontrainable_weights,
             )
 
         return self.param_coordinators[training]
 
+    def empty_partition_cache(self):
+        self.partition_all_parameters()
+
     def _convert_to_zero_parameters(self, ds_config, module, mpu):
         non_zero_params = [p for p in module.parameters() if not is_zero_param(p)]
         if non_zero_params:
@@ -276,7 +318,10 @@ def _convert_to_zero_parameters(self, ds_config, module, mpu):
                      config_dict_or_path=ds_config,
                      remote_device=self.offload_device,
                      pin_memory=self.offload_param_pin_memory,
-                     mpu=mpu)
+                     mpu=mpu,
+                     zero_param_parallel_group=self.zero_param_parallel_group,
+                     zero_quantized_weights=self.zero_quantized_weights,
+                     zero_quantized_nontrainable_weights=self.zero_quantized_nontrainable_weights)
 
     def destroy(self):
         self._remove_module_hooks()
@@ -291,9 +336,8 @@ def _remove_module_hooks(self):
         for hook in self.backward_hooks:
             hook.remove()
 
-        print_rank_0(
-            f'Deleted module hooks: forward = {num_forward_hooks}, backward = {num_backward_hooks}',
-            force=False)
+        print_rank_0(f'Deleted module hooks: forward = {num_forward_hooks}, backward = {num_backward_hooks}',
+                     force=False)
 
     def setup_zero_stage3_hooks(self):
         self.hierarchy = 0
@@ -317,11 +361,11 @@ def mark_persistent_parameters(self, param_threshold, model_threshold):
         persistent_params = []
         total_persistent_parameters = 0
         params_count = 0
-        for _, param in self.module.named_parameters(recurse=True):
+        for name, param in self.module.named_parameters(recurse=True):
             if param.ds_numel + total_persistent_parameters > model_threshold:
                 continue
 
-            if param.ds_numel < param_threshold:
+            if param.ds_numel <= param_threshold:
                 params_count += 1
                 param.ds_persist = True
                 persistent_params.append(param)
@@ -365,19 +409,14 @@ def _post_forward_module_hook(module, input, output):
                             outputs.append(val)
                     output = outputs
 
-            for item in filter(
-                    lambda item: is_zero_param(item) or hasattr(item,
-                                                                'ds_param_alias'),
-                    output):
+            for item in filter(lambda item: is_zero_param(item) or hasattr(item, 'ds_param_alias'), output):
                 key = id(item) if hasattr(item, 'ds_id') else id(item.ds_param_alias)
-                actual_external_param = item if hasattr(item,
-                                                        'ds_id') else item.ds_param_alias
+                actual_external_param = item if hasattr(item, 'ds_id') else item.ds_param_alias
 
                 if not any(key in m._external_params for m in FWD_MODULE_STACK):
                     actual_external_param.is_external_param = True
                     module_to_register = FWD_MODULE_STACK[-1]
-                    register_external_parameter(module_to_register,
-                                                actual_external_param)
+                    register_external_parameter(module_to_register, actual_external_param)
                     print_rank_0(
                         f'Registering dangling parameter for module {module_to_register.__class__.__name__}, ds_id = {actual_external_param.ds_id}.',
                         force=False)
@@ -395,6 +434,7 @@ def _post_forward_module_hook(module, input, output):
             self.post_sub_module_forward_function(module)
 
         def _pre_backward_module_hook(module, inputs, output):
+
             @instrument_w_nvtx
             def _run_before_backward_function(sub_module):
                 # some models (e.g. Albert) may run multiple forwards on the same layer in a loop
@@ -406,10 +446,7 @@ def _run_before_backward_function(sub_module):
                     sub_module.applied_pre_backward_ref_cnt -= 1
                 #print(f"COUNTER after: {sub_module.applied_pre_backward_ref_cnt}")
 
-            return _apply_to_tensors_only(module,
-                                          PreBackwardFunction,
-                                          _run_before_backward_function,
-                                          output)
+            return _apply_to_tensors_only(module, PreBackwardFunction, _run_before_backward_function, output)
 
         #This is an alternate to doing _post_backward_module_hook
         #it uses tensor.register_hook instead of using torch.autograd.Function
@@ -428,11 +465,8 @@ def _run_before_forward_function(input):
                 if input.requires_grad:
                     module.ds_grads_remaining += 1
 
-            return _apply_forward_and_backward_to_tensors_only(
-                module,
-                _run_before_forward_function,
-                _run_after_backward_hook,
-                inputs)
+            return _apply_forward_and_backward_to_tensors_only(module, _run_before_forward_function,
+                                                               _run_after_backward_hook, inputs)
 
         def _post_backward_module_hook(module, inputs):
             module.ds_grads_remaining = 0
@@ -442,32 +476,25 @@ def _run_after_backward_function(sub_module):
                 if sub_module.ds_grads_remaining == 0:
                     self.post_sub_module_backward_function(sub_module)
 
-            return _apply_to_tensors_only(module,
-                                          PostBackwardFunction,
-                                          _run_after_backward_function,
-                                          inputs)
+            return _apply_to_tensors_only(module, PostBackwardFunction, _run_after_backward_function, inputs)
 
         # Pre forward hook
-        self.forward_hooks.append(
-            module.register_forward_pre_hook(_pre_forward_module_hook))
+        self.forward_hooks.append(module.register_forward_pre_hook(_pre_forward_module_hook))
 
         # Post forward hook
-        self.forward_hooks.append(
-            module.register_forward_hook(_post_forward_module_hook))
+        self.forward_hooks.append(module.register_forward_hook(_post_forward_module_hook))
 
         # Pre backward hook
-        self.backward_hooks.append(
-            module.register_forward_hook(_pre_backward_module_hook))
+        self.backward_hooks.append(module.register_forward_hook(_pre_backward_module_hook))
 
         # post backward hook
-        self.backward_hooks.append(
-            module.register_forward_pre_hook(_post_backward_module_hook))
+        self.backward_hooks.append(module.register_forward_pre_hook(_post_backward_module_hook))
 
-    @torch.no_grad()
     def pre_sub_module_forward_function(self, sub_module):
-        see_memory_usage(f"Before sub module function {sub_module.__class__.__name__}",
-                         force=False)
-
+        see_memory_usage(f"Before sub module function {sub_module.__class__.__name__}", force=False)
+        prev_grad_state = torch.is_grad_enabled(
+        )  # we don't want to enable grad for sub modules fetching, yet the subfunction need to know if grad is enabled
+        torch.set_grad_enabled(False)
         global FWD_MODULE_STACK
         FWD_MODULE_STACK.append(sub_module)
 
@@ -475,41 +502,38 @@ def pre_sub_module_forward_function(self, sub_module):
         param_coordinator.trace_prologue(sub_module)
         if param_coordinator.is_record_trace():
             param_coordinator.record_module(sub_module)
-        param_coordinator.fetch_sub_module(sub_module)
-
-        see_memory_usage(
-            f"Before sub module function {sub_module.__class__.__name__} after fetch",
-            force=False)
+        param_coordinator.fetch_sub_module(sub_module, forward=prev_grad_state)
+        torch.set_grad_enabled(prev_grad_state)
+        see_memory_usage(f"Before sub module function {sub_module.__class__.__name__} after fetch", force=False)
 
     @torch.no_grad()
     def post_sub_module_forward_function(self, sub_module):
-        see_memory_usage(
-            f"After sub module function {sub_module.__class__.__name__} {sub_module.id} before release",
-            force=False)
+        see_memory_usage(f"After sub module function {sub_module.__class__.__name__} {sub_module.id} before release",
+                         force=False)
 
         param_coordinator = self.get_param_coordinator(training=sub_module.training)
-        param_coordinator.release_sub_module(sub_module)
+        param_coordinator.release_sub_module(sub_module, backward=False)
 
-        see_memory_usage(
-            f"After sub module function {sub_module.__class__.__name__}  {sub_module.id} after release",
-            force=False)
+        see_memory_usage(f"After sub module function {sub_module.__class__.__name__}  {sub_module.id} after release",
+                         force=False)
 
     @torch.no_grad()
     def pre_sub_module_backward_function(self, sub_module):
-        param_coordinator = self.get_param_coordinator(training=sub_module.training)
+        assert sub_module.training, "backward pass is invalid for module in evaluation mode"
+        param_coordinator = self.get_param_coordinator(training=True)
         param_coordinator.trace_prologue(sub_module)
         if param_coordinator.is_record_trace():
             param_coordinator.record_module(sub_module)
-        param_coordinator.fetch_sub_module(sub_module)
+        param_coordinator.fetch_sub_module(sub_module, forward=False)
 
     @torch.no_grad()
     def post_sub_module_backward_function(self, sub_module):
+        assert sub_module.training, "backward pass is invalid for module in evaluation mode"
         see_memory_usage(
             f"After sub module backward function {sub_module.__class__.__name__} {sub_module.id} before release",
             force=False)
 
-        self.get_param_coordinator(
-            training=sub_module.training).release_sub_module(sub_module)
+        self.get_param_coordinator(training=True).release_sub_module(sub_module, backward=True)
 
         see_memory_usage(
             f"After sub module backward function {sub_module.__class__.__name__} {sub_module.id} after release",
diff --git a/deepspeed/runtime/zero/partition_parameters.py b/deepspeed/runtime/zero/partition_parameters.py
index 39cfb7f90202..c0fcb8d34b80 100755
--- a/deepspeed/runtime/zero/partition_parameters.py
+++ b/deepspeed/runtime/zero/partition_parameters.py
@@ -1,7 +1,7 @@
-"""
-"Copyright 2020 The Microsoft DeepSpeed Team.
-Licensed under the MIT license.
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import math
 import os
@@ -11,7 +11,8 @@
 import functools
 import itertools
 from typing import List
-
+from collections import defaultdict
+import logging
 import torch
 from torch import Tensor
 from deepspeed import comm as dist
@@ -20,30 +21,76 @@
 
 from .linear import zero3_linear_wrap
 
+from deepspeed.utils import groups
 import deepspeed
-from ..utils import get_only_unique_item, see_memory_usage
+from ..utils import see_memory_usage
+from deepspeed.runtime.zero.config import DeepSpeedZeroConfig
 from deepspeed.runtime.zero.utils import assert_ints_same_as_other_ranks
 from deepspeed.runtime.zero.offload_config import OffloadDeviceEnum
+from deepspeed.runtime.config_utils import get_config_default
 from deepspeed.utils import instrument_w_nvtx, logger
 from deepspeed.comm.comm import init_distributed
-from deepspeed.utils.debug import (debug_param2name_id_shape,
-                                   debug_param2name_id_shape_device,
-                                   debug_module2name,
-                                   debug_param2name_id,
-                                   debug_param2name_id_shape_status)
+from deepspeed.utils.debug import (debug_param2name_id_shape, debug_param2name_id_shape_device, debug_module2name,
+                                   debug_param2name_id, debug_param2name_id_shape_status)
 from deepspeed.accelerator import get_accelerator
 from ..swap_tensor.partitioned_param_swapper import AsyncPartitionedParameterSwapper, PartitionedParamStatus
+from deepspeed.inference.quantization.utils import _quantize_param, WEIGHT_QUANTIZATION_LAYERS, wrap_quantized_functional, wrap_load_from_state_dict
 
-param_count = 0
 partitioned_param_data_shape = [0]
-zero_init_enabled = False
+zero_init_context = 0
+top_level_context = None
+
+
+class NoGatherHandle:
+
+    def __init__(self, param: Parameter) -> None:
+        if param.ds_status != ZeroParamStatus.INFLIGHT:
+            raise RuntimeError(f"expected param {param.ds_summary()} to be available")
+
+        if hasattr(param.ds_tensor, "ds_quant_scale"):
+            param.data = Init.quantizer_module.dequantize(param.ds_tensor.data, param.ds_tensor.ds_quant_scale).to(
+                device=get_accelerator().current_device_name(), non_blocking=True).view(param.ds_shape)
+        else:
+            param.data = param.ds_tensor.data.to(device=get_accelerator().current_device_name(),
+                                                 non_blocking=True).view(param.ds_shape)
+        self.__param = param
+
+    def wait(self) -> None:
+        get_accelerator().current_stream().synchronize()
+        self.__param.ds_status = ZeroParamStatus.AVAILABLE
+
+
+class NoGatherCoalescedHandle:
+
+    def __init__(self, params: List[Parameter]) -> None:
+        self.__params = params
+        self.__complete = False
+
+        for param in self.__params:
+            if param.ds_status != ZeroParamStatus.INFLIGHT:
+                raise RuntimeError(f"expected param {param.ds_summary()} to not be available")
+            if hasattr(param.ds_tensor, "ds_quant_scale"):
+                param.data = Init.quantizer_module.dequantize(param.ds_tensor.data, param.ds_tensor.ds_quant_scale).to(
+                    device=get_accelerator().current_device_name(), non_blocking=True).view(param.ds_shape)
+            else:
+                param.data = param.ds_tensor.data.to(device=get_accelerator().current_device_name(),
+                                                     non_blocking=True).view(param.ds_shape)
+
+    @instrument_w_nvtx
+    def wait(self) -> None:
+        if self.__complete:
+            return
+
+        get_accelerator().current_stream().synchronize()
+        for param in self.__params:
+            assert param.ds_status == ZeroParamStatus.INFLIGHT, f"expected param {param.ds_summary()} to be inflight"
+            param.ds_status = ZeroParamStatus.AVAILABLE
+
+        self.__complete = True
 
 
 def _dist_allgather_fn(input_tensor: Tensor, output_tensor: Tensor, group=None):
-    return instrument_w_nvtx(dist.allgather_fn)(output_tensor,
-                                                input_tensor,
-                                                group=group,
-                                                async_op=True)
+    return instrument_w_nvtx(dist.allgather_fn)(output_tensor, input_tensor, group=group, async_op=True)
 
 
 def print_rank_0(message, debug=False, force=False):
@@ -76,9 +123,7 @@ def external_parameters(self):
             return self._external_params.items()
 
         def all_parameters(self):
-            return itertools.chain(self.named_parameters(self,
-                                                         recurse=False),
-                                   external_parameters(self))
+            return itertools.chain(self.named_parameters(self, recurse=False), external_parameters(self))
 
         module.ds_external_parameters = types.MethodType(external_parameters, module)
         module.all_parameters = types.MethodType(all_parameters, module)
@@ -150,8 +195,7 @@ def unregister_external_parameter(module, parameter):
     if not isinstance(parameter, torch.nn.Parameter):
         raise RuntimeError('Parameter is not a torch.nn.Parameter')
 
-    if not hasattr(module,
-                   '_external_params') or id(parameter) not in module._external_params:
+    if not hasattr(module, '_external_params') or id(parameter) not in module._external_params:
         raise RuntimeError('Parameter is not a registered external parameter of module.')
 
     key = id(parameter)
@@ -182,21 +226,24 @@ class ZeroParamStatus(Enum):
     INFLIGHT = 3
 
 
+_orig_torch_tensor = torch.tensor
 _orig_torch_empty = torch.empty
 _orig_torch_zeros = torch.zeros
 _orig_torch_ones = torch.ones
 _orig_torch_full = torch.full
+_orig_torch_arange = torch.arange
+_orig_torch_eye = torch.eye
+_orig_torch_randn = torch.randn
 
 
-def zero_wrapper_for_fp_tensor_constructor(fn: Callable,
-                                           target_fp_dtype: torch.dtype) -> Callable:
+def zero_wrapper_for_fp_tensor_constructor(fn: Callable, target_fp_dtype: torch.dtype) -> Callable:
+
     def wrapped_fn(*args, **kwargs) -> Tensor:
         if kwargs.get("device", None) is None:
-            kwargs['device'] = torch.device(get_accelerator().device_name(
-                os.environ["LOCAL_RANK"]))
+            kwargs['device'] = torch.device(get_accelerator().device_name(os.environ["LOCAL_RANK"]))
         tensor: Tensor = fn(*args, **kwargs)
         if tensor.is_floating_point():
-            tensor = tensor.to(target_fp_dtype)
+            tensor.data = tensor.data.to(target_fp_dtype)
 
         return tensor
 
@@ -204,9 +251,12 @@ def wrapped_fn(*args, **kwargs) -> Tensor:
 
 
 def get_new_tensor_fn_for_dtype(dtype: torch.dtype) -> Callable:
-    def new_tensor(cls, *args) -> Tensor:
+
+    def new_tensor(cls, *args, **kwargs) -> Tensor:
         device = torch.device(get_accelerator().device_name(os.environ["LOCAL_RANK"]))
-        tensor = _orig_torch_empty(0, device=device).new_empty(*args)
+        if not args:
+            args = (0, )
+        tensor = _orig_torch_empty(0, device=device).new_empty(*args, **kwargs)
         if tensor.is_floating_point():
             tensor = tensor.to(dtype)
 
@@ -236,7 +286,8 @@ def free_param(param: Parameter) -> None:
     if get_accelerator().on_accelerator(param.data):
         # need to make sure that we don't free the parameter while it is still
         # being used for computation
-        param.data.record_stream(get_accelerator().current_stream())
+        if not get_accelerator().is_synchronized_device():
+            param.data.record_stream(get_accelerator().current_stream())
     # param.data doesn't store anything meaningful in partitioned state
     param.data = torch.empty(0, dtype=param.dtype, device=param.device)
     param.ds_status = ZeroParamStatus.NOT_AVAILABLE
@@ -250,21 +301,76 @@ def free_param(param: Parameter) -> None:
 # Inserts _post_init_method at the end of init method
 # for all sub classes of torch.nn.Module
 class InsertPostInitMethodToModuleSubClasses(object):
-    def __init__(self,
-                 enabled=True,
-                 mem_efficient_linear=True,
-                 ds_config=None,
-                 dtype=None):
+    num_module_parameters = 0
+    num_module_elements = 0
+
+    def __init__(self, enabled=True, mem_efficient_linear=True, ds_config=None, dtype=None):
         self.mem_efficient_linear = mem_efficient_linear
         self.enabled = enabled
         self._set_dtype(ds_config, dtype)
-        assert self.dtype in [torch.half, torch.bfloat16, torch.float], f"Invalid data type {self.dtype}, allowed values are [torch.half, torch.bfloat16, torch.float]"
+        assert self.dtype in [
+            torch.half, torch.bfloat16, torch.float
+        ], f"Invalid data type {self.dtype}, allowed values are [torch.half, torch.bfloat16, torch.float]"
+        self.wrapped_cls = set()
+
+        self.quantized_initialization = None
+        if ds_config is not None and ds_config.weight_quantization_config and ds_config.weight_quantization_config.quantized_initialization:
+            self.quantized_initialization = ds_config.weight_quantization_config.quantized_initialization
 
     def __enter__(self):
-        global zero_init_enabled
         if not self.enabled:
             return
-        zero_init_enabled = True
+
+        global zero_init_context
+        if zero_init_context == 0:
+            self.patch_init_and_builtins()
+            global top_level_context
+            top_level_context = self
+
+        zero_init_context += 1
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        if not self.enabled:
+            return
+
+        global zero_init_context
+        zero_init_context -= 1
+
+        # Exiting the top level context
+        if zero_init_context == 0:
+            self.unpatch_init_and_builtins()
+            global top_level_context
+            top_level_context = None
+
+            if dist.get_rank() == 0:
+                billion_elems = InsertPostInitMethodToModuleSubClasses.num_module_elements / 1e9
+                num_params = InsertPostInitMethodToModuleSubClasses.num_module_parameters
+                logger.info(
+                    f"finished initializing model - num_params = {num_params}, num_elems = {billion_elems:.2f}B")
+
+        # Now that we cleaned up the metaclass injection, raise the exception.
+        if exc_type is not None:
+            return False
+
+    # To be implemented by inheriting classes
+    def _post_init_method(self, module):
+        pass
+
+    def _set_dtype(self, ds_config, dtype):
+        if ds_config is not None and dtype is None:
+            if ds_config.bfloat16_enabled and ds_config.fp16_enabled:
+                raise RuntimeError("bfloat16 and fp16 cannot be enabled at once")
+
+            if ds_config.bfloat16_enabled:
+                self.dtype = torch.bfloat16
+            elif ds_config.fp16_enabled:
+                self.dtype = torch.half
+            else:
+                self.dtype = torch.float
+        else:
+            self.dtype = dtype or torch.half
+
+    def patch_init_and_builtins(self):
 
         def apply_with_gather(orig_module_apply_fn: Callable) -> Callable:
             """many models make use of child modules like Linear or Embedding which
@@ -280,6 +386,7 @@ def apply_with_gather(orig_module_apply_fn: Callable) -> Callable:
             to get around this issue, we wrap the function passed to Module.apply
             so that the applied function is applied to child modules correctly.
             """
+
             def get_wrapped_fn_to_apply(fn_to_apply: Callable) -> Callable:
                 if hasattr(fn_to_apply, "wrapped"):
                     return fn_to_apply
@@ -296,18 +403,15 @@ def wrapped_fn_to_apply(module_to_apply_fn_to: Module) -> None:
                     3. broadcasts root rank's parameters to the other ranks
                     4. re-partitions the parameters
                     """
-                    if not all(
-                            is_zero_param(p)
-                            for p in module_to_apply_fn_to.parameters(recurse=False)):
-                        raise RuntimeError(
-                            f"not all parameters for {module_to_apply_fn_to.__class__.__name__}, "
-                            f"were zero params, is it possible that the parameters were "
-                            f"overwritten after they were initialized? "
-                            f"params: {[p for p in module_to_apply_fn_to.parameters(recurse=False)]} "
-                        )
+
+                    # TODO Delay error checking for dangling partitioned parameters to post module init
+                    # raise RuntimeError(f"not all parameters for {module_to_apply_fn_to.__class__.__name__}, "
+                    #                    f"were zero params, is it possible that the parameters were "
+                    #                    f"overwritten after they were initialized? "
+                    #                    f"params: {[p for p in module_to_apply_fn_to.parameters(recurse=False)]} ")
 
                     params_to_apply_fn_to: Iterable[Parameter] = list(
-                        sorted(module_to_apply_fn_to.parameters(recurse=False),
+                        sorted([p for p in module_to_apply_fn_to.parameters(recurse=False) if is_zero_param(p)],
                                key=lambda p: p.ds_id))
 
                     for param in params_to_apply_fn_to:
@@ -332,6 +436,7 @@ def wrapped_apply(module: Module, fn_to_apply: Callable) -> None:
             return wrapped_apply
 
         def partition_after(f):
+
             @functools.wraps(f)
             def wrapper(module, *args, **kwargs):
 
@@ -343,8 +448,7 @@ def wrapper(module, *args, **kwargs):
                 # custom weights init function. So if a parent created the weights param, the child
                 # won't need to gather it in order to tweak it
 
-                print_rank_0(f'Before initializing {module.__class__.__name__}',
-                             force=False)
+                print_rank_0(f'Before initializing {module.__class__.__name__}', force=False)
 
                 is_child_module = False
                 if not hasattr(module, "_ds_child_entered"):
@@ -358,13 +462,10 @@ def wrapper(module, *args, **kwargs):
                     # child's __init__ is done, now we can run a single post_init on the child object
                     delattr(module, "_ds_child_entered")
 
-                    print_rank_0(f'Running post_init for {module.__class__.__name__}',
-                                 force=False)
+                    print_rank_0(f'Running post_init for {module.__class__.__name__}', force=False)
                     self._post_init_method(module)
 
-                print_rank_0(
-                    f'After initializing followed by post init for {module.__class__.__name__}',
-                    force=False)
+                print_rank_0(f'After initializing followed by post init for {module.__class__.__name__}', force=False)
 
             return wrapper
 
@@ -373,11 +474,11 @@ def _enable_class(cls):
             cls.__init__ = partition_after(cls.__init__)
 
         def _init_subclass(cls, **kwargs):
+            cls._old_init = cls.__init__
             cls.__init__ = partition_after(cls.__init__)
 
         # Replace .__init__() for all existing subclasses of torch.nn.Module recursively
         for subclass in get_all_subclasses(torch.nn.modules.module.Module):
-            # print(f"subclass={subclass.__module__}.{subclass.__qualname__}")
             _enable_class(subclass)
 
         # holding onto some methods so we can put them back the way they were in __exit__
@@ -387,16 +488,10 @@ def _init_subclass(cls, **kwargs):
 
         # Replace .__init__() for future subclasses of torch.nn.Module
         torch.nn.modules.module.Module.__init_subclass__ = classmethod(_init_subclass)
-        torch.nn.modules.module.Module.apply = apply_with_gather(
-            torch.nn.modules.module.Module._old_apply)
+        if Init.override_module_apply:
+            torch.nn.modules.module.Module.apply = apply_with_gather(torch.nn.modules.module.Module._old_apply)
 
-        torch.Tensor.__new__ = get_new_tensor_fn_for_dtype(self.dtype)
-        torch.empty = zero_wrapper_for_fp_tensor_constructor(_orig_torch_empty,
-                                                             self.dtype)
-        torch.zeros = zero_wrapper_for_fp_tensor_constructor(_orig_torch_zeros,
-                                                             self.dtype)
-        torch.ones = zero_wrapper_for_fp_tensor_constructor(_orig_torch_ones, self.dtype)
-        torch.full = zero_wrapper_for_fp_tensor_constructor(_orig_torch_full, self.dtype)
+        self._add_tensor_creation_wrappers()
 
         if self.mem_efficient_linear:
             print_rank_0(
@@ -405,151 +500,264 @@ def _init_subclass(cls, **kwargs):
             self.linear_bk = torch.nn.functional.linear
             torch.nn.functional.linear = zero3_linear_wrap
 
-    def __exit__(self, exc_type, exc_value, traceback):
-        if not self.enabled:
-            return
+            if self.quantized_initialization:
+                print_rank_0("nn.functional.linear has been overridden with quantized linear version.", force=False)
+                torch.nn.functional.linear = wrap_quantized_functional(torch.nn.functional.linear)
+                torch.nn.functional.embedding = wrap_quantized_functional(torch.nn.functional.embedding)
+                for cls in WEIGHT_QUANTIZATION_LAYERS:
+                    cls._load_from_state_dict = wrap_load_from_state_dict(cls._load_from_state_dict)
 
-        shutdown_init_context()
+                logger.info("Enable Zero3 engine with INT4 quantization.")
 
-        if dist.get_rank() == 0:
-            logger.info("finished initializing model with %.2fB parameters",
-                        param_count / 1e9)
+        self.patched = True
 
-        # Now that we cleaned up the metaclass injection, raise the exception.
-        if exc_type is not None:
-            return False
+    def unpatch_init_and_builtins(self):
 
-    # To be implemented by inheriting classes
-    def _post_init_method(self, module):
-        pass
+        if self.patched:
 
-    def _set_dtype(self, ds_config, dtype):
-        if ds_config is not None and dtype is None:
-            if ds_config.bfloat16_enabled and ds_config.fp16_enabled:
-                raise RuntimeError("bfloat16 and fp16 cannot be enabled at once")
+            def _disable_class(cls):
+                cls.__init__ = cls._old_init
 
-            if ds_config.bfloat16_enabled:
-                self.dtype = torch.bfloat16
-            elif ds_config.fp16_enabled:
-                self.dtype = torch.half
-            else:
-                self.dtype = torch.float
-        else:
-            self.dtype = dtype or torch.half
+            for subclass in get_all_subclasses(torch.nn.modules.module.Module):
+                _disable_class(subclass)
 
+            # putting methods back the way we found them
+            torch.nn.modules.module.Module.__init_subclass__ = torch.nn.modules.module.Module._old_init_subclass
+            if Init.override_module_apply:
+                torch.nn.modules.module.Module.apply = torch.nn.modules.module.Module._old_apply
 
-def shutdown_init_context():
-    global zero_init_enabled
-
-    if not zero_init_enabled:
-        return
+            self._remove_tensor_creation_wrappers()
 
-    def _disable_class(cls):
-        cls.__init__ = cls._old_init
+            self.patched = False
 
-    # Replace .__init__() for all existing subclasses of torch.nn.Module
-    for subclass in get_all_subclasses(torch.nn.modules.module.Module):
-        _disable_class(subclass)
+    def _add_tensor_creation_wrappers(self):
+        torch.Tensor.__new__ = get_new_tensor_fn_for_dtype(self.dtype)
+        torch.tensor = zero_wrapper_for_fp_tensor_constructor(_orig_torch_tensor, self.dtype)
+        torch.empty = zero_wrapper_for_fp_tensor_constructor(_orig_torch_empty, self.dtype)
+        torch.zeros = zero_wrapper_for_fp_tensor_constructor(_orig_torch_zeros, self.dtype)
+        torch.ones = zero_wrapper_for_fp_tensor_constructor(_orig_torch_ones, self.dtype)
+        torch.full = zero_wrapper_for_fp_tensor_constructor(_orig_torch_full, self.dtype)
+        torch.arange = zero_wrapper_for_fp_tensor_constructor(_orig_torch_arange, self.dtype)
+        torch.eye = zero_wrapper_for_fp_tensor_constructor(_orig_torch_eye, self.dtype)
+        torch.randn = zero_wrapper_for_fp_tensor_constructor(_orig_torch_randn, self.dtype)
+
+    def _remove_tensor_creation_wrappers(self):
+        torch.Tensor.__new__ = torch.Tensor.__old_new__
+        torch.tensor = _orig_torch_tensor
+        torch.empty = _orig_torch_empty
+        torch.zeros = _orig_torch_zeros
+        torch.ones = _orig_torch_ones
+        torch.full = _orig_torch_full
+        torch.arange = _orig_torch_arange
+        torch.eye = _orig_torch_eye
+        torch.randn = _orig_torch_randn
 
-    # putting methods back the way we found them
-    torch.nn.modules.module.Module.__init_subclass__ = torch.nn.modules.module.Module._old_init_subclass
-    torch.nn.modules.module.Module.apply = torch.nn.modules.module.Module._old_apply
 
-    torch.Tensor.__new__ = torch.Tensor.__old_new__
-    torch.empty = _orig_torch_empty
-    torch.zeros = _orig_torch_zeros
-    torch.ones = _orig_torch_ones
-    torch.full = _orig_torch_full
+def shutdown_init_context():
+    """
+    This function is used to initialize deepspeed engine inside the context of Init.
+    We need to remove the wrappers but keep the context.
+    """
+    if top_level_context:
+        top_level_context.unpatch_init_and_builtins()
 
-    # un doing it here will undo it during training
-    # if self.mem_efficient_linear:
-    #    torch.nn.functional.linear = self.linear_bk
-    #        if self.mem_efficient_linear:
-    #            torch.nn.functional.linear = self.linear_bk
 
-    zero_init_enabled = False
+def restore_init_context():
+    """
+    This function is used to restore the wrappers after deepspeed engine is initialized.
+    """
+    if top_level_context:
+        top_level_context.patch_init_and_builtins()
 
 
 class AllGatherHandle:
-    def __init__(self, handle, param: Parameter) -> None:
+
+    def __init__(self, handle, param: Parameter, quantization=None) -> None:
         if param.ds_status != ZeroParamStatus.INFLIGHT:
             raise RuntimeError(f"expected param {param.ds_summary()} to be available")
 
         self.__handle = handle
         self.__param = param
+        self.__quantization = quantization
 
     def wait(self) -> None:
         instrument_w_nvtx(self.__handle.wait)()
+        if self.__quantization:
+            instrument_w_nvtx(self.__quantization.quant_handle.wait)()
+            self.__param.data = self.__quantization.backend.dequantize(
+                self.__quantization.quantized_param, self.__quantization.scale_buffer).to(self.__param.device)
         self.__param.ds_status = ZeroParamStatus.AVAILABLE
 
 
 class AllGatherCoalescedHandle:
+
     def __init__(
         self,
         allgather_handle,
         params: List[Parameter],
         partitions: List[Tensor],
         world_size: int,
+        use_secondary_tensor=False,
+        forward=False,
+        quantization=None,
     ) -> None:
-        self.__allgather_handle = allgather_handle
-        self.__params = params
-        self.__partitions = partitions
-        self.__world_size = world_size
-        self.__complete = False
-
-        for param in self.__params:
+        self.allgather_handle = allgather_handle
+        self.params = params
+        self.partitions = partitions
+        self.world_size = world_size
+        self.use_secondary_tensor = use_secondary_tensor
+        self.forward = forward
+        self.complete = False
+        self.quantization = quantization
+
+        for param in self.params:
             if param.ds_status != ZeroParamStatus.INFLIGHT:
-                raise RuntimeError(
-                    f"expected param {param.ds_summary()} to not be available")
+                raise RuntimeError(f"expected param {param.ds_summary()} to not be available")
 
     @instrument_w_nvtx
     def wait(self) -> None:
-        if self.__complete:
+        if self.complete:
             return
 
-        instrument_w_nvtx(self.__allgather_handle.wait)()
+        instrument_w_nvtx(self.allgather_handle.wait)()
+
+        if self.quantization:
+            instrument_w_nvtx(self.quantization.quant_handle.wait)()
+            flat_tensor = self.quantization.backend.dequantize(
+                self.quantization.quantized_param, self.quantization.scale_buffer).to(self.params[0].device)
+
+            self.partitions: List[Parameter] = []
+            for i in range(self.world_size):
+                self.partitions.append(
+                    flat_tensor.narrow(0, self.quantization.partition_sz * i, self.quantization.partition_sz))
 
         # split the single tensor out into individual tensors
         param_offset = 0
-        for param in self.__params:
+        for param in self.params:
             assert param.ds_status == ZeroParamStatus.INFLIGHT, f"expected param {param.ds_summary()} to be inflight"
             partitions: List[Tensor] = []
-            for rank in range(self.__world_size):
-                param_start = rank * param.ds_tensor.ds_numel
+            ds_tensor_numel = param.ds_tensor.ds_numel
+            if self.use_secondary_tensor and not self.forward:
+                ds_tensor_numel *= param.ds_secondary_tensor_num_of_groups
+            for rank in range(self.world_size):
+                param_start = rank * ds_tensor_numel
                 if param_start < param.ds_numel:
-                    part_to_copy = self.__partitions[rank].narrow(
-                        0,
-                        param_offset,
-                        min(param.ds_numel - param_start,
-                            param.ds_tensor.ds_numel))
+                    part_to_copy = self.partitions[rank].narrow(0, param_offset,
+                                                                min(param.ds_numel - param_start, ds_tensor_numel))
                     partitions.append(part_to_copy)
-
             param.data = instrument_w_nvtx(torch.cat)(partitions).view(param.ds_shape)
             param.ds_status = ZeroParamStatus.AVAILABLE
 
             for part_to_copy in partitions:
-                part_to_copy.record_stream(get_accelerator().current_stream())
+                if not get_accelerator().is_synchronized_device():
+                    part_to_copy.record_stream(get_accelerator().current_stream())
 
-            param_offset += param.ds_tensor.ds_numel
+            param_offset += ds_tensor_numel
 
-        self.__complete = True
+        self.complete = True
+
+
+class MultipleAllGatherHandles:
+
+    def __init__(self, handles: List[AllGatherCoalescedHandle]):
+        self.handles = handles
+
+    def wait(self) -> None:
+        for handle in self.handles:
+            handle.wait()
+
+
+class QuantizationInfo:
+    # a placeholder object to store all quant related vars used in handles
+    def __init__(self) -> None:
+        self.quantized_param = None
+        self.backend = None
+        self.quant_handle = None
+        self.scale_buffer = None
+
+
+class CUDAQuantizer:
+    async_flag = True
+    target_group_size = 8000  # the optimal size is 4k, so we set the target to be below 8k
+    group_size_cache = dict()
+    quantizer_cuda_module = None
+
+    def __init__(self) -> None:
+        if CUDAQuantizer.quantizer_cuda_module is None:
+            CUDAQuantizer.quantizer_cuda_module = deepspeed.ops.op_builder.QuantizerBuilder().load()
+
+    def quantize(self, param, groups=None):
+        if groups is None:
+            try:
+                groups = self.group_size_cache[param.numel()]
+            except KeyError:
+                groups = math.ceil(param.numel() / self.target_group_size)
+                while groups < param.numel():
+                    if param.numel() % (8 * groups) == 0:
+                        break
+                    groups += 1
+                while True:
+                    if param.numel() % (8 * groups * 2) == 0 and param.numel(
+                    ) / groups > self.target_group_size:  #hard limit of 16k group_size
+                        groups *= 2
+                    else:
+                        break
+                assert (
+                    param.numel() % (8 * groups) == 0
+                ), f"Qantized weight requires the number of weights be a multiple of 8. Yet {param.numel()} cannot be divided by 8*{groups}"
+                assert (param.numel() / groups < 16000), f"{param.numel()} / {groups} is larger than 16k"
+                assert param.numel(
+                ) > groups, f"Adaptive grouping algorithm cannot find a group size for input tensor of size {param.numel()}"
+                self.group_size_cache[param.numel()] = groups
+        return self.quantizer_cuda_module.quantize(param.to(get_accelerator().device_name()), groups, 8,
+                                                   self.quantizer_cuda_module.Symmetric)
+
+    def dequantize(self, quantized_param, scale):
+        return self.quantizer_cuda_module.dequantize(quantized_param, scale, scale.numel(), 8,
+                                                     self.quantizer_cuda_module.Symmetric)
+
+
+def _no_gather_coalesced(params: Iterable[Parameter]) -> AllGatherCoalescedHandle:
+    for param in params:
+        if param.ds_status != ZeroParamStatus.NOT_AVAILABLE:
+            raise RuntimeError(f"expect param.ds_status == ZeroParamStatus.NOT_AVAILABLE, got{param.ds_summary()}")
+        param.ds_status = ZeroParamStatus.INFLIGHT
+
+    params = sorted(params, key=lambda p: p.ds_id)
+    if len(params) == 1:
+        param, = params
+        return NoGatherHandle(param)
+    return NoGatherCoalescedHandle(params)
 
 
 # Replaces all parameters in module with Scattered Parameters
 class Init(InsertPostInitMethodToModuleSubClasses):
     param_id = 0
+    param_persistence_threshold = get_config_default(DeepSpeedZeroConfig, "param_persistence_threshold")
+    model_persistence_threshold = get_config_default(DeepSpeedZeroConfig, "model_persistence_threshold")
+    num_persisted_parameters = 0
+    num_persisted_elements = 0
+    apply_param_persistence = False
+    override_module_apply = get_config_default(DeepSpeedZeroConfig, "override_module_apply")
 
-    def __init__(self,
-                 module=None,
-                 data_parallel_group=None,
-                 mem_efficient_linear=True,
-                 remote_device=None,
-                 pin_memory=False,
-                 config_dict_or_path=None,
-                 config=None,
-                 enabled=True,
-                 dtype=None,
-                 mpu=None):
+    def __init__(
+        self,
+        module=None,
+        data_parallel_group=None,
+        mem_efficient_linear=True,
+        remote_device=None,
+        pin_memory=False,
+        config_dict_or_path=None,
+        config=None,
+        enabled=True,
+        dtype=None,
+        mpu=None,
+        zero_param_parallel_group=None,
+        zero_quantized_weights=False,
+        zero_quantized_nontrainable_weights=False,
+        sequence_data_parallel_group=None,
+        param_swapper=None,
+    ):
         """A context to enable massive model construction for training with
         ZeRO-3. Models are automatically partitioned (or, sharded) across the
         system and converted to half precision.
@@ -578,6 +786,11 @@ def __init__(self,
             dtype (``dtype``, optional): Can be used to change the data type of the parameters.
                 Supported options are ``torch.half`` and ``torch.float``. Defaults to ``None``
             mpu (``object``, optional): A model parallelism unit object that implements get_{model,data}_parallel_{rank,group,world_size}.
+            zero_param_parallel_group(``object``, optional): Parallel (comm) group for dual partitioning of ZeRO params.
+            zero_quantized_weights (bool, optional): If ``True``, turn on quantized weights in all gather weights. Default is ``False``
+            zero_quantized_nontrainable_weights (bool, optional): If ``True``, nontrainable weights will be stored in quantized format. Default is ``False``
+            param_swapper (``deepspeed.runtime.swap_tensor.partitioned_param_swapper.AsyncPartitionedParameterSwapper``, optional): [Experimental] Use existing parameter swapper. Defaults to ``None``.
+                This argument will be removed in the near future.
 
         This context accelerates model initialization and enables models that
         are too large to allocate in their entirety in CPU memory. It has the
@@ -609,15 +822,6 @@ def __init__(self,
             Initializes ``deepspeed.comm`` if it has not already been done so.
             See :meth:`deepspeed.init_distributed` for more information.
 
-        .. note::
-            Can also be used as a decorator:
-
-            .. code-block:: python
-
-                @deepspeed.zero.Init()
-                def get_model():
-                    return MyLargeModel()
-
         .. note::
             Only applicable to training with ZeRO-3.
 
@@ -651,52 +855,84 @@ def get_model():
         if config is not None:
             config_dict_or_path = config
             logger.warning(
-                f'zero.Init: the `config` argument is deprecated. Please use `config_dict_or_path` instead.'
-            )
-
-        _ds_config = deepspeed.runtime.config.DeepSpeedConfig(
-            config_dict_or_path,
-            mpu) if config_dict_or_path is not None else None
-        super().__init__(enabled=enabled,
-                         mem_efficient_linear=mem_efficient_linear,
-                         ds_config=_ds_config,
-                         dtype=dtype)
+                f'zero.Init: the `config` argument is deprecated. Please use `config_dict_or_path` instead.')
+        _ds_config = deepspeed.runtime.config.DeepSpeedConfig(config_dict_or_path,
+                                                              mpu) if config_dict_or_path is not None else None
+        if _ds_config is not None:
+            mem_efficient_linear = _ds_config.zero_config.memory_efficient_linear
+        super().__init__(enabled=enabled, mem_efficient_linear=mem_efficient_linear, ds_config=_ds_config, dtype=dtype)
         if not dist.is_initialized():
             init_distributed()
             assert dist.is_initialized(), "Parameters cannot be scattered without initializing deepspeed.comm"
-        if data_parallel_group is None:
+
+        if data_parallel_group is None and sequence_data_parallel_group is None:
             self.ds_process_group = dist.get_world_group()
-        else:
+        elif sequence_data_parallel_group is not None:
+            self.ds_process_group = sequence_data_parallel_group
+        elif data_parallel_group is not None:
             self.ds_process_group = data_parallel_group
+        else:  # both given
+            raise ValueError(
+                "Both 'data_parallel_group' and 'sequence_data_parallel_group' were specified. Please provide only one of these arguments."
+            )
 
         self.rank = dist.get_rank(group=self.ds_process_group)
-        self.world_size = dist.get_world_size(group=self.ds_process_group)
+        self.dp_world_size = dist.get_world_size(group=self.ds_process_group)
+
+        self.zero_param_process_group = zero_param_parallel_group
+        if _ds_config is not None and _ds_config.zero_config.zero_hpz_partition_size > 1 and self.zero_param_process_group is None:
+            groups._create_zero_param_parallel_group(_ds_config.zero_config.zero_hpz_partition_size)
+            self.zero_param_process_group = groups._get_zero_param_intra_parallel_group()
+
+        self.num_ranks_in_param_group = self.dp_world_size
+        self.rank_in_group = self.rank
+        self.num_param_groups = 1
+
+        if self.zero_param_process_group is not None:
+            self.num_ranks_in_param_group = groups._get_zero_param_intra_parallel_group_world_size()
+            self.num_param_groups = int(self.dp_world_size / self.num_ranks_in_param_group)
+            self.rank_in_group = groups._get_zero_param_intra_parallel_rank_in_mygroup()
+            print_rank_0(f"hpZeRO group size? {self.num_ranks_in_param_group}", force=True)
+
+            logger.debug(
+                "hpZeRO partition parameter my rank in world {} my rank in group {} ranks in my param partition group: {} "
+                .format(self.rank, self.rank_in_group, groups._get_zero_param_intra_parallel_group_ranks()))
 
         # Local device is the device where the parameters are consumed, must be default device.
         # It is the device where parameters are fully instantiated using allgather
-        self.local_device = torch.device(get_accelerator().device_name(
-            os.environ["LOCAL_RANK"]))
+        self.local_device = torch.device(get_accelerator().device_name(os.environ["LOCAL_RANK"]))
         get_accelerator().set_device(self.local_device)
 
-        if _ds_config is not None and _ds_config.zero_config.offload_param is not None:
-            remote_device = _ds_config.zero_config.offload_param.device
-            pin_memory = _ds_config.zero_config.offload_param.pin_memory
+        self.quantized_weights = zero_quantized_weights
+        if _ds_config is not None and _ds_config.zero_config.zero_quantized_weights and not self.quantized_weights:
+            self.quantized_weights = _ds_config.zero_config.zero_quantized_weights
+        self.quantized_nontrainable_weights = zero_quantized_nontrainable_weights
+        if _ds_config is not None and _ds_config.zero_config.zero_quantized_nontrainable_weights and not self.quantized_nontrainable_weights:
+            self.quantized_nontrainable_weights = _ds_config.zero_config.zero_quantized_nontrainable_weights
+
+        self.module = module
+        if (self.quantized_weights or self.quantized_nontrainable_weights):
+            self.quantizer_module = CUDAQuantizer()
+            print_rank_0(f'Using quantizer for weights: {self.quantizer_module.__class__.__name__}', force=True)
+
+        if _ds_config is not None:
+            Init.override_module_apply = _ds_config.zero_config.override_module_apply
+
+            if _ds_config.zero_config.offload_param is not None:
+                remote_device = _ds_config.zero_config.offload_param.device
+                pin_memory = _ds_config.zero_config.offload_param.pin_memory
 
         self._validate_remote_device(remote_device, _ds_config)
 
         # Remote device is the device where parameter partitions are stored
         # It can be same as local_device or it could be CPU or NVMe.
-        self.remote_device = self.local_device if remote_device in [
-            None,
-            OffloadDeviceEnum.none
-        ] else remote_device
-        self.pin_memory = pin_memory if (
-            self.remote_device in [OffloadDeviceEnum.cpu,
-                                   OffloadDeviceEnum.nvme]) else False
+        self.remote_device = self.local_device if remote_device in [None, OffloadDeviceEnum.none] else remote_device
+        self.pin_memory = pin_memory if (self.remote_device in [OffloadDeviceEnum.cpu, OffloadDeviceEnum.nvme
+                                                                ]) else False
 
         # Enable fp16 param swapping to NVMe
         if self.remote_device == OffloadDeviceEnum.nvme:
-            self.param_swapper = AsyncPartitionedParameterSwapper(_ds_config, self.dtype)
+            self.param_swapper = param_swapper or AsyncPartitionedParameterSwapper(_ds_config, self.dtype)
         else:
             self.param_swapper = None
 
@@ -705,19 +941,30 @@ def get_model():
             assert isinstance(module, torch.nn.Module)
             self._convert_to_zero_parameters(module.parameters(recurse=True))
 
-        self.use_all_gather_base = False
-        if dist.has_allgather_base():
-            self.use_all_gather_base = True
+        self.use_all_gather_into_tensor = dist.has_all_gather_into_tensor()
+        if not self.use_all_gather_into_tensor:
+            logger.info(f"all_gather_into_tensor API is not available in torch {torch.__version__}")
+
+    def _update_persist_config(self, ds_config):
+        Init.apply_param_persistence = True
+        Init.param_persistence_threshold = ds_config.zero_config.param_persistence_threshold
+        Init.model_persistence_threshold = ds_config.zero_config.model_persistence_threshold // self.num_partitions
+
+    def _zero_init_param(self, param):
+        self._convert_to_deepspeed_param(param)
+        if dist.get_world_group() == self.get_dp_process_group():
+            dist.broadcast(param, 0, self.get_dp_process_group())
         else:
-            logger.info(
-                f"_all_gather_base API is not available in torch {torch.__version__}")
+            dist.broadcast(param, dist.get_global_rank(self.get_dp_process_group(), 0), self.get_dp_process_group())
+        param.partition()
 
     def _convert_to_zero_parameters(self, param_list):
         for param in param_list:
             if is_zero_param(param):
                 continue
-            self._convert_to_deepspeed_param(param)
-            param.partition()
+
+            param.data = param.data.to(self.local_device)
+            self._zero_init_param(param)
 
     def _validate_remote_device(self, remote_device, ds_config):
         if ds_config is not None:
@@ -735,31 +982,27 @@ def _validate_remote_device(self, remote_device, ds_config):
                 f'"nvme_path" in DeepSpeed Config cannot be None if remote device is {OffloadDeviceEnum.nvme}'
 
     def _post_init_method(self, module):
-        #see_memory_usage(f"Before converting parmas in {module.__class__.__name__}", force=False)
+        #see_memory_usage(f"Before converting params in {module.__class__.__name__}", force=False)
         print_rank_0(f'Converting Params in {module.__class__.__name__}', force=False)
-        see_memory_usage(
-            f"Before converting and partitioning parmas in {module.__class__.__name__}",
-            force=False)
+        see_memory_usage(f"Before converting and partitioning params in {module.__class__.__name__}", force=False)
 
-        global param_count
         for name, param in module.named_parameters(recurse=False):
-            param_count += param.numel()
+            print_rank_0(f'Analyzing param {name} in {module.__class__.__name__}', force=False)
+            InsertPostInitMethodToModuleSubClasses.num_module_parameters += 1
+            InsertPostInitMethodToModuleSubClasses.num_module_elements += param.numel()
             if not is_zero_param(param):
-                self._convert_to_deepspeed_param(param)
-                print_rank_0(
-                    f"Partitioning param {debug_param2name_id_shape(param)} module={debug_module2name(module)}"
-                )
+                if not get_accelerator().on_accelerator(param):
+                    param.data = param.data.to(self.local_device)
 
-                if get_accelerator().on_accelerator(param):
-                    dist.broadcast(param, 0, self.ds_process_group)
-                else:
-                    if dist.get_rank() == 0:
-                        logger.warn(f"param `{name}` in {module.__class__.__name__} "
-                                    f"not on GPU so was not broadcasted from rank 0")
+                if name == 'weight' and self.quantized_initialization and type(module) in WEIGHT_QUANTIZATION_LAYERS:
+                    _quantize_param(param, self.quantized_initialization)
+
+                self._zero_init_param(param)
+                print_rank_0(
+                    f"Partitioning param {debug_param2name_id_shape(param)} module={debug_module2name(module)}")
 
-                param.partition()
         see_memory_usage(
-            f"Param count {param_count}. After converting and partitioning parmas in {module.__class__.__name__}",
+            f"Param count {InsertPostInitMethodToModuleSubClasses.num_module_elements}. After converting and partitioning params in {module.__class__.__name__}",
             force=False)
 
     def _convert_to_deepspeed_param(self, param):
@@ -784,13 +1027,26 @@ def _convert_to_deepspeed_param(self, param):
 
         # If this flag is true, then the parameters are replicated throughput training
         # And only partitioned before the step
-        param.ds_persist = False
+        if Init.apply_param_persistence and param.ds_numel <= Init.param_persistence_threshold and Init.num_persisted_elements + param.ds_numel <= Init.model_persistence_threshold:
+            param.ds_persist = True
+            Init.num_persisted_parameters += 1
+            Init.num_persisted_elements += param.ds_numel
+        else:
+            param.ds_persist = False
 
         param.is_external_param = False
 
         # The group that the parameter is scattered across.
         param.ds_process_group = self.ds_process_group
 
+        # Stores the secondary partitioned copy of the tensor
+        param.ds_secondary_tensor = None
+
+        #Process group for secondary partition all (group) gather
+        param.ds_zero_param_process_group = self.zero_param_process_group
+        param.ds_secondary_tensor_group_size = self.num_ranks_in_param_group
+        param.ds_secondary_tensor_num_of_groups = self.num_param_groups
+
         # This is set to the Async Param swapper if remote device is nvme
         # else this is set to None
         param.nvme_swapper = self.param_swapper
@@ -805,18 +1061,70 @@ def all_gather(param_list=None, async_op=False, hierarchy=0):
                 param_list = [cls]
             return self._all_gather(param_list, async_op=async_op, hierarchy=hierarchy)
 
+        def _all_gather_dtype(dtype, params, forward, world_size, rank_in_group, ds_process_group):
+            partition_sz = sum(p.ds_tensor.ds_numel for p in params)
+
+            if params[0].ds_secondary_tensor is not None and not forward:
+                partition_sz = sum(p.ds_tensor.ds_numel * p.ds_secondary_tensor_num_of_groups for p in params)
+
+            flat_tensor = torch.empty(partition_sz * world_size,
+                                      dtype=dtype,
+                                      device=get_accelerator().current_device_name(),
+                                      requires_grad=False)
+
+            partitions: List[Parameter] = []
+            for i in range(world_size):
+                partitions.append(flat_tensor.narrow(0, partition_sz * i, partition_sz))
+
+            if params[0].ds_secondary_tensor is not None and not forward:
+                use_secondary_tensor = True
+                instrument_w_nvtx(
+                    torch.cat)([p.ds_secondary_tensor.to(get_accelerator().current_device_name()) for p in params],
+                               out=partitions[rank_in_group])
+            else:
+                use_secondary_tensor = False
+                instrument_w_nvtx(torch.cat)([p.ds_tensor.to(get_accelerator().current_device_name()) for p in params],
+                                             out=partitions[rank_in_group])
+            handle = _dist_allgather_fn(partitions[rank_in_group], flat_tensor, ds_process_group)
+            #Fix get_partition_dp_group(params[0]))
+
+            return AllGatherCoalescedHandle(
+                allgather_handle=handle,
+                params=params,
+                partitions=partitions,
+                world_size=world_size,
+                use_secondary_tensor=use_secondary_tensor,
+                forward=forward,
+            )
+
         @instrument_w_nvtx
         def all_gather_coalesced(params: Iterable[Parameter],
-                                 safe_mode: bool = False) -> AllGatherCoalescedHandle:
+                                 forward: bool = True,
+                                 safe_mode: bool = False,
+                                 quantize: bool = False) -> AllGatherCoalescedHandle:
 
             # fetches from nvme if the partition is not available and in nvme
             self._ensure_availability_of_partitioned_params(params)
 
+            if self.num_partitions == 1:
+                return _no_gather_coalesced(params)
+
             for param in params:
                 if param.ds_status != ZeroParamStatus.NOT_AVAILABLE:
                     raise RuntimeError(param.ds_summary())
                 param.ds_status = ZeroParamStatus.INFLIGHT
 
+            #use appropriate all gather process group
+            ds_process_group = self.ds_process_group
+            rank_in_group = self.rank
+            world_size = self.dp_world_size
+            use_secondary_tensor = False
+            if self.zero_param_process_group and not forward:
+                ds_process_group = self.zero_param_process_group  #intragroup
+                rank_in_group = self.rank_in_group
+                world_size = self.num_ranks_in_param_group
+
+            #pprint(dir(ds_process_group))
             # ensure that each rank has params in same order. the allgather
             # is done by flattening the parameter list into a single tensor that
             # can be allgathered in a single call - this means that if each rank
@@ -825,7 +1133,8 @@ def all_gather_coalesced(params: Iterable[Parameter],
             # to debug correctness issues.
             params = sorted(params, key=lambda p: p.ds_id)
 
-            debug_rank0(f"-allgather_coalesced: {[p.ds_id for p in params]}")
+            if logger.isEnabledFor(logging.DEBUG):
+                debug_rank0(f"-allgather_coalesced: {[p.ds_id for p in params]}")
 
             if safe_mode:
                 # ensure that same list (with same ordering) of parameters are
@@ -839,56 +1148,128 @@ def all_gather_coalesced(params: Iterable[Parameter],
             if len(params) == 1:
                 # have an opportunity to avoid some intermediate memory allocations
                 param, = params
+                buffer_size = math.ceil(param.ds_numel / world_size) * world_size
+                if not forward and param.ds_secondary_tensor is not None:
+                    buffer_size = param.ds_secondary_tensor.shape[0] * world_size  #make sure out is appropriately sized
+
+                param_ds_tensor = param.ds_secondary_tensor if not forward and param.ds_secondary_tensor is not None else param.ds_tensor
                 param_buffer = torch.empty(
-                    math.ceil(param.ds_numel / self.world_size) * self.world_size,
-                    dtype=param.dtype,
+                    buffer_size,
+                    dtype=param_ds_tensor.dtype if not quantize else torch.int8,
                     device=get_accelerator().current_device_name(),
                     requires_grad=False,
                 )
-                handle = _dist_allgather_fn(
-                    param.ds_tensor.to(get_accelerator().current_device_name()),
-                    param_buffer,
-                    self.ds_process_group)
-                param.data = param_buffer.narrow(0,
-                                                 0,
-                                                 param.ds_numel).view(param.ds_shape).to(
-                                                     param.device)
-                return AllGatherHandle(handle, param)
+                if not quantize:
+                    handles = _dist_allgather_fn(
+                        param_ds_tensor.to(get_accelerator().current_device_name()),
+                        param_buffer,
+                        ds_process_group,
+                    )
+                    param.data = param_buffer.narrow(0, 0, param.ds_numel).view(param.ds_shape).to(param.device)
+                    return AllGatherHandle(handles, param)
+                else:
+                    if hasattr(param_ds_tensor, "ds_quant_scale"):
+                        scales = param_ds_tensor.ds_quant_scale
+                        quantized_param = param_ds_tensor.data
+                    else:
+                        quantized_param, scales = self.quantizer_module.quantize(param_ds_tensor)
+                    handle = _dist_allgather_fn(quantized_param.to(get_accelerator().current_device_name()),
+                                                param_buffer, ds_process_group)
+
+                    quant_scale_buffer = torch.empty(
+                        scales.numel() * world_size,
+                        dtype=scales.dtype,
+                        device=get_accelerator().current_device_name(),
+                        requires_grad=False,
+                    )
+                    quant_handle = _dist_allgather_fn(scales.to(get_accelerator().current_device_name()),
+                                                      quant_scale_buffer, ds_process_group)
+                    quant_info = QuantizationInfo()
+                    quant_info.quantized_param = param_buffer.narrow(0, 0, param.ds_numel).view(param.ds_shape).to(
+                        param.device)
+                    quant_info.backend = self.quantizer_module
+                    quant_info.quant_handle = quant_handle
+                    quant_info.scale_buffer = quant_scale_buffer
+                    return AllGatherHandle(handle, param, quantization=quant_info)
+
             else:
-                partition_sz = sum(p.ds_tensor.ds_numel for p in params)
-                flat_tensor = torch.empty(partition_sz * self.world_size,
-                                          dtype=get_only_unique_item(p.dtype
-                                                                     for p in params),
-                                          device=get_accelerator().current_device_name(),
-                                          requires_grad=False)
-                partitions: List[Parameter] = []
-                for i in range(self.world_size):
-                    partitions.append(
-                        flat_tensor.narrow(0,
-                                           partition_sz * i,
-                                           partition_sz))
-
-                instrument_w_nvtx(torch.cat)([
-                    p.ds_tensor.to(get_accelerator().current_device_name())
-                    for p in params
-                ],
-                                             out=partitions[self.rank])
-                handle = _dist_allgather_fn(partitions[self.rank],
-                                            flat_tensor,
-                                            self.ds_process_group)
-
-                return AllGatherCoalescedHandle(
-                    allgather_handle=handle,
-                    params=params,
-                    partitions=partitions,
-                    world_size=self.world_size,
-                )
+                if not quantize:
+                    dtype_params = defaultdict(list)
+                    for p in params:
+                        dtype_params[p.ds_tensor.dtype].append(p)
+                    handles = []
+                    for dtype, params in dtype_params.items():
+                        handles.append(
+                            _all_gather_dtype(dtype, params, forward, world_size, rank_in_group, ds_process_group))
+
+                    return MultipleAllGatherHandles(handles)
 
-        def partition(param_list=None, hierarchy=0, has_been_updated=False):
+                else:
+                    partition_sz = sum(p.ds_tensor.ds_numel for p in params)
+
+                    if params[0].ds_secondary_tensor is not None and not forward:
+                        partition_sz = sum(p.ds_tensor.ds_numel * p.ds_secondary_tensor_num_of_groups for p in params)
+
+                    flat_tensor = torch.empty(partition_sz * world_size,
+                                              dtype=torch.int8,
+                                              device=get_accelerator().current_device_name(),
+                                              requires_grad=False)
+
+                    if params[0].ds_secondary_tensor is not None and not forward:
+                        use_secondary_tensor = True
+                        if hasattr(params[0].ds_secondary_tensor, "ds_quant_scale"):
+                            quantized_param = instrument_w_nvtx(torch.cat)([
+                                p.ds_secondary_tensor.data.to(get_accelerator().current_device_name()) for p in params
+                            ])
+                            scales = instrument_w_nvtx(torch.cat)([
+                                p.ds_secondary_tensor.ds_quant_scale.to(get_accelerator().current_device_name())
+                                for p in params
+                            ])
+                        else:
+                            quantized_param, scales = self.quantizer_module.quantize(
+                                instrument_w_nvtx(torch.cat)([
+                                    p.ds_secondary_tensor.to(get_accelerator().current_device_name()) for p in params
+                                ]))
+                    else:
+                        if hasattr(params[0].ds_tensor, "ds_quant_scale"):
+                            quantized_param = instrument_w_nvtx(torch.cat)(
+                                [p.ds_tensor.data.to(get_accelerator().current_device_name()) for p in params])
+                            scales = instrument_w_nvtx(torch.cat)([
+                                p.ds_tensor.ds_quant_scale.to(get_accelerator().current_device_name()) for p in params
+                            ])
+                        else:
+                            quantized_param, scales = self.quantizer_module.quantize(
+                                instrument_w_nvtx(torch.cat)(
+                                    [p.ds_tensor.to(get_accelerator().current_device_name()) for p in params]))
+                    quant_scale_buffer = torch.empty(
+                        scales.numel() * world_size,
+                        dtype=torch.float32,
+                        device=get_accelerator().current_device_name(),
+                        requires_grad=False,
+                    )
+                    handle = _dist_allgather_fn(quantized_param, flat_tensor, ds_process_group)
+                    quant_handle = _dist_allgather_fn(scales, quant_scale_buffer, ds_process_group)
+                    quant_info = QuantizationInfo()
+                    quant_info.quantized_param = flat_tensor
+                    quant_info.backend = self.quantizer_module
+                    quant_info.quant_handle = quant_handle
+                    quant_info.scale_buffer = quant_scale_buffer
+                    quant_info.partition_sz = partition_sz
+                    quant_info.world_size = world_size
+                    return AllGatherCoalescedHandle(
+                        allgather_handle=handle,
+                        params=params,
+                        partitions=None,
+                        world_size=world_size,
+                        use_secondary_tensor=use_secondary_tensor,
+                        forward=forward,
+                        quantization=quant_info,
+                    )
+
+        def partition(param_list=None, backward=False, hierarchy=0, has_been_updated=False):
             cls = param
-            print_rank_0(
-                f"{'--'*hierarchy}----Partitioning param {debug_param2name_id_shape_device(cls)}"
-            )
+            print_rank_0(f"{'--'*hierarchy}----Partitioning param {debug_param2name_id_shape_device(cls)}",
+                         force=False)
             if param_list is None:
                 param_list = [cls]
             self._partition(param_list, has_been_updated=has_been_updated)
@@ -902,22 +1283,16 @@ def reduce_gradients_at_owner(param_list=None, hierarchy=0):
             )
             self._reduce_scatter_gradients(param_list)
 
-        def partition_gradients(param_list=None,
-                                partition_buffers=None,
-                                hierarchy=0,
-                                accumulate=False):
+        def partition_gradients(param_list=None, partition_buffers=None, hierarchy=0, accumulate=False):
             cls = param
             print_rank_0(
-                f"{'--'*hierarchy}----Partitioning param gradient with id {debug_param2name_id_shape_device(cls)}"
-            )
+                f"{'--'*hierarchy}----Partitioning param gradient with id {debug_param2name_id_shape_device(cls)}")
             if param_list is None:
                 param_list = [cls]
                 if isinstance(partition_buffers, torch.Tensor):
                     partition_buffers = [partition_buffers]
 
-            self._partition_gradients(param_list,
-                                      partition_buffers=partition_buffers,
-                                      accumulate=accumulate)
+            self._partition_gradients(param_list, partition_buffers=partition_buffers, accumulate=accumulate)
 
         def aligned_size():
             return self._aligned_size(param)
@@ -944,12 +1319,14 @@ def ds_summary(slf: torch.Tensor, use_debug_name: bool = False) -> dict:
                 "grad_shape": tuple(slf.grad.shape) if slf.grad is not None else None,
                 "persist": slf.ds_persist,
                 "active_sub_modules": slf.ds_active_sub_modules,
+                "ds_tensor.shape": slf.ds_tensor.shape if slf.ds_tensor is not None else None
             }
 
         def convert_to_zero_parameters(param_list):
             self._convert_to_zero_parameters(param_list)
 
         def allgather_before(func: Callable) -> Callable:
+
             def wrapped(*args, **kwargs):
                 param.all_gather()
                 return func(*args, **kwargs)
@@ -979,8 +1356,8 @@ def _aligned_size(self, param):
         return param.ds_numel + self._padding_size(param)
 
     def _padding_size(self, param):
-        remainder = param.ds_numel % self.world_size
-        return (self.world_size - remainder) if remainder else 0
+        remainder = param.ds_numel % self.num_partitions
+        return (self.num_partitions - remainder) if remainder else 0
 
     def _partition_numel(self, param):
         return param.ds_tensor.ds_numel
@@ -1011,48 +1388,53 @@ def _all_gather(self, param_list, async_op=False, hierarchy=None):
         for param in param_list:
             if param.ds_status == ZeroParamStatus.NOT_AVAILABLE:
                 if async_op:
-                    handle = self._allgather_param(param,
-                                                   async_op=async_op,
-                                                   hierarchy=hierarchy)
+                    handle = self._allgather_param(param, async_op=async_op, hierarchy=hierarchy)
                     param.ds_status = ZeroParamStatus.INFLIGHT  # if async_op else ZeroParamStatus.AVAILABLE
                     handles.append(handle)
                 else:
                     all_gather_list.append(param)
-
+        # note: param_list may contain params that are already in flight / aviailable. So we need to use all_gather_list
         if not async_op:
-            if len(param_list) == 1:
+            if len(all_gather_list) == 1:
                 ret_value = self._allgather_params(all_gather_list, hierarchy=hierarchy)
             else:
-                ret_value = self._allgather_params_coalesced(all_gather_list, hierarchy)
-
+                all_gather_quantize_list = []
+                all_gather_nonquantize_list = []
+                for param in all_gather_list:
+                    if hasattr(param.ds_tensor,
+                               "ds_quant_scale") or (hasattr(param, "ds_secondary_tensor")
+                                                     and hasattr(param.ds_secondary_tensor, "ds_quant_scale")):
+                        all_gather_quantize_list.append(param)
+                    else:
+                        all_gather_nonquantize_list.append(param)
+                # _allgather_params_coalesced always return None
+                self._allgather_params_coalesced(all_gather_nonquantize_list, hierarchy, quantize=False)
+                self._allgather_params_coalesced(all_gather_quantize_list, hierarchy, quantize=True)
             for param in all_gather_list:
                 param.ds_status = ZeroParamStatus.AVAILABLE
-            return ret_value
+            return None
 
         return handles
 
     def _partition(self, param_list, force=False, has_been_updated=False):
         for param in param_list:
-            #print_rank_0(f"Before Partitioning Param {param.ds_id}")
-            # self._param_status(param)
+            print_rank_0(f"Before Partitioning Param {param.ds_id}", force=False)
+            if self.zero_param_process_group is not None:
+                self._partition_param_sec(param, has_been_updated=has_been_updated)
             self._partition_param(param, has_been_updated=has_been_updated)
+
             param.ds_status = ZeroParamStatus.NOT_AVAILABLE
             # if param.ds_tensor is not None:
             #    assert id(param.data) == id(param.ds_tensor.data), \
             #    "After the parameters are initially partitioned, make sure we are not recreating the partition."
-            #print_rank_0(f"After Partitioning Param {param.ds_id}")
-            # self._param_status(param)
-
+            #print_rank_0(f"After Partitioning Param {param.ds_id} {param.ds_tensor.size()} {param.ds_tensor}",force=False)
     @instrument_w_nvtx
     def _partition_param(self, param, buffer=None, has_been_updated=False):
         assert param.ds_status is not ZeroParamStatus.INFLIGHT, f" {param} Cannot partition a param in flight"
-
         global reuse_buffers
-        #print_rank_0(f"Param id {param.ds_id} status is {param.ds_status}")
+        print_rank_0(f"Param id {param.ds_id} status is {param.ds_status}", force=False)
         if param.ds_status is ZeroParamStatus.AVAILABLE:
-            print_rank_0(
-                f"Partitioning param id {param.ds_id} reuse buffers {reuse_buffers}",
-                force=False)
+            print_rank_0(f"Partitioning param id {param.ds_id} reuse buffers {reuse_buffers}", force=False)
             # if reuse_buffers and False:
             #     numel = buffer.numel()
             #     buffer = param.data.view(-1)
@@ -1064,60 +1446,62 @@ def _partition_param(self, param, buffer=None, has_been_updated=False):
 
             # if deepspeed.comm.get_rank():
             #    print(f"Releasing {param.data.numel()}")
-            if param.ds_tensor is not None and not has_been_updated:
 
+            if param.ds_tensor is not None and not has_been_updated:  ##param already partitioned
+
+                #print_rank_0(f"Param  {param.ds_id} pri {param.ds_tensor.size()}  loc? {param.ds_tensor.final_location}", force=True)
                 #param.data = param.ds_tensor.data
 
-                see_memory_usage(
-                    f'Before partitioning param {param.ds_id} {param.shape}',
-                    force=False)
+                see_memory_usage(f'Before partitioning param {param.ds_id} {param.shape}', force=False)
                 # param.data does not store anything meaningful in partitioned state
                 free_param(param)
-                see_memory_usage(f'After partitioning param {param.ds_id} {param.shape}',
-                                 force=False)
+                see_memory_usage(f'After partitioning param {param.ds_id} {param.shape}', force=False)
 
                 if param.ds_tensor.final_location == OffloadDeviceEnum.nvme:
+                    print_rank_0(f"Param {param.ds_id} partition released since it exists in nvme", force=False)
+                    param.nvme_swapper.remove_partition_and_release_buffers([param])
                     print_rank_0(
-                        f"Param {param.ds_id} partition released since it exists in nvme",
+                        f"after swap Param {param.ds_id} {param.ds_tensor.shape} partition released since it exists in nvme",
                         force=False)
-                    param.nvme_swapper.remove_partition_and_release_buffers([param])
 
                 return
 
             tensor_size = self._aligned_size(param)
-            partition_size = tensor_size // self.world_size
-
+            partition_size = tensor_size // self.num_partitions
             if param.ds_tensor is None:
                 final_location = None
                 if self.remote_device == OffloadDeviceEnum.nvme and self.param_swapper.swappable_tensor(
                         numel=partition_size):
                     final_location = OffloadDeviceEnum.nvme
                     buffer = self.param_swapper.get_buffer(param, partition_size)
-                    partitioned_tensor = torch.empty(0,
-                                                     dtype=param.dtype,
-                                                     device=buffer.device)
+                    partitioned_tensor = torch.empty(0, dtype=param.dtype, device=buffer.device)
                     partitioned_tensor.data = buffer.data
-                    print_rank_0(
-                        f"ID {param.ds_id} Initializing partition for the first time for nvme offload."
-                    )
+                    print_rank_0(f"ID {param.ds_id} Initializing partition for the first time for nvme offload.")
 
                 else:
-                    partitioned_tensor = torch.empty(
-                        partition_size,
-                        dtype=param.dtype,
-                        device=OffloadDeviceEnum.cpu if self.remote_device
-                        == OffloadDeviceEnum.nvme else self.remote_device)
-                    if self.pin_memory:
-                        partitioned_tensor = get_accelerator().pin_memory(
+                    if param.ds_persist:
+                        device = self.local_device
+                    elif self.remote_device == OffloadDeviceEnum.nvme:
+                        device = OffloadDeviceEnum.cpu
+                    else:
+                        device = self.remote_device
+
+                    partitioned_tensor = torch.empty(partition_size, dtype=param.dtype, device=device)
+                    # quantize the tensor if it's not trainable
+                    if not param.requires_grad and self.quantized_nontrainable_weights:
+                        partitioned_tensor, partitioned_tensor.ds_quant_scale = self.quantizer_module.quantize(
                             partitioned_tensor)
 
+                    if device == OffloadDeviceEnum.cpu and self.pin_memory:
+                        partitioned_tensor = get_accelerator().pin_memory(partitioned_tensor)
+
                 partitioned_tensor.requires_grad = False
                 param.ds_tensor = partitioned_tensor
                 param.ds_tensor.ds_numel = partition_size
                 param.ds_tensor.status = PartitionedParamStatus.AVAILABLE
                 param.ds_tensor.final_location = final_location
 
-            start = partition_size * self.rank
+            start = partition_size * self.get_partition_rank()
             end = start + partition_size
 
             one_dim_param = param.contiguous().view(-1)
@@ -1125,7 +1509,11 @@ def _partition_param(self, param, buffer=None, has_been_updated=False):
             if start < param.ds_numel and end <= param.ds_numel:
                 src_tensor = one_dim_param.narrow(0, start, partition_size)
 
-                param.ds_tensor.copy_(src_tensor)
+                with torch.no_grad():
+                    # make sure param.ds_tensor requires_grad always be false,
+                    # otherwise, torch tracer will complain.
+                    param.ds_tensor.copy_(src_tensor)
+
                 #partitioned_tensor = src_tensor.clone().detach().to(self.remote_device)
 
             else:
@@ -1134,14 +1522,12 @@ def _partition_param(self, param, buffer=None, has_been_updated=False):
                 #                                  device=self.remote_device )
 
                 if start < param.ds_numel:
-                    elements_to_copy = param.ds_numel - start
-                    param.ds_tensor.narrow(0,
-                                           0,
-                                           elements_to_copy).copy_(
-                                               one_dim_param.narrow(
-                                                   0,
-                                                   start,
-                                                   elements_to_copy))
+                    elems_to_copy = param.ds_numel - start
+                    with torch.no_grad():
+                        # make sure param.ds_tensor requires_grad always be false,
+                        # otherwise, torch tracer will complain.
+                        param.ds_tensor.narrow(0, 0,
+                                               elems_to_copy).copy_(one_dim_param.narrow(0, start, elems_to_copy))
 
             #print(f"Remote device {self.remote_device}")
 
@@ -1151,23 +1537,71 @@ def _partition_param(self, param, buffer=None, has_been_updated=False):
 
             # param.data does not store anything meaningful in partitioned state
 
-            see_memory_usage(f'Before partitioning param {param.ds_id} {param.shape}',
-                             force=False)
+            see_memory_usage(f'Before partitioning param {param.ds_id} {param.shape}', force=False)
             free_param(param)
-            see_memory_usage(f'After partitioning param {param.ds_id} {param.shape}',
-                             force=False)
+            see_memory_usage(f'After partitioning param {param.ds_id} {param.shape}', force=False)
 
             if param.ds_tensor.final_location == OffloadDeviceEnum.nvme:
                 self.param_swapper.swap_out_and_release([param])
-                print_rank_0(
-                    f"ID {param.ds_id} Offloaded to nvme offload and buffers released.")
-                see_memory_usage(
-                    f"ID {param.ds_id} Offloaded to nvme offload and buffers released.",
-                    force=False)
+                print_rank_0(f"ID {param.ds_id} Offloaded to nvme offload and buffers released.")
+                see_memory_usage(f"ID {param.ds_id} Offloaded to nvme offload and buffers released.", force=False)
 
-            print_rank_0(
-                f"ID {param.ds_id} partitioned type {param.dtype} dev {param.device} shape {param.shape}"
-            )
+            print_rank_0(f"ID {param.ds_id} partitioned type {param.dtype} dev {param.device} shape {param.shape}")
+
+    @instrument_w_nvtx
+    def _partition_param_sec(self, param, buffer=None, has_been_updated=False):
+        assert param.ds_status is not ZeroParamStatus.INFLIGHT, f" {param} Cannot partition a param in flight"
+        global reuse_buffers
+        ##support for NVME secondary param offload
+        #print_rank_0(f"SEC Param id {param.ds_id} status is {param.ds_status}", force=True)
+        if param.ds_status is ZeroParamStatus.AVAILABLE:
+            if param.ds_secondary_tensor is not None and not has_been_updated:  ##param already partitioned
+
+                return
+            #check padding
+            tensor_size = self._aligned_size(param)
+            partition_size = tensor_size // self.dp_world_size
+
+            secondary_partition_size = int(tensor_size // self.num_ranks_in_param_group)
+            final_location = None
+            secondary_partitioned_tensor = torch.empty(secondary_partition_size,
+                                                       dtype=param.dtype,
+                                                       device=self.remote_device)
+
+            if self.pin_memory:
+                secondary_partitioned_tensor = secondary_partitioned_tensor.pin_memory()
+            # quantize the tensor if it's not trainable
+            if not param.requires_grad and self.quantized_nontrainable_weights:
+                secondary_partitioned_tensor, secondary_partitioned_tensor.ds_quant_scale = self.quantizer_module.quantize(
+                    secondary_partitioned_tensor)
+            secondary_partitioned_tensor.requires_grad = False
+            param.ds_secondary_tensor = secondary_partitioned_tensor
+            param.ds_secondary_tensor.ds_numel = secondary_partition_size
+            param.ds_secondary_tensor.status = PartitionedParamStatus.AVAILABLE
+            param.ds_secondary_tensor.final_location = final_location
+
+            #use rank in group for secondary tensor
+            secondary_start = secondary_partition_size * self.rank_in_group
+
+            secondary_end = secondary_start + secondary_partition_size
+
+            one_dim_param = param.contiguous().view(-1)
+            start = partition_size * self.rank
+            end = start + partition_size
+            if start < param.ds_numel and end <= param.ds_numel:
+                if secondary_start < param.ds_numel and secondary_end <= param.ds_numel:
+                    sec_src_tensor = one_dim_param.narrow(0, secondary_start, secondary_partition_size)
+                    param.ds_secondary_tensor.copy_(sec_src_tensor)
+
+            else:
+                if start < param.ds_numel:
+                    elements_to_copy = param.ds_numel - start
+                    elements_to_copy_sec = elements_to_copy * param.ds_secondary_tensor_num_of_groups
+                    param.ds_secondary_tensor.narrow(0, 0, elements_to_copy_sec).copy_(
+                        one_dim_param.narrow(0, secondary_start, elements_to_copy_sec))
+
+            print_rank_0(f"{param.ds_id} partitioned type {param.dtype} dev {param.device} shape {param.shape}",
+                         force=False)
 
     def _param_status(self, param):
         if param.ds_tensor is not None:
@@ -1183,7 +1617,7 @@ def _allgather_param(self, param, async_op=False, hierarchy=0):
 
         partition_size = param.ds_tensor.ds_numel
 
-        tensor_size = partition_size * self.world_size
+        tensor_size = partition_size * self.num_partitions
         aligned_param_size = self._aligned_size(param)
         assert tensor_size == aligned_param_size, f'param id {param.ds_id} aligned size {aligned_param_size} does not match tensor size {tensor_size}'
 
@@ -1194,9 +1628,7 @@ def _allgather_param(self, param, async_op=False, hierarchy=0):
         see_memory_usage(
             f'Before allocate allgather param {debug_param2name_id_shape_status(param)} partition_size={partition_size} ',
             force=False)
-        flat_tensor = torch.zeros(aligned_param_size,
-                                  dtype=param.dtype,
-                                  device=param.device).view(-1)
+        flat_tensor = torch.zeros(aligned_param_size, dtype=param.dtype, device=param.device).view(-1)
         see_memory_usage(
             f'After allocate allgather param {debug_param2name_id_shape_status(param)} {aligned_param_size} {partition_size} ',
             force=False)
@@ -1212,96 +1644,126 @@ def _allgather_param(self, param, async_op=False, hierarchy=0):
         #                                                   param.ds_numel).view(param.ds_shape)
         #            param.data = replicated_tensor.data
         #            return None
-        if self.use_all_gather_base:
-            # try the _all_gather_base on PyTorch master branch
-            handle = dist.all_gather_base(flat_tensor,
-                                          param.ds_tensor.to(
-                                              get_accelerator().device_name()),
-                                          group=self.ds_process_group,
-                                          async_op=async_op)
+        if self.use_all_gather_into_tensor:
+            handle = dist.all_gather_into_tensor(flat_tensor,
+                                                 param.ds_tensor.to(get_accelerator().device_name()),
+                                                 group=self.get_partition_dp_group(param),
+                                                 async_op=async_op)
         else:
             partitions = []
-            for i in range(self.world_size):
-                partitions.append(
-                    flat_tensor.narrow(0,
-                                       partition_size * i,
-                                       partition_size))
+            for i in range(self.num_partitions):
+                partitions.append(flat_tensor.narrow(0, partition_size * i, partition_size))
 
-                if i == dist.get_rank(group=self.ds_process_group):
+                if i == dist.get_rank(group=self.get_partition_dp_group(param)):
                     partitions[i].data.copy_(param.ds_tensor.data, non_blocking=True)
 
             handle = dist.all_gather(partitions,
-                                     partitions[self.rank],
-                                     group=self.ds_process_group,
+                                     partitions[self.get_partition_rank()],
+                                     group=self.get_partition_dp_group(param),
                                      async_op=async_op)
 
         replicated_tensor = flat_tensor.narrow(0, 0, param.ds_numel).view(param.ds_shape)
         param.data = replicated_tensor.data
         return handle
 
-    def _allgather_params_coalesced(self, param_list, hierarchy=0):
+    def _allgather_params_coalesced(self, param_list, hierarchy=0, quantize=False):
         """ blocking call
         avoid explicit memory copy in _allgather_params
         """
         if len(param_list) == 0:
             return
+
+        if self.num_partitions == 1:
+            handle = _no_gather_coalesced(param_list)
+            handle.wait()
+            return None
+
         # collect local tensors and partition sizes
         partition_sizes = []
         local_tensors = []
+        if quantize:
+            quantize_scale_sizes = []
+            quantize_scale_tensors = []
         for param in param_list:
             partition_sizes.append(param.ds_tensor.ds_numel)
             local_tensors.append(param.ds_tensor.to(get_accelerator().device_name()))
-
+            if quantize:
+                quantize_scale_sizes.append(param.ds_tensor.ds_quant_scale.numel())
+                quantize_scale_tensors.append(param.ds_tensor.ds_quant_scale.to(get_accelerator().device_name()))
         # allocate memory for allgather params
         allgather_params = []
+        if quantize:
+            allgather_quantize_scale = []
         for psize in partition_sizes:
-            tensor_size = psize * self.world_size
-            flat_tensor = torch.empty(tensor_size,
-                                      dtype=param_list[0].dtype,
+            tensor_size = psize * self.num_partitions
+            flat_tensor = torch.empty(tensor_size, dtype=param_list[0].ds_tensor.dtype,
                                       device=self.local_device).view(-1)
             flat_tensor.requires_grad = False
             allgather_params.append(flat_tensor)
+        if quantize:
+            for psize in quantize_scale_sizes:
+                tensor_size = psize * self.num_partitions
+                flat_tensor = torch.empty(tensor_size,
+                                          dtype=param_list[0].ds_tensor.ds_quant_scale.dtype,
+                                          device=self.local_device).view(-1)
+                flat_tensor.requires_grad = False
+                allgather_quantize_scale.append(flat_tensor)
 
         # launch
         launch_handles = []
-        # backend = get_backend(self.ds_process_group)
-        # with _batch_p2p_manager(backend):
+        launch_quantize_handles = []
         for param_idx, param in enumerate(param_list):
             input_tensor = local_tensors[param_idx].view(-1)
 
-            if self.use_all_gather_base:
+            if self.use_all_gather_into_tensor:
                 # try the _all_gather_base from Pytorch master
-                h = dist.all_gather_base(allgather_params[param_idx],
-                                         input_tensor,
-                                         group=self.ds_process_group,
-                                         async_op=True)
+                h = dist.all_gather_into_tensor(allgather_params[param_idx],
+                                                input_tensor,
+                                                group=self.get_partition_dp_group(param),
+                                                async_op=True)
+                if quantize:
+                    quantize_handle = dist.all_gather_into_tensor(allgather_quantize_scale[param_idx],
+                                                                  quantize_scale_tensors[param_idx],
+                                                                  group=self.get_partition_dp_group(param),
+                                                                  async_op=True)
+                    launch_quantize_handles.append(quantize_handle)
             else:
                 output_list = []
-                for i in range(self.world_size):
+                for i in range(self.num_partitions):
                     psize = partition_sizes[param_idx]
                     partition = allgather_params[param_idx].narrow(0, i * psize, psize)
                     output_list.append(partition)
                     if not get_accelerator().on_accelerator(partition):
                         logger.warning(
-                            f'param {param_idx}, partition {i} is not on CUDA, partition shape {partition.size()}'
-                        )
-
-                # back to old all_gather function signature
-                h = dist.all_gather(output_list,
-                                    input_tensor,
-                                    group=self.ds_process_group,
-                                    async_op=True)
+                            f'param {param_idx}, partition {i} is not on CUDA, partition shape {partition.size()}')
+
+                # back to old all_gather function
+                h = dist.all_gather(output_list, input_tensor, group=self.get_partition_dp_group(param), async_op=True)
+                if quantize:
+                    output_scale_list = []
+                    for i in range(self.num_partitions):
+                        psize = quantize_scale_sizes[param_idx]
+                        partition = allgather_quantize_scale[param_idx].narrow(0, i * psize, psize)
+                        output_scale_list.append(partition)
+                    quant_handle = dist.all_gather(output_scale_list,
+                                                   quantize_scale_tensors[param_idx],
+                                                   group=self.get_partition_dp_group(param),
+                                                   async_op=True)
+                    launch_quantize_handles.append(quant_handle)
             launch_handles.append(h)
 
         # Wait ensures the operation is enqueued, but not necessarily complete.
         launch_handles[-1].wait()
+        if quantize:
+            for quant_handle in launch_quantize_handles:
+                quant_handle.wait()
 
         # assign to param.data (not copy)
         for i, param in enumerate(param_list):
             gathered_tensor = allgather_params[i]
-            param.data = gathered_tensor.narrow(0,
-                                                0,
-                                                param.ds_numel).view(param.ds_shape).data
+            if quantize:
+                gathered_tensor = self.quantizer_module.dequantize(gathered_tensor, allgather_quantize_scale[i])
+            param.data = gathered_tensor.narrow(0, 0, param.ds_numel).view(param.ds_shape).data
 
         # guarantee the communication to be completed
         get_accelerator().synchronize()
@@ -1314,42 +1776,62 @@ def _allgather_params(self, param_list, hierarchy=0):
 
         partition_size = sum([param.ds_tensor.ds_numel for param in param_list])
 
-        tensor_size = partition_size * self.world_size
-        flat_tensor = torch.empty(tensor_size,
-                                  dtype=param_list[0].dtype,
-                                  device=self.local_device)
+        tensor_size = partition_size * self.num_partitions
+        flat_tensor = torch.empty(tensor_size, dtype=param_list[0].ds_tensor.dtype, device=self.local_device)
         flat_tensor.requires_grad = False
         partitions = []
-        for i in range(self.world_size):
+        for i in range(self.num_partitions):
             start = partition_size * i
 
             partitions.append(flat_tensor.narrow(0, start, partition_size))
 
-            if i == self.rank:
+            if i == self.get_partition_rank():
                 offset = 0
                 for param in param_list:
                     param_numel = param.ds_tensor.ds_numel
 
-                    partitions[i].narrow(0,
-                                         offset,
-                                         param_numel).copy_(param.ds_tensor.data)
+                    partitions[i].narrow(0, offset, param_numel).copy_(param.ds_tensor.data)
 
                     offset += param_numel
 
+        if hasattr(param_list[0], 'ds_quant_scale'):
+            scale_size = sum([param.ds_tensor.ds_quant_scale.numel() for param in param_list])
+            scale_tensor_size = scale_size * self.world_size
+            flat_scale_tensor = torch.empty(scale_tensor_size,
+                                            dtype=param_list[0].ds_tensor.ds_quant_scale.dtype,
+                                            device=self.local_device)
+            flat_scale_tensor.requires_grad = False
+            scale_partitions = []
+            for i in range(self.world_size):
+                start = scale_tensor_size * i
+                scale_partitions.append(flat_scale_tensor.narrow(0, start, scale_tensor_size))
+                if i == self.rank:
+                    offset = 0
+                    for param in param_list:
+                        param_scale_numel = param.ds_tensor.ds_quant_scale.ds_numel
+
+                        scale_partitions[i].narrow(0, offset,
+                                                   param_scale_numel).copy_(param.ds_tensor.ds_quant_scale.data)
+
+                        offset += param_scale_numel
+
         dist.all_gather(partitions,
-                        partitions[self.rank],
-                        group=self.ds_process_group,
+                        partitions[self.get_partition_rank()],
+                        group=self.get_partition_dp_group(param),
                         async_op=False)
+        if hasattr(param_list[0], 'ds_quant_scale'):
+            dist.all_gather(flat_scale_tensor,
+                            param_list[0].ds_quant_scale,
+                            group=self.get_partition_dp_group(param),
+                            async_op=False)
         param_offset = 0
 
         for param in param_list:
             param_partition_size = param.ds_tensor.ds_numel
             param_size = param.ds_numel
-            replicated_tensor = torch.empty(param.ds_shape,
-                                            dtype=param.dtype,
-                                            device=self.local_device)
+            replicated_tensor = torch.empty(param.ds_shape, dtype=param.ds_tensor.dtype, device=self.local_device)
 
-            for i in range(self.world_size):
+            for i in range(self.num_partitions):
 
                 start = i * partition_size
 
@@ -1360,12 +1842,11 @@ def _allgather_params(self, param_list, hierarchy=0):
 
                     part_to_copy = partitions[i].narrow(0, param_offset, numel_to_copy)
 
-                    replicated_tensor.view(-1).narrow(0,
-                                                      param_start,
-                                                      numel_to_copy).copy_(part_to_copy)
+                    replicated_tensor.view(-1).narrow(0, param_start, numel_to_copy).copy_(part_to_copy)
             #param_offset += param.data.numel()
             param_offset += param.ds_tensor.ds_numel
-
+            if hasattr(param_list[0], 'ds_quant_scale'):
+                replicated_tensor = self.quantizer_module.dequantize(replicated_tensor, flat_scale_tensor)
             param.data = replicated_tensor.data
 
         return None
@@ -1389,27 +1870,22 @@ def _reduce_scatter_gradients(self, param_list):
             # For these ranks the output of reduce scatter is a separate buffer and needs
             # to be copied in
             partition_size = param.ds_tensor.ds_numel
-            start = self.rank * partition_size
+            start = self.get_partition_rank() * partition_size
             end = start + partition_size
-            #print_rank_0("REduce scatter was executed for praam {param.ds_id}")
-            if start < param.ds_numel and end > param.ds_numel:
+            #print_rank_0("REduce scatter was executed for param {param.ds_id}")
+            if start < param.ds_numel < end:
                 elements = param.ds_numel - start
-                param.grad.view(-1).narrow(0,
-                                           start,
-                                           elements).copy_(
-                                               reduced_partition.narrow(0,
-                                                                        0,
-                                                                        elements))
+                param.grad.view(-1).narrow(0, start, elements).copy_(reduced_partition.narrow(0, 0, elements))
 
     def _reduce_scatter_gradient(self, param):
 
         partition_size = param.ds_tensor.ds_numel
         #output = torch.empty(partition_size, dtype=param.dtype, device=param.device)
 
-        total_size = partition_size * self.world_size
+        total_size = partition_size * self.num_partitions
         input_list = []
 
-        for i in range(self.world_size):
+        for i in range(self.num_partitions):
 
             start = i * partition_size
             end = start + partition_size
@@ -1418,25 +1894,18 @@ def _reduce_scatter_gradient(self, param):
             if start < param.ds_numel and end <= param.ds_numel:
                 input = param.grad.view(-1).narrow(0, start, partition_size)
             else:
-                input = torch.zeros(partition_size,
-                                    dtype=param.dtype,
-                                    device=param.device)
+                input = torch.zeros(partition_size, dtype=param.dtype, device=param.device)
 
                 if start < param.ds_numel:
                     elements = param.ds_numel - start
-                    input.narrow(0,
-                                 0,
-                                 elements).copy_(
-                                     param.grad.view(-1).narrow(0,
-                                                                start,
-                                                                elements))
+                    input.narrow(0, 0, elements).copy_(param.grad.view(-1).narrow(0, start, elements))
             #print("after reduce scatter gradients")
             input_list.append(input)
 
-        rank = dist.get_rank(group=self.ds_process_group)
+        rank = dist.get_rank(group=self.get_partition_dp_group(param))
         handle = dist.reduce_scatter(input_list[rank],
                                      input_list,
-                                     group=self.ds_process_group,
+                                     group=self.get_partition_dp_group(param),
                                      async_op=True)
 
         return handle, input_list[rank]
@@ -1446,11 +1915,10 @@ def _partition_gradients(self, param_list, partition_buffers=None, accumulate=Fa
             partition_buffers = [None] * len(param_list)
 
         for param, partition_buffer in zip(param_list, partition_buffers):
-            self._partition_gradient(param,
-                                     partition_buffer=partition_buffer,
-                                     accumulate=accumulate)
+            self._partition_gradient(param, partition_buffer=partition_buffer, accumulate=accumulate)
 
     def _partition_gradient(self, param, partition_buffer=None, accumulate=False):
+
         #import pdb;pdb.set_trace()
         # param.grad=None
         # param.grad.test()
@@ -1462,14 +1930,12 @@ def _partition_gradient(self, param, partition_buffer=None, accumulate=False):
 
         if partition_buffer is None:
             assert not accumulate, "No buffer to accumulate to"
-            partition_buffer = torch.zeros(partition_size,
-                                           dtype=param.dtype,
-                                           device=param.device)
+            partition_buffer = torch.zeros(partition_size, dtype=param.dtype, device=param.device)
         else:
             assert partition_buffer.numel(
             ) >= partition_size, f"The partition buffer size {partition_buffer.numel()} should match the size of param.ds_tensor {partition_size}"
 
-        rank = dist.get_rank(group=self.ds_process_group)
+        rank = dist.get_rank(group=self.get_partition_dp_group(param))
         start = partition_size * rank
         end = start + partition_size
 
@@ -1496,9 +1962,7 @@ def _partition_gradient(self, param, partition_buffer=None, accumulate=False):
             # when src is gpu and dest is cpu
             # adding directly to cpu is very slow
             else:
-                acc_tensor = torch.empty(src_tensor.numel(),
-                                         dtype=param.dtype,
-                                         device=param.device)
+                acc_tensor = torch.empty(src_tensor.numel(), dtype=param.dtype, device=param.device)
 
                 acc_tensor.copy_(dest_tensor)
                 acc_tensor.add_(src_tensor)
@@ -1515,8 +1979,25 @@ def _partition_gradient(self, param, partition_buffer=None, accumulate=False):
         param.grad.data = dest_tensor_full_buffer.data
         see_memory_usage("After partitioning gradients", force=False)
 
+    def get_partition_dp_group(self, param):
+        return param.ds_process_group
+
+    def get_partition_rank(self):
+        """subclass can overload to specify different relative rank in
+        parameter partition group"""
+        return self.rank
+
+    @property
+    def num_partitions(self):
+        return self.dp_world_size
+
+    def get_dp_process_group(self):
+        """ Return the communication group with all data-parallel ranks """
+        return self.ds_process_group
+
 
 class GatheredParameters:
+
     def __init__(self, params, modifier_rank=None, fwd_module=None, enabled=True):
         """A context that collects parameters that were partitioned via a
         :class:`deepspeed.zero.Init` context. The parameters are partitioned
@@ -1614,21 +2095,22 @@ def load(module: nn.Module, prefix=""):
         else:
             # single param
             params = [params]
-
         # enable if at least one is zero-param, otherwise a noop
         if not any(is_zero_param(p) for p in params):
             self.enabled = False
             return
 
         self.params = [p for p in params if hasattr(p, "ds_id")]
+        self.params = sorted(
+            set(self.params), key=lambda x: x.ds_id
+        )  # remove the duplicates to prevent racing condition, we must also make sure the order is the same on all ranks otherwise we'll get deadlocks
         self.src_rank = None
         if modifier_rank is not None:
             if self.params[0].ds_process_group == dist.get_world_group():
                 self.src_rank = modifier_rank
             else:
                 # A group was specified; convert DP rank to global rank
-                self.src_rank = dist.get_global_rank(self.params[0].ds_process_group,
-                                                     modifier_rank)
+                self.src_rank = dist.get_global_rank(self.params[0].ds_process_group, modifier_rank)
         self.fwd_module = fwd_module
         if self.fwd_module is not None:
             # is a no-op if already registered
@@ -1647,12 +2129,7 @@ def __exit__(self, *exc):
             self.params[0].partition(param_list=self.params, has_been_updated=False)
             return
 
-        handles = [
-            dist.broadcast(p,
-                           self.src_rank,
-                           group=p.ds_process_group,
-                           async_op=True) for p in self.params
-        ]
+        handles = [dist.broadcast(p, self.src_rank, group=p.ds_process_group, async_op=True) for p in self.params]
         for h in handles:
             h.wait()
         self.params[0].partition(param_list=self.params, has_been_updated=True)
diff --git a/deepspeed/runtime/zero/partitioned_param_coordinator.py b/deepspeed/runtime/zero/partitioned_param_coordinator.py
index f6edfe626f0b..9bcf5a91bc95 100644
--- a/deepspeed/runtime/zero/partitioned_param_coordinator.py
+++ b/deepspeed/runtime/zero/partitioned_param_coordinator.py
@@ -1,7 +1,7 @@
-"""
-"Copyright 2020 The Microsoft DeepSpeed Team.
-Licensed under the MIT license.
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from dataclasses import dataclass
 import collections
@@ -12,9 +12,13 @@
 from deepspeed.utils.logging import logger
 from deepspeed.runtime.zero.offload_config import OffloadDeviceEnum
 from deepspeed.runtime.zero.partition_parameters import *
+from deepspeed.runtime.zero.partitioned_param_profiler import PartitionedParameterProfiler
 from deepspeed.runtime.swap_tensor.partitioned_param_swapper import PartitionedParamStatus
 from deepspeed.utils.debug import debug_module2name_id, debug_param2name_id
 from deepspeed.accelerator import get_accelerator
+import logging
+
+ENABLE_PROFILER = False
 
 
 def debug_rank0(message: str) -> None:
@@ -24,8 +28,7 @@ def debug_rank0(message: str) -> None:
 
 @instrument_w_nvtx
 def get_all_parameters(sub_module, recurse=False):
-    return itertools.chain(sub_module.named_parameters(recurse=recurse),
-                           sub_module.ds_external_parameters())
+    return itertools.chain(sub_module.named_parameters(recurse=recurse), sub_module.ds_external_parameters())
 
 
 def iter_params(module: Module, recurse=False) -> Iterable[Parameter]:
@@ -41,20 +44,27 @@ class ZeRoTraceMode(Enum):
     INVALID = 3
 
 
+class InflightParamRegistry(UserDict):
+    """registry for parameters in flight"""
+
+    def __setitem__(self, param: Parameter, handle: AllGatherCoalescedHandle) -> None:
+        if param in self.data:
+            raise RuntimeError(f"{param.ds_summary()} already in registry")
+        if param.ds_status != ZeroParamStatus.INFLIGHT:
+            raise RuntimeError(f"attempted to add non-inflight parameter to registry {param.ds_summary()}")
+        self.data[param] = handle
+
+
 class PartitionedParameterCoordinator:
+    FORWARD_FETCH_SUBMIT = 'forward_fetch_submit'
+    FORWARD_FETCH_WAIT = 'forward_fetch_wait'
+    FORWARD_PREFETCH_SUBMIT = 'forward_prefetch_submit'
+    BACKWARD_FETCH_SUBMIT = 'backward_fetch_submit'
+    BACKWARD_FETCH_WAIT = 'backward_fetch_wait'
+    BACKWARD_PREFETCH_SUBMIT = 'backward_prefetch_wait'
+    FORWARD_ALL_GATHER = 'forward_all_gather'
+    BACKWARD_ALL_GATHER = 'backward_all_gather'
     """Handles partitioning and gathering of parameters."""
-    class __InflightParamRegistry(UserDict):
-        """registry for parameters in flight"""
-        def __setitem__(self,
-                        param: Parameter,
-                        handle: AllGatherCoalescedHandle) -> None:
-            if param in self.data:
-                raise RuntimeError(f"{param.ds_summary()} already in registry")
-            if param.ds_status != ZeroParamStatus.INFLIGHT:
-                raise RuntimeError(
-                    f"attempted to add non-inflight parameter to registry {param.ds_summary()}"
-                )
-            self.data[param] = handle
 
     @dataclass
     class __ParamInTrace:
@@ -67,10 +77,14 @@ def __init__(
         max_reuse_distance_in_numel: int,
         max_available_parameters_in_numel: int,
         allgather_stream: get_accelerator().Stream,
+        inflight_param_registry: InflightParamRegistry,
         prefetch_nvme: bool = False,
+        timers=None,
+        zero_quantized_weights=False,
+        zero_quantized_nontrainable_weights=False,
     ) -> None:
         # mapping of param -> handle for each param that is currently in flight
-        self.__inflight_param_registry = __class__.__InflightParamRegistry()
+        self.__inflight_param_registry = inflight_param_registry
         # keeps track of the number of submodules invoked so far.
         self.__step_id: int = 0
         # network tracing mode
@@ -78,10 +92,8 @@ def __init__(
         # sequence of submodules/parameters in forward pass + backward pass
         self.__submodule_order: Iterable[Module] = []
         self.__param_order: Iterable[__class__.__ParamInTrace] = []
-        self.__most_recent_step_id_param_fetched_for = collections.defaultdict(
-            lambda: int(-1e10))
-        self.__step_id_module_fetched_for = collections.defaultdict(
-            lambda: collections.deque())
+        self.__most_recent_step_id_param_fetched_for = collections.defaultdict(lambda: int(-1e10))
+        self.__step_id_module_fetched_for = collections.defaultdict(lambda: collections.deque())
         # number of available params, and max number of available params
         self.__n_available_params: int = 0
         self.__max_n_available_params: int = max_available_parameters_in_numel
@@ -93,6 +105,8 @@ def __init__(
         self.__prefetch_bucket_sz: int = prefetch_bucket_sz
         self.__prefetch_nvme: bool = prefetch_nvme
         self.hierarchy: int = 0
+        self.zero_quantized_weights = zero_quantized_weights
+        self.zero_quantized_nontrainable_weights = zero_quantized_nontrainable_weights
 
         # stream that will be used for allgather operations
         self.__allgather_stream: get_accelerator().Stream = allgather_stream
@@ -109,6 +123,7 @@ def __init__(
         self.__ongoing_fetch_events: Deque[get_accelerator().Event] = collections.deque()
         # TODO. make this configurable via JSON
         self.__max_ongoing_fetch_events: int = 2
+        self.__profiler = PartitionedParameterProfiler(timers if ENABLE_PROFILER else None)
 
     """Tracing and Tracking
     TODO. consider performing trace before initializing PartitionedParameterCoordinator
@@ -122,8 +137,7 @@ def __init__(
     def _clear_trace_structures(self) -> None:
         self.__submodule_order = []
         self.__param_order = []
-        self.__most_recent_step_id_param_fetched_for = collections.defaultdict(
-            lambda: int(-1e10))
+        self.__most_recent_step_id_param_fetched_for = collections.defaultdict(lambda: int(-1e10))
         self.__param_queue = None
 
     def is_complete_trace(self) -> bool:
@@ -144,19 +158,26 @@ def _invalidate_trace(self) -> None:
     def trace_prologue(self, sub_module: Module) -> None:
         if self.is_complete_trace():
             # sub_module must match expectation else invalidate trace cache
+            if len(self.__submodule_order) <= self.__step_id:
+                print_rank_0(
+                    f"Invalidate trace cache @ step {self.__step_id} and module {sub_module.id}: "
+                    f"cache has only {len(self.__submodule_order)} modules",
+                    force=True)
+                self._invalidate_trace()
+                return
+
             if sub_module != self.__submodule_order[self.__step_id]:
                 expected_module_id = self.__submodule_order[self.__step_id].id
-                debug_rank0(
+                print_rank_0(
                     f"Invalidate trace cache @ step {self.__step_id}: "
-                    f"expected module {expected_module_id}, but got module {sub_module.id}"
-                )
+                    f"expected module {expected_module_id}, but got module {sub_module.id}",
+                    force=True)
                 self._invalidate_trace()
 
     def record_module(self, sub_module: Module) -> None:
         """adds sub module to trace"""
         if not self.is_record_trace():
-            raise RuntimeError(
-                f"attempted to record trace when status = {self.__trace_mode}")
+            raise RuntimeError(f"attempted to record trace when status = {self.__trace_mode}")
 
         self.__submodule_order.append(sub_module)
         self.__step_id_module_fetched_for[sub_module.id].append(self.__step_id)
@@ -164,14 +185,11 @@ def record_module(self, sub_module: Module) -> None:
     def record_parameters(self, sub_module: Module) -> None:
         """adds sub module to trace"""
         if not self.is_record_trace():
-            raise RuntimeError(
-                f"attempted to record trace when status = {self.__trace_mode}")
+            raise RuntimeError(f"attempted to record trace when status = {self.__trace_mode}")
 
         step_id = self.__step_id_module_fetched_for[sub_module.id].popleft()
         for param in sorted(set(iter_params(sub_module)), key=lambda p: p.ds_id):
-            self.__param_order.append(
-                __class__.__ParamInTrace(param=param,
-                                         step_id_last_used_at=step_id))
+            self.__param_order.append(__class__.__ParamInTrace(param=param, step_id_last_used_at=step_id))
 
     def construct_parameter_trace_from_module_trace(self):
         """use module trace to construct parameter trace"""
@@ -182,9 +200,8 @@ def construct_parameter_trace_from_module_trace(self):
     def reset_step(self) -> None:
         """indicate that we have completed one fwd+bwd for the model"""
         if self.__inflight_param_registry:
-            raise RuntimeError(
-                f"still have inflight params "
-                f"{[p.ds_summary for p in self.__inflight_param_registry.keys()]}")
+            raise RuntimeError(f"still have inflight params "
+                               f"{[p.ds_summary() for p in self.__inflight_param_registry.keys()]}")
 
         if not self.is_complete_trace():  # not self.trace_complete:
             # Make sure that recorded submodule orders are identical across ranks
@@ -194,41 +211,41 @@ def reset_step(self) -> None:
                 # Successfully recorded a trace
                 self.construct_parameter_trace_from_module_trace()
                 # Make sure that recorded parameter orders are identical across ranks
-                assert_ints_same_as_other_ranks(
-                    [p.param.ds_id for p in self.__param_order])
-                assert_ints_same_as_other_ranks(
-                    [p.step_id_last_used_at for p in self.__param_order])
+                assert_ints_same_as_other_ranks([p.param.ds_id for p in self.__param_order])
+                assert_ints_same_as_other_ranks([p.step_id_last_used_at for p in self.__param_order])
 
                 self.__submodule_order = tuple(self.__submodule_order)  # freeze
                 self.__param_order = tuple(self.__param_order)  # freeze
                 self.__trace_mode = ZeRoTraceMode.COMPLETE
                 print_rank_0(
-                    f"completed record trace: {[m.id for m in self.__submodule_order]}",
+                    f"completed record trace of {len(self.__submodule_order)} sub modules: {[m.id for m in self.__submodule_order]}",
                     force=False)
             else:
                 # Enable trace recording for next forward/backward pass
                 self.__trace_mode = ZeRoTraceMode.RECORD
 
+        else:
+            if self.__profiler is not None:
+                self.__profiler.log_events()
+
         self.__param_queue = collections.deque(self.__param_order)  # reset fetch queue
-        self.__most_recent_step_id_param_fetched_for = collections.defaultdict(
-            lambda: int(-1e10))
-        self.__step_id_module_fetched_for = collections.defaultdict(
-            lambda: collections.deque())
+        self.__most_recent_step_id_param_fetched_for = collections.defaultdict(lambda: int(-1e10))
+        self.__step_id_module_fetched_for = collections.defaultdict(lambda: collections.deque())
         self.__step_id = 0
         self.__n_available_params = 0
+        self.__profiler.reset_events()
 
     def _dump_params(self, tag, sub_module, params, step_id=None):
         if step_id is None:
             step_id = self.__step_id
         param_names = [debug_param2name_id(p) for p in params]
-        print(
-            f'{tag} step = {step_id} mod = {debug_module2name_id(sub_module)} p_names = {param_names}'
-        )
+        print_rank_0(f'{tag} step = {step_id} mod = {debug_module2name_id(sub_module)} p_names = {param_names}',
+                     force=False)
 
     def _dump_param_ids(self, tag, mod_id, p_ids, step_id=None):
         if step_id is None:
             step_id = self.__step_id
-        print(f'{tag} mod = {mod_id}, step = {step_id}, p_ids = {p_ids}')
+        print_rank_0(f'{tag} mod = {mod_id}, step = {step_id}, p_ids = {p_ids}', force=False)
 
     """Fetch and Release
     Fetching, prefetching, and releasing parameters
@@ -236,48 +253,64 @@ def _dump_param_ids(self, tag, mod_id, p_ids, step_id=None):
 
     @instrument_w_nvtx
     @torch.no_grad()
-    def fetch_sub_module(self, current_submodule: Module) -> None:
+    def fetch_sub_module(self, current_submodule: Module, forward: bool) -> None:
         """This method does the following (in order):
         1. kick off fetch for parameters in immediately required sub module
         2. kick off fetch for next few parameters we will need later (prefetch)
         3. block on parameters in immediately required sub module
         """
-        debug_rank0(
-            f"{self.__step_id}: M{current_submodule.id}({type(current_submodule).__name__}) P{[p.ds_id for p in iter_params(current_submodule)]} "
-            + str({
-                "avail": f"{self.__n_available_params:.1e}",
-                "queue_sz": f"{len(self.__param_queue or [])}",
-                "inflight": [p.ds_id for p in self.__inflight_param_registry],
-            }))
+        if logger.isEnabledFor(logging.DEBUG):
+            debug_rank0(
+                f"{self.__step_id}: M{current_submodule.id}({type(current_submodule).__name__}) P{[p.ds_id for p in iter_params(current_submodule)]} "
+                + str({
+                    "avail": f"{self.__n_available_params:.1e}",
+                    "queue_sz": f"{len(self.__param_queue or [])}",
+                    "inflight": [p.ds_id for p in self.__inflight_param_registry],
+                }))
 
         params_to_fetch = frozenset(iter_params(current_submodule))
-
-        # kick off all gather for params in the immediately required submodule
-        for param in params_to_fetch:
-            debug_rank0(f"-fetch: {param.ds_summary()}")
-        self.__all_gather_params(params_to_fetch)
-
+        fetch_numel = sum(
+            [p.partition_numel() for p in params_to_fetch if p.ds_status == ZeroParamStatus.NOT_AVAILABLE])
+        if fetch_numel > 0:
+            event_name = __class__.FORWARD_FETCH_SUBMIT if forward else __class__.BACKWARD_FETCH_SUBMIT
+            self._dump_param_ids(event_name, current_submodule.id,
+                                 [p.ds_id for p in params_to_fetch if p.ds_status == ZeroParamStatus.NOT_AVAILABLE])
+            self.__profiler.start_event(event_name)
+            # kick off all gather for params in the immediately required submodule
+            #for param in params_to_fetch:
+            if logger.isEnabledFor(logging.DEBUG):
+                for param in params_to_fetch:
+                    debug_rank0(f"-fetch: {param.ds_summary()}")
+            self.__all_gather_params(params_to_fetch, forward)
+            self.__profiler.stop_event(event_name, fetch_numel)
+
+        wait_numel = 0
+        wait_event_name = __class__.FORWARD_FETCH_WAIT if forward else __class__.BACKWARD_FETCH_WAIT
+        self.__profiler.start_event(wait_event_name)
         # wait for parameters in the immediately needed submodule to become available
         for param in params_to_fetch:
             param.ds_active_sub_modules.add(current_submodule.id)
-            debug_rank0(f"-wait: {param.ds_summary()}")
+            if logger.isEnabledFor(logging.DEBUG):
+                debug_rank0(f"-wait: {param.ds_summary()}")
             if param in self.__inflight_param_registry:
+                wait_numel += param.partition_numel()
                 with get_accelerator().stream(self.__allgather_stream):
-                    while self.__ongoing_fetch_events and self.__ongoing_fetch_events[
-                            0].query():
+                    while self.__ongoing_fetch_events and self.__ongoing_fetch_events[0].query():
                         self.__ongoing_fetch_events.popleft()
-                    if len(self.__ongoing_fetch_events
-                           ) > self.__max_ongoing_fetch_events:
+                    if len(self.__ongoing_fetch_events) > self.__max_ongoing_fetch_events:
                         self.__ongoing_fetch_events.popleft().synchronize()
 
                     self.__inflight_param_registry.pop(param).wait()
 
-                    event = get_accelerator().Event()
-                    event.record()
-                    self.__ongoing_fetch_events.append(event)
+                    if not get_accelerator().is_synchronized_device():
+                        event = get_accelerator().Event()
+                        event.record()
+                        self.__ongoing_fetch_events.append(event)
 
             assert param.ds_status == ZeroParamStatus.AVAILABLE, param.ds_summary()
-        get_accelerator().current_stream().wait_stream(self.__allgather_stream)
+        if not get_accelerator().is_synchronized_device():
+            get_accelerator().current_stream().wait_stream(self.__allgather_stream)
+        self.__profiler.stop_event(wait_event_name, wait_numel)
 
         # kick off parameter prefetches for upcoming modules
         # don't prefetch if we dont have a completed model trace
@@ -288,12 +321,8 @@ def fetch_sub_module(self, current_submodule: Module) -> None:
             # prefetches we won't look for them here
             discarded_from_prefetch_queue = set()
             params_not_already_fetched = set(
-                filter(
-                    lambda p: self.__most_recent_step_id_param_fetched_for[p] < self.
-                    __step_id,
-                    params_to_fetch))
-            while self.__param_queue and len(discarded_from_prefetch_queue) < len(
-                    params_not_already_fetched):
+                filter(lambda p: self.__most_recent_step_id_param_fetched_for[p] < self.__step_id, params_to_fetch))
+            while self.__param_queue and len(discarded_from_prefetch_queue) < len(params_not_already_fetched):
                 param_in_trace = self.__param_queue.popleft()
                 self.__most_recent_step_id_param_fetched_for[
                     param_in_trace.param] = param_in_trace.step_id_last_used_at
@@ -305,8 +334,7 @@ def fetch_sub_module(self, current_submodule: Module) -> None:
                     f"module id: {current_submodule.id}, training: {current_submodule.training}\n"
                     f"expected the next {len(params_not_already_fetched)} parameters in the "
                     f"parameter fetch queue to be {tuple(p.ds_summary(use_debug_name=True) for p in params_not_already_fetched)} \n"
-                    f"but got \n {tuple(p.ds_summary(use_debug_name=True) for p in discarded_from_prefetch_queue)}."
-                )
+                    f"but got \n {tuple(p.ds_summary(use_debug_name=True) for p in discarded_from_prefetch_queue)}.")
 
             def _is_currently_on_nvme(param):
                 if param.nvme_swapper is None:
@@ -317,14 +345,12 @@ def _is_currently_on_nvme(param):
 
             # kick off all gather for params in the next few submodules (prefetch)
             if self.__prefetch_bucket_sz > 0:
-                max_params_to_prefetch = min(
-                    self.__max_n_available_params - self.__n_available_params,
-                    self.__prefetch_bucket_sz)
+                max_params_to_prefetch = min(self.__max_n_available_params - self.__n_available_params,
+                                             self.__prefetch_bucket_sz)
                 params_to_prefetch = set()
                 numel_prefetching = 0
                 while self.__param_queue and numel_prefetching < max_params_to_prefetch:
-                    param_in_trace: __class__.__ParamInTrace = self.__param_queue.popleft(
-                    )
+                    param_in_trace: __class__.__ParamInTrace = self.__param_queue.popleft()
 
                     if _is_currently_on_nvme(param_in_trace.param):
                         # nvme prefetch is handled elsewhere. Need to break here to preserve fetch order
@@ -344,9 +370,14 @@ def _is_currently_on_nvme(param):
                         params_to_prefetch.add(param_in_trace.param)
                         numel_prefetching += param_in_trace.param.ds_numel
 
-                for param in params_to_prefetch:
-                    debug_rank0(f"-prefetch: {param.ds_summary()}")
-                self.__all_gather_params(params_to_prefetch)
+                if numel_prefetching > 0:
+                    event_name = __class__.FORWARD_PREFETCH_SUBMIT if forward else __class__.BACKWARD_PREFETCH_SUBMIT
+                    self.__profiler.start_event(event_name)
+                    if logger.isEnabledFor(logging.DEBUG):
+                        for param in params_to_prefetch:
+                            debug_rank0(f"-prefetch: {param.ds_summary()}")
+                    self.__all_gather_params(params_to_prefetch, forward)
+                    self.__profiler.stop_event(event_name, numel_prefetching)
 
                 if self.__prefetch_nvme:
                     self.__prefetch_nvme_param_partitions()
@@ -355,17 +386,15 @@ def _is_currently_on_nvme(param):
 
     @instrument_w_nvtx
     @torch.no_grad()
-    def release_sub_module(self, submodule: Module) -> None:
+    def release_sub_module(self, submodule: Module, backward: bool) -> None:
         """release the parameters of a sub module, assuming they meet conditions to
         be released."""
-        params_to_release = (self.__params_to_release(submodule,
-                                                      self.__step_id)
-                             if self.is_complete_trace() else set(
-                                 p.ds_id for p in iter_params(submodule)))
+        params_to_release = (self.__params_to_release(submodule, self.__step_id) if self.is_complete_trace() else set(
+            p.ds_id for p in iter_params(submodule)))
         for param in iter_params(submodule):
             param.ds_active_sub_modules.discard(submodule.id)
             if param.ds_id in params_to_release and not param.is_external_param:
-                self.__release_param(param)
+                self.__release_param(param, backward)
 
     @instrument_w_nvtx
     @torch.no_grad()
@@ -378,25 +407,46 @@ def release_and_reset_all(self, module: Module) -> None:
             # TODO. make this throw if if there are still active submodules. currently
             # there's a hook execution issue
             param.ds_active_sub_modules.clear()
-            self.__release_param(param)
+            self.__release_param(param, backward=False)
 
         for param in iter_params(module, recurse=True):
             if param.ds_status != ZeroParamStatus.NOT_AVAILABLE:
                 raise RuntimeError(f"{param.ds_summary()} expected to be released")
 
     @instrument_w_nvtx
-    def __all_gather_params(self, params: Set[Parameter]) -> None:
+    def __all_gather_params(self, params: Set[Parameter], forward: bool) -> None:
+        quantized_params = []
+        nonquantized_params = []
+        for param in params:
+            if hasattr(param.ds_tensor, 'ds_quant_scale'):
+                quantized_params.append(param)
+            else:
+                nonquantized_params.append(param)
+        if quantized_params:
+            self.__all_gather_params_(quantized_params, forward, quantize=True)
+        if nonquantized_params:
+            self.__all_gather_params_(nonquantized_params, forward, quantize=self.zero_quantized_weights)
+
+    def __all_gather_params_(self, params: Set[Parameter], forward: bool, quantize: bool = False) -> None:
         """for each partitioned parameter, kick off an async allgather and store
         the work handle for the in flight parameters."""
         partitioned_params = []
+        all_gather_numel = 0
         for param in params:
             if param.ds_status == ZeroParamStatus.NOT_AVAILABLE:
                 partitioned_params.append(param)
-                self.__n_available_params += param.ds_numel
+                all_gather_numel += param.ds_numel
 
         if partitioned_params:
+            partitioned_params
+            self.__n_available_params += all_gather_numel
             with get_accelerator().stream(self.__allgather_stream):
-                handle = partitioned_params[0].all_gather_coalesced(partitioned_params)
+                event_name = __class__.FORWARD_ALL_GATHER if forward else __class__.BACKWARD_ALL_GATHER
+                self.__profiler.start_event(event_name)
+                handle = partitioned_params[0].all_gather_coalesced(partitioned_params,
+                                                                    forward=forward,
+                                                                    quantize=quantize)
+                self.__profiler.stop_event(event_name, all_gather_numel)
 
             for param in partitioned_params:
                 assert param.ds_status == ZeroParamStatus.INFLIGHT, param.ds_summary()
@@ -404,31 +454,26 @@ def __all_gather_params(self, params: Set[Parameter]) -> None:
 
             # Release swap buffers for persisted params on nvme since they will never be partitioned or evicted from GPU
             swap_persisted_params = [
-                p for p in partitioned_params
-                if p.ds_persist and p.ds_tensor.final_location == OffloadDeviceEnum.nvme
+                p for p in partitioned_params if p.ds_persist and p.ds_tensor.final_location == OffloadDeviceEnum.nvme
             ]
             if swap_persisted_params:
-                swap_persisted_params[
-                    0].nvme_swapper.remove_partition_and_release_buffers(
-                        swap_persisted_params)
+                swap_persisted_params[0].nvme_swapper.remove_partition_and_release_buffers(swap_persisted_params)
 
     @instrument_w_nvtx
-    def __release_param(self, param: Parameter) -> None:
+    def __release_param(self, param: Parameter, backward: bool) -> None:
         if param.ds_status == ZeroParamStatus.AVAILABLE and not param.ds_active_sub_modules:
-            debug_rank0(f"-release: {param.ds_summary()}")
-            param.partition()
+            if logger.isEnabledFor(logging.DEBUG):
+                debug_rank0(f"-release: {param.ds_summary()}")
+            param.partition(backward=backward)
             self.__n_available_params -= param.ds_numel
 
     @instrument_w_nvtx
     @functools.lru_cache(maxsize=None)
-    def __params_to_release(self,
-                            submodule_to_release: Module,
-                            step_id: int) -> Set[int]:
+    def __params_to_release(self, submodule_to_release: Module, step_id: int) -> Set[int]:
         if not self.is_complete_trace():
             raise RuntimeError("expected trace to be complete")
 
-        params_to_release = set(p.ds_id for p in iter_params(submodule_to_release)
-                                if not p.ds_persist)
+        params_to_release = set(p.ds_id for p in iter_params(submodule_to_release) if not p.ds_persist)
 
         # Problem: When prefetcher scans the param trace, it skips AVAILABLE params.
         # This creates issues if those params are released before the skipped uses:
@@ -470,8 +515,8 @@ def __prefetch_nvme_param_partitions(self) -> None:
             param = param_in_trace.param
             if param.nvme_swapper is None:
                 continue
-            if (numel_considered > 2 * numel_in_flight or len(swap_in_params) >=
-                    param.nvme_swapper.available_swap_in_buffers()):
+            if (numel_considered > 2 * numel_in_flight
+                    or len(swap_in_params) >= param.nvme_swapper.available_swap_in_buffers()):
                 break
             if param.ds_tensor.status == PartitionedParamStatus.NOT_AVAILABLE:
                 swap_in_params.append(param)
diff --git a/deepspeed/runtime/zero/partitioned_param_profiler.py b/deepspeed/runtime/zero/partitioned_param_profiler.py
new file mode 100644
index 000000000000..b4ea11f3b836
--- /dev/null
+++ b/deepspeed/runtime/zero/partitioned_param_profiler.py
@@ -0,0 +1,63 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from dataclasses import dataclass
+from deepspeed.utils import log_dist
+
+
+class PartitionedParameterProfiler(object):
+
+    @dataclass
+    class EventCounter:
+        name: str
+        count: int
+        num_elem: int
+
+        def reset(self):
+            self.count = 0
+            self.num_elem = 0
+
+        def increment(self, numel):
+            self.count += 1
+            self.num_elem += numel
+
+    def __init__(self, timers):
+        self.timers = timers
+        self.event_counters = {}
+
+    def reset_events(self):
+        for event_ctr in self.event_counters.values():
+            event_ctr.reset()
+
+    def start_event(self, name):
+        if self.timers is None:
+            return
+
+        if name not in self.event_counters:
+            self.event_counters[name] = __class__.EventCounter(name=name, count=0, num_elem=0)
+        self.timers(name).start()
+
+    def stop_event(self, name, num_elem):
+        if self.timers is None:
+            return
+        assert name in self.event_counters, f'unknown event {name}'
+        self.event_counters[name].increment(num_elem)
+        self.timers(name).stop()
+
+    def _log_timers(self):
+        if self.timers is None:
+            return
+        self.timers.log(names=list(self.event_counters.keys()))
+
+    def _log_event_counters(self):
+        for event_ctr in self.event_counters.values():
+            log_dist(
+                f'{event_ctr.name}: count = {event_ctr.count}, numel = {event_ctr.num_elem}',
+                #f'{event_ctr.name}: time = {self._log_timers()},count = {event_ctr.count}, numel = {event_ctr.num_elem}',
+                ranks=[0])
+
+    def log_events(self):
+        self._log_event_counters()
+        self._log_timers()
diff --git a/deepspeed/runtime/zero/stage3.py b/deepspeed/runtime/zero/stage3.py
index 9f7cb384a74a..c0fd9d6625c7 100644
--- a/deepspeed/runtime/zero/stage3.py
+++ b/deepspeed/runtime/zero/stage3.py
@@ -1,39 +1,46 @@
-"""
-"Copyright 2020 The Microsoft DeepSpeed Team.
-Licensed under the MIT license.
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import sys
 import gc
 import collections
 from typing import Deque, Dict, Tuple
+from deepspeed import comm as dist
+from deepspeed.utils import groups
 
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
 from deepspeed.runtime import ZeROOptimizer
 from deepspeed.utils import logger
 from deepspeed.runtime.fp16.loss_scaler import CreateLossScaler
-from deepspeed.runtime.comm.coalesced_collectives import reduce_scatter_coalesced
-from deepspeed.runtime.utils import inf, get_global_norm, is_model_parallel_parameter
+from deepspeed.runtime.comm.coalesced_collectives import reduce_scatter_coalesced, all_to_all_quant_reduce
+from deepspeed.runtime.utils import inf, get_global_norm, is_model_parallel_parameter, get_only_unique_item
 from deepspeed.runtime.zero.partition_parameters import *
 from deepspeed.runtime.zero.config import ZeroStageEnum
 from deepspeed.runtime.zero.offload_config import OffloadDeviceEnum
 from deepspeed.runtime.zero.parameter_offload import DeepSpeedZeRoOffload
-from deepspeed.ops.adam import DeepSpeedCPUAdam
+from deepspeed.ops.adam import DeepSpeedCPUAdam, FusedAdam
 from deepspeed.runtime.swap_tensor.partitioned_param_swapper import PartitionedParamStatus
 from deepspeed.runtime.swap_tensor.partitioned_optimizer_swapper import PartitionedOptimizerSwapper
 from deepspeed.runtime.swap_tensor.pipelined_optimizer_swapper import PipelinedOptimizerSwapper
-from deepspeed.checkpoint.constants import OPTIMIZER_STATE_DICT, FP32_FLAT_GROUPS, PARTITION_COUNT, ZERO_STAGE
+from deepspeed.checkpoint.constants import OPTIMIZER_STATE_DICT, FP32_FLAT_GROUPS, PARTITION_COUNT, ZERO_STAGE, LOSS_SCALER
 from deepspeed.accelerator import get_accelerator
-from deepspeed.ops.op_builder import UtilsBuilder
 
 # Toggle this to true to enable correctness test
 # with gradient partitioning and without
 pg_correctness_test = False
 
+OPTIMIZER_SWAP_IN_STATE_TIMER = 'optimizer_swap_in_state'
+INIT_OPTIMIZER_TIMER = 'init_optimizer_state'
+OPTIMIZER_SWAP_OUT_STATE_TIMER = 'optimizer_swap_out_state'
+OPTIMIZER_STEP_TIMER = 'optimizer_step'
+
 
 def print_rank_0(message, debug=False, force=False):
     rank = dist.get_rank()
     if rank == 0 and (debug or force):
-        print(message)
+        logger.info(message)
     # other variations
     # - print for all ranks w/o interleaving
     # printflock(f"[{rank}] {message}")
@@ -73,41 +80,48 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
     For usage examples, refer to TODO: DeepSpeed Tutorial
 
     """
-    def __init__(self,
-                 module,
-                 init_optimizer,
-                 timers,
-                 ds_config,
-                 static_loss_scale=1.0,
-                 dynamic_loss_scale=False,
-                 dynamic_loss_args=None,
-                 verbose=True,
-                 contiguous_gradients=True,
-                 reduce_bucket_size=500000000,
-                 prefetch_bucket_size=50000000,
-                 max_reuse_distance=1000000000,
-                 max_live_parameters=1000000000,
-                 param_persistence_threshold=100000,
-                 model_persistence_threshold=sys.maxsize,
-                 dp_process_group=None,
-                 reduce_scatter=True,
-                 overlap_comm=False,
-                 offload_optimizer_config=None,
-                 offload_param_config=None,
-                 sub_group_size=1000000000000,
-                 mpu=None,
-                 clip_grad=0.0,
-                 communication_data_type=torch.float16,
-                 postscale_gradients=True,
-                 gradient_predivide_factor=1.0,
-                 gradient_accumulation_steps=1,
-                 elastic_checkpoint=False,
-                 aio_config=None):
 
+    def __init__(
+        self,
+        module,
+        init_optimizer,
+        timers,
+        ds_config,
+        static_loss_scale=1.0,
+        dynamic_loss_scale=False,
+        dynamic_loss_args=None,
+        verbose=True,
+        contiguous_gradients=True,
+        reduce_bucket_size=500000000,
+        prefetch_bucket_size=50000000,
+        max_reuse_distance=1000000000,
+        max_live_parameters=1000000000,
+        param_persistence_threshold=100000,
+        model_persistence_threshold=sys.maxsize,
+        dp_process_group=None,
+        reduce_scatter=True,
+        overlap_comm=False,
+        offload_optimizer_config=None,
+        offload_param_config=None,
+        sub_group_size=1000000000000,
+        offload_ratio=0.0,
+        mpu=None,
+        clip_grad=0.0,
+        gradient_accumulation_dtype=torch.float32,
+        communication_data_type=torch.float16,
+        postscale_gradients=True,
+        gradient_predivide_factor=1.0,
+        gradient_accumulation_steps=1,
+        elastic_checkpoint=False,
+        aio_config=None,
+        all2all_process_group=None,
+        zero_hpz_partition_size=1,
+        zero_quantized_weights=False,
+        zero_quantized_nontrainable_weights=False,
+    ):
         see_memory_usage("Stage 3 initialize beginning", force=True)
 
-        print_rank_0(f"initialized {__class__.__name__} with args: {locals()}",
-                     force=False)
+        print_rank_0(f"initialized {__class__.__name__} with args: {locals()}", force=False)
 
         if dist.get_rank() == 0:
             logger.info(f"Reduce bucket size {reduce_bucket_size}")
@@ -126,11 +140,11 @@ def __init__(self,
 
         self.optimizer = init_optimizer
 
-        # Load pre-built or JIT compile (un)flatten ops
-        util_ops = UtilsBuilder().load()
-        self.flatten = util_ops.flatten
-        self.unflatten = util_ops.unflatten
+        # Use torch (un)flatten ops
+        self.flatten = _flatten_dense_tensors
+        self.unflatten = _unflatten_dense_tensors
         self.dtype = self.optimizer.param_groups[0]['params'][0].dtype
+        self.gradient_accumulation_dtype = gradient_accumulation_dtype
         self._global_grad_norm = 0.
 
         self.custom_loss_scaler = False
@@ -146,8 +160,20 @@ def __init__(self,
         self.offload_param_pin_memory = False
         self.params_in_nvme_and_cpu = False
         self.max_params_in_cpu = 0
+        self.partial_offload = offload_ratio
 
-        self.parameter_offload = DeepSpeedZeRoOffload(
+        #num of ranks in a ZeRO param partitioning group
+        self.zero_hpz_partition_size = zero_hpz_partition_size
+
+        zero_param_parallel_group = groups._get_zero_param_intra_parallel_group()
+        print_rank_0(
+            f"ZeRO Stage 3 param partitioning group {self.zero_hpz_partition_size} {zero_param_parallel_group}",
+            force=False)
+        if self.zero_hpz_partition_size > 1 and zero_param_parallel_group is None:
+            self._set_zero_group_parallelism()
+            zero_param_parallel_group = groups._get_zero_param_intra_parallel_group()
+
+        self.parameter_offload = self.initialize_ds_offload(
             module=module,
             timers=timers,
             ds_config=ds_config,
@@ -157,41 +183,66 @@ def __init__(self,
             max_live_parameters=max_live_parameters,
             param_persistence_threshold=param_persistence_threshold,
             model_persistence_threshold=model_persistence_threshold,
-            offload_param_config=offload_optimizer_config,
-            mpu=mpu)
+            dp_process_group=dp_process_group,
+            offload_param_config=offload_param_config,
+            mpu=mpu,
+            zero_param_parallel_group=zero_param_parallel_group,
+            zero_quantized_weights=zero_quantized_weights,
+            zero_quantized_nontrainable_weights=zero_quantized_nontrainable_weights)
 
         self.persistent_parameters = self.parameter_offload.persistent_parameters
         self._configure_offloading(offload_optimizer_config, offload_param_config)
 
+        # backup fused_adam optimizer init
+        if self.offload_optimizer and self.partial_offload != 1.0:
+            backup_gpu_tensor = torch.randn(1, device='cuda').to(self.dtype)
+            backup_gpu_param = torch.nn.Parameter(backup_gpu_tensor)
+            assert type(init_optimizer) == DeepSpeedCPUAdam, 'Hybrid Optimizer Only Supports DeepSpeedCPUAdam'
+            self.backup_optimizer = FusedAdam([backup_gpu_param],
+                                              lr=self.optimizer.param_groups[0]["lr"],
+                                              bias_correction=self.optimizer.param_groups[0]["bias_correction"],
+                                              betas=self.optimizer.param_groups[0]["betas"],
+                                              eps=self.optimizer.param_groups[0]["eps"],
+                                              weight_decay=self.optimizer.param_groups[0]["weight_decay"],
+                                              amsgrad=self.optimizer.param_groups[0]["amsgrad"])
+            # Multiple param_groups configs for back-up optimizer
+            if len(self.optimizer.param_groups) > 1:
+                for i in range(1, len(self.optimizer.param_groups)):
+                    self.backup_optimizer.add_param_group(self.optimizer.param_groups[i])
+
         self.module = module
         self.elastic_checkpoint = elastic_checkpoint
 
-        self.__inf_or_nan_tracker: Tensor = torch.zeros(
-            1,
-            dtype=torch.bool,
-            device=get_accelerator().current_device_name(),
-            requires_grad=False)
+        self.inf_or_nan_tracker: Tensor = torch.zeros(1,
+                                                      dtype=torch.bool,
+                                                      device=get_accelerator().current_device_name(),
+                                                      requires_grad=False)
 
-        self.deepspeed_adam_offload = (self.offload_optimizer
-                                       and type(init_optimizer) == DeepSpeedCPUAdam)
+        self.deepspeed_adam_offload = (self.offload_optimizer and type(init_optimizer) == DeepSpeedCPUAdam)
 
-        self.device = get_accelerator().current_device_name(
-        ) if not self.offload_optimizer else OffloadDeviceEnum.cpu
+        self.device = get_accelerator().current_device_name() if not self.offload_optimizer else OffloadDeviceEnum.cpu
         ### streams used for overlapping computation with communication
-        self.__reduce_and_partition_stream = get_accelerator().Stream(
-        ) if overlap_comm else get_accelerator().default_stream()
+        self.reduce_and_partition_stream = None if get_accelerator().is_synchronized_device() else get_accelerator(
+        ).Stream() if overlap_comm else get_accelerator().default_stream()
 
         ############################################################################
 
-        self.__n_caching_allocator_flushes = 0
+        self.n_caching_allocator_flushes = 0
 
         #-------------Stage 3 Setup-------------------#
 
         self.timers = timers
 
+        self.all2all_process_group = all2all_process_group
+
         self.reduce_scatter = reduce_scatter
 
-        self.dp_process_group = dp_process_group
+        self.dp_process_group = self.parameter_offload.dp_process_group
+        self.sequence_parallel_size = groups._get_sequence_parallel_world_size()
+
+        self.all2all_process_group = all2all_process_group
+
+        self.zero_quantized_nontrainable_weights = zero_quantized_nontrainable_weights
 
         self.partition_count = dist.get_world_size(group=self.dp_process_group)
 
@@ -211,8 +262,12 @@ def __init__(self,
         self.micro_step_id = 0
         self.reduce_bucket_size = int(reduce_bucket_size)
 
+        if self.all2all_process_group is not None:
+            assert self.all2all_process_group is not None and self.reduce_scatter == True, "when enable all_to_all_reduce, reduce_scatter should also be enabled for data type checks."
+
         if self.reduce_scatter:
-            assert self.communication_data_type in (torch.float16, torch.bfloat16, torch.float32), f"ZeRO-3 supports only float16 or bfloat16 communication_data_type with reduce scatter enabled. Got: '{self.communication_data_type}'"
+            valid_reduce_scatter_dtypes = (torch.float16, torch.bfloat16, torch.float32)
+            assert self.communication_data_type in valid_reduce_scatter_dtypes, f"ZeRO-3 supports {valid_reduce_scatter_dtypes} communication_data_type with reduce scatter enabled. Got: '{self.communication_data_type}'"
             assert self.gradient_predivide_factor == 1.0, "gradient_predivide_factor != 1.0 is not yet supported with ZeRO-3 with reduce scatter enabled"
             assert self.postscale_gradients, "pre-scale gradients is not yet supported with ZeRO-3 with reduce scatter enabled"
 
@@ -258,19 +313,17 @@ def __init__(self,
         see_memory_usage("Before creating fp16 partitions", force=True)
         self._create_fp16_partitions_with_defragmentation(self.trainable_param_groups)
         num_fp16_subgroups = len(self.fp16_partitioned_groups_flat)
-        see_memory_usage(f"After creating fp16 partitions: {num_fp16_subgroups}",
-                         force=True)
+        see_memory_usage(f"After creating fp16 partitions: {num_fp16_subgroups}", force=True)
 
         # Optimizer tensor swapping
         if self.swap_optimizer:
             self._configure_tensor_swapping(offload_optimizer_config, aio_config)
 
-        self.__params_in_ipg_bucket: List[Parameter] = []
         self.is_gradient_accumulation_boundary: bool = True
 
-        self.__param_reduce_events: Deque[get_accelerator().Event] = collections.deque()
+        self.param_reduce_events: Deque[get_accelerator().Event] = collections.deque()
         # TODO. make this configurable via JSON
-        self.__max_param_reduce_events: int = 2
+        self.max_param_reduce_events: int = 2
 
         self.param_dict = {}
 
@@ -281,7 +334,6 @@ def __init__(self,
         self.grads_in_ipg_bucket = []
         self.params_in_ipg_bucket = []
 
-        self.params_already_reduced = []
         self.is_gradient_accumulation_boundary = True
         self._release_ipg_buffers()
         self.previous_reduced_grads = None
@@ -295,19 +347,17 @@ def __init__(self,
                 unique_id = id(param)
                 self.param_id[unique_id] = count
                 self.param_dict[count] = param
-                self.params_already_reduced.append(False)
                 count = count + 1
 
         #Largest partitioned param
-        largest_partitioned_param_numel = max([
-            max([
-                max(tensor.numel(),
-                    tensor.ds_numel) for tensor in fp16_partitioned_group
-            ]) for fp16_partitioned_group in self.fp16_partitioned_groups
-        ])
-        print_rank_0(
-            f'Largest partitioned param numel = {largest_partitioned_param_numel}',
-            force=False)
+        largest_partitioned_param_numel = 0
+        for fp16_partitioned_group in self.fp16_partitioned_groups:
+            if len(fp16_partitioned_group) > 0:
+                largest_partitioned_param_numel = max(
+                    largest_partitioned_param_numel,
+                    max([max(tensor.numel(), tensor.ds_numel) for tensor in fp16_partitioned_group]))
+
+        print_rank_0(f'Largest partitioned param numel = {largest_partitioned_param_numel}', force=False)
 
         self._setup_for_real_optimizer()
         self.grad_position = {}
@@ -315,7 +365,6 @@ def __init__(self,
 
         if self.offload_optimizer:
             self.norm_for_param_grads = {}
-            self.local_overflow = False
 
         # stores if a partition has been reduced in this step
         self.is_partition_reduced = {}
@@ -327,6 +376,7 @@ def __init__(self,
         self.averaged_gradients = {}
 
         #creates backward hooks for gradient partitioning
+        ###Calls all gather param
         self.create_reduce_and_remove_grad_hooks()
 
         #exit(0)
@@ -347,16 +397,69 @@ def __init__(self,
 
     def destroy(self):
         self.parameter_offload.destroy()
+        del self.__ipg_bucket_flat_buffer
+
+    def initialize_ds_offload(
+        self,
+        module,
+        timers,
+        ds_config,
+        overlap_comm,
+        prefetch_bucket_size,
+        max_reuse_distance,
+        max_live_parameters,
+        param_persistence_threshold,
+        model_persistence_threshold,
+        dp_process_group,
+        offload_param_config,
+        mpu,
+        zero_param_parallel_group,
+        zero_quantized_weights,
+        zero_quantized_nontrainable_weights,
+    ):
+        return DeepSpeedZeRoOffload(module=module,
+                                    timers=timers,
+                                    ds_config=ds_config,
+                                    overlap_comm=overlap_comm,
+                                    prefetch_bucket_size=prefetch_bucket_size,
+                                    max_reuse_distance=max_reuse_distance,
+                                    max_live_parameters=max_live_parameters,
+                                    param_persistence_threshold=param_persistence_threshold,
+                                    model_persistence_threshold=model_persistence_threshold,
+                                    dp_process_group=dp_process_group,
+                                    offload_param_config=offload_param_config,
+                                    mpu=mpu,
+                                    zero_param_parallel_group=zero_param_parallel_group,
+                                    zero_quantized_weights=zero_quantized_weights,
+                                    zero_quantized_nontrainable_weights=zero_quantized_nontrainable_weights)
 
     def _get_trainable_parameter_groups(self):
         param_groups = []
+        PARAMS_KEY = "params"
         for param_group in self.optimizer.param_groups:
-            trainable_params = {
-                "params": [p for p in param_group["params"] if p.requires_grad]
-            }
-            param_groups.append(trainable_params)
+            trainable_params = [p for p in param_group[PARAMS_KEY] if p.requires_grad]
+            if len(trainable_params) == 0:
+                continue
+
+            trainable_param_group = {}
+            for key in param_group.keys():
+                if key == PARAMS_KEY:
+                    trainable_param_group[PARAMS_KEY] = trainable_params
+                else:
+                    trainable_param_group[key] = param_group[key]
+            param_groups.append(trainable_param_group)
+
         return param_groups
 
+    def _set_zero_group_parallelism(self):
+        groups._create_zero_param_parallel_group(self.zero_hpz_partition_size)
+
+    def invalidate_secondary_tensor(self):
+        for fpg in self.fp16_groups:
+            for param in fpg:
+                if param.ds_secondary_tensor is not None:
+                    param.ds_secondary_tensor = None
+
     def _setup_for_real_optimizer(self):
         see_memory_usage("Before creating fp32 partitions", force=True)
         self._create_fp32_partitions()
@@ -377,31 +480,25 @@ def _setup_for_real_optimizer(self):
 
         # IPG
         if self.contiguous_gradients:
-            self.__ipg_bucket_flat_buffer: Tensor = torch.empty(
-                self.reduce_bucket_size,
-                dtype=self.dtype,
-                device=get_accelerator().current_device_name())
+            self.__ipg_bucket_flat_buffer: Tensor = torch.empty(self.reduce_bucket_size,
+                                                                dtype=self.dtype,
+                                                                device=get_accelerator().current_device_name())
 
-        grad_partitions_flat_buffer = None
+        self.grad_partitions_flat_buffer = None
         self.__param_id_to_grad_partition: Dict[int, Tensor] = {}
 
         all_params = list(itertools.chain.from_iterable(self.fp16_groups))
 
-        grad_partitions_flat_buffer: Tensor = torch.zeros(sum(p.partition_numel()
-                                                              for p in all_params),
-                                                          dtype=self.dtype,
-                                                          device=self.device)
+        self.grad_partitions_flat_buffer: Tensor = torch.zeros(sum(p.partition_numel() for p in all_params),
+                                                               dtype=self.gradient_accumulation_dtype,
+                                                               device=self.device)
         if self.offload_optimizer_pin_memory:
-            grad_partitions_flat_buffer = get_accelerator().pin_memory(
-                grad_partitions_flat_buffer)
+            self.grad_partitions_flat_buffer = get_accelerator().pin_memory(self.grad_partitions_flat_buffer)
 
         offset = 0
         for param in all_params:
-            self.__param_id_to_grad_partition[
-                param.ds_id] = grad_partitions_flat_buffer.narrow(
-                    0,
-                    offset,
-                    param.partition_numel())
+            self.__param_id_to_grad_partition[param.ds_id] = self.grad_partitions_flat_buffer.narrow(
+                0, offset, param.partition_numel())
             offset += param.partition_numel()
 
     def _link_all_hp_params(self):
@@ -477,27 +574,25 @@ def _configure_offloading(self, offload_optimizer_config, offload_param_config):
                 force=False)
 
     def _configure_tensor_swapping(self, offload_optimizer_config, aio_config):
-        nvme_swap_folder = os.path.join(offload_optimizer_config.nvme_path,
-                                        'zero_stage_3')
+        nvme_swap_folder = os.path.join(offload_optimizer_config.nvme_path, 'zero_stage_3')
         os.makedirs(nvme_swap_folder, exist_ok=True)
         if dist.get_rank() == 0:
             logger.info(f'Tensor Swapping: Adding optimizer tensors')
 
         swapper_type = PipelinedOptimizerSwapper if offload_optimizer_config.pipeline else PartitionedOptimizerSwapper
 
-        self.optimizer_swapper = swapper_type(
-            swap_config=offload_optimizer_config,
-            aio_config=aio_config,
-            base_folder=nvme_swap_folder,
-            optimizer=self.optimizer,
-            largest_numel=max(self.fp16_partitioned_groups_flat_numel),
-            device=self.device,
-            dtype=torch.float32,
-            timers=self.timers)
+        self.optimizer_swapper = swapper_type(swap_config=offload_optimizer_config,
+                                              aio_config=aio_config,
+                                              base_folder=nvme_swap_folder,
+                                              optimizer=self.optimizer,
+                                              largest_numel=max(self.fp16_partitioned_groups_flat_numel),
+                                              device=self.device,
+                                              dtype=torch.float32,
+                                              timers=self.timers)
 
     @property
     def elements_in_ipg_bucket(self):
-        return sum(p.ds_numel for p in self.__params_in_ipg_bucket)
+        return sum(p.ds_numel for p in self.params_in_ipg_bucket)
 
     def _move_to_flat_buffer(self, param_list, flat_buffer, avoid_copy=False):
         '''If flat buffer is None then the parameters in the param_list are
@@ -518,8 +613,7 @@ def _move_to_flat_buffer(self, param_list, flat_buffer, avoid_copy=False):
             '''if the parameter was initialized in nvme then bring it to the destination buffer directly'''
             if src.status == PartitionedParamStatus.NOT_AVAILABLE:
                 print_rank_0(
-                    f"Swapping in {param.ds_id} with partition size {param.partition_numel()} permanently to CPU"
-                )
+                    f"Swapping in {param.ds_id} with partition size {param.partition_numel()} permanently to CPU")
                 param.nvme_swapper.swap_into_buffer(param, dest)
                 src.data = dest.data
                 src.status = PartitionedParamStatus.AVAILABLE
@@ -544,33 +638,24 @@ def _create_param_groups_fp16_flat_cpu_memory(self):
             if self.params_in_nvme_and_cpu and \
                 aggregate_params_count + params_in_group > self.max_params_in_cpu:
 
-                flat_buffer_size = max(0,
-                                       self.max_params_in_cpu - aggregate_params_count)
+                flat_buffer_size = max(0, self.max_params_in_cpu - aggregate_params_count)
 
             aggregate_params_count += params_in_group
 
             if flat_buffer_size > 0:
-                print_rank_0(f"group {j} flat buffer size {flat_buffer_size}",
-                             force=False)
-                self.param_groups_fp16_flat_cpu_memory.append(
-                    get_accelerator().pin_memory(
-                        torch.empty(int(flat_buffer_size),
-                                    dtype=self.dtype)))
+                print_rank_0(f"group {j} flat buffer size {flat_buffer_size}", force=False)
+                self.param_groups_fp16_flat_cpu_memory.append(get_accelerator().pin_memory(
+                    torch.empty(int(flat_buffer_size), dtype=self.dtype)))
             else:
-                print_rank_0(
-                    f"No flat buffer size. Param group size was  {params_in_group}",
-                    force=False)
+                print_rank_0(f"No flat buffer size. Param group size was  {params_in_group}", force=False)
 
-                self.param_groups_fp16_flat_cpu_memory.append(
-                    torch.empty(1,
-                                dtype=self.dtype))
+                self.param_groups_fp16_flat_cpu_memory.append(torch.empty(1, dtype=self.dtype))
 
     def _create_fp16_partitions_with_defragmentation(self, fp16_param_groups):
         dist.barrier()
 
         param_groups: List[List[Parameter]] = tuple(
-            self._create_fp16_sub_groups(param_group["params"])
-            for param_group in fp16_param_groups)
+            self._create_fp16_sub_groups(param_group["params"]) for param_group in fp16_param_groups)
 
         # bookkeeping related to param groups
         for param_group_idx, param_group in enumerate(param_groups):
@@ -579,23 +664,18 @@ def _create_fp16_partitions_with_defragmentation(self, fp16_param_groups):
 
                 # record sub group and partitions
                 self.fp16_groups.append(sub_group)
-                self.fp16_partitioned_groups.append(
-                    [param.ds_tensor for param in sub_group])
+                self.fp16_partitioned_groups.append([param.ds_tensor for param in sub_group])
 
                 # record sub group -> group mapping
                 self.sub_group_to_group_id[sub_group_idx] = param_group_idx
 
                 # record total elements of parameter partitions in sub group
-                self.fp16_partitioned_groups_flat_numel.append(
-                    sum(p.partition_numel() for p in sub_group))
+                self.fp16_partitioned_groups_flat_numel.append(sum(p.partition_numel() for p in sub_group))
 
                 # record padding required to align group to world size (only applies to last rank)
                 rank_requires_padding = dist.get_rank(
-                    self.dp_process_group) == dist.get_world_size(
-                        self.dp_process_group) - 1
-                self.groups_padding.append([
-                    p.padding_size() if rank_requires_padding else 0 for p in sub_group
-                ])
+                    self.dp_process_group) == dist.get_world_size(self.dp_process_group) - 1
+                self.groups_padding.append([p.padding_size() if rank_requires_padding else 0 for p in sub_group])
 
         # move parameters to flattened buffer
         if not self.offload_param:  # partitioned params remain in GPU during training
@@ -611,10 +691,7 @@ def _create_fp16_partitions_with_defragmentation(self, fp16_param_groups):
             offset = 0
             for sub_group in self.fp16_groups:
                 sub_group_numel = sum(param.partition_numel() for param in sub_group)
-                self.fp16_partitioned_groups_flat.append(
-                    device_buffer.narrow(0,
-                                         offset,
-                                         sub_group_numel))
+                self.fp16_partitioned_groups_flat.append(device_buffer.narrow(0, offset, sub_group_numel))
                 offset += sub_group_numel
         else:  # partitioned params offloaded to CPU when not in use
             # create a flat CPU memory allocation for each param group
@@ -627,19 +704,15 @@ def _create_fp16_partitions_with_defragmentation(self, fp16_param_groups):
                     #Flat buffer may not be available for parameters that reside in NVME
                     if not self.params_in_nvme_and_cpu or flat_offset + total_elements <= self.param_groups_fp16_flat_cpu_memory[
                             param_group_idx].numel():
-                        fp16_partitioned_group_flat = self.param_groups_fp16_flat_cpu_memory[
-                            param_group_idx].narrow(0,
-                                                    flat_offset,
-                                                    total_elements)
+                        fp16_partitioned_group_flat = self.param_groups_fp16_flat_cpu_memory[param_group_idx].narrow(
+                            0, flat_offset, total_elements)
                         print_rank_0(
                             f"Creating a flat buffer for subgroup {i} requiring {total_elements} elements, and cumulative CPU elements {flat_offset + total_elements}",
                             force=False)
 
                     elif self.params_in_nvme_and_cpu:
                         fp16_partitioned_group_flat = None
-                        print_rank_0(
-                            f"No flat buffer for sub group {i} of {total_elements} elements",
-                            force=False)
+                        print_rank_0(f"No flat buffer for sub group {i} of {total_elements} elements", force=False)
                     else:
                         assert False, "Either params are in nvme, or they are in CPU memory. This code path should not be triggered. Please see you max_params_in_cpu and params_in_nvme configs"
 
@@ -652,9 +725,8 @@ def _create_fp16_partitions_with_defragmentation(self, fp16_param_groups):
 
         # if necessary, create a pinned memory buffer to be used for swapping out
         # params to NVME after optimizer step
-        should_create_fp16_flat_reuse_buffer = any(
-            flattened_partition_group is None
-            for flattened_partition_group in self.fp16_partitioned_groups_flat)
+        should_create_fp16_flat_reuse_buffer = any(flattened_partition_group is None
+                                                   for flattened_partition_group in self.fp16_partitioned_groups_flat)
         if should_create_fp16_flat_reuse_buffer:
             max_partition_numel, largest_partition_numel = 0, None
             for sub_group in self.fp16_groups:
@@ -664,15 +736,14 @@ def _create_fp16_partitions_with_defragmentation(self, fp16_param_groups):
                     max_partition_numel = total_elements
 
             assert len(largest_partition_numel) > 0, f'Unexpected that largest partition is empty'
-            self.fp16_groups[0][0].nvme_swapper.reserve_partitioned_swap_space(
-                largest_partition_numel)
+            self.fp16_groups[0][0].nvme_swapper.reserve_partitioned_swap_space(largest_partition_numel)
 
     def _swap_in_sub_group_to_flat_buffer(self, flat_buffer, sub_group_id):
         offset = 0
-        elements_in_sub_group = sum(
-            [t.ds_numel for t in self.fp16_partitioned_groups[sub_group_id]])
+        elements_in_sub_group = sum([t.ds_numel for t in self.fp16_partitioned_groups[sub_group_id]])
         assert (flat_buffer.numel() == elements_in_sub_group)
-        for param, partitioned_param in zip(self.fp16_groups[sub_group_id], self.fp16_partitioned_groups[sub_group_id]):
+        for param, partitioned_param in zip(self.fp16_groups[sub_group_id],
+                                            self.fp16_partitioned_groups[sub_group_id]):
             dest = flat_buffer.narrow(0, offset, partitioned_param.ds_numel)
             if partitioned_param.status == PartitionedParamStatus.NOT_AVAILABLE:
                 print_rank_0(
@@ -687,9 +758,7 @@ def _swap_in_sub_group_to_flat_buffer(self, flat_buffer, sub_group_id):
             offset += partitioned_param.ds_numel
 
     def _create_next_swappable_fp32_groups(self):
-        reverse_order_indices = [
-            i for i in range(len(self.fp32_partitioned_groups_flat))
-        ]
+        reverse_order_indices = [i for i in range(len(self.fp32_partitioned_groups_flat))]
         reverse_order_indices.reverse()
 
         next_group = None
@@ -702,16 +771,13 @@ def _create_next_swappable_fp32_groups(self):
 
     def _get_sub_group_partitions(self, sub_group_id):
         sub_group_partitions = []
-        for param, partitioned_param in zip(self.fp16_groups[sub_group_id], self.fp16_partitioned_groups[sub_group_id]):
+        for param, partitioned_param in zip(self.fp16_groups[sub_group_id],
+                                            self.fp16_partitioned_groups[sub_group_id]):
             if partitioned_param.status == PartitionedParamStatus.NOT_AVAILABLE:
                 swap_path = param.nvme_swapper.get_path(param, True)
-                sub_group_partitions.append((partitioned_param,
-                                             param.partition_numel(),
-                                             swap_path))
+                sub_group_partitions.append((partitioned_param, param.partition_numel(), swap_path))
             else:
-                sub_group_partitions.append((partitioned_param,
-                                             partitioned_param.ds_numel,
-                                             None))
+                sub_group_partitions.append((partitioned_param, partitioned_param.ds_numel, None))
 
         return sub_group_partitions
 
@@ -733,6 +799,17 @@ def _create_fp32_partitions(self):
         nvme_fp32_dest_tensors = []
         fp32_element_size = torch.tensor([], dtype=torch.float32).element_size()
 
+        # Assign portion of subgroup to cpu, the other to gpu.
+        if self.offload_optimizer:
+            self.subgroup_to_device = {}
+            sub_group_size = len(self.fp16_partitioned_groups_flat)
+            # print(f"Partial offload sub_group_size is {sub_group_size}, ratio is {self.partial_offload}\n")
+            for i in range(sub_group_size):
+                if i < int(self.partial_offload * sub_group_size):
+                    self.subgroup_to_device[i] = 'cpu'
+                else:
+                    self.subgroup_to_device[i] = get_accelerator()._name
+
         for i, tensor in enumerate(self.fp16_partitioned_groups_flat):
             num_elements = self.fp16_partitioned_groups_flat_numel[i]
 
@@ -749,60 +826,51 @@ def _create_fp32_partitions(self):
                         sub_group_partitions = self._get_sub_group_partitions(i)
                         nvme_fp16_partitions_info.append(sub_group_partitions)
                         nvme_fp16_num_elems.append(num_elements)
-                        nvme_fp32_dest_tensors.append(
-                            self.fp32_partitioned_groups_flat[i])
+                        nvme_fp32_dest_tensors.append(self.fp32_partitioned_groups_flat[i])
                     else:
-                        unpinned_fp32_buffer = torch.empty(num_elements,
-                                                           device=self.device,
-                                                           dtype=torch.float)
+                        unpinned_fp32_buffer = torch.empty(num_elements, device=self.device, dtype=torch.float)
                         self._swap_in_sub_group_to_flat_buffer(unpinned_fp32_buffer, i)
-                        self.optimizer_swapper.initialize_parameters(
-                            parameters=[self.fp32_partitioned_groups_flat[i]],
-                            src_tensors=[unpinned_fp32_buffer])
+                        self.optimizer_swapper.initialize_parameters(parameters=[self.fp32_partitioned_groups_flat[i]],
+                                                                     src_tensors=[unpinned_fp32_buffer])
                 else:
                     num_swap_from_cpu_partitions += 1
                     swap_from_cpu_memory_usage += (fp32_element_size * num_elements)
                     swappable_fp32_tensors.append(self.fp32_partitioned_groups_flat[i])
-                    swappable_fp16_src_tensors.append(
-                        self.fp16_partitioned_groups_flat[i])
+                    swappable_fp16_src_tensors.append(self.fp16_partitioned_groups_flat[i])
             else:
                 cpu_memory_usage += (fp32_element_size * num_elements)
                 cpu_memory_sub_groups += 1
 
                 if self.params_in_nvme_and_cpu and tensor is None:
-                    unpinned_fp32_buffer = torch.empty(num_elements,
-                                                       device=self.device,
-                                                       dtype=torch.float)
+                    unpinned_fp32_buffer = torch.empty(num_elements, device=self.device, dtype=torch.float)
                     self._swap_in_sub_group_to_flat_buffer(unpinned_fp32_buffer, i)
                     self.fp32_partitioned_groups_flat.append(unpinned_fp32_buffer)
                 else:
-                    self.fp32_partitioned_groups_flat.append(
-                        self.fp16_partitioned_groups_flat[i].to(
+                    if self.offload_optimizer:
+                        self.fp32_partitioned_groups_flat.append(self.fp16_partitioned_groups_flat[i].to(
+                            self.subgroup_to_device[i]).clone().float().detach())
+                    else:
+                        self.fp32_partitioned_groups_flat.append(self.fp16_partitioned_groups_flat[i].to(
                             self.device).clone().float().detach())
 
-            self.fp32_partitioned_groups_flat[
-                i].requires_grad = True  # keep this in case internal optimizer uses it
+            self.fp32_partitioned_groups_flat[i].requires_grad = True  # keep this in case internal optimizer uses it
 
         if len(swappable_fp32_tensors) > 0:
-            self.optimizer_swapper.initialize_parameters(
-                parameters=swappable_fp32_tensors,
-                src_tensors=swappable_fp16_src_tensors)
+            self.optimizer_swapper.initialize_parameters(parameters=swappable_fp32_tensors,
+                                                         src_tensors=swappable_fp16_src_tensors)
 
         if len(nvme_fp32_dest_tensors) > 0:
-            fp16_pinned_buffers = self.fp16_groups[0][
-                0].nvme_swapper.reserve_available_buffers()
+            fp16_pinned_buffers = self.fp16_groups[0][0].nvme_swapper.reserve_available_buffers()
             assert len(fp16_pinned_buffers) > 0
-            self.optimizer_swapper.initialize_from_swapped_fp16_params(
-                fp16_partitions_info=nvme_fp16_partitions_info,
-                fp16_num_elems=nvme_fp16_num_elems,
-                fp16_pinned_buffers=fp16_pinned_buffers,
-                fp32_parameters=nvme_fp32_dest_tensors)
+            self.optimizer_swapper.initialize_from_swapped_fp16_params(fp16_partitions_info=nvme_fp16_partitions_info,
+                                                                       fp16_num_elems=nvme_fp16_num_elems,
+                                                                       fp16_pinned_buffers=fp16_pinned_buffers,
+                                                                       fp32_parameters=nvme_fp32_dest_tensors)
             self.fp16_groups[0][0].nvme_swapper.release_reserved_buffers()
 
         nvme_gigabytes = nvme_memory_usage / GIGA_BYTES
-        print_rank_0(
-            f'Swappable FP32 Partitions: count={num_swappable_partitions} size={nvme_gigabytes:5.2f} GB',
-            force=False)
+        print_rank_0(f'Swappable FP32 Partitions: count={num_swappable_partitions} size={nvme_gigabytes:5.2f} GB',
+                     force=False)
         if self.params_in_nvme_and_cpu:
             print_rank_0(
                 f'Swap from NVMe Partitions: count = {num_swap_from_nvme_partitions}, size = {swap_from_nvme_memory_usage/GIGA_BYTES:5.2f}GB',
@@ -812,9 +880,8 @@ def _create_fp32_partitions(self):
                 force=False)
 
         cpu_memory_gigabytes = cpu_memory_usage / GIGA_BYTES
-        print_rank_0(
-            f'In-Memory FP32 Partitions: count={cpu_memory_sub_groups} size={cpu_memory_gigabytes:5.2f} GB',
-            force=False)
+        print_rank_0(f'In-Memory FP32 Partitions: count={cpu_memory_sub_groups} size={cpu_memory_gigabytes:5.2f} GB',
+                     force=False)
 
         # Clear for on-the-fly population before the optimizer step
         for param_group in self.optimizer.param_groups:
@@ -836,8 +903,7 @@ def _create_fp16_sub_groups(self, params_group):
             sub_group.append(param)
             local_sub_group_size += param.partition_numel()
 
-            if local_sub_group_size >= sub_group_size or id(param) == id(
-                    params_group[-1]):
+            if local_sub_group_size >= sub_group_size or id(param) == id(params_group[-1]):
 
                 sub_groups.append(sub_group)
 
@@ -853,18 +919,27 @@ def _release_ipg_buffers(self):
     def _optimizer_step(self, sub_group_id):
         param_group_id = self.sub_group_to_group_id[sub_group_id]
         fp32_param = self.fp32_partitioned_groups_flat[sub_group_id]
-        self.optimizer.param_groups[param_group_id]['params'] = [fp32_param]
-
-        self.optimizer.step()
-        self.optimizer.param_groups[param_group_id]['params'] = []
+        if self.offload_optimizer:
+            cur_device = self.subgroup_to_device[sub_group_id]
+            if cur_device == 'cpu':
+                self.optimizer.param_groups[param_group_id]['params'] = [fp32_param]
+                cpu_loss = self.optimizer.step()
+                self.optimizer.param_groups[param_group_id]['params'] = []
+            else:
+                self.backup_optimizer.param_groups[param_group_id]['params'] = [fp32_param]
+                gpu_loss = self.backup_optimizer.step()
+                self.backup_optimizer.param_groups[param_group_id]['params'] = []
+        else:
+            self.optimizer.param_groups[param_group_id]['params'] = [fp32_param]
+            self.optimizer.step()
+            self.optimizer.param_groups[param_group_id]['params'] = []
 
     def _swappable_optimizer_subgroup(self, sub_group_id):
         if not self.swap_optimizer:
             return False
 
-        return self.optimizer_swapper.swappable_tensor(
-            None,
-            numel=self.fp16_partitioned_groups_flat_numel[sub_group_id])
+        return self.optimizer_swapper.swappable_tensor(None,
+                                                       numel=self.fp16_partitioned_groups_flat_numel[sub_group_id])
 
     def _partitioned_params_swap_out(self, i):
         offset = 0
@@ -884,28 +959,27 @@ def _partitioned_params_swap_out(self, i):
             offset += partitioned_param.ds_numel
 
         if len(swap_fp16_params):
-            swap_fp16_params[0].nvme_swapper.swap_out_partitioned_params(
-                dst_fp16_params=swap_fp16_params,
-                src_fp32_params=swap_fp32_params)
+            swap_fp16_params[0].nvme_swapper.swap_out_partitioned_params(dst_fp16_params=swap_fp16_params,
+                                                                         src_fp32_params=swap_fp32_params)
 
     def initialize_optimizer_states(self):
         num_subgroups = len(self.fp16_groups)
 
-        largest_numel = max(
-            [sum([p.ds_numel for p in psg]) for psg in self.fp16_partitioned_groups])
+        largest_numel = max([sum([p.ds_numel for p in psg]) for psg in self.fp16_partitioned_groups])
         gradient_dtype = self.fp32_partitioned_groups_flat[0].dtype
-        gradient_buffer = torch.zeros(int(largest_numel),
-                                      dtype=gradient_dtype,
-                                      device=self.device)
+        gradient_buffer = torch.zeros(int(largest_numel), dtype=gradient_dtype, device=self.device)
 
         timer_names = set()
 
+        # State initialization for the Adagrad optimizer occurs at construction as opposed to other optimizers
+        # which do lazy initialization of the state at the first call to step.
+        is_adagrad = isinstance(self.optimizer, torch.optim.Adagrad)
+
         if self.swap_optimizer:
             self.optimizer_swapper.init_timers()
 
-        INIT_OPTIMIZER_TIMER = 'init_optimizer_state'
         timer_names.add(INIT_OPTIMIZER_TIMER)
-        self.start_timers([INIT_OPTIMIZER_TIMER])
+        self.timers(INIT_OPTIMIZER_TIMER).start()
 
         for i, group in enumerate(self.fp16_groups):
             swappable_optimizer_subgroup = self._swappable_optimizer_subgroup(i)
@@ -921,21 +995,17 @@ def initialize_optimizer_states(self):
                 self._optimizer_states_and_gradient_swap_in(i, timer_names)
 
             if self.offload_optimizer and not swappable_optimizer_subgroup:
-                subgroup_gradient_buffer = torch.zeros(num_elements,
-                                                       dtype=gradient_dtype,
-                                                       device=self.device)
+                subgroup_gradient_buffer = torch.zeros(num_elements, dtype=gradient_dtype, device=self.device)
                 if self.offload_optimizer_pin_memory:
-                    subgroup_gradient_buffer = get_accelerator().pin_memory(
-                        subgroup_gradient_buffer)
+                    subgroup_gradient_buffer = get_accelerator().pin_memory(subgroup_gradient_buffer)
 
-                self.fp32_partitioned_groups_flat[i].grad = subgroup_gradient_buffer
+                self.fp32_partitioned_groups_flat[i].grad = subgroup_gradient_buffer.to(self.subgroup_to_device[i])
             else:
-                self.fp32_partitioned_groups_flat[i].grad = gradient_buffer.narrow(
-                    0,
-                    0,
-                    num_elements)
+                self.fp32_partitioned_groups_flat[i].grad = gradient_buffer.narrow(0, 0, num_elements)
 
-            self._optimizer_step(i)
+            # Initialize the optimizer states with the flattened fp32 partition.
+            if not is_adagrad:
+                self._optimizer_step(i)
 
             if swappable_param_subgroup:
                 self._partitioned_params_swap_out(i)
@@ -947,8 +1017,12 @@ def initialize_optimizer_states(self):
                 f'[End] Initialize optimizer states {i} / {num_subgroups} subgroups, num_elems: {num_elements}, swappable opt/param:{swappable_optimizer_subgroup}/{swappable_param_subgroup}',
                 force=False)
 
-        self.stop_timers([INIT_OPTIMIZER_TIMER])
-        self.log_timers(timer_names)
+        # Initialize the optimizer states with the flattened fp32 partition.
+        if is_adagrad:
+            self.optimizer = torch.optim.Adagrad(self.fp32_partitioned_groups_flat, **self.optimizer.defaults)
+
+        self.timers(INIT_OPTIMIZER_TIMER).stop()
+        self.timers.log(timer_names)
 
         if self.swap_optimizer:
             self.optimizer_swapper.log_timers()
@@ -992,11 +1066,8 @@ def initialize_gradient_partitioning_data_structures(self):
                 self.grad_start_offset[i][partition_id] = {}
                 self.initialize_gradient_partition(i, param_group, partition_id)
                 self.is_partition_reduced[i][partition_id] = False
-                self.first_param_index_in_partition[i][
-                    partition_id] = self.get_first_param_index(
-                        i,
-                        param_group,
-                        partition_id)
+                self.first_param_index_in_partition[i][partition_id] = self.get_first_param_index(
+                    i, param_group, partition_id)
 
     @instrument_w_nvtx
     def independent_gradient_partition_epilogue(self):
@@ -1004,28 +1075,18 @@ def independent_gradient_partition_epilogue(self):
         self.__reduce_and_partition_ipg_grads()
         self.report_ipg_memory_usage(f"In ipg_epilogue after reduce_ipg_grads", 0)
 
-        self.__reduce_and_partition_stream.synchronize()
-
-        # if dist.get_rank() == 0:
-        #    logger.info("Params already reduced %s", self.params_already_reduced)
-        for i in range(len(self.params_already_reduced)):
-            self.params_already_reduced[i] = False
+        if not get_accelerator().is_synchronized_device():
+            self.reduce_and_partition_stream.synchronize()
 
         #in case of cpu offload, averaged gradients are already in fp32_partitioned_groups_flat.grad
         #TODO: use a similar code path for both cpu_offload and non-cpu offload
         if not self.offload_optimizer:
             for i, sub_group in enumerate(self.fp16_groups):
+                #TODO: This is redundant
                 self.averaged_gradients[i] = [
                     self.__param_id_to_grad_partition[param.ds_id]
-                    if param.requires_grad else torch.zeros_like(param.ds_tensor)
-                    for param in sub_group
+                    if param.requires_grad else torch.zeros_like(param.ds_tensor) for param in sub_group
                 ]
-                # self.averaged_gradients[i] = self.get_flat_partition(
-                #     self.fp16_groups[i],
-                #     0,
-                #     self.fp32_partitioned_groups_flat[i].numel(),
-                #     return_tensor_list=True)
-
         # this method gets called after every backward. need to increment
         # here because if it gets incremented in backward() the micro step
         # id will be off by one when we do the reduce and partition at the.
@@ -1043,24 +1104,25 @@ def create_reduce_and_remove_grad_hooks(self):
             for param in param_group:
                 if param.requires_grad:
                     #print_rank_0(f" Before all gather {param.device}, {param.shape}")
+                    print_rank_0(f"Before all gather {param.device}, {param.shape}", force=False)
 
                     # The hook must be created in un-partitioned parameter
                     param.all_gather()
 
                     #print(f"After all gather {param.device}, {param.shape}")
-                    def wrapper(param, i):
+                    def wrapper(param):
                         param_tmp = param.expand_as(param)
                         grad_acc = param_tmp.grad_fn.next_functions[0][0]
 
                         @instrument_w_nvtx
                         def reduce_partition_and_remove_grads(*notneeded):
-                            self.reduce_ready_partitions_and_remove_grads(param, i)
+                            self.reduce_ready_partitions_and_remove_grads(param)
 
                         grad_acc.register_hook(reduce_partition_and_remove_grads)
                         self.grad_accs.append(grad_acc)
 
                     #print(f"param grad fn {param.expand_as(param).grad_fn}")
-                    wrapper(param, i)
+                    wrapper(param)
 
                     # Partition the parameter after creating the hook
                     param.partition()
@@ -1077,8 +1139,8 @@ def report_ipg_memory_usage(self, tag, param_elems):
             f"{tag}: elems in_bucket {self.elements_in_ipg_bucket} param {param_elems} max_percent {percent_of_bucket_size}",
             force=False)
 
-    ###############Idependent Partition Gradient ########################
-    def reduce_independent_p_g_buckets_and_remove_grads(self, param, i):
+    ###############Independent Partition Gradient ########################
+    def reduce_independent_p_g_buckets_and_remove_grads(self, param):
         #print_rank_0(f"Inside reduce ipg buckets. {debug_param2name_id_shape(param)}, ipg elements {self.elements_in_ipg_bucket}, reduce bucket size {self.reduce_bucket_size}", force=True)
 
         # Because the ipg bucket is initialized with a random place holder tensor, we must
@@ -1086,76 +1148,110 @@ def reduce_independent_p_g_buckets_and_remove_grads(self, param, i):
         # 0). Otherwise if the incoming param.ds_numel is large, this branch may get triggered on a
         # garbage data and `self.average_tensor()` will crash because its params_to_reduce will be
         # empty, while reduction_list will have that garbage data.
-        if self.elements_in_ipg_bucket > 0 and self.elements_in_ipg_bucket + param.ds_numel > self.reduce_bucket_size:
-            self.report_ipg_memory_usage("In ipg_remove_grads before reduce_ipg_grads",
-                                         param.ds_numel)
+        if self.elements_in_ipg_bucket + param.ds_numel > self.reduce_bucket_size and self.elements_in_ipg_bucket > 0:
+            self.report_ipg_memory_usage("In ipg_remove_grads before reduce_ipg_grads", param.ds_numel)
 
             self.__reduce_and_partition_ipg_grads()
 
-        param_id = self.get_param_id(param)
-        assert self.params_already_reduced[param_id] == False, \
-            f"The parameter {param_id} has already been reduced. \
-            Gradient computed twice for this partition. \
-            Multiple gradient reduction is currently not supported"
-
         self.__add_grad_to_ipg_bucket(param)
 
     @instrument_w_nvtx
     @torch.no_grad()
     def __add_grad_to_ipg_bucket(self, param: Parameter) -> None:
-        self.__reduce_and_partition_stream.wait_stream(
-            get_accelerator().default_stream())
+        if not get_accelerator().is_synchronized_device():
+            self.reduce_and_partition_stream.wait_stream(get_accelerator().default_stream())
 
-        if self.contiguous_gradients and self.elements_in_ipg_bucket + param.grad.numel(
-        ) < self.reduce_bucket_size:
+        if self.contiguous_gradients and self.elements_in_ipg_bucket + param.grad.numel() <= self.reduce_bucket_size:
             # move the gradient to a contiguous buffer
-            with get_accelerator().stream(self.__reduce_and_partition_stream):
+            with get_accelerator().stream(self.reduce_and_partition_stream):
                 # move the parameter's gradient to the contiguous flat buffer
-                new_grad_tensor = self.__ipg_bucket_flat_buffer.narrow(
-                    0,
-                    self.elements_in_ipg_bucket,
-                    param.grad.numel()).view_as(param.grad)
+                new_grad_tensor = self.__ipg_bucket_flat_buffer.narrow(0, self.elements_in_ipg_bucket,
+                                                                       param.grad.numel()).view_as(param.grad)
                 new_grad_tensor.copy_(param.grad, non_blocking=True)
-                param.grad.record_stream(get_accelerator().current_stream())
+                if not get_accelerator().is_synchronized_device():
+                    param.grad.record_stream(get_accelerator().current_stream())
                 param.grad.data = new_grad_tensor
 
-        self.__params_in_ipg_bucket.append(param)
+        self.params_in_ipg_bucket.append(param)
 
     @instrument_w_nvtx
     @torch.no_grad()
     def __reduce_and_partition_ipg_grads(self, safe_mode: bool = False) -> None:
-        if not self.__params_in_ipg_bucket:
+        if not self.params_in_ipg_bucket:
             return
 
-        for param in self.__params_in_ipg_bucket:
+        for param in self.params_in_ipg_bucket:
             if param.grad.numel() != param.ds_numel:
-                raise RuntimeError(
-                    f"{param.grad.numel()} != {param.ds_numel} Cannot reduce scatter "
-                    f"gradients whose size is not same as the params")
-
-        self.__params_in_ipg_bucket.sort(key=lambda p: p.ds_id)
+                raise RuntimeError(f"{param.grad.numel()} != {param.ds_numel} Cannot reduce scatter "
+                                   f"gradients whose size is not same as the params")
 
-        assert len(set(p.ds_id for p in self.__params_in_ipg_bucket)) == len(
-            self.__params_in_ipg_bucket)
+        assert len(set(p.ds_id for p in self.params_in_ipg_bucket)) == len(self.params_in_ipg_bucket)
 
-        while self.__param_reduce_events and self.__param_reduce_events[0].query():
-            self.__param_reduce_events.popleft()
-        if len(self.__param_reduce_events) > self.__max_param_reduce_events:
-            self.__param_reduce_events.popleft().synchronize()
+        while self.param_reduce_events and self.param_reduce_events[0].query():
+            self.param_reduce_events.popleft()
+        if len(self.param_reduce_events) > self.max_param_reduce_events:
+            self.param_reduce_events.popleft().synchronize()
 
-        with get_accelerator().stream(self.__reduce_and_partition_stream):
+        with get_accelerator().stream(self.reduce_and_partition_stream):
             if safe_mode:
-                assert_ints_same_as_other_ranks(
-                    [p.ds_id for p in self.__params_in_ipg_bucket])
+                assert_ints_same_as_other_ranks([p.ds_id for p in self.params_in_ipg_bucket])
+
+            if self.contiguous_gradients and self.elements_in_ipg_bucket <= self.reduce_bucket_size and not self.reduce_scatter:
+                grad_bucket = self.__ipg_bucket_flat_buffer.narrow(0, 0, self.elements_in_ipg_bucket)
+                grad_partitions = self.__avg_scatter_contiguous_grads(grad_bucket)
+            else:
+                self.params_in_ipg_bucket.sort(key=lambda p: p.ds_id)
+                grad_partitions = self.__avg_scatter_grads(self.params_in_ipg_bucket)
+
+            self.partition_grads(self.params_in_ipg_bucket, grad_partitions)
+
+            self.params_in_ipg_bucket.clear()
+
+            if not get_accelerator().is_synchronized_device():
+                event = get_accelerator().Event()
+                event.record()
+                self.param_reduce_events.append(event)
+
+    @instrument_w_nvtx
+    def __avg_scatter_contiguous_grads(self, buffer_to_reduce: Tensor) -> List[Tensor]:
+        dtype = buffer_to_reduce.dtype
+        if self.communication_data_type == self.dtype:
+            buffer_to_reduce = buffer_to_reduce.to(self.communication_data_type)
+        if self.postscale_gradients and self.gradient_predivide_factor != 1.0:
+            buffer_to_reduce = buffer_to_reduce.div_(self.gradient_predivide_factor)
+
+        world_sz = dist.get_world_size(self.dp_process_group)
+        rank = dist.get_rank(self.dp_process_group)
+        buffer_to_reduce.div_(world_sz / float(self.sequence_parallel_size))
 
-            grad_partitions = self.__avg_scatter_grads(self.__params_in_ipg_bucket)
-            self.__partition_grads(self.__params_in_ipg_bucket, grad_partitions)
+        dist.all_reduce(buffer_to_reduce, group=self.dp_process_group)
 
-            self.__params_in_ipg_bucket.clear()
+        if self.postscale_gradients and self.gradient_predivide_factor != world_sz:
+            buffer_to_reduce = buffer_to_reduce.mul(self.gradient_predivide_factor)
 
-            event = get_accelerator().Event()
-            event.record()
-            self.__param_reduce_events.append(event)
+        if self.communication_data_type != self.dtype:
+            buffer_to_reduce = buffer_to_reduce.to(self.dtype)
+
+        grad_partitions = []
+        grad_offset_in_buffer = 0
+        for param in self.params_in_ipg_bucket:
+            grad = param.grad
+            chunk_sz = math.ceil(grad.numel() / world_sz)
+
+            start_offset = grad_offset_in_buffer + min(rank * chunk_sz, grad.numel())
+            end_offset = grad_offset_in_buffer + min(rank * chunk_sz + chunk_sz, grad.numel())
+
+            partition = buffer_to_reduce[start_offset:end_offset]
+            if param.partition_numel() != partition.numel():
+                padded_partition = torch.zeros(param.partition_numel(), device=grad.device, dtype=grad.dtype)
+                if partition.numel() > 0:
+                    padded_partition[:partition.numel()] = partition
+                grad_partitions.append(padded_partition)
+            else:
+                grad_partitions.append(partition)
+            grad_offset_in_buffer += grad.numel()
+
+        return grad_partitions
 
     @instrument_w_nvtx
     def __avg_scatter_grads(self, params_to_reduce: List[Parameter]) -> List[Tensor]:
@@ -1163,28 +1259,25 @@ def __avg_scatter_grads(self, params_to_reduce: List[Parameter]) -> List[Tensor]
 
         full_grads_for_rank = [p.grad for p in params_to_reduce]
         if self.communication_data_type != self.dtype:
-            full_grads_for_rank = [
-                g.to(self.communication_data_type) for g in full_grads_for_rank
-            ]
+            full_grads_for_rank = [g.to(self.communication_data_type) for g in full_grads_for_rank]
 
         if self.postscale_gradients and self.gradient_predivide_factor != 1.0:
-            full_grads_for_rank = [
-                g.div(self.gradient_predivide_factor) for g in full_grads_for_rank
-            ]
+            full_grads_for_rank = [g.div(self.gradient_predivide_factor) for g in full_grads_for_rank]
 
-        grad_partitions_for_rank = reduce_scatter_coalesced(full_grads_for_rank,
-                                                            self.dp_process_group)
+        local_world_size = get_accelerator().device_count()
+        global_world_size = dist.get_world_size()
+        num_nodes = global_world_size // local_world_size
+        if self.all2all_process_group is not None and num_nodes > 1:
+            grad_partitions_for_rank = all_to_all_quant_reduce(full_grads_for_rank, self.all2all_process_group)
+        else:
+            grad_partitions_for_rank = reduce_scatter_coalesced(full_grads_for_rank, self.dp_process_group)
 
-        if self.postscale_gradients and self.gradient_predivide_factor != dist.get_world_size(
+        if self.postscale_gradients and self.gradient_predivide_factor != 1.0 and self.gradient_predivide_factor != dist.get_world_size(
                 self.dp_process_group):
-            grad_partitions_for_rank = [
-                g.mul(self.gradient_predivide_factor) for g in grad_partitions_for_rank
-            ]
+            grad_partitions_for_rank = [g.mul(self.gradient_predivide_factor) for g in grad_partitions_for_rank]
 
         if self.communication_data_type != self.dtype:
-            grad_partitions_for_rank = [
-                g.to(self.dtype) for g in grad_partitions_for_rank
-            ]
+            grad_partitions_for_rank = [g.to(self.dtype) for g in grad_partitions_for_rank]
 
         return grad_partitions_for_rank
 
@@ -1195,11 +1288,7 @@ def set_grad_positions(self):
                 param_id = self.get_param_id(param)
                 num_elements = param.partition_numel()
 
-                self.grad_position[param_id] = [
-                    int(i),
-                    int(current_offset),
-                    int(num_elements)
-                ]
+                self.grad_position[param_id] = [int(i), int(current_offset), int(num_elements)]
                 #print(f"param id {param_id} i:{i}, ds_tensor {num_elements} numel {param.numel()}")
                 current_offset += num_elements
         see_memory_usage(f"After Set Grad positions", force=False)
@@ -1240,74 +1329,56 @@ def complete_grad_norm_calculation_for_cpu_offload(self, params):
         # Sum across all model parallel GPUs.
         total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)])
 
-        dist.all_reduce(total_norm_cuda,
-                        op=dist.ReduceOp.SUM,
-                        group=self.dp_process_group)
+        dist.all_reduce(total_norm_cuda, op=dist.ReduceOp.SUM, group=self.dp_process_group)
 
         self._model_parallel_all_reduce(tensor=total_norm_cuda, op=dist.ReduceOp.SUM)
 
         total_norm = total_norm_cuda[0].item()**(1. / norm_type)
 
-        if total_norm == float(
-                'inf') or total_norm == -float('inf') or total_norm != total_norm:
+        if total_norm == float('inf') or total_norm == -float('inf') or total_norm != total_norm:
             total_norm = -1
 
         return total_norm
 
     @instrument_w_nvtx
-    def __partition_grads(self,
-                          params_to_release: List[Parameter],
-                          grad_partitions: List[Tensor]) -> None:
+    def partition_grads(self, params_to_release: List[Parameter], grad_partitions: List[Tensor]) -> None:
         offload_fp32_gradients = {}
         offload_fp32_offsets = {}
+        buffers = []
         for param, grad_partition in zip(params_to_release, grad_partitions):
 
-            contains_real_data = param.partition_numel() * dist.get_rank(
-                self.dp_process_group) < param.ds_numel
+            contains_real_data = param.partition_numel() * dist.get_rank(self.dp_process_group) < param.ds_numel
             if not contains_real_data:
                 # this grad partition is empty - don't need to do anything
                 param.grad = None
                 continue
 
             # move or accumulate gradient partition to target buffer
-            grad_buffer = self.__param_id_to_grad_partition[param.ds_id].narrow(
-                0,
-                0,
-                grad_partition.numel())
+            grad_buffer = self.__param_id_to_grad_partition[param.ds_id].narrow(0, 0, grad_partition.numel())
+            buffers.append(grad_buffer)
             if self.micro_step_id == 0:  # don't accumulate
                 grad_buffer.copy_(grad_partition, non_blocking=True)
                 # ensure grad buffer is a CUDA buffer to speed up the next few
                 # operations and so it can be used asynchronously
                 grad_buffer = grad_buffer.to(grad_partition.device, non_blocking=True)
             elif get_accelerator().on_accelerator(grad_buffer):
-                grad_buffer.add_(grad_partition)
+                grad_buffer.add_(grad_partition.to(self.gradient_accumulation_dtype).view(grad_buffer.shape))
             else:
                 # if dst is CPU, copy first to src device, do the addition
                 # there, then move back to dst. adding directly to cpu is very slow
-                cuda_grad_buffer = grad_buffer.to(grad_partition.device,
-                                                  non_blocking=True)
-                cuda_grad_buffer.add_(grad_partition)
+                cuda_grad_buffer = grad_buffer.to(grad_partition.device, non_blocking=True)
+                cuda_grad_buffer.add_(grad_partition.to(self.gradient_accumulation_dtype).view(cuda_grad_buffer.shape))
                 grad_buffer.copy_(cuda_grad_buffer, non_blocking=True)
                 # ensure grad buffer is a CUDA buffer to speed up the next few
                 # operations and so it can be used asynchronously
                 grad_buffer = cuda_grad_buffer
 
-            if hasattr(self.__inf_or_nan_tracker, "logical_or_"):
-                self.__inf_or_nan_tracker.logical_or_(torch.isinf(grad_buffer).any())
-                self.__inf_or_nan_tracker.logical_or_(torch.isnan(grad_buffer).any())
-            else:
-                # logical_or_ not available in older versions of pytorch
-                self.__inf_or_nan_tracker += torch.isinf(grad_buffer).any()
-                self.__inf_or_nan_tracker += torch.isnan(grad_buffer).any()
-                self.__inf_or_nan_tracker = self.__inf_or_nan_tracker > 0
-
             # offload the gradient partition if applicable
             if self.offload_optimizer:
                 i, dest_offset, _ = self.grad_position[self.get_param_id(param)]
 
                 if self.is_gradient_accumulation_boundary:
-                    self.norm_for_param_grads[self.get_param_id(
-                        param)] = self._constant_buffered_norm2(grad_buffer)
+                    self.norm_for_param_grads[self.get_param_id(param)] = self._constant_buffered_norm2(grad_buffer)
 
                     if self._swappable_optimizer_subgroup(i):
                         if not i in offload_fp32_gradients.keys():
@@ -1317,28 +1388,28 @@ def __partition_grads(self,
                         offload_fp32_gradients[i].append(grad_buffer.float())
                         offload_fp32_offsets[i].append(dest_offset)
                     else:
-                        fp32_grad_tensor = self.fp32_partitioned_groups_flat[
-                            i].grad.narrow(0,
-                                           dest_offset,
-                                           grad_buffer.numel())
+                        fp32_grad_tensor = self.fp32_partitioned_groups_flat[i].grad.narrow(
+                            0, dest_offset, grad_buffer.numel())
                         fp32_grad_tensor.copy_(grad_buffer)
 
             # free the gradient
-            param.grad.record_stream(get_accelerator().current_stream())
+            if not get_accelerator().is_synchronized_device():
+                param.grad.record_stream(get_accelerator().current_stream())
             param.grad = None
 
         if self.offload_optimizer and self.swap_optimizer:
             for i in offload_fp32_gradients.keys():
-                self.optimizer_swapper.swap_out_gradients(
-                    parameter=self.fp32_partitioned_groups_flat[i],
-                    gradient_offsets=offload_fp32_offsets[i],
-                    gradient_tensors=offload_fp32_gradients[i])
+                self.optimizer_swapper.swap_out_gradients(parameter=self.fp32_partitioned_groups_flat[i],
+                                                          gradient_offsets=offload_fp32_offsets[i],
+                                                          gradient_tensors=offload_fp32_gradients[i])
+        return buffers
 
-    def reduce_ready_partitions_and_remove_grads(self, param, i):
+    def reduce_ready_partitions_and_remove_grads(self, param):
         #print_rank_0(f"Backward {debug_param2name_id_shape(param)}", force=True)
-        self.reduce_independent_p_g_buckets_and_remove_grads(param, i)
+        self.reduce_independent_p_g_buckets_and_remove_grads(param)
 
     def zero_reduced_gradients(self, partition_id, i):
+
         def are_all_related_partitions_reduced(params_id):
             for partition_id in self.param_to_partition_ids[i][params_id]:
                 if not self.is_partition_reduced[i][partition_id]:
@@ -1349,6 +1420,40 @@ def are_all_related_partitions_reduced(params_id):
             if are_all_related_partitions_reduced(params_id):
                 self.param_dict[params_id].grad = None
 
+    def quantize_nontrainable_params(self):
+        """ In ZeRO-3, when the zero_quantized_nontrainable_weights flag is set, we quantize the non-trainable weights and also store them in quantized format. However, this check for trainable/non-trainable is done when deepspeed initializes the partitioning. So, if the user changes the trainable/non-trainable status of a parameter after the partitioning is done (e.g. LoRA), the user needs to re-quantize the non-trainable weights by calling this function.
+        """
+        if not self.zero_quantized_nontrainable_weights:
+            print_rank_0(
+                f"Warning: quantize_nontrainable_params() called with zero_quantized_nontrainable_weights disabled, return without doing anything",
+                force=True)
+            return
+        quantizer_module = CUDAQuantizer()
+
+        def quantize_dstensor(tensor):
+            assert tensor.dtype == torch.float16, f"quantize_dstensor() expects tensor.dtype == torch.float16, got {tensor.dtype}"
+            partition_size = tensor.ds_numel
+            ds_status = tensor.status
+            final_location = tensor.final_location
+            tensor, tensor.ds_quant_scale = quantizer_module.quantize(tensor)
+            tensor.ds_numel = partition_size
+            tensor.status = ds_status
+            tensor.final_location = final_location
+            tensor.requires_grad = False
+            return tensor
+
+        for param in self.module.parameters():
+            if hasattr(param, "ds_tensor") and (param.ds_tensor.numel() <= 2048 or param.ds_numel <= 500000):
+                # skip small parameters
+                continue
+            if hasattr(param,
+                       "ds_tensor") and not param.requires_grad and not hasattr(param.ds_tensor, "ds_quant_scale"):
+                param.ds_tensor = quantize_dstensor(param.ds_tensor)
+            if hasattr(param, "ds_secondary_tensor") and not param.requires_grad and not hasattr(
+                    param.ds_secondary_tensor, "ds_quant_scale") and param.ds_secondary_tensor is not None:
+                param.ds_secondary_tensor = quantize_dstensor(param.ds_secondary_tensor)
+        get_accelerator().synchronize()
+
     def flatten_and_print(self, message, tensors, start=0, n=5):
         flatten_tensor = self.flatten(tensors)
 
@@ -1358,29 +1463,23 @@ def print_func():
         self.sequential_execution(print_func, message)
 
     def get_grads_to_reduce(self, i, partition_id):
+
         def get_reducible_portion(key):
             grad = self.param_dict[key].grad
             total_elements = grad.numel()
             start = self.grad_start_offset[i][partition_id][key]
-            num_elements = min(
-                total_elements - start,
-                self.partition_size[i] -
-                self.grad_partition_insertion_offset[i][partition_id][key])
+            num_elements = min(total_elements - start,
+                               self.partition_size[i] - self.grad_partition_insertion_offset[i][partition_id][key])
             if not pg_correctness_test:
                 if num_elements == total_elements:
                     return grad
                 else:
-                    return grad.contiguous().view(-1).narrow(0,
-                                                             int(start),
-                                                             int(num_elements))
+                    return grad.contiguous().view(-1).narrow(0, int(start), int(num_elements))
             else:
                 if num_elements == total_elements:
                     return grad.clone()
                 else:
-                    return grad.clone().contiguous().view(-1).narrow(
-                        0,
-                        int(start),
-                        int(num_elements))
+                    return grad.clone().contiguous().view(-1).narrow(0, int(start), int(num_elements))
 
         grads_to_reduce = []
         for key in self.is_grad_computed[i][partition_id]:
@@ -1420,7 +1519,7 @@ def allreduce_bucket(self, bucket, rank=None, log=None):
         if communication_data_type != tensor.dtype:
             tensor_to_allreduce = tensor.to(communication_data_type)
 
-        tensor_to_allreduce.div_(dist.get_world_size(group=self.dp_process_group))
+        tensor_to_allreduce.div_(dist.get_world_size(group=self.dp_process_group) / float(self.sequence_parallel_size))
 
         if rank is None:
             #    "All Reducing"
@@ -1443,11 +1542,7 @@ def allreduce_and_copy(self, small_bucket, rank=None, log=None):
                 for buf, synced in zip(small_bucket, self.unflatten(allreduced, small_bucket)):
                     buf.copy_(synced)
 
-    def allreduce_no_retain(self,
-                            bucket,
-                            numel_per_bucket=500000000,
-                            rank=None,
-                            log=None):
+    def allreduce_no_retain(self, bucket, numel_per_bucket=500000000, rank=None, log=None):
         small_bucket = []
         numel = 0
         for tensor in bucket:
@@ -1499,14 +1594,14 @@ def get_partition_info(self, tensor_list, partition_size, partition_id):
 
             tensor_size = tensor.numel()
 
-            if (current_index >= start_index and current_index < end_index):
+            if start_index <= current_index < end_index:
                 params_in_partition.append(tensor)
 
-            elif start_index > current_index and start_index < (current_index +
-                                                                tensor_size):
+            elif current_index < start_index < (current_index + tensor_size):
                 params_in_partition.append(tensor)
 
-                assert (first_offset == 0), "This can happen either zero or only once as this must be the first tensor in the partition"
+                assert (first_offset == 0
+                        ), "This can happen either zero or only once as this must be the first tensor in the partition"
                 first_offset = start_index - current_index
 
             else:
@@ -1517,7 +1612,7 @@ def get_partition_info(self, tensor_list, partition_size, partition_id):
         return params_in_partition, params_not_in_partition, first_offset
 
     @instrument_w_nvtx
-    def zero_grad(self, set_to_none=False):
+    def zero_grad(self, set_to_none=True):
         """
         Zero FP16 parameter grads.
         """
@@ -1566,9 +1661,7 @@ def get_grad_norm_direct(self, gradients, params, norm_type=2):
         if norm_type == inf:
             total_norm = max(g.data.abs().max() for g in gradients)
             total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)])
-            dist.all_reduce(total_norm_cuda,
-                            op=dist.ReduceOp.MAX,
-                            group=self.dp_process_group)
+            dist.all_reduce(total_norm_cuda, op=dist.ReduceOp.MAX, group=self.dp_process_group)
 
             # Take max across all GPUs.
             self._model_parallel_all_reduce(tensor=total_norm_cuda, op=dist.ReduceOp.MAX)
@@ -1579,23 +1672,23 @@ def get_grad_norm_direct(self, gradients, params, norm_type=2):
             grad_norms = []
             for g, p in zip(gradients, params):
                 if is_model_parallel_parameter(p) or (self.model_parallel_rank == 0):
-                    grad_norms.append(
-                        g.to(get_accelerator().device_name(),
-                             non_blocking=True).double().norm(2))
+                    grad_norms.append(g.to(get_accelerator().device_name(), non_blocking=True).double().norm(2))
 
             # Sum across all model parallel GPUs.
-            total_norm_cuda = torch.sum(torch.pow(torch.stack(grad_norms), 2))
+            if len(grad_norms) == 0:
+                # FIX https://github.com/microsoft/DeepSpeed/issues/3564
+                total_norm_cuda = torch.tensor(0,
+                                               dtype=gradients[0].dtype).to(get_accelerator().device_name()).double()
+            else:
+                total_norm_cuda = torch.sum(torch.pow(torch.stack(grad_norms), 2))
 
-            dist.all_reduce(total_norm_cuda,
-                            op=dist.ReduceOp.SUM,
-                            group=self.dp_process_group)
+            dist.all_reduce(total_norm_cuda, op=dist.ReduceOp.SUM, group=self.dp_process_group)
 
             self._model_parallel_all_reduce(tensor=total_norm_cuda, op=dist.ReduceOp.SUM)
 
             total_norm = total_norm_cuda.item()**(1. / norm_type)
 
-        if total_norm == float(
-                'inf') or total_norm == -float('inf') or total_norm != total_norm:
+        if total_norm == float('inf') or total_norm == -float('inf') or total_norm != total_norm:
             total_norm = -1
 
         return total_norm
@@ -1603,11 +1696,7 @@ def get_grad_norm_direct(self, gradients, params, norm_type=2):
     # creates a flat fused tensor from the tensor list starting at the first_offset
     # in the first tensor of the list. If there are not enough elements in the tensor
     # list then the flat tensor will be padded with zeros
-    def get_flat_partition(self,
-                           tensor_list,
-                           first_offset,
-                           partition_size,
-                           return_tensor_list=False):
+    def get_flat_partition(self, tensor_list, first_offset, partition_size, return_tensor_list=False):
         flat_tensor_list = []
         current_size = 0
         for i, tensor in enumerate(tensor_list):
@@ -1630,10 +1719,7 @@ def get_flat_partition(self,
             # we need a narrow view of the tensor based on the tensor offset and number of elements that
             # we need from this tensor
             if tensor_offset > 0 or num_elements < tensor.numel():
-                flat_tensor_list.append(tensor.contiguous().view(-1).narrow(
-                    0,
-                    int(tensor_offset),
-                    int(num_elements)))
+                flat_tensor_list.append(tensor.contiguous().view(-1).narrow(0, int(tensor_offset), int(num_elements)))
             else:
                 flat_tensor_list.append(tensor)
 
@@ -1657,27 +1743,6 @@ def free_grad_in_param_list(self, param_list):
 
     def reset_cpu_buffers(self):
         self.norm_for_param_grads = {}
-        self.local_overflow = False
-
-    def log_timers(self, timer_names):
-        if self.timers is None:
-            return
-
-        self.timers.log(names=list(timer_names))
-
-    def start_timers(self, timer_names):
-        if self.timers is None:
-            return
-
-        for name in timer_names:
-            self.timers(name).start()
-
-    def stop_timers(self, timer_names):
-        if self.timers is None:
-            return
-
-        for name in timer_names:
-            self.timers(name).stop()
 
     def _pre_step(self):
         self.micro_step_id = 0
@@ -1695,13 +1760,9 @@ def _get_norm_groups(self):
         norm_groups = []
         for i, group in enumerate(self.fp16_groups):
             if self.offload_optimizer:
-                norm_groups.append(
-                    self.complete_grad_norm_calculation_for_cpu_offload(
-                        self.fp16_groups[i]))
+                norm_groups.append(self.complete_grad_norm_calculation_for_cpu_offload(self.fp16_groups[i]))
             else:
-                norm_groups.append(
-                    self.get_grad_norm_direct(self.averaged_gradients[i],
-                                              self.fp16_groups[i]))
+                norm_groups.append(self.get_grad_norm_direct(self.averaged_gradients[i], self.fp16_groups[i]))
         return norm_groups
 
     @instrument_w_nvtx
@@ -1720,55 +1781,48 @@ def _prepare_fp32_grad_for_sub_group(self, sub_group_id):
         # release all the gradient since we have already created a necessary copy in dp_grad_partition
         self.zero_grad(set_to_none=True)
 
-        for grad in filter(lambda g: get_accelerator().on_accelerator(g),
-                           self.averaged_gradients[sub_group_id]):
-            grad.record_stream(get_accelerator().current_stream())
+        if not get_accelerator().is_synchronized_device():
+            for grad in filter(lambda g: get_accelerator().on_accelerator(g), self.averaged_gradients[sub_group_id]):
+                grad.record_stream(get_accelerator().current_stream())
 
         self.averaged_gradients[sub_group_id] = None
 
     @instrument_w_nvtx
-    def _prepare_sub_group(self, sub_group_id, timer_names=set()):
-        see_memory_usage(f'Before prepare optimizer sub group {sub_group_id}',
-                         force=False)
+    def _prepare_sub_group(self, sub_group_id, timer_names):
+        see_memory_usage(f'Before prepare optimizer sub group {sub_group_id}', force=False)
         if self._swappable_optimizer_subgroup(sub_group_id):
             self._optimizer_states_and_gradient_swap_in(sub_group_id, timer_names)
         elif not self.offload_optimizer:
             self._prepare_fp32_grad_for_sub_group(sub_group_id)
-        see_memory_usage(f'After prepare optimizer sub group {sub_group_id}',
-                         force=False)
+        see_memory_usage(f'After prepare optimizer sub group {sub_group_id}', force=False)
 
-    def _optimizer_states_and_gradient_swap_in(self, sub_group_id, timer_names=set()):
+    def _optimizer_states_and_gradient_swap_in(self, sub_group_id, timer_names):
         param_length = self.fp16_partitioned_groups_flat_numel[sub_group_id]
         fp32_param_id = id(self.fp32_partitioned_groups_flat[sub_group_id])
         assert self._swappable_optimizer_subgroup(sub_group_id), \
             f'Parameter {fp32_param_id} of numel={param_length} is not swappable'
 
-        OPTIMIZER_SWAP_IN_STATE = 'optimizer_swap_in_state'
-        see_memory_usage(f'pre-step Before swapping in optimizer tensors {sub_group_id}',
-                         force=False)
-        self.start_timers([OPTIMIZER_SWAP_IN_STATE])
+        see_memory_usage(f'pre-step Before swapping in optimizer tensors {sub_group_id}', force=False)
+        timer_names.add(OPTIMIZER_SWAP_IN_STATE_TIMER)
+        self.timers(OPTIMIZER_SWAP_IN_STATE_TIMER).start()
 
         self.optimizer_swapper.swap_in_optimizer_state(
             parameter=self.fp32_partitioned_groups_flat[sub_group_id],
             async_parameter=self.next_swappable_fp32_partitioned_groups[sub_group_id])
 
-        self.stop_timers([OPTIMIZER_SWAP_IN_STATE])
-        timer_names.add(OPTIMIZER_SWAP_IN_STATE)
-        see_memory_usage(f'pre-step After swapping in optimizer tensors {sub_group_id}',
-                         force=False)
+        self.timers(OPTIMIZER_SWAP_IN_STATE_TIMER).stop()
+        see_memory_usage(f'pre-step After swapping in optimizer tensors {sub_group_id}', force=False)
 
     @instrument_w_nvtx
-    def _release_sub_group(self, sub_group_id, timer_names=set()):
-        see_memory_usage(f'Before release optimizer sub group {sub_group_id}',
-                         force=False)
+    def _release_sub_group(self, sub_group_id, timer_names):
+        see_memory_usage(f'Before release optimizer sub group {sub_group_id}', force=False)
         # get rid of the fp32 gradients. Not needed anymore
         if not self.offload_optimizer:
             self.fp32_partitioned_groups_flat[sub_group_id].grad = None
 
         if self._swappable_optimizer_subgroup(sub_group_id):
             self._optimizer_states_and_gradient_swap_out(sub_group_id, timer_names)
-        see_memory_usage(f'After release optimizer sub group {sub_group_id}',
-                         force=False)
+        see_memory_usage(f'After release optimizer sub group {sub_group_id}', force=False)
 
     # create a flat tensor aligned at the alignment boundary
     @instrument_w_nvtx
@@ -1781,9 +1835,7 @@ def flatten_dense_tensors_aligned(self, tensor_list, alignment):
 
         if remaining:
             elements_to_add = alignment - remaining
-            pad_tensor = torch.zeros(elements_to_add,
-                                     device=tensor_list[0].device,
-                                     dtype=tensor_list[0].dtype)
+            pad_tensor = torch.zeros(elements_to_add, device=tensor_list[0].device, dtype=tensor_list[0].dtype)
             padded_tensor_list = tensor_list + [pad_tensor]
 
             num_elements = num_elements + elements_to_add
@@ -1792,28 +1844,22 @@ def flatten_dense_tensors_aligned(self, tensor_list, alignment):
 
         return self.flatten(padded_tensor_list)
 
-    def _optimizer_states_and_gradient_swap_out(self, sub_group_id, timer_names=set()):
+    def _optimizer_states_and_gradient_swap_out(self, sub_group_id, timer_names):
         param_length = self.fp16_partitioned_groups_flat_numel[sub_group_id]
         fp32_param_id = id(self.fp32_partitioned_groups_flat[sub_group_id])
         assert self._swappable_optimizer_subgroup(sub_group_id), \
             f'Parameter {fp32_param_id} of numel={param_length} is not swappable'
 
-        OPTIMIZER_SWAP_OUT_STATE = 'optimizer_swap_out_state'
-        see_memory_usage(
-            f'post-step Before swapping out optimizer tensors {sub_group_id}',
-            force=False)
-        self.start_timers([OPTIMIZER_SWAP_OUT_STATE])
+        see_memory_usage(f'post-step Before swapping out optimizer tensors {sub_group_id}', force=False)
+        timer_names.add(OPTIMIZER_SWAP_OUT_STATE_TIMER)
+        self.timers(OPTIMIZER_SWAP_OUT_STATE_TIMER).start()
 
         self.optimizer_swapper.swap_out_optimizer_state(
             parameter=self.fp32_partitioned_groups_flat[sub_group_id],
-            async_swap=self.next_swappable_fp32_partitioned_groups[sub_group_id]
-            is not None)
+            async_swap=self.next_swappable_fp32_partitioned_groups[sub_group_id] is not None)
 
-        self.stop_timers([OPTIMIZER_SWAP_OUT_STATE])
-        see_memory_usage(
-            f'post-step After swapping out optimizer tensors {sub_group_id}',
-            force=False)
-        timer_names.add(OPTIMIZER_SWAP_OUT_STATE)
+        self.timers(OPTIMIZER_SWAP_OUT_STATE_TIMER).stop()
+        see_memory_usage(f'post-step After swapping out optimizer tensors {sub_group_id}', force=False)
 
         # get rid of the fp32 gradients. Not needed anymore
         self.fp32_partitioned_groups_flat[sub_group_id].grad = None
@@ -1836,17 +1882,12 @@ def _overflow_clean_up(self, prev_scale):
 
         see_memory_usage('After overflow after clearing gradients', force=False)
 
-        if dist.get_rank() == 0:
-            overflow_msg = f"[deepspeed] OVERFLOW! Rank {dist.get_rank()} Skipping step."
-            if self.dtype == torch.half:
-                overflow_msg += f" Attempted loss scale: {prev_scale}, reducing to {self.loss_scale}"
-            logger.info(overflow_msg)
-
     @instrument_w_nvtx
     def _overflow_check_and_loss_scale_update(self):
 
         # First compute norm for all group so we know if there is overflow
-        self.check_overflow()
+        if self.dtype == torch.float16:
+            self.check_overflow()
 
         #loss scaling related computation
         prev_scale = self.loss_scale
@@ -1858,7 +1899,7 @@ def _overflow_check_and_loss_scale_update(self):
         return self.overflow
 
     @instrument_w_nvtx
-    def _post_step(self, timer_names=set()):
+    def _post_step(self, timer_names):
         if self.offload_optimizer:
             self.reset_cpu_buffers()
 
@@ -1869,7 +1910,9 @@ def _post_step(self, timer_names=set()):
         if self.swap_optimizer:
             self.optimizer_swapper.log_timers()
 
-        self.log_timers(timer_names)
+        # self.invalidate_secondary_tensor() # given that we want hpz in forward pass when no_grad is set, we need to keep the secondary tensor
+
+        self.timers.log(timer_names)
 
         see_memory_usage('After zero_optimizer step', force=False)
         print_rank_0(f"------------------Finishing Step-----------------------")
@@ -1887,9 +1930,7 @@ def _reassign_or_swap_out_partitioned_parameters(self, sub_group_id):
 
     def override_loss_scale(self, loss_scale):
         if loss_scale != self.external_loss_scale:
-            logger.info(
-                f'[deepspeed] setting loss scale from {self.external_loss_scale} -> {loss_scale}'
-            )
+            logger.info(f'[deepspeed] setting loss scale from {self.external_loss_scale} -> {loss_scale}')
         self.custom_loss_scaler = True
         self.external_loss_scale = loss_scale
 
@@ -1897,7 +1938,7 @@ def override_loss_scale(self, loss_scale):
     def step(self, closure=None):
         """
             Not supporting closure.
-            """
+        """
         self._pre_step()
         self._partition_all_parameters()
 
@@ -1915,8 +1956,8 @@ def step(self, closure=None):
 
         timer_names = set()
 
-        timer_names.add('optimizer_step')
-        self.start_timers(['optimizer_step'])
+        timer_names.add(OPTIMIZER_STEP_TIMER)
+        self.timers(OPTIMIZER_STEP_TIMER).start()
 
         #update parameters one sub group at a time
         for sub_group_id, group in enumerate(self.fp16_groups):
@@ -1936,14 +1977,16 @@ def step(self, closure=None):
             #release memory or swap out optimizer states of fp32 parameters
             self._release_sub_group(sub_group_id, timer_names)
 
-        self.stop_timers(['optimizer_step'])
+        self.timers(OPTIMIZER_STEP_TIMER).stop()
 
         self._post_step(timer_names)
 
         # warn user about caching allocator flushes
         memory_stats = get_accelerator().memory_stats()
-        alloc_retries = memory_stats["num_alloc_retries"] if memory_stats != None else 0
-        if alloc_retries > self.__n_caching_allocator_flushes:
+        alloc_retries = memory_stats.get("num_alloc_retries")
+        if alloc_retries == None:
+            alloc_retries = 0
+        if alloc_retries > self.n_caching_allocator_flushes:
             if dist.get_rank() == 0:
                 logger.warning(
                     "%d pytorch allocator cache flushes since last step. this happens "
@@ -1953,8 +1996,8 @@ def step(self, closure=None):
                     "make the cache flushes go away consider adding "
                     "get_accelerator().empty_cache() calls in your training loop to ensure "
                     "that all ranks flush their caches at the same time",
-                    alloc_retries - self.__n_caching_allocator_flushes)
-            self.__n_caching_allocator_flushes = alloc_retries
+                    alloc_retries - self.n_caching_allocator_flushes)
+            self.n_caching_allocator_flushes = alloc_retries
 
     def dump_pre_step_gradients(self, debug_fp32_grads):
         # Dump gradient norms for debugging
@@ -1971,21 +2014,15 @@ def dump_pre_step_gradients(self, debug_fp32_grads):
     def dump_post_step_gradients(self):
         # Dump gradient norms for debugging
         for i, group in enumerate(self.fp16_groups):
-            print(
-                f'Post-Step Dump Norms for Group {i} FP16P, FP16DS, FP16FLAT, FP32FLAT')
+            print(f'Post-Step Dump Norms for Group {i} FP16P, FP16DS, FP16FLAT, FP32FLAT')
             unflat_fp16 = self.unflatten(self.fp16_groups_flat[i], self.fp16_groups[i])
-            unflat_fp32 = self.unflatten(self.fp32_partitioned_groups_flat[i],
-                                         self.fp16_groups[i])
+            unflat_fp32 = self.unflatten(self.fp32_partitioned_groups_flat[i], self.fp16_groups[i])
             for j, p in enumerate(self.fp16_groups[i]):
                 param_id = self.get_param_id(p)
                 param_norm = float(p.data.float().norm(2))
                 ds_norm = float(p.ds_tensor.data.float().norm(2))
 
-                unflat_norm = [
-                    float(t.data.float().norm(2))
-                    for t in [unflat_fp16[j],
-                              unflat_fp32[j]]
-                ]
+                unflat_norm = [float(t.data.float().norm(2)) for t in [unflat_fp16[j], unflat_fp32[j]]]
                 norm_list = [param_norm, ds_norm] + unflat_norm
                 print(f'Post-Step Norms {i} {param_id} = {norm_list}')
 
@@ -2022,16 +2059,22 @@ def has_overflow_partitioned_grads_serial(self):
     @instrument_w_nvtx
     def has_overflow(self, partition_gradients=True):
         if partition_gradients:
-            with get_accelerator().stream(self.__reduce_and_partition_stream):
-                self.local_overflow = bool(self.__inf_or_nan_tracker.item())
-                self.__inf_or_nan_tracker.zero_()
+            with get_accelerator().stream(self.reduce_and_partition_stream):
+                if hasattr(self.inf_or_nan_tracker, "logical_or_"):
+                    self.inf_or_nan_tracker.logical_or_(torch.isinf(self.grad_partitions_flat_buffer).any())
+                    self.inf_or_nan_tracker.logical_or_(torch.isnan(self.grad_partitions_flat_buffer).any())
+                else:
+                    # logical_or_ not available in older versions of pytorch
+                    self.inf_or_nan_tracker += torch.isinf(self.grad_partitions_flat_buffer).any()
+                    self.inf_or_nan_tracker += torch.isnan(self.grad_partitions_flat_buffer).any()
+                    self.inf_or_nan_tracker = self.inf_or_nan_tracker > 0
 
-            overflow = self.local_overflow
-            #overflow = self.has_overflow_partitioned_grads_serial()
-            overflow_gpu = get_accelerator().ByteTensor([overflow])
-            dist.all_reduce(overflow_gpu,
-                            op=dist.ReduceOp.MAX,
-                            group=self.dp_process_group)
+                overflow_gpu = self.inf_or_nan_tracker.clone().to(torch.uint8)
+                self.inf_or_nan_tracker.zero_()
+
+            if not get_accelerator().is_synchronized_device():
+                get_accelerator().default_stream().wait_stream(self.reduce_and_partition_stream)
+            dist.all_reduce(overflow_gpu, op=dist.ReduceOp.MAX, group=self.dp_process_group)
 
         else:
             params = []
@@ -2100,16 +2143,14 @@ def get_fp32_grad_partitions(self) -> Dict[int, Dict[int, Tensor]]:
         """get fp32 gradient partition dictionary
         accessed as grad_dict[parameter_group_index][parameter_index]
         """
-        self.__reduce_and_partition_stream.synchronize()
+        if not get_accelerator().is_synchronized_device():
+            self.reduce_and_partition_stream.synchronize()
         grad_dict = collections.defaultdict(dict)
         if self.offload_optimizer:
             for group in self.fp16_groups:
                 for param_idx, param in enumerate(group):
                     group_idx, dest_offset, num_elements = self.grad_position[self.get_param_id(param)]
-                    fp32_grad = self.fp32_partitioned_groups_flat[group_idx].grad.narrow(
-                        0,
-                        dest_offset,
-                        num_elements)
+                    fp32_grad = self.fp32_partitioned_groups_flat[group_idx].grad.narrow(0, dest_offset, num_elements)
                     grad_dict[group_idx][param_idx] = fp32_grad
         else:
             for group_idx, group in self.averaged_gradients.items():
@@ -2118,17 +2159,17 @@ def get_fp32_grad_partitions(self) -> Dict[int, Dict[int, Tensor]]:
 
         return grad_dict
 
-    def _fp32_state_allgather(self, param, fp32_state):
-        reduce_buffer = torch.zeros(self.partition_count * fp32_state.numel(),
+    def _fp32_state_allgather(self, param, fp32_state_partition):
+        reduce_buffer = torch.zeros(self.partition_count * fp32_state_partition.numel(),
                                     dtype=torch.float32,
                                     device=param.device).flatten()
         my_rank = dist.get_rank(group=self.dp_process_group)
         partitions = [
             reduce_buffer.narrow(0,
-                                 fp32_state.numel() * i,
-                                 fp32_state.numel()) for i in range(self.partition_count)
+                                 fp32_state_partition.numel() * i, fp32_state_partition.numel())
+            for i in range(self.partition_count)
         ]
-        partitions[my_rank].data.copy_(fp32_state.data, non_blocking=False)
+        partitions[my_rank].data.copy_(fp32_state_partition.data, non_blocking=False)
 
         dist.all_gather(partitions, partitions[my_rank], group=self.dp_process_group)
 
@@ -2138,24 +2179,21 @@ def get_fp32_grad_for_param(self, param) -> Tensor:
         if not param.requires_grad:
             return None
 
-        self.__reduce_and_partition_stream.synchronize()
+        if not get_accelerator().is_synchronized_device():
+            self.reduce_and_partition_stream.synchronize()
 
         if self.offload_optimizer:
             group_idx, dest_offset, num_elements = self.grad_position[self.get_param_id(param)]
-            fp32_grad = self.fp32_partitioned_groups_flat[group_idx].grad.narrow(
-                0,
-                dest_offset,
-                num_elements).to(device=param.device)
+            fp32_grad = self.fp32_partitioned_groups_flat[group_idx].grad.narrow(0, dest_offset, num_elements)
         else:
             fp32_grad = self.__param_id_to_grad_partition[param.ds_id].float()
 
         return self._fp32_state_allgather(param, fp32_grad)
 
-    def get_full_hp_param(self, param, optim_state_key=None) -> Tensor:
-        if not param.requires_grad:
-            return None
+    def _get_fp32_opt_state_partition(self, param, optim_state_key=None):
+        if not get_accelerator().is_synchronized_device():
+            self.reduce_and_partition_stream.synchronize()
 
-        self.__reduce_and_partition_stream.synchronize()
         group_idx, dest_offset, num_elements = self.grad_position[self.get_param_id(param)]
 
         if self._swappable_optimizer_subgroup(group_idx):
@@ -2163,20 +2201,81 @@ def get_full_hp_param(self, param, optim_state_key=None) -> Tensor:
 
         fp32_param = self.fp32_partitioned_groups_flat[group_idx]
         if optim_state_key is None:
-            fp32_opt_state = fp32_param.narrow(0,
-                                               dest_offset,
-                                               num_elements).to(device=param.device)
+            fp32_opt_state = fp32_param.narrow(0, dest_offset, num_elements)
         else:
-            fp32_opt_state = self.optimizer.state[fp32_param][optim_state_key].narrow(
-                0,
-                dest_offset,
-                num_elements).to(device=param.device)
+            fp32_opt_state = self.optimizer.state[fp32_param][optim_state_key].narrow(0, dest_offset, num_elements)
+
+        return fp32_opt_state, group_idx
+
+    def get_full_hp_param(self, param, optim_state_key=None) -> Tensor:
+        if not param.requires_grad:
+            return None
 
+        fp32_opt_state, group_idx = self._get_fp32_opt_state_partition(param, optim_state_key)
         hp_param = self._fp32_state_allgather(param, fp32_opt_state)
+
         if self._swappable_optimizer_subgroup(group_idx):
             self._optimizer_states_and_gradient_swap_out(group_idx)
+
         return hp_param
 
+    def set_full_hp_param(self, value, param, optim_state_key=None):
+        if not param.requires_grad:
+            return
+
+        assert value.numel(
+        ) == param.ds_numel, f" Number of elements do not match: {value.numel()} != {param.ds_numel}"
+
+        fp32_opt_state_partition, group_idx = self._get_fp32_opt_state_partition(param, optim_state_key)
+        my_rank = dist.get_rank(group=self.dp_process_group)
+        value_partition = value.flatten().narrow(0,
+                                                 fp32_opt_state_partition.numel() * my_rank,
+                                                 fp32_opt_state_partition.numel())
+        fp32_opt_state_partition.data.copy_(value_partition.data)
+
+        if self._swappable_optimizer_subgroup(group_idx):
+            self._optimizer_states_and_gradient_swap_out(group_idx)
+
+    ### Local API START ###
+
+    def get_local_fp32_param(self, param, optim_state_key=None) -> Tensor:
+        if not param.requires_grad:
+            return None
+        fp32_opt_state, group_idx = self._get_fp32_opt_state_partition(param, optim_state_key)
+        return fp32_opt_state
+
+    def get_local_fp32_grad_for_param(self, param) -> Tensor:
+        if not param.requires_grad:
+            return None
+
+        if not get_accelerator().is_synchronized_device():
+            self.reduce_and_partition_stream.synchronize()
+
+        if self.offload_optimizer:
+            group_idx, dest_offset, num_elements = self.grad_position[self.get_param_id(param)]
+            fp32_grad = self.fp32_partitioned_groups_flat[group_idx].grad.narrow(0, dest_offset, num_elements)
+        else:
+            fp32_grad = self.__param_id_to_grad_partition[param.ds_id].float()
+        return fp32_grad
+
+    def set_local_hp_param(self, value, param, optim_state_key=None):
+        if not param.requires_grad:
+            return
+
+        assert hasattr(param, "ds_tensor"), f" The parameter does not contain the partitioned copy of the tensor."
+        assert value.numel() == param.ds_tensor.numel(
+        ), f" Number of elements do not match: {value.numel()} != {param.ds_tensor.ds_numel}"
+
+        fp32_opt_state_partition, group_idx = self._get_fp32_opt_state_partition(param, optim_state_key)
+        value_partition = value.flatten()
+        fp32_opt_state_partition.data.copy_(value_partition.data)
+
+        if self._swappable_optimizer_subgroup(group_idx):
+            self._optimizer_states_and_gradient_swap_out(group_idx)
+        logger.info(f"[set_local_hp_param][update the params' value successfully]")
+
+    ### Local API END ###
+
     @instrument_w_nvtx
     def _partition_all_parameters(self):
         self.parameter_offload.partition_all_parameters()
@@ -2240,10 +2339,8 @@ def get_lean_optimizer_state(self):
             for key, value in self.optimizer.state[p].items():
                 if torch.is_tensor(value):
                     padded_lens = [t.numel() for t in self.fp16_partitioned_groups[i]]
-                    lean_state[key] = self._get_lean_tensors(
-                        value,
-                        self.fp16_partitioned_groups[i],
-                        self.groups_padding[i])
+                    lean_state[key] = self._get_lean_tensors(value, self.fp16_partitioned_groups[i],
+                                                             self.groups_padding[i])
                     lean_flat_len = sum([t.numel() for t in lean_state[key]])
                 else:
                     lean_state[key] = value
@@ -2256,9 +2353,7 @@ def get_groups_without_padding(self, groups_with_padding):
         # Return group tensor after removing paddings added for alignment to DP world size.
         groups_without_padding = []
         for i, group in enumerate(groups_with_padding):
-            lean_group = self._get_lean_tensors(group,
-                                                self.fp16_partitioned_groups[i],
-                                                self.groups_padding[i])
+            lean_group = self._get_lean_tensors(group, self.fp16_partitioned_groups[i], self.groups_padding[i])
             groups_without_padding.append(lean_group)
 
         return groups_without_padding
@@ -2276,7 +2371,7 @@ def _clear_fp32_optimizer_param_groups(self):
     def _rigid_state_dict(self):
         state_dict = {}
         state_dict[ZERO_STAGE] = ZeroStageEnum.weights
-        state_dict['loss_scaler'] = self.loss_scaler
+        state_dict[LOSS_SCALER] = self.loss_scaler
         state_dict['dynamic_loss_scale'] = self.dynamic_loss_scale
         state_dict['overflow'] = self.overflow
         state_dict[PARTITION_COUNT] = self.partition_count
@@ -2300,14 +2395,11 @@ def state_dict(self):
             torch.save(checkpoint, "saved.pth")
         """
         if self.elastic_checkpoint:
-            raise NotImplementedError(
-                "ZeRO-3 does not yet support elastic checkpointing, please disable for now."
-            )
+            raise NotImplementedError("ZeRO-3 does not yet support elastic checkpointing, please disable for now.")
 
         if self.swap_optimizer or self.params_in_nvme_and_cpu:
             raise NotImplementedError(
-                "ZeRO-3 does not yet support checkpointing with NVMe offloading, please disable for now."
-            )
+                "ZeRO-3 does not yet support checkpointing with NVMe offloading, please disable for now.")
 
         return self._rigid_state_dict()
 
@@ -2329,7 +2421,8 @@ def _restore_from_fp32_weights(self, all_state_dict):
 
     # Restore base optimizer fp32 weights from ZeRO fp16 weights
     def _restore_from_bit16_weights(self):
-        for fp16_partitions, fp32_partition in zip(self.fp16_partitioned_groups_flat, self.fp32_partitioned_groups_flat):
+        for fp16_partitions, fp32_partition in zip(self.fp16_partitioned_groups_flat,
+                                                   self.fp32_partitioned_groups_flat):
             fp32_partition.data.copy_(fp16_partitions.data)
 
     # Refresh the fp32 master params from the fp16 copies.
@@ -2348,9 +2441,7 @@ def _get_flattened_partition(self, all_partition_states):
 
         local_state_partitions = []
         for param_index, param_slices in enumerate(param_partitions):
-            flattened_merged_tensor = self.flatten_dense_tensors_aligned(
-                param_slices,
-                alignment)
+            flattened_merged_tensor = self.flatten_dense_tensors_aligned(param_slices, alignment)
             new_partitions = self.get_data_parallel_partitions(flattened_merged_tensor)
             local_state_partitions.append(new_partitions[partition_id])
 
@@ -2368,15 +2459,10 @@ def _restore_base_optimizer_state(self, all_state_dict):
         base_optimizer_group_states = []
         for i in range(len(self.optimizer.param_groups)):
             partition_states = {}
-            all_partition_group_states = [
-                sd['base_optimizer_state'][i] for sd in all_state_dict
-            ]
+            all_partition_group_states = [sd['base_optimizer_state'][i] for sd in all_state_dict]
             for key in all_partition_group_states[0].keys():
-                all_partition_states = [
-                    all_states[key] for all_states in all_partition_group_states
-                ]
-                partition_states[key] = self._get_flattened_partition(
-                    all_partition_states)
+                all_partition_states = [all_states[key] for all_states in all_partition_group_states]
+                partition_states[key] = self._get_flattened_partition(all_partition_states)
             base_optimizer_group_states.append(partition_states)
 
         for i, group in enumerate(self.optimizer.param_groups):
@@ -2389,7 +2475,7 @@ def _restore_base_optimizer_state(self, all_state_dict):
 
     def _rigid_load_state_dict(self, state_dict, load_optimizer_states=True):
         # I think it should actually be ok to reload the optimizer before the model.
-        self.loss_scaler = state_dict['loss_scaler']
+        self.loss_scaler = state_dict[LOSS_SCALER]
         self.dynamic_loss_scale = state_dict['dynamic_loss_scale']
         self.overflow = state_dict['overflow']
 
@@ -2410,9 +2496,8 @@ def _rigid_load_state_dict(self, state_dict, load_optimizer_states=True):
 
         # update fp16 unflattened params
         for sub_group_id in range(len(self.fp16_partitioned_groups_flat)):
-            updated_params = self.unflatten(
-                self.fp16_partitioned_groups_flat[sub_group_id],
-                self.fp16_partitioned_groups[sub_group_id])
+            updated_params = self.unflatten(self.fp16_partitioned_groups_flat[sub_group_id],
+                                            self.fp16_partitioned_groups[sub_group_id])
 
             for partitioned_param, q in zip(self.fp16_partitioned_groups[sub_group_id], updated_params):
                 partitioned_param.data = q.data
@@ -2422,7 +2507,8 @@ def load_state_dict(self,
                         state_dict_list,
                         load_optimizer_states=True,
                         load_from_fp32_weights=False,
-                        checkpoint_folder=None):
+                        checkpoint_folder=None,
+                        load_serial=None):
         r"""Loading a ZeRO checkpoint
         Arguments:
             state_dict_list: List of all saved ZeRO checkpoints, one for each saved partition.
@@ -2449,22 +2535,30 @@ def load_state_dict(self,
         """
 
         if self.elastic_checkpoint:
-            raise NotImplementedError(
-                "ZeRO-3 does not yet support elastic checkpointing, please disable for now."
-            )
+            raise NotImplementedError("ZeRO-3 does not yet support elastic checkpointing, please disable for now.")
 
         if self.swap_optimizer or self.params_in_nvme_and_cpu:
             raise NotImplementedError(
-                "ZeRO-3 does not yet support checkpointing with NVMe offloading, please disable for now."
-            )
-
-        self._rigid_load_state_dict(
-            state_dict_list[dist.get_rank(group=self.dp_process_group)],
-            load_optimizer_states=load_optimizer_states)
+                "ZeRO-3 does not yet support checkpointing with NVMe offloading, please disable for now.")
+
+        self._rigid_load_state_dict(state_dict_list[dist.get_rank(group=self.dp_process_group)],
+                                    load_optimizer_states=load_optimizer_states)
+
+        # when use loading checkpoint serial, after finish loading, we need to
+        # delete the temp state_dict_list variable to save memory, then trigger
+        # the next rank's loading
+        if load_serial != None:
+            load_serial += 1
+            rank = dist.get_rank(group=self.dp_process_group)
+            local_rank = dist.get_local_rank()
+            del state_dict_list[rank]
+            rank_end = dist.get_world_size() - 1
+            if local_rank != rank_end:
+                dist.send(tensor=load_serial, dst=rank + 1)
 
         if len(self.persistent_parameters) > 0:
             self.persistent_parameters[0].partition(self.persistent_parameters)
-            self.persistent_parameters[0].all_gather(self.persistent_parameters)
+            # self.persistent_parameters[0].all_gather(self.persistent_parameters) # this will be done in checkpoint_event_epilogue() so remove it to prevent double all_gather
 
     def checkpoint_event_prologue(self):
         self._partition_all_parameters()
@@ -2473,6 +2567,9 @@ def checkpoint_event_epilogue(self):
         if len(self.persistent_parameters) > 0:
             self.persistent_parameters[0].all_gather(self.persistent_parameters)
 
+    def empty_partition_cache(self):
+        self.parameter_offload.empty_partition_cache()
+
 
 def _handle_overflow(cpu_sum, x, i):
     import math
@@ -2483,9 +2580,7 @@ def _handle_overflow(cpu_sum, x, i):
             if not math.isfinite(float(v)):
                 t_i = v_i
                 break
-        logger.info(
-            f"rank {rank} detected overflow {cpu_sum} in tensor {i}:{t_i} shape {x.shape}"
-        )
+        logger.info(f"rank {rank} detected overflow {cpu_sum} in tensor {i}:{t_i} shape {x.shape}")
 
 
 def estimate_zero3_model_states_mem_needs(total_params,
@@ -2508,8 +2603,7 @@ def estimate_zero3_model_states_mem_needs(total_params,
             if zero_init:
                 cpu_mem = total_params * 18 * gpus_factor * additional_buffer_factor
             else:
-                cpu_mem = total_params * max(4 * num_gpus_per_node,
-                                             18 * gpus_factor) * additional_buffer_factor
+                cpu_mem = total_params * max(4 * num_gpus_per_node, 18 * gpus_factor) * additional_buffer_factor
 
         else:
             gpu_mem = largest_layer_memory + int(2 * total_params / total_gpus)
@@ -2517,8 +2611,7 @@ def estimate_zero3_model_states_mem_needs(total_params,
             if zero_init:
                 cpu_mem = total_params * 16 * gpus_factor * additional_buffer_factor
             else:
-                cpu_mem = total_params * max(4 * num_gpus_per_node,
-                                             16 * gpus_factor) * additional_buffer_factor
+                cpu_mem = total_params * max(4 * num_gpus_per_node, 16 * gpus_factor) * additional_buffer_factor
     else:
         gpu_mem = largest_layer_memory + int(18 * total_params / total_gpus)
         if zero_init:
@@ -2531,9 +2624,7 @@ def estimate_zero3_model_states_mem_needs(total_params,
 
 def model_to_params(model):
     # shared params calculated only once
-    total_params = sum(
-        dict((p.data_ptr(),
-              p.numel()) for p in model.parameters()).values())
+    total_params = sum(dict((p.data_ptr(), p.numel()) for p in model.parameters()).values())
 
     largest_layer_params = 0
     for m in model.modules():
@@ -2568,12 +2659,11 @@ def estimate_zero3_model_states_mem_needs_all_live(model,
 
     total_params, largest_layer_params = model_to_params(model)
 
-    estimate_zero3_model_states_mem_needs_all_cold(
-        total_params=total_params,
-        largest_layer_params=largest_layer_params,
-        num_gpus_per_node=num_gpus_per_node,
-        num_nodes=num_nodes,
-        additional_buffer_factor=additional_buffer_factor)
+    estimate_zero3_model_states_mem_needs_all_cold(total_params=total_params,
+                                                   largest_layer_params=largest_layer_params,
+                                                   num_gpus_per_node=num_gpus_per_node,
+                                                   num_nodes=num_nodes,
+                                                   additional_buffer_factor=additional_buffer_factor)
 
 
 def estimate_zero3_model_states_mem_needs_all_cold(total_params,
@@ -2599,6 +2689,7 @@ def estimate_zero3_model_states_mem_needs_all_cold(total_params,
         - ``additional_buffer_factor``: estimation factor (defaults to 1.5):
 
     """
+
     def format_options(cpu_offload, cpu_offload_params, zero_init):
         enabled = []
         padded_cpu_str = f'{OffloadDeviceEnum.cpu:4}'
@@ -2630,11 +2721,9 @@ def format_options(cpu_offload, cpu_offload_params, zero_init):
                     cpu_offload=cpu_offload,
                     cpu_offload_params=cpu_offload_params,
                     zero_init=zero_init,
-                    additional_buffer_factor=additional_buffer_factor
-                )
+                    additional_buffer_factor=additional_buffer_factor)
 
                 options_str = format_options(cpu_offload=cpu_offload,
                                              cpu_offload_params=cpu_offload_params,
                                              zero_init=zero_init)
-                print(
-                    f" {cpu_mem/2**30:7.2f}GB | {gpu_mem/2**30:6.2f}GB | {options_str}")
+                print(f" {cpu_mem/2**30:7.2f}GB | {gpu_mem/2**30:6.2f}GB | {options_str}")
diff --git a/deepspeed/runtime/zero/stage_1_and_2.py b/deepspeed/runtime/zero/stage_1_and_2.py
index aef25135320c..4b92fe319bfa 100755
--- a/deepspeed/runtime/zero/stage_1_and_2.py
+++ b/deepspeed/runtime/zero/stage_1_and_2.py
@@ -1,23 +1,20 @@
-'''
-Copyright 2019 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 import os
 from deepspeed import comm as dist
 from packaging import version as pkg_version
 from collections import OrderedDict
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
 
 from deepspeed.runtime import ZeROOptimizer
 from deepspeed.runtime.fp16.loss_scaler import CreateLossScaler
-from deepspeed.runtime.utils import (bwc_tensor_model_parallel_rank,
-                                     get_global_norm,
-                                     empty_cache,
-                                     see_memory_usage,
-                                     inf,
-                                     is_model_parallel_parameter,
-                                     align_dense_tensors,
-                                     all_gather_dp_groups)
+from deepspeed.runtime.utils import (bwc_tensor_model_parallel_rank, get_global_norm, empty_cache, see_memory_usage,
+                                     inf, is_model_parallel_parameter, align_dense_tensors, all_gather_dp_groups,
+                                     all_gather_all_partitions)
 
 from deepspeed.runtime.zero.config import ZeroStageEnum
 from deepspeed.runtime.zero.offload_config import OffloadDeviceEnum
@@ -28,23 +25,23 @@
 
 from deepspeed.runtime.constants import PIPE_REPLICATED
 from deepspeed.accelerator import get_accelerator
-from deepspeed.ops.op_builder import UtilsBuilder
-
-from deepspeed.checkpoint.constants import (DS_VERSION,
-                                            GROUP_PADDINGS,
-                                            PARTITION_COUNT,
-                                            SINGLE_PARTITION_OF_FP32_GROUPS,
-                                            BASE_OPTIMIZER_STATE,
-                                            CLIP_GRAD,
-                                            ZERO_STAGE,
-                                            PARAM_SLICE_MAPPINGS)
+
+from deepspeed.checkpoint.constants import (DS_VERSION, GROUP_PADDINGS, PARTITION_COUNT, LOSS_SCALER,
+                                            SINGLE_PARTITION_OF_FP32_GROUPS, BASE_OPTIMIZER_STATE,
+                                            BASE_OPTIMIZER_STATE_STEP, CLIP_GRAD, ZERO_STAGE, PARAM_SLICE_MAPPINGS)
 from deepspeed.utils import link_hp_params
 from deepspeed.checkpoint import enable_universal_checkpoint
 
+from deepspeed.utils import groups
 # Toggle this to true to enable correctness test
 # with gradient partitioning and without
 pg_correctness_test = False
 
+OPTIMIZER_ALLGATHER_TIMER = 'optimizer_allgather'
+OPTIMIZER_GRADIENTS_TIMER = 'optimizer_gradients'
+OPTIMIZER_STEP_TIMER = 'optimizer_step'
+OPTIMIZER_TIMERS = [OPTIMIZER_ALLGATHER_TIMER, OPTIMIZER_GRADIENTS_TIMER, OPTIMIZER_STEP_TIMER]
+
 
 def input(msg):
     return
@@ -53,10 +50,8 @@ def input(msg):
 def split_half_float_double(tensors):
     device_type = get_accelerator().device_name()
     dtypes = [
-        "torch.{}.HalfTensor".format(device_type),
-        "torch.{}.FloatTensor".format(device_type),
-        "torch.{}.DoubleTensor".format(device_type),
-        "torch.{}.BFloat16Tensor".format(device_type)
+        "torch.{}.HalfTensor".format(device_type), "torch.{}.FloatTensor".format(device_type),
+        "torch.{}.DoubleTensor".format(device_type), "torch.{}.BFloat16Tensor".format(device_type)
     ]
     buckets = []
     for i, dtype in enumerate(dtypes):
@@ -110,6 +105,7 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
     For usage examples, refer to TODO: DeepSpeed Tutorial
 
     """
+
     def __init__(self,
                  init_optimizer,
                  param_names,
@@ -120,15 +116,17 @@ def __init__(self,
                  verbose=True,
                  contiguous_gradients=True,
                  reduce_bucket_size=500000000,
+                 use_multi_rank_bucket_allreduce=True,
                  allgather_bucket_size=5000000000,
                  dp_process_group=None,
                  expert_parallel_group=None,
                  expert_data_parallel_group=None,
                  reduce_scatter=True,
                  overlap_comm=False,
-                 cpu_offload=False,
+                 offload_optimizer_config=None,
                  mpu=None,
                  clip_grad=0.0,
+                 gradient_accumulation_dtype=torch.float32,
                  communication_data_type=torch.float16,
                  postscale_gradients=True,
                  gradient_predivide_factor=1.0,
@@ -140,10 +138,17 @@ def __init__(self,
                  fp16_master_weights_and_gradients=False,
                  elastic_checkpoint=False):
 
+        if offload_optimizer_config is not None and offload_optimizer_config.device != OffloadDeviceEnum.none:
+            self.cpu_offload = True
+            self.cpu_offload_pin_memory = offload_optimizer_config.pin_memory
+        else:
+            self.cpu_offload = False
+            self.cpu_offload_pin_memory = False
+
         if dist.get_rank() == 0:
             logger.info(f"Reduce bucket size {reduce_bucket_size}")
             logger.info(f"Allgather bucket size {allgather_bucket_size}")
-            logger.info(f"CPU Offload: {cpu_offload}")
+            logger.info(f"CPU Offload: {self.cpu_offload}")
             logger.info(f'Round robin gradient partitioning: {round_robin_gradients}')
         # The fused optimizer does all the work. We need this layer for two reason:
         # 1. maintain same user API from apex.fp16_utils
@@ -158,16 +163,16 @@ def __init__(self,
         # - flat by groups, not keeping state. TODO: remove state explicitly?
         # - master grad and unflat master weight never exist. TODO: a way to save out unflat master?
         if not get_accelerator().is_available():
-            raise SystemError("Cannot use fp16 without accelerator.")
+            raise SystemError("Accelerator is not detected, cannot perform low precision training (e.g., fp16, bf16).")
         self.optimizer = init_optimizer
 
-        # Load pre-built or JIT compile (un)flatten ops
-        util_ops = UtilsBuilder().load()
-        self.flatten = util_ops.flatten
-        self.unflatten = util_ops.unflatten
+        # Use torch (un)flatten ops
+        self.flatten = _flatten_dense_tensors
+        self.unflatten = _unflatten_dense_tensors
 
         # ZeRO stage 1 (False) or 2 (True)
         self.partition_gradients = partition_grads
+        self.zero_stage_string = "ZeRO-2" if partition_grads else "ZeRO-1"
 
         self.timers = timers
 
@@ -175,15 +180,12 @@ def __init__(self,
 
         self.overlap_comm = overlap_comm
 
-        self.cpu_offload = cpu_offload
+        self.deepspeed_adam_offload = self.cpu_offload
 
-        self.deepspeed_adam_offload = cpu_offload
-
-        self.device = get_accelerator().current_device_name(
-        ) if not self.cpu_offload else 'cpu'
+        self.device = get_accelerator().current_device_name() if not self.cpu_offload else 'cpu'
 
         self.dp_process_group = dp_process_group
-
+        self.sequence_parallel_size = groups._get_sequence_parallel_world_size()
         #expert parallel group
         self.ep_process_group = expert_parallel_group
 
@@ -195,15 +197,13 @@ def __init__(self,
 
         #For MoE models this maybe different for different param group
         #It will be modified during MoE setup later in the init
-        self.real_dp_process_group = [
-            dp_process_group for i in range(len(self.optimizer.param_groups))
-        ]
+        self.real_dp_process_group = [dp_process_group for i in range(len(self.optimizer.param_groups))]
         self.partition_count = [dp_size for i in range(len(self.optimizer.param_groups))]
 
         self.is_gradient_accumulation_boundary = True
 
         # CPU-Offload requires contiguous gradients
-        self.contiguous_gradients = contiguous_gradients or cpu_offload
+        self.contiguous_gradients = contiguous_gradients or self.cpu_offload
 
         self.has_moe_layers = has_moe_layers
         if self.has_moe_layers:
@@ -233,12 +233,16 @@ def __init__(self,
         self.fp16_master_weights_and_gradients = fp16_master_weights_and_gradients
 
         if self.fp16_master_weights_and_gradients:
-            assert self.cpu_offload and type(self.optimizer) in [DeepSpeedCPUAdam], f"fp16_master_and_gradients requires optimizer to support keeping fp16 master and gradients while keeping the optimizer states in fp32. Currently only supported using ZeRO-Offload with DeepSpeedCPUAdam. But current setting is ZeRO-Offload:{self.cpu_offload} and optimizer type {type(self.optimizer)}. Either disable fp16_master_weights_and_gradients or enable ZeRO-2 Offload with DeepSpeedCPUAdam"
+            assert self.cpu_offload and type(self.optimizer) in [DeepSpeedCPUAdam], \
+            f"fp16_master_and_gradients requires optimizer to support keeping fp16 master and gradients while keeping the optimizer states in fp32."\
+            f"Currently only supported using ZeRO-Offload with DeepSpeedCPUAdam. But current setting is ZeRO-Offload:{self.cpu_offload} and optimizer type {type(self.optimizer)}." \
+            f"Either disable fp16_master_weights_and_gradients or enable {self.zero_stage_string} Offload with DeepSpeedCPUAdam."
 
         if self.reduce_scatter:
-            assert self.communication_data_type in (torch.float16, torch.bfloat16), f"ZeRO-2 supports only float16 or bfloat16 communication_data_type with reduce scatter enabled. Got: '{self.communication_data_type}'"
-            assert self.gradient_predivide_factor == 1.0, "gradient_predivide_factor != 1.0 is not yet supported with ZeRO-2 with reduce scatter enabled"
-            assert self.postscale_gradients, "pre-scale gradients is not yet supported with ZeRO-2 with reduce scatter enabled"
+            valid_reduce_scatter_dtypes = (torch.float16, torch.bfloat16, torch.float32)
+            assert self.communication_data_type in valid_reduce_scatter_dtypes, f"{self.zero_stage_string} supports {valid_reduce_scatter_dtypes} communication_data_type with reduce scatter enabled. Got: '{self.communication_data_type}'"
+            assert self.gradient_predivide_factor == 1.0, "gradient_predivide_factor != 1.0 is not yet supported with {self.zero_stage_string} with reduce scatter enabled"
+            assert self.postscale_gradients, "pre-scale gradients is not yet supported with {self.zero_stage_string} with reduce scatter enabled"
 
         # param flattened by groups
         self.bit16_groups = []
@@ -261,7 +265,7 @@ def __init__(self,
         # These are the parameters that will be updated by this process directly
         self.params_in_partition = []
 
-        # Offset from the first parameter in the the self.params_in_partition
+        # Offset from the first parameter in the self.params_in_partition
         # the parameter boundaries may not align with partition boundaries
         # so we need to keep track of the offset
         self.first_offset = []
@@ -272,10 +276,22 @@ def __init__(self,
         # align nccl all-gather send buffers to 4-byte boundary
         self.nccl_start_alignment_factor = 2  # 4-byte alignment/sizeof(fp16) = 2
 
-        assert (allgather_bucket_size % self.nccl_start_alignment_factor == 0), f"allgather_bucket_size must be a multiple of nccl_start_alignment_factor, {self.nccl_start_alignment_factor} "
+        assert (
+            allgather_bucket_size % self.nccl_start_alignment_factor == 0
+        ), f"allgather_bucket_size must be a multiple of nccl_start_alignment_factor, {self.nccl_start_alignment_factor} "
 
         self.all_reduce_print = False
         self.dtype = self.optimizer.param_groups[0]['params'][0].dtype
+        self.gradient_accumulation_dtype = gradient_accumulation_dtype
+
+        if self.dtype != self.gradient_accumulation_dtype:
+            self.use_separate_grad_accum = True
+        else:
+            self.use_separate_grad_accum = False
+        if self.use_separate_grad_accum and not self.partition_gradients:
+            self.use_grad_accum_attribute = True
+        else:
+            self.use_grad_accum_attribute = False
 
         self.round_robin_bit16_groups = []
         self.round_robin_bit16_indices = []
@@ -289,9 +305,11 @@ def __init__(self,
 
             # push this group to list before modify
             # TODO: Explore simplification that avoids the extra book-keeping by pushing the reordered group
-            trainable_parameters = [
-                param for param in param_group['params'] if param.requires_grad
-            ]
+            trainable_parameters = []
+            for param in param_group['params']:
+                if param.requires_grad:
+                    param.grad_accum = None
+                    trainable_parameters.append(param)
             self.bit16_groups.append(trainable_parameters)
 
             # not sure why apex was cloning the weights before flattening
@@ -309,9 +327,7 @@ def __init__(self,
             # to the same rank, instead they will belong to 3 ranks (r_m+2, r_m+1, r_m).
             if self.round_robin_gradients:
                 round_robin_tensors, round_robin_indices = self._round_robin_reorder(
-                    self.bit16_groups[i],
-                    dist.get_world_size(group=self.real_dp_process_group[i])
-                )
+                    self.bit16_groups[i], dist.get_world_size(group=self.real_dp_process_group[i]))
             else:
                 round_robin_tensors = self.bit16_groups[i]
                 round_robin_indices = list(range(len(self.bit16_groups[i])))
@@ -323,15 +339,12 @@ def __init__(self,
             self.bit16_groups_flat.append(
                 self.flatten_dense_tensors_aligned(
                     self.round_robin_bit16_groups[i],
-                    self.nccl_start_alignment_factor *
-                    dist.get_world_size(group=self.real_dp_process_group[i])).to(
+                    self.nccl_start_alignment_factor * dist.get_world_size(group=self.real_dp_process_group[i])).to(
                         get_accelerator().current_device_name()))
-            see_memory_usage(f"After flattening and moving param group {i} to GPU",
-                             force=False)
+            see_memory_usage(f"After flattening and moving param group {i} to GPU", force=False)
 
             # Record padding required for alignment
-            if partition_id == dist.get_world_size(
-                    group=self.real_dp_process_group[i]) - 1:
+            if partition_id == dist.get_world_size(group=self.real_dp_process_group[i]) - 1:
                 padding = self.bit16_groups_flat[i].numel() - sum(
                     [t.numel() for t in self.round_robin_bit16_groups[i]])
             else:
@@ -339,36 +352,29 @@ def __init__(self,
             self.groups_padding.append(padding)
 
             if dist.get_rank(group=self.real_dp_process_group[i]) == 0:
-                see_memory_usage(
-                    f"After Flattening and after emptying param group {i} cache",
-                    force=False)
+                see_memory_usage(f"After Flattening and after emptying param group {i} cache", force=False)
 
             # set model bit16 weight to slices of flattened buffer
             self._update_model_bit16_weights(i)
 
             # divide the flat weights into near equal partition equal to the data parallel degree
             # each process will compute on a different part of the partition
-            data_parallel_partitions = self.get_data_parallel_partitions(
-                self.bit16_groups_flat[i],
-                i)
+            data_parallel_partitions = self.get_data_parallel_partitions(self.bit16_groups_flat[i], i)
             self.parallel_partitioned_bit16_groups.append(data_parallel_partitions)
 
             # verify that data partition start locations are 4-byte aligned
             for partitioned_data in data_parallel_partitions:
-                assert (partitioned_data.data_ptr() %
-                        (2 * self.nccl_start_alignment_factor) == 0)
+                assert (partitioned_data.data_ptr() % (2 * self.nccl_start_alignment_factor) == 0)
 
             # A partition of the fp32 master weights that will be updated by this process.
             # Note that the params in single_partition_of_fp32_groups is cloned and detached
             # from the origin params of the model.
             if not fp16_master_weights_and_gradients:
-                self.single_partition_of_fp32_groups.append(
-                    self.parallel_partitioned_bit16_groups[i][partition_id].to(
-                        self.device).clone().float().detach())
+                self.single_partition_of_fp32_groups.append(self.parallel_partitioned_bit16_groups[i][partition_id].to(
+                    self.device).clone().float().detach())
             else:
-                self.single_partition_of_fp32_groups.append(
-                    self.parallel_partitioned_bit16_groups[i][partition_id].to(
-                        self.device).clone().half().detach())
+                self.single_partition_of_fp32_groups.append(self.parallel_partitioned_bit16_groups[i][partition_id].to(
+                    self.device).clone().half().detach())
 
             # Set local optimizer to have flat params of its own partition.
             # After this, the local optimizer will only contain its own partition of params.
@@ -377,33 +383,21 @@ def __init__(self,
                 i].requires_grad = True  # keep this in case internal optimizer uses it
             param_group['params'] = [self.single_partition_of_fp32_groups[i]]
 
-            partition_size = len(self.bit16_groups_flat[i]) / dist.get_world_size(
-                group=self.real_dp_process_group[i])
+            partition_size = len(self.bit16_groups_flat[i]) / dist.get_world_size(group=self.real_dp_process_group[i])
             params_in_partition, params_not_in_partition, first_offset = self.get_partition_info(
-                self.round_robin_bit16_groups[i],
-                partition_size,
-                partition_id)
+                self.round_robin_bit16_groups[i], partition_size, partition_id)
 
             self.partition_size.append(partition_size)
             self.params_in_partition.append(params_in_partition)
             self.params_not_in_partition.append(params_not_in_partition)
             self.first_offset.append(first_offset)
 
-        for rank in range(dist.get_world_size()):
-            if dist.get_rank() == rank:
-                print(
-                    f"Rank: {rank} partition count {self.partition_count} and sizes{[(p.numel(), self.is_moe_param_group[i] if hasattr(self, 'is_moe_param_group') else False) for i,p in enumerate(self.single_partition_of_fp32_groups)]} "
-                )
-                dist.barrier()
-
         self.reduce_bucket_size = int(reduce_bucket_size)
+        self.use_multi_rank_bucket_allreduce = use_multi_rank_bucket_allreduce
         self.allgather_bucket_size = int(allgather_bucket_size)
 
-        self.reduction_event = get_accelerator().Event(enable_timing=False,
-                                                       blocking=False)
-        self.reduction_stream = get_accelerator().Stream()
-        self.cpu_computation_stream = get_accelerator().Stream()
-        self.copy_grad_stream = get_accelerator().Stream()
+        self.reduction_stream = None if get_accelerator().is_synchronized_device() else get_accelerator().Stream()
+        #self.copy_grad_stream = get_accelerator().Stream()
         self.callback_queued = False
 
         self.param_dict = {}
@@ -448,19 +442,17 @@ def __init__(self,
             self.norm_for_param_grads = {}
             self.local_overflow = False
             self.grad_position = {}
-            self.temp_grad_buffer_for_cpu_offload = get_accelerator().pin_memory(
-                torch.zeros(largest_param_numel,
-                            device=self.device,
-                            dtype=self.dtype))
-            self.temp_grad_buffer_for_gpu_offload = torch.zeros(
-                largest_param_numel,
-                device=get_accelerator().current_device_name(),
-                dtype=self.dtype)
+            self.temp_grad_buffer_for_cpu_offload = torch.zeros(largest_param_numel,
+                                                                device=self.device,
+                                                                dtype=self.dtype)
+            if self.cpu_offload_pin_memory:
+                self.temp_grad_buffer_for_cpu_offload = get_accelerator().pin_memory(
+                    self.temp_grad_buffer_for_cpu_offload)
+            self.temp_grad_buffer_for_gpu_offload = torch.zeros(largest_param_numel,
+                                                                device=get_accelerator().current_device_name(),
+                                                                dtype=self.dtype)
             for i, params_group in enumerate(self.bit16_groups):
-                self.get_grad_position(i,
-                                       self.params_in_partition[i],
-                                       self.first_offset[i],
-                                       self.partition_size[i])
+                self.get_grad_position(i, self.params_in_partition[i], self.first_offset[i], self.partition_size[i])
 
         # mapping from parameter to partition that it belongs to
         self.param_to_partition_ids = {}
@@ -512,6 +504,11 @@ def __init__(self,
                                             dynamic_loss_args=dynamic_loss_args)
         self.dynamic_loss_scale = self.loss_scaler.dynamic
 
+        if self.dtype != torch.float16:
+            # Only fp16 should use dynamic loss scaling
+            assert self.loss_scaler.cur_scale == 1.0
+            assert not self.dynamic_loss_scale
+
         see_memory_usage("Before initializing optimizer states", force=True)
         self.initialize_optimizer_states()
         see_memory_usage("After initializing optimizer states", force=True)
@@ -537,8 +534,7 @@ def _create_param_mapping(self):
             for lp in self.bit16_groups[i]:
                 if lp._hp_mapping is not None:
                     lp_name = self.param_names[lp]
-                    param_mapping_per_group[
-                        lp_name] = lp._hp_mapping.get_hp_fragment_address()
+                    param_mapping_per_group[lp_name] = lp._hp_mapping.get_hp_fragment_address()
             param_mapping.append(param_mapping_per_group)
 
         return param_mapping
@@ -553,17 +549,16 @@ def _link_all_hp_params(self):
             partition_id = dist.get_rank(group=self.real_dp_process_group[i])
             partition_size = self.bit16_groups_flat[i].numel() // dp_world_size
             flat_hp_partition = self.single_partition_of_fp32_groups[i]
-            link_hp_params(
-                lp_param_list=self.bit16_groups[i],
-                flat_hp_partition=flat_hp_partition,
-                gradient_dict=self.averaged_gradients,
-                offload_gradient_dict=self.offload_gradient_dict,
-                use_offload=self.cpu_offload,
-                param_group_index=i,
-                partition_start=partition_id * partition_size,
-                partition_size=partition_size,
-                partition_optimizer_state=self.optimizer.state[flat_hp_partition],
-                dp_group=self.real_dp_process_group[i])
+            link_hp_params(lp_param_list=self.bit16_groups[i],
+                           flat_hp_partition=flat_hp_partition,
+                           gradient_dict=self.averaged_gradients,
+                           offload_gradient_dict=self.offload_gradient_dict,
+                           use_offload=self.cpu_offload,
+                           param_group_index=i,
+                           partition_start=partition_id * partition_size,
+                           partition_size=partition_size,
+                           partition_optimizer_state=self.optimizer.state[flat_hp_partition],
+                           dp_group=self.real_dp_process_group[i])
 
     def is_moe_group(self, group):
         return 'moe' in group and group['moe']
@@ -575,19 +570,19 @@ def _configure_moe_settings(self):
         # NOTE: To run ZeRO stage 1 with MoE, we need to set self.contiguous_gradients to True or ignore the assertion
         if not self.partition_gradients and not self.contiguous_gradients:
             logger.warn(
-                "ZeRO Stage 1 has not been thoroughly tested with MoE. This configuration is still experimental."
-            )
+                "ZeRO Stage 1 has not been thoroughly tested with MoE. This configuration is still experimental.")
         assert self.reduce_scatter, "Reduce Scatter in ZeRO Stage 2 must be set to True for MoE. Other code paths are not tested with MoE"
 
-        assert any([self.is_moe_group(group) for group in self.optimizer.param_groups]), "The model has moe layers, but None of the param groups are marked as MoE. Create a param group with 'moe' key set to True before creating optimizer"
+        assert any(
+            [self.is_moe_group(group) for group in self.optimizer.param_groups]
+        ), "The model has moe layers, but None of the param groups are marked as MoE. Create a param group with 'moe' key set to True before creating optimizer"
         self.is_moe_param_group = []
         for i, group in enumerate(self.optimizer.param_groups):
             if self.is_moe_group(group):
-                assert all([is_moe_param(param) for param in group['params']]), "All params in MoE group must be MoE params"
-                self.real_dp_process_group[i] = self.expert_dp_process_group[
-                    group['name']]
-                self.partition_count[i] = dist.get_world_size(
-                    group=self.expert_dp_process_group[group['name']])
+                assert all([is_moe_param(param)
+                            for param in group['params']]), "All params in MoE group must be MoE params"
+                self.real_dp_process_group[i] = self.expert_dp_process_group[group['name']]
+                self.partition_count[i] = dist.get_world_size(group=self.expert_dp_process_group[group['name']])
                 self.is_moe_param_group.append(True)
             else:
                 self.is_moe_param_group.append(False)
@@ -638,14 +633,19 @@ def _release_ipg_buffers(self):
     def initialize_optimizer_states(self):
 
         for i, group in enumerate(self.bit16_groups):
-            single_grad_partition = torch.zeros(
-                int(self.partition_size[i]),
-                dtype=self.single_partition_of_fp32_groups[i].dtype,
-                device=self.device)
+            single_grad_partition = torch.zeros(int(self.partition_size[i]),
+                                                dtype=self.single_partition_of_fp32_groups[i].dtype,
+                                                device=self.device)
             self.single_partition_of_fp32_groups[i].grad = get_accelerator().pin_memory(
-                single_grad_partition) if self.cpu_offload else single_grad_partition
+                single_grad_partition) if self.cpu_offload_pin_memory else single_grad_partition
 
-        self.optimizer.step()
+        # Initialize the optimizer states with the flattened fp32 partition.
+        # State initialization for the Adagrad optimizer occurs at construction as opposed to other optimizers
+        # which do lazy initialization of the state at the first call to step.
+        if isinstance(self.optimizer, torch.optim.Adagrad):
+            self.optimizer = torch.optim.Adagrad(self.single_partition_of_fp32_groups, **self.optimizer.defaults)
+        else:
+            self.optimizer.step()
 
         if not self.cpu_offload:
             for group in self.single_partition_of_fp32_groups:
@@ -672,7 +672,8 @@ def reduce_gradients(self, pipeline_parallel=False):
         if not self.overlap_comm:
             for i, group in enumerate(self.bit16_groups):
                 for param in group:
-                    if param.grad is not None:
+                    grad_reduc = self.get_gradient_for_reduction(param)
+                    if grad_reduc is not None:
                         self.reduce_ready_partitions_and_remove_grads(param, i)
         # reduce any pending grads in either hook/non-hook case
         self.overlapping_partition_gradients_reduce_epilogue()
@@ -709,11 +710,8 @@ def initialize_gradient_partitioning_data_structures(self):
                 self.total_grads_in_partition[i][partition_id] = 0
                 self.initialize_gradient_partition(i, param_group, partition_id)
                 self.is_partition_reduced[i][partition_id] = False
-                self.first_param_index_in_partition[i][
-                    partition_id] = self.get_first_param_index(
-                        i,
-                        param_group,
-                        partition_id)
+                self.first_param_index_in_partition[i][partition_id] = self.get_first_param_index(
+                    i, param_group, partition_id)
 
     def independent_gradient_partition_epilogue(self):
         self.report_ipg_memory_usage(f"In ipg_epilogue before reduce_ipg_grads", 0)
@@ -738,17 +736,16 @@ def independent_gradient_partition_epilogue(self):
                         self.params_in_partition[i],
                         self.first_offset[i],
                         self.partition_size[i],
-                        dtype=self.dtype,
+                        dtype=self.gradient_accumulation_dtype,
                         device=get_accelerator().current_device_name(),
                         return_tensor_list=True)
                 else:
-                    avg_new = self.get_flat_partition(
-                        self.params_in_partition[i],
-                        self.first_offset[i],
-                        self.partition_size[i],
-                        dtype=self.dtype,
-                        device=get_accelerator().current_device_name(),
-                        return_tensor_list=True)
+                    avg_new = self.get_flat_partition(self.params_in_partition[i],
+                                                      self.first_offset[i],
+                                                      self.partition_size[i],
+                                                      dtype=self.gradient_accumulation_dtype,
+                                                      device=get_accelerator().current_device_name(),
+                                                      return_tensor_list=True)
 
                     for accumulated_grad, new_avg_grad in zip(self.averaged_gradients[i], avg_new):
                         accumulated_grad.add_(new_avg_grad)
@@ -769,13 +766,13 @@ def reset_partition_gradient_structures(self):
             total_partitions = dist.get_world_size(group=self.real_dp_process_group[i])
             for partition_id in range(total_partitions):
                 self.is_partition_reduced[i][partition_id] = False
-                self.remaining_grads_in_partition[i][
-                    partition_id] = self.total_grads_in_partition[i][partition_id]
+                self.remaining_grads_in_partition[i][partition_id] = self.total_grads_in_partition[i][partition_id]
 
                 for param_id in self.is_grad_computed[i][partition_id]:
                     self.is_grad_computed[i][partition_id][param_id] = False
 
     def initialize_gradient_partition(self, i, param_group, partition_id):
+
         def set_key_value_list(dictionary, key, value):
             if key in dictionary:
                 dictionary[key].append(value)
@@ -801,26 +798,21 @@ def increment_value(dictionary, key):
             param_size = param.numel()
             param_id = self.get_param_id(param)
 
-            if (current_index >= start_index and current_index < end_index):
-                set_key_value_list(self.param_to_partition_ids[i],
-                                   param_id,
-                                   partition_id)
+            if start_index <= current_index < end_index:
+                set_key_value_list(self.param_to_partition_ids[i], param_id, partition_id)
                 increment_value(self.total_grads_in_partition[i], partition_id)
 
                 self.is_grad_computed[i][partition_id][param_id] = False
 
-                self.grad_partition_insertion_offset[i][partition_id][
-                    param_id] = current_index - start_index
+                self.grad_partition_insertion_offset[i][partition_id][param_id] = current_index - start_index
                 self.grad_start_offset[i][partition_id][param_id] = 0
 
-            elif start_index > current_index and start_index < (current_index +
-                                                                param_size):
-                assert (first_offset == 0), "This can happen either zero or only once as this must be the first tensor in the partition"
+            elif current_index < start_index < (current_index + param_size):
+                assert (first_offset == 0
+                        ), "This can happen either zero or only once as this must be the first tensor in the partition"
                 first_offset = start_index - current_index
 
-                set_key_value_list(self.param_to_partition_ids[i],
-                                   param_id,
-                                   partition_id)
+                set_key_value_list(self.param_to_partition_ids[i], param_id, partition_id)
                 increment_value(self.total_grads_in_partition[i], partition_id)
 
                 self.is_grad_computed[i][partition_id][param_id] = False
@@ -833,6 +825,33 @@ def increment_value(dictionary, key):
     def overlapping_partition_gradients_reduce_epilogue(self):
         self.independent_gradient_partition_epilogue()
 
+    def fill_grad_accum_attribute(self):
+        for group in self.bit16_groups:
+            for param in group:
+                if param.grad is not None:
+                    if param.grad_accum is None:
+                        param.grad_accum = param.grad.to(self.gradient_accumulation_dtype)
+                    else:
+                        param.grad_accum.add_(
+                            param.grad.to(self.gradient_accumulation_dtype).view(param.grad_accum.shape))
+                    param.grad = None
+
+    def get_gradient_for_reduction(self, param):
+        if self.use_grad_accum_attribute:
+            return param.grad_accum.to(self.dtype) if param.grad_accum is not None else None
+        else:
+            return param.grad
+
+    def get_param_gradient_attribute(self, param):
+        return param.grad_accum if self.use_grad_accum_attribute else param.grad
+
+    # Clear the tensor the reduction gradient attribute is pointing to
+    def clear_grad_attribute(self, param):
+        if self.use_grad_accum_attribute:
+            param.grad_accum = None
+        else:
+            param.grad = None
+
     def create_reduce_and_remove_grad_hooks(self):
         self.grad_accs = []
         for i, param_group in enumerate(self.bit16_groups):
@@ -868,15 +887,15 @@ def flatten_dense_tensors_aligned(self, tensor_list, alignment):
 
     ############### Independent Partition Gradient ########################
     def reduce_independent_p_g_buckets_and_remove_grads(self, param, i):
+
+        grad_reduc = self.get_gradient_for_reduction(param)
         if self.elements_in_ipg_bucket + param.numel() > self.reduce_bucket_size:
-            self.report_ipg_memory_usage("In ipg_remove_grads before reduce_ipg_grads",
-                                         param.numel())
+            self.report_ipg_memory_usage("In ipg_remove_grads before reduce_ipg_grads", param.numel())
             self.reduce_ipg_grads()
             if self.contiguous_gradients and self.overlap_comm:
                 # Swap ipg_index between 0 and 1
                 self.ipg_index = 1 - self.ipg_index
-            self.report_ipg_memory_usage("In ipg_remove_grads after reduce_ipg_grads",
-                                         param.numel())
+            self.report_ipg_memory_usage("In ipg_remove_grads after reduce_ipg_grads", param.numel())
 
         param_id = self.get_param_id(param)
         assert self.params_already_reduced[param_id] == False, \
@@ -884,23 +903,20 @@ def reduce_independent_p_g_buckets_and_remove_grads(self, param, i):
             Gradient computed twice for this partition. \
             Multiple gradient reduction is currently not supported"
 
-        if param.numel() > self.reduce_bucket_size:
-            self.extra_large_param_to_reduce = param
-
-        elif self.contiguous_gradients:
-            # keeping the gradients contiguous to prevent memory fragmentation, and avoid flattening
-            new_grad_tensor = self.ipg_buffer[self.ipg_index].narrow(
-                0,
-                self.elements_in_ipg_bucket,
-                param.numel())
-            new_grad_tensor.copy_(param.grad.view(-1))
-            param.grad.data = new_grad_tensor.data.view_as(param.grad)
+        if self.contiguous_gradients:
+            if param.numel() > self.reduce_bucket_size:
+                self.extra_large_param_to_reduce = param
+            else:
+                # keeping the gradients contiguous to prevent memory fragmentation, and avoid flattening
+                new_grad_tensor = self.ipg_buffer[self.ipg_index].narrow(0, self.elements_in_ipg_bucket, param.numel())
+                new_grad_tensor.copy_(grad_reduc.view(-1))
+                grad_reduc.data = new_grad_tensor.data.view_as(grad_reduc)
 
         self.elements_in_ipg_bucket += param.numel()
 
-        assert param.grad is not None, f"rank {dist.get_rank()} - Invalid to reduce Param {param_id} with None gradient"
+        assert grad_reduc is not None, f"rank {dist.get_rank()} - Invalid to reduce Param {param_id} with None gradient"
 
-        self.grads_in_ipg_bucket.append(param.grad)
+        self.grads_in_ipg_bucket.append(grad_reduc)
         self.params_in_ipg_bucket.append((i, param, param_id))
 
         #make sure the average tensor function knows how to average the gradients
@@ -929,9 +945,10 @@ def gradient_reduction_w_predivide(self, tensor):
             dist.all_reduce(tensor_to_allreduce, group=self.dp_process_group)
 
             if self.gradient_predivide_factor != dp_world_size:
-                tensor_to_allreduce.mul_(self.gradient_predivide_factor / dp_world_size)
+                tensor_to_allreduce.mul_(self.gradient_predivide_factor /
+                                         (dp_world_size / float(self.sequence_parallel_size)))
         else:
-            tensor_to_allreduce.div_(dp_world_size)
+            tensor_to_allreduce.div_(dp_world_size / float(self.sequence_parallel_size))
             dist.all_reduce(tensor_to_allreduce, group=self.dp_process_group)
 
         if self.communication_data_type != tensor.dtype and tensor is not tensor_to_allreduce:
@@ -939,10 +956,51 @@ def gradient_reduction_w_predivide(self, tensor):
 
         return tensor
 
+    def allreduce_and_copy_with_multiple_ranks(self,
+                                               small_bucket,
+                                               log=None,
+                                               divide=True,
+                                               process_group=None,
+                                               bucket_ranks=None):
+        process_group = self.dp_process_group if process_group is None else process_group
+        allreduced = self.allreduce_bucket(small_bucket, log=log, divide=divide, process_group=process_group)
+        for buf, synced, bucket_rank in zip(small_bucket, self.unflatten(allreduced, small_bucket), bucket_ranks):
+            if dist.get_rank(group=process_group) == bucket_rank:
+                buf.copy_(synced)
+
+    def allreduce_and_scatter(self, bucket, numel_per_bucket=500000000, log=None, divide=True, process_group=None):
+        small_bucket = []
+        small_bucket_ranks = []
+        numel = 0
+        allreduce_sizes = []
+
+        for i, bucket_elem in enumerate(bucket):
+            rank, tensor = bucket_elem
+            small_bucket.append(tensor)
+            small_bucket_ranks.append(rank)
+            numel = numel + tensor.numel()
+            if numel > numel_per_bucket:
+                self.allreduce_and_copy_with_multiple_ranks(small_bucket,
+                                                            log=None,
+                                                            divide=divide,
+                                                            process_group=process_group,
+                                                            bucket_ranks=small_bucket_ranks)
+                small_bucket = []
+                small_bucket_ranks = []
+                numel = 0
+
+        if len(small_bucket) > 0:
+            self.allreduce_and_copy_with_multiple_ranks(small_bucket,
+                                                        log=None,
+                                                        divide=divide,
+                                                        process_group=process_group,
+                                                        bucket_ranks=small_bucket_ranks)
+
     def average_tensor(self, tensor):
         if self.overlap_comm:
             stream = self.reduction_stream
-            stream.wait_stream(get_accelerator().current_stream())
+            if not get_accelerator().is_synchronized_device():
+                stream.wait_stream(get_accelerator().current_stream())
         else:
             stream = get_accelerator().current_stream()
 
@@ -958,24 +1016,25 @@ def average_tensor(self, tensor):
             rank_and_offsets = []
             real_dp_process_group = []
             curr_size = 0
-            prev_id = -1
+            prev_id, prev_process_group = -1, None
 
             process_group = self.dp_process_group
             # count = 0
             for i, param, param_id in self.params_in_ipg_bucket:
 
                 process_group = self.dp_process_group
+                grad_reduc = self.get_gradient_for_reduction(param)
                 #Averages gradients at parameter level if ipg has a moe param
                 #Otherwise averaging is done at the entire buffer level at the end of the loop
                 # MoE param have different groups
                 if self.ipg_bucket_has_moe_params:
-                    process_group = self.expert_dp_process_group[
-                        param.group_name] if is_moe_param(
-                            param) else self.dp_process_group
-                    param.grad.data.div_(dist.get_world_size(group=process_group))
+                    process_group = self.expert_dp_process_group[param.group_name] if is_moe_param(
+                        param) else self.dp_process_group
+                    grad_reduc.data.div_(dist.get_world_size(group=process_group) / float(self.sequence_parallel_size))
 
                 partition_ids = self.param_to_partition_ids[i][param_id]
-                assert all([p_id < dist.get_world_size(group=process_group) for p_id in partition_ids]), f"world size {dist.get_world_size(group=process_group)} and p_ids: {partition_ids}"
+                assert all([p_id < dist.get_world_size(group=process_group) for p_id in partition_ids
+                            ]), f"world size {dist.get_world_size(group=process_group)} and p_ids: {partition_ids}"
                 partition_size = self.partition_size[i]
                 # Get all partition ids + their offsets
                 partition_ids_w_offsets = []
@@ -1001,41 +1060,43 @@ def average_tensor(self, tensor):
                         numel = partition_ids_w_offsets[idx + 1][1] - offset
 
                     # Merge bucket ranges if they belong to the same rank
-                    if partition_id == prev_id:
+                    if partition_id == prev_id and process_group == prev_process_group:
                         prev_pid, prev_size, prev_numel = rank_and_offsets[-1]
                         rank_and_offsets[-1] = (prev_pid, prev_size, prev_numel + numel)
                     else:
                         rank_and_offsets.append((partition_id, curr_size, numel))
                         real_dp_process_group.append(process_group)
                     curr_size += numel
-                    prev_id = partition_id
+                    prev_id, prev_process_group = partition_id, process_group
 
             if not self.ipg_bucket_has_moe_params:
-                tensor.div_(dist.get_world_size(group=self.dp_process_group))
+                tensor.div_(dist.get_world_size(group=self.dp_process_group) / float(self.sequence_parallel_size))
 
-            tensor_to_reduce = tensor
-            if self.communication_data_type != tensor.dtype:
-                tensor_to_reduce = tensor.to(self.communication_data_type)
-
-            async_handles = []
+            buckets = {}
             for i, (dst, bucket_offset, numel) in enumerate(rank_and_offsets):
-                grad_slice = tensor_to_reduce.narrow(0, int(bucket_offset), int(numel))
-                # if dist.get_rank() == 0:
-                #     print(f"Rank {dist.get_rank()} rank offset id {i} real dp size {dist.get_world_size(group=real_dp_process_group[i])} and dst: {dst}")
-                # dist.barrier()
-                #dist.barrier()
-                dst_rank = dist.get_global_rank(real_dp_process_group[i], dst)
-                async_handle = dist.reduce(grad_slice,
-                                           dst=dst_rank,
-                                           group=real_dp_process_group[i],
-                                           async_op=True)
-                async_handles.append(async_handle)
-
-            for handle in async_handles:
-                handle.wait()
-
-            if self.communication_data_type != tensor.dtype:
-                tensor.copy_(tensor_to_reduce)
+                grad_slice = tensor.narrow(0, int(bucket_offset), int(numel))
+                bucket_key = real_dp_process_group[i] if self.use_multi_rank_bucket_allreduce else (
+                    dst, real_dp_process_group[i])
+                if bucket_key not in buckets:
+                    buckets[bucket_key] = []
+                if self.use_multi_rank_bucket_allreduce:
+                    buckets[bucket_key].append((dst, grad_slice))
+                else:
+                    buckets[bucket_key].append(grad_slice)
+
+            for bucket_key in buckets:
+                if self.use_multi_rank_bucket_allreduce:
+                    self.allreduce_and_scatter(buckets[bucket_key],
+                                               numel_per_bucket=self.reduce_bucket_size,
+                                               divide=self.ipg_bucket_has_moe_params,
+                                               process_group=bucket_key)
+                else:
+                    dst, process_group = bucket_key
+                    self.allreduce_no_retain(buckets[bucket_key],
+                                             numel_per_bucket=self.reduce_bucket_size,
+                                             rank=dst,
+                                             divide=self.ipg_bucket_has_moe_params,
+                                             process_group=process_group)
 
     ##############################################################################
     ############################# CPU Offload Methods#############################
@@ -1060,15 +1121,14 @@ def get_grad_position(self, group_id, tensor_list, first_offset, partition_size)
                 num_elements = partition_size - current_offset
 
             self.grad_position[param_id] = [
-                int(group_id),
-                int(param_start_offset),
-                int(current_offset),
-                int(num_elements)
+                int(group_id), int(param_start_offset),
+                int(current_offset), int(num_elements)
             ]
             current_offset += num_elements
 
     def update_overflow_tracker_for_param_grad(self, param):
-        if param.grad is not None and self._has_inf_or_nan(param.grad.data):
+        grad_accum = self.get_param_gradient_attribute(param)
+        if grad_accum is not None and self._has_inf_or_nan(grad_accum.data):
             self.local_overflow = True
 
     def _get_offload_gradient_dict(self):
@@ -1077,10 +1137,8 @@ def _get_offload_gradient_dict(self):
             for lp_param in self.params_in_partition[param_group_index]:
                 param_id = self.get_param_id(lp_param)
                 [_, _, dest_offset, num_elements] = self.grad_position[param_id]
-                dest_tensor = self.single_partition_of_fp32_groups[
-                    param_group_index].grad.view(-1).narrow(0,
-                                                            dest_offset,
-                                                            num_elements)
+                dest_tensor = self.single_partition_of_fp32_groups[param_group_index].grad.view(-1).narrow(
+                    0, dest_offset, num_elements)
                 self.offload_gradient_dict[param_group_index].append(dest_tensor)
 
     def async_accumulate_grad_in_cpu_via_gpu(self, param):
@@ -1089,55 +1147,38 @@ def async_accumulate_grad_in_cpu_via_gpu(self, param):
         [i, source_offset, dest_offset, num_elements] = self.grad_position[param_id]
 
         # copy to a preexisiting buffer to avoid memory allocation penalty
-        dest_buffer = self.temp_grad_buffer_for_gpu_offload.view(-1).narrow(
-            0,
-            0,
-            param.numel())
+        dest_buffer = self.temp_grad_buffer_for_gpu_offload.view(-1).narrow(0, 0, param.numel())
 
         #buffer for storing gradients for this parameter in CPU
         def buffer_to_accumulate_to_in_cpu():
             if not self.fp16_master_weights_and_gradients:
-                return get_accelerator().pin_memory(
-                    torch.zeros(param.numel(),
-                                dtype=param.dtype,
-                                device=self.device))
+                buffer = torch.zeros(param.numel(), dtype=param.dtype, device=self.device)
+                return get_accelerator().pin_memory(buffer) if self.cpu_offload_pin_memory else buffer
             else:
-                return self.single_partition_of_fp32_groups[i].grad.view(-1).narrow(
-                    0,
-                    dest_offset,
-                    num_elements)
+                return self.single_partition_of_fp32_groups[i].grad.view(-1).narrow(0, dest_offset, num_elements)
 
-        #accumulate gradients into param.grad or parts of it that belongs to this partition
+        #accumulate gradients into param.grad_accum or parts of it that belongs to this partition
         def accumulate_gradients():
+            grad_accum = self.get_param_gradient_attribute(param)
             if not self.fp16_master_weights_and_gradients:
-                dest_buffer.copy_(self.accumulated_grads_in_cpu[param_id].view(-1),
-                                  non_blocking=True)
-                param.grad.data.view(-1).add_(dest_buffer)
+                dest_buffer.copy_(self.accumulated_grads_in_cpu[param_id].view(-1), non_blocking=True)
+                grad_accum.data.view(-1).add_(dest_buffer)
             else:
-                dest_buffer.narrow(0,
-                                   source_offset,
-                                   num_elements).copy_(
-                                       self.accumulated_grads_in_cpu[param_id].view(-1),
-                                       non_blocking=True)
-                param.grad.data.view(-1).narrow(
-                    0,
-                    source_offset,
-                    num_elements).add_(dest_buffer.narrow(0,
-                                                          source_offset,
-                                                          num_elements))
+                dest_buffer.narrow(0, source_offset,
+                                   num_elements).copy_(self.accumulated_grads_in_cpu[param_id].view(-1),
+                                                       non_blocking=True)
+                grad_accum.data.view(-1).narrow(0, source_offset,
+                                                num_elements).add_(dest_buffer.narrow(0, source_offset, num_elements))
 
         #move accumulated gradients back to CPU
         def copy_gradients_to_cpu():
+            grad_accum = self.get_param_gradient_attribute(param)
             if not self.fp16_master_weights_and_gradients:
-                self.accumulated_grads_in_cpu[param_id].data.copy_(
-                    param.grad.data.view(-1),
-                    non_blocking=True)
+                self.accumulated_grads_in_cpu[param_id].data.copy_(grad_accum.data.view(-1), non_blocking=True)
             else:
-                self.accumulated_grads_in_cpu[param_id].data.copy_(
-                    param.grad.data.view(-1).narrow(0,
-                                                    source_offset,
-                                                    num_elements),
-                    non_blocking=True)
+                self.accumulated_grads_in_cpu[param_id].data.copy_(grad_accum.data.view(-1).narrow(
+                    0, source_offset, num_elements),
+                                                                   non_blocking=True)
 
         if param_id not in self.accumulated_grads_in_cpu:
             self.accumulated_grads_in_cpu[param_id] = buffer_to_accumulate_to_in_cpu()
@@ -1151,8 +1192,9 @@ def copy_gradients_to_cpu():
 
     def set_norm_for_param_grad(self, param):
         param_id = self.get_param_id(param)
+        grad_accum = self.get_param_gradient_attribute(param)
         accumulated_grad = self.accumulated_grads_in_cpu[
-            param_id] if self.gradient_accumulation_steps > 1 else param.grad
+            param_id] if self.gradient_accumulation_steps > 1 else grad_accum
 
         [i, source_offset, dest_offset, num_elements] = self.grad_position[param_id]
 
@@ -1163,7 +1205,11 @@ def set_norm_for_param_grad(self, param):
 
     def set_norm_for_param_grad_in_gpu(self, param):
         param_id = self.get_param_id(param)
-        accumulated_grad = param.grad
+        grad_accum = self.get_param_gradient_attribute(param)
+        if grad_accum is None:
+            accumulated_grad = param.grad
+        else:
+            accumulated_grad = grad_accum
 
         [i, source_offset, dest_offset, num_elements] = self.grad_position[param_id]
 
@@ -1177,12 +1223,13 @@ def async_inplace_copy_grad_to_fp32_buffer_from_gpu(self, param):
 
         [i, source_offset, dest_offset, num_elements] = self.grad_position[param_id]
 
-        dest_tensor = self.single_partition_of_fp32_groups[i].grad.view(-1).narrow(
-            0,
-            dest_offset,
-            num_elements)
+        dest_tensor = self.single_partition_of_fp32_groups[i].grad.view(-1).narrow(0, dest_offset, num_elements)
 
-        src_tensor = param.grad.view(-1).narrow(0, source_offset, num_elements)
+        grad_accum = self.get_param_gradient_attribute(param)
+        if grad_accum is None:
+            src_tensor = grad_accum.view(-1).narrow(0, source_offset, num_elements)
+        else:
+            src_tensor = grad_accum.view(-1).narrow(0, source_offset, num_elements)
         if not self.fp16_master_weights_and_gradients:
             src_tensor = src_tensor.float()
 
@@ -1220,16 +1267,13 @@ def complete_grad_norm_calculation_for_cpu_offload(self, params):
 
         # Sum across all model parallel GPUs.
         total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)])
-        dist.all_reduce(total_norm_cuda,
-                        op=dist.ReduceOp.SUM,
-                        group=self.dp_process_group)
+        dist.all_reduce(total_norm_cuda, op=dist.ReduceOp.SUM, group=self.dp_process_group)
 
         self._model_parallel_all_reduce(tensor=total_norm_cuda, op=dist.ReduceOp.SUM)
 
         total_norm = total_norm_cuda[0].item()**(1. / norm_type)
 
-        if total_norm == float(
-                'inf') or total_norm == -float('inf') or total_norm != total_norm:
+        if total_norm == float('inf') or total_norm == -float('inf') or total_norm != total_norm:
             total_norm = -1
 
         return total_norm
@@ -1258,19 +1302,16 @@ def copy_grads_in_partition(self, param):
                     total_size += param_in_partition.numel()
 
             see_memory_usage(f"before copying {total_size} gradients into partition")
-            self.grads_in_partition = torch.empty(
-                int(total_size),
-                dtype=self.dtype,
-                device=get_accelerator().current_device_name())
+            self.grads_in_partition = torch.empty(int(total_size),
+                                                  dtype=self.dtype,
+                                                  device=get_accelerator().current_device_name())
             see_memory_usage(f"after copying {total_size} gradients into partition")
 
+        grad_reduc = self.get_gradient_for_reduction(param)
         # The allreduce buffer will be rewritten. Copy the gradients in partition to a new buffer
-        new_grad_tensor = self.grads_in_partition.view(-1).narrow(
-            0,
-            self.grads_in_partition_offset,
-            param.numel())
-        new_grad_tensor.copy_(param.grad.view(-1))
-        param.grad.data = new_grad_tensor.data.view_as(param.grad)
+        new_grad_tensor = self.grads_in_partition.view(-1).narrow(0, self.grads_in_partition_offset, param.numel())
+        new_grad_tensor.copy_(grad_reduc.view(-1))
+        grad_reduc.data = new_grad_tensor.data.view_as(grad_reduc)
         #print(f"Grad norm after copy to contiguous_buffer {param.grad.data.norm()}")
         self.grads_in_partition_offset += param.numel()
 
@@ -1279,17 +1320,17 @@ def reduce_ipg_grads(self):
             if self.extra_large_param_to_reduce is not None:
                 assert len(self.params_in_ipg_bucket) == 1, "more than 1 param in ipg bucket, this shouldn't happen"
                 _, _, param_id = self.params_in_ipg_bucket[0]
-                assert self.get_param_id(
-                    self.extra_large_param_to_reduce) == param_id, "param in ipg bucket does not match extra-large param"
-                self.average_tensor(self.extra_large_param_to_reduce.grad.view(-1))
+                assert self.get_param_id(self.extra_large_param_to_reduce
+                                         ) == param_id, "param in ipg bucket does not match extra-large param"
+                extra_large_grad_reduc = self.get_gradient_for_reduction(self.extra_large_param_to_reduce)
+                self.average_tensor(extra_large_grad_reduc.view(-1))
                 self.extra_large_param_to_reduce = None
             else:
                 self.average_tensor(self.ipg_buffer[self.ipg_index])
         else:
-            self.buffered_reduce_fallback(
-                None,
-                self.grads_in_ipg_bucket,
-                elements_per_buffer=self.elements_in_ipg_bucket)
+            self.buffered_reduce_fallback(None,
+                                          self.grads_in_ipg_bucket,
+                                          elements_per_buffer=self.elements_in_ipg_bucket)
 
         if self.overlap_comm:
             stream = self.reduction_stream
@@ -1310,7 +1351,6 @@ def reduce_ipg_grads(self):
                     Multiple gradient reduction is currently not supported"
 
                 self.params_already_reduced[param_id] = True
-
                 if self.partition_gradients:
                     if not self.is_param_in_current_partition[param_id]:
                         if self.overlap_comm and self.contiguous_gradients is False:
@@ -1320,12 +1360,11 @@ def reduce_ipg_grads(self):
                                 self.previous_reduced_grads = []
                             self.previous_reduced_grads.append(param)
                         else:
-                            param.grad = None  #only if self.partition_gradients
+                            self.clear_grad_attribute(param)
                     elif self.contiguous_gradients:
                         self.copy_grads_in_partition(param)
                 else:  # zero stage 1 - partition only optimizer state
-                    if self.contiguous_gradients and self.is_param_in_current_partition[
-                            param_id]:
+                    if self.contiguous_gradients and self.is_param_in_current_partition[param_id]:
                         self.copy_grads_in_partition(param)
 
         self.grads_in_ipg_bucket = []
@@ -1339,6 +1378,7 @@ def reduce_ready_partitions_and_remove_grads(self, param, i):
             self.reduce_independent_p_g_buckets_and_remove_grads(param, i)
 
     def zero_reduced_gradients(self, partition_id, i):
+
         def are_all_related_partitions_reduced(params_id):
             for partition_id in self.param_to_partition_ids[i][params_id]:
                 if not self.is_partition_reduced[i][partition_id]:
@@ -1358,29 +1398,23 @@ def print_func():
         self.sequential_execution(print_func, message)
 
     def get_grads_to_reduce(self, i, partition_id):
+
         def get_reducible_portion(key):
             grad = self.param_dict[key].grad
             total_elements = grad.numel()
             start = self.grad_start_offset[i][partition_id][key]
-            num_elements = min(
-                total_elements - start,
-                self.partition_size[i] -
-                self.grad_partition_insertion_offset[i][partition_id][key])
+            num_elements = min(total_elements - start,
+                               self.partition_size[i] - self.grad_partition_insertion_offset[i][partition_id][key])
             if not pg_correctness_test:
                 if num_elements == total_elements:
                     return grad
                 else:
-                    return grad.contiguous().view(-1).narrow(0,
-                                                             int(start),
-                                                             int(num_elements))
+                    return grad.contiguous().view(-1).narrow(0, int(start), int(num_elements))
             else:
                 if num_elements == total_elements:
                     return grad.clone()
                 else:
-                    return grad.clone().contiguous().view(-1).narrow(
-                        0,
-                        int(start),
-                        int(num_elements))
+                    return grad.clone().contiguous().view(-1).narrow(0, int(start), int(num_elements))
 
         grads_to_reduce = []
         for key in self.is_grad_computed[i][partition_id]:
@@ -1405,13 +1439,15 @@ def set_none_gradients_to_zero(self, i, partition_id):
                 param.grad = torch.zero_like(param)
 
     ######################Reduction Related Methods##############################
-    def allreduce_bucket(self, bucket, rank=None, log=None):
+    def allreduce_bucket(self, bucket, rank=None, log=None, divide=True, process_group=None):
         rank = None
         tensor = self.flatten(bucket)
 
+        process_group = self.dp_process_group if process_group is None else process_group
+
         tensor_to_allreduce = tensor
 
-        if pg_correctness_test:
+        if pg_correctness_test or self.sequence_parallel_size > 1:
             communication_data_type = torch.float32
         else:
             communication_data_type = self.communication_data_type
@@ -1419,17 +1455,18 @@ def allreduce_bucket(self, bucket, rank=None, log=None):
         if communication_data_type != tensor.dtype:
             tensor_to_allreduce = tensor.to(communication_data_type)
 
-        tensor_to_allreduce.div_(dist.get_world_size(group=self.dp_process_group))
+        if divide:
+            tensor_to_allreduce.div_(dist.get_world_size(group=process_group) / float(self.sequence_parallel_size))
 
         if rank is None:
             #    "All Reducing"
-            dist.all_reduce(tensor_to_allreduce, group=self.dp_process_group)
+            dist.all_reduce(tensor_to_allreduce, group=process_group)
         else:
-            global_rank = dist.get_global_rank(self.dp_process_group, rank)
-            dist.reduce(tensor_to_allreduce, global_rank, group=self.dp_process_group)
+            global_rank = dist.get_global_rank(process_group, rank)
+            dist.reduce(tensor_to_allreduce, global_rank, group=process_group)
 
         if communication_data_type != tensor.dtype and tensor is not tensor_to_allreduce:
-            if rank is None or rank == dist.get_rank(group=self.dp_process_group):
+            if rank is None or rank == dist.get_rank(group=process_group):
                 tensor.copy_(tensor_to_allreduce)
 
         return tensor
@@ -1437,11 +1474,12 @@ def allreduce_bucket(self, bucket, rank=None, log=None):
     def _clear_previous_reduced_grads(self):
         if self.previous_reduced_grads is not None:
             for param in self.previous_reduced_grads:
-                param.grad = None  # overlap enabled
+                self.clear_grad_attribute(param)
             self.previous_reduced_grads = None
 
     # if rank is specified do a reduction instead of an allreduce
-    def allreduce_and_copy(self, small_bucket, rank=None, log=None):
+    def allreduce_and_copy(self, small_bucket, rank=None, log=None, divide=True, process_group=None):
+        process_group = self.dp_process_group if process_group is None else process_group
         if self.overlap_comm:
             get_accelerator().synchronize()
             # It is safe to clear the previously reduced grads of other partitions
@@ -1451,42 +1489,46 @@ def allreduce_and_copy(self, small_bucket, rank=None, log=None):
             stream = get_accelerator().current_stream()
 
         with get_accelerator().stream(stream):
-            allreduced = self.allreduce_bucket(small_bucket, rank=rank, log=log)
+            allreduced = self.allreduce_bucket(
+                small_bucket,
+                rank=rank,
+                log=log,
+                divide=divide,
+                process_group=process_group,
+            )
             if rank is None or rank == dist.get_rank(group=self.dp_process_group):
                 for buf, synced in zip(small_bucket, self.unflatten(allreduced, small_bucket)):
                     buf.copy_(synced)
 
-    def allreduce_no_retain(self,
-                            bucket,
-                            numel_per_bucket=500000000,
-                            rank=None,
-                            log=None):
+    def allreduce_no_retain(
+        self,
+        bucket,
+        numel_per_bucket=500000000,
+        rank=None,
+        log=None,
+        divide=True,
+        process_group=None,
+    ):
         small_bucket = []
         numel = 0
         for tensor in bucket:
             small_bucket.append(tensor)
             numel = numel + tensor.numel()
             if numel > numel_per_bucket:
-                self.allreduce_and_copy(small_bucket, rank=rank, log=None)
+                self.allreduce_and_copy(small_bucket, rank=rank, log=None, divide=divide, process_group=process_group)
                 small_bucket = []
+                numel = 0
 
         if len(small_bucket) > 0:
-            self.allreduce_and_copy(small_bucket, rank=rank, log=log)
+            self.allreduce_and_copy(small_bucket, rank=rank, log=log, divide=divide, process_group=process_group)
 
     # allows using reduction of gradients instead of using all_reduce
 
-    def buffered_reduce_fallback(self,
-                                 rank,
-                                 grads,
-                                 elements_per_buffer=500000000,
-                                 log=None):
+    def buffered_reduce_fallback(self, rank, grads, elements_per_buffer=500000000, log=None):
         split_buckets = split_half_float_double(grads)
 
         for i, bucket in enumerate(split_buckets):
-            self.allreduce_no_retain(bucket,
-                                     numel_per_bucket=elements_per_buffer,
-                                     rank=rank,
-                                     log=log)
+            self.allreduce_no_retain(bucket, numel_per_bucket=elements_per_buffer, rank=rank, log=log)
 
     #############################################################################
     #############################################################################
@@ -1528,14 +1570,14 @@ def get_partition_info(self, tensor_list, partition_size, partition_id):
 
             tensor_size = tensor.numel()
 
-            if (current_index >= start_index and current_index < end_index):
+            if start_index <= current_index < end_index:
                 params_in_partition.append(tensor)
 
-            elif start_index > current_index and start_index < (current_index +
-                                                                tensor_size):
+            elif current_index < start_index < (current_index + tensor_size):
                 params_in_partition.append(tensor)
 
-                assert (first_offset == 0), "This can happen either zero or only once as this must be the first tensor in the partition"
+                assert (first_offset == 0
+                        ), "This can happen either zero or only once as this must be the first tensor in the partition"
                 first_offset = start_index - current_index
 
             else:
@@ -1545,16 +1587,18 @@ def get_partition_info(self, tensor_list, partition_size, partition_id):
 
         return params_in_partition, params_not_in_partition, first_offset
 
-    def zero_grad(self, set_to_none=False):
+    def zero_grad(self, set_to_none=True):
         """
         Zero FP16 parameter grads.
         """
         # FP32 grad should never exist.
         # For speed, set model fp16 grad to None by default
+        # zero all pointers to grad tensors
         for group in self.bit16_groups:
             for p in group:
                 if set_to_none:
                     p.grad = None  # epilogue and in step
+                    p.grad_accum = None
                 else:
                     if p.grad is not None:
                         p.grad.detach_()
@@ -1589,9 +1633,7 @@ def get_grad_norm_direct(self, gradients, params, norm_type=2):
         if norm_type == inf:
             total_norm = max(g.data.abs().max() for g in gradients)
             total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)])
-            dist.all_reduce(total_norm_cuda,
-                            op=dist.ReduceOp.MAX,
-                            group=self.dp_process_group)
+            dist.all_reduce(total_norm_cuda, op=dist.ReduceOp.MAX, group=self.dp_process_group)
 
             # Take max across all GPUs.
             self._model_parallel_all_reduce(tensor=total_norm_cuda, op=dist.ReduceOp.MAX)
@@ -1609,16 +1651,13 @@ def get_grad_norm_direct(self, gradients, params, norm_type=2):
                     total_norm += param_norm.item()**2
             # Sum across all model parallel GPUs.
             total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)])
-            dist.all_reduce(total_norm_cuda,
-                            op=dist.ReduceOp.SUM,
-                            group=self.dp_process_group)
+            dist.all_reduce(total_norm_cuda, op=dist.ReduceOp.SUM, group=self.dp_process_group)
 
             self._model_parallel_all_reduce(tensor=total_norm_cuda, op=dist.ReduceOp.SUM)
 
             total_norm = total_norm_cuda[0].item()**(1. / norm_type)
 
-        if total_norm == float(
-                'inf') or total_norm == -float('inf') or total_norm != total_norm:
+        if total_norm == float('inf') or total_norm == -float('inf') or total_norm != total_norm:
             total_norm = -1
 
         return total_norm
@@ -1626,20 +1665,16 @@ def get_grad_norm_direct(self, gradients, params, norm_type=2):
     # creates a flat fused tensor from the tensor list starting at the first_offset
     # in the first tensor of the list. If there are not enough elements in the tensor
     # list then the flat tensor will be padded with zeros
-    def get_flat_partition(self,
-                           tensor_list,
-                           first_offset,
-                           partition_size,
-                           dtype,
-                           device,
-                           return_tensor_list=False):
+    def get_flat_partition(self, tensor_list, first_offset, partition_size, dtype, device, return_tensor_list=False):
         flat_tensor_list = []
         current_size = 0
+
         for i, tensor in enumerate(tensor_list):
-            if tensor.grad is None:
-                tensor.grad = torch.zeros_like(tensor)
+            grad_accum = self.get_param_gradient_attribute(tensor)
+            if grad_accum is None:
+                grad_accum = torch.zeros_like(tensor, dtype=dtype)
 
-            tensor = tensor.grad
+            tensor = grad_accum
             num_elements = tensor.numel()
             tensor_offset = 0
 
@@ -1655,10 +1690,7 @@ def get_flat_partition(self,
             # we need a narrow view of the tensor based on the tensor offset and number of elements that
             # we need from this tensor
             if tensor_offset > 0 or num_elements < tensor.numel():
-                flat_tensor_list.append(tensor.contiguous().view(-1).narrow(
-                    0,
-                    int(tensor_offset),
-                    int(num_elements)))
+                flat_tensor_list.append(tensor.contiguous().view(-1).narrow(0, int(tensor_offset), int(num_elements)))
             else:
                 flat_tensor_list.append(tensor)
 
@@ -1666,10 +1698,7 @@ def get_flat_partition(self,
 
         # this means its the last partition and does not align with the dp boundary. We need to pad before flattening
         if current_size < partition_size:
-            flat_tensor_list.append(
-                torch.zeros(int(partition_size - current_size),
-                            dtype=dtype,
-                            device=device))
+            flat_tensor_list.append(torch.zeros(int(partition_size - current_size), dtype=dtype, device=device))
 
         if return_tensor_list:
             return flat_tensor_list
@@ -1679,31 +1708,12 @@ def get_flat_partition(self,
     def free_grad_in_param_list(self, param_list):
         for p in param_list:
             p.grad = None  # in step
+            p.grad_accum = None
 
     def reset_cpu_buffers(self):
         self.norm_for_param_grads = {}
         self.local_overflow = False
 
-    def log_timers(self, timer_names):
-        if self.timers is None:
-            return
-
-        self.timers.log(names=list(timer_names))
-
-    def start_timers(self, timer_names):
-        if self.timers is None:
-            return
-
-        for name in timer_names:
-            self.timers(name).start()
-
-    def stop_timers(self, timer_names):
-        if self.timers is None:
-            return
-
-        for name in timer_names:
-            self.timers(name).stop()
-
     def set_lr(self, lr):
         """Set the learning rate."""
         for param_group in self.optimizer.param_groups:
@@ -1715,9 +1725,7 @@ def get_lr(self):
 
     def override_loss_scale(self, loss_scale):
         if loss_scale != self.external_loss_scale:
-            logger.info(
-                f'[deepspeed] setting loss scale from {self.external_loss_scale} -> {loss_scale}'
-            )
+            logger.info(f'[deepspeed] setting loss scale from {self.external_loss_scale} -> {loss_scale}')
         self.custom_loss_scaler = True
         self.external_loss_scale = loss_scale
 
@@ -1727,14 +1735,10 @@ def scaled_global_norm(self, norm_type=2):
         for i, group in enumerate(self.bit16_groups):
             partition_id = dist.get_rank(group=self.real_dp_process_group[i])
             if self.cpu_offload:
-                norm_groups.append(
-                    self.complete_grad_norm_calculation_for_cpu_offload(
-                        self.params_in_partition[i]))
+                norm_groups.append(self.complete_grad_norm_calculation_for_cpu_offload(self.params_in_partition[i]))
                 single_grad_partition = self.single_partition_of_fp32_groups[i].grad
             else:
-                norm_groups.append(
-                    self.get_grad_norm_direct(self.averaged_gradients[i],
-                                              self.params_in_partition[i]))
+                norm_groups.append(self.get_grad_norm_direct(self.averaged_gradients[i], self.params_in_partition[i]))
 
         if self.has_moe_layers:
             self._average_expert_grad_norms(norm_groups)
@@ -1745,18 +1749,18 @@ def scaled_global_norm(self, norm_type=2):
     def get_bit16_param_group(self, group_no):
         bit16_partitions = self.parallel_partitioned_bit16_groups[group_no]
         partition_id = dist.get_rank(group=self.real_dp_process_group[group_no])
-        return [
-            bit16_partitions[dist.get_rank(group=self.real_dp_process_group[group_no])]
-        ]
+        return [bit16_partitions[dist.get_rank(group=self.real_dp_process_group[group_no])]]
 
     def _optimizer_step(self, group_no):
         original_param_groups = self.optimizer.param_groups
         self.optimizer.param_groups = [original_param_groups[group_no]]
-        from deepspeed.ops.adam import DeepSpeedCPUAdam
-        if type(self.optimizer) == DeepSpeedCPUAdam and self.dtype == torch.half:
-            self.optimizer.step(fp16_param_groups=[self.get_bit16_param_group(group_no)])
-        else:
-            self.optimizer.step()
+        # Disabling this as the C++ side copy & synchronize is not working correctly
+        #from deepspeed.ops.adam import DeepSpeedCPUAdam
+        #if type(self.optimizer) == DeepSpeedCPUAdam and self.dtype == torch.half:
+        #    self.optimizer.step(fp16_param_groups=[self.get_bit16_param_group(group_no)])
+        #else:
+        #    self.optimizer.step()
+        self.optimizer.step()
         self.optimizer.param_groups = original_param_groups
 
     def step(self, closure=None):
@@ -1768,21 +1772,12 @@ def step(self, closure=None):
         see_memory_usage(f"In step before checking overflow")
 
         # First compute norm for all group so we know if there is overflow
-        self.check_overflow()
-        OPTIMIZER_ALLGATHER = 'optimizer_allgather'
-        OPTIMIZER_GRADIENTS = 'optimizer_gradients'
-        OPTIMIZER_STEP = 'optimizer_step'
-        timer_names = [OPTIMIZER_ALLGATHER, OPTIMIZER_GRADIENTS, OPTIMIZER_STEP]
+        if self.dtype == torch.float16:
+            self.check_overflow()
 
         prev_scale = self.loss_scale
         self._update_scale(self.overflow)
         if self.overflow:
-            if dist.get_rank() == 0:
-                overflow_msg = f"[deepspeed] OVERFLOW! Rank {dist.get_rank()} Skipping step."
-                if self.dtype == torch.half:
-                    overflow_msg += f" Attempted loss scale: {prev_scale}, reducing to {self.loss_scale}"
-                logger.info(overflow_msg)
-
             see_memory_usage('After overflow before clearing gradients')
             self.zero_grad(set_to_none=True)
             if self.cpu_offload:
@@ -1792,48 +1787,50 @@ def step(self, closure=None):
 
             see_memory_usage('After overflow after clearing gradients')
 
-            self.start_timers(timer_names)
-            self.stop_timers(timer_names)
+            for timer in OPTIMIZER_TIMERS:
+                self.timers(timer).start()
+                self.timers(timer).stop()
             return
 
-        # Step 1:- Calculate gradient norm using fp-16 grads
+        # Step 1:- Calculate gradient norm using bit-16 grads
         see_memory_usage('Before norm calculation')
         scaled_global_grad_norm = self.scaled_global_norm()
         self._global_grad_norm = scaled_global_grad_norm / prev_scale
-
         see_memory_usage('After norm before optimizer')
+
         # Step 2:- run optimizer and upscaling simultaneously
         for i, group in enumerate(self.bit16_groups):
-            self.start_timers([OPTIMIZER_GRADIENTS])
+            self.timers(OPTIMIZER_GRADIENTS_TIMER).start()
             partition_id = dist.get_rank(group=self.real_dp_process_group[i])
             if self.cpu_offload:
                 single_grad_partition = self.single_partition_of_fp32_groups[i].grad
-                self.unscale_and_clip_grads([single_grad_partition],
-                                            scaled_global_grad_norm)
-                self.stop_timers([OPTIMIZER_GRADIENTS])
-                self.start_timers([OPTIMIZER_STEP])
+                self.unscale_and_clip_grads([single_grad_partition], scaled_global_grad_norm)
+
+                self.timers(OPTIMIZER_GRADIENTS_TIMER).stop()
+                self.timers(OPTIMIZER_STEP_TIMER).start()
                 self._optimizer_step(i)
 
-                from deepspeed.ops.adam import DeepSpeedCPUAdam
-                if not (type(self.optimizer) == DeepSpeedCPUAdam
-                        and self.dtype == torch.half):
-                    bit16_partitions = self.parallel_partitioned_bit16_groups[i]
-                    fp32_partition = self.single_partition_of_fp32_groups[i]
-                    bit16_partitions[partition_id].data.copy_(fp32_partition.data)
+                # Disabled, this is not currently working
+                #from deepspeed.ops.adam import DeepSpeedCPUAdam
+                #if not (type(self.optimizer) == DeepSpeedCPUAdam and self.dtype == torch.half):
+                #    bit16_partitions = self.parallel_partitioned_bit16_groups[i]
+                #    fp32_partition = self.single_partition_of_fp32_groups[i]
+                #    bit16_partitions[partition_id].data.copy_(fp32_partition.data)
+                bit16_partitions = self.parallel_partitioned_bit16_groups[i]
+                fp32_partition = self.single_partition_of_fp32_groups[i]
+                bit16_partitions[partition_id].data.copy_(fp32_partition.data)
 
-                self.stop_timers([OPTIMIZER_STEP])
+                self.timers(OPTIMIZER_STEP_TIMER).stop()
             else:
                 # free gradients for all the parameters that are not updated by this process(ZeRO stage2)
                 self.free_grad_in_param_list(self.params_not_in_partition[i])
 
                 # create a flat gradients for parameters updated by this process
                 # If we are last partition, ensure we have same size grads and partition size, if not pad with zero tensors
-                if partition_id == dist.get_world_size(
-                        group=self.real_dp_process_group[i]) - 1:
+                if partition_id == dist.get_world_size(group=self.real_dp_process_group[i]) - 1:
                     single_grad_partition = self.flatten_dense_tensors_aligned(
                         self.averaged_gradients[i],
-                        int(self.partition_size[i])).to(
-                            self.single_partition_of_fp32_groups[i].dtype)
+                        int(self.partition_size[i])).to(self.single_partition_of_fp32_groups[i].dtype)
                 else:
                     single_grad_partition = self.flatten(self.averaged_gradients[i]).to(
                         self.single_partition_of_fp32_groups[i].dtype)
@@ -1847,12 +1844,12 @@ def step(self, closure=None):
 
                 self.averaged_gradients[i] = None
 
-                self.unscale_and_clip_grads([single_grad_partition],
-                                            scaled_global_grad_norm)
-                self.stop_timers([OPTIMIZER_GRADIENTS])
+                self.unscale_and_clip_grads([single_grad_partition], scaled_global_grad_norm)
+
+                self.timers(OPTIMIZER_GRADIENTS_TIMER).stop()
 
                 # Step 3:- run the optimizer if no offloading
-                self.start_timers([OPTIMIZER_STEP])
+                self.timers(OPTIMIZER_STEP_TIMER).start()
                 self._optimizer_step(i)
                 # Step 4:- get rid of the fp32 gradients. Not needed anymore
                 self.single_partition_of_fp32_groups[i].grad = None
@@ -1860,52 +1857,60 @@ def step(self, closure=None):
                 bit16_partitions = self.parallel_partitioned_bit16_groups[i]
                 fp32_partition = self.single_partition_of_fp32_groups[i]
                 bit16_partitions[partition_id].data.copy_(fp32_partition.data)
-                self.stop_timers([OPTIMIZER_STEP])
+                self.timers(OPTIMIZER_STEP_TIMER).stop()
 
         see_memory_usage('After optimizer before all-gather')
         if self.cpu_offload:
             self.reset_cpu_buffers()
 
-        self.start_timers([OPTIMIZER_ALLGATHER])
+        self.timers(OPTIMIZER_ALLGATHER_TIMER).start()
         # Gather the updated weights from everyone.
         # Then all partitions of the model parameters are updated and ready for next round forward.
-        all_gather_dp_groups(
-            partitioned_param_groups=self.parallel_partitioned_bit16_groups,
-            dp_process_group=self.real_dp_process_group,
-            start_alignment_factor=self.nccl_start_alignment_factor,
-            allgather_bucket_size=self.allgather_bucket_size)
+        if dist.has_all_gather_into_tensor():
+            all_gather_all_partitions(global_flatten_group=self.bit16_groups_flat,
+                                      partitioned_param_groups=self.parallel_partitioned_bit16_groups,
+                                      dp_process_group=self.real_dp_process_group)
+        else:
+            all_gather_dp_groups(partitioned_param_groups=self.parallel_partitioned_bit16_groups,
+                                 dp_process_group=self.real_dp_process_group,
+                                 start_alignment_factor=self.nccl_start_alignment_factor,
+                                 allgather_bucket_size=self.allgather_bucket_size)
 
-        self.stop_timers([OPTIMIZER_ALLGATHER])
+        self.timers(OPTIMIZER_ALLGATHER_TIMER).stop()
 
         # TODO: we probably don't need this? just to be safe
         for i in range(len(self.bit16_groups)):
             self._update_model_bit16_weights(i)
 
-        self.log_timers(timer_names)
+        self.timers.log(OPTIMIZER_TIMERS)
         see_memory_usage('After zero_optimizer step')
 
         return
 
     @torch.no_grad()
     def update_lp_params(self):
-        for i, (bit16_partitions, fp32_partition) in enumerate(zip(self.parallel_partitioned_bit16_groups, self.single_partition_of_fp32_groups)):
+        for i, (bit16_partitions, fp32_partition) in enumerate(
+                zip(self.parallel_partitioned_bit16_groups, self.single_partition_of_fp32_groups)):
             partition_id = dist.get_rank(group=self.real_dp_process_group[i])
             bit16_partitions[partition_id].data.copy_(fp32_partition.data)
             # print_rank_0(f'update_lp_params {i=} {partition_id=}', force=True)
             # if i == 0:
             #     print_rank_0(f'{fp32_partition[:10]=}', force=True)
 
-        all_gather_dp_groups(
-            partitioned_param_groups=self.parallel_partitioned_bit16_groups,
-            dp_process_group=self.real_dp_process_group,
-            start_alignment_factor=self.nccl_start_alignment_factor,
-            allgather_bucket_size=self.allgather_bucket_size)
+        if dist.has_all_gather_into_tensor():
+            all_gather_all_partitions(global_flatten_group=self.bit16_groups_flat,
+                                      partitioned_param_groups=self.parallel_partitioned_bit16_groups,
+                                      dp_process_group=self.real_dp_process_group)
+        else:
+            all_gather_dp_groups(partitioned_param_groups=self.parallel_partitioned_bit16_groups,
+                                 dp_process_group=self.real_dp_process_group,
+                                 start_alignment_factor=self.nccl_start_alignment_factor,
+                                 allgather_bucket_size=self.allgather_bucket_size)
 
     def _average_expert_grad_norms(self, norm_groups):
         for i, norm in enumerate(norm_groups):
             if self.is_moe_param_group[i]:
-                scaled_norm = norm * 1.0 / float(
-                    dist.get_world_size(group=self.real_dp_process_group[i]))
+                scaled_norm = norm * 1.0 / float(dist.get_world_size(group=self.real_dp_process_group[i]))
                 scaled_norm_tensor = torch.tensor(scaled_norm,
                                                   device=get_accelerator().device_name(),
                                                   dtype=torch.float)
@@ -1949,14 +1954,11 @@ def has_overflow_partitioned_grads_serial(self):
 
     def has_overflow(self, partition_gradients=True):
         if partition_gradients:
-            overflow = self.local_overflow if self.cpu_offload else self.has_overflow_partitioned_grads_serial(
-            )
+            overflow = self.local_overflow if self.cpu_offload else self.has_overflow_partitioned_grads_serial()
             overflow_gpu = get_accelerator().ByteTensor([overflow])
             '''This will capture overflow across all data parallel and expert parallel process
             Since expert parallel process are a subset of data parallel process'''
-            dist.all_reduce(overflow_gpu,
-                            op=dist.ReduceOp.MAX,
-                            group=self.dp_process_group)
+            dist.all_reduce(overflow_gpu, op=dist.ReduceOp.MAX, group=self.dp_process_group)
 
         else:
             params = []
@@ -2027,6 +2029,10 @@ def backward(self, loss, retain_graph=False):
         else:
             self.loss_scaler.backward(loss.float(), retain_graph=retain_graph)
 
+        # Only for Stage 1, Mode 2
+        if self.use_grad_accum_attribute:
+            self.fill_grad_accum_attribute()
+
     def check_overflow(self, partition_gradients=True):
         self._check_overflow(partition_gradients)
 
@@ -2093,9 +2099,7 @@ def _get_base_optimizer_state(self):
         optimizer_groups_state = []
         for i, group in enumerate(self.optimizer.param_groups):
             p = group['params'][0]
-            lean_optimizer_state = self._get_state_without_padding(
-                self.optimizer.state[p],
-                self.groups_padding[i])
+            lean_optimizer_state = self._get_state_without_padding(self.optimizer.state[p], self.groups_padding[i])
             optimizer_groups_state.append(lean_optimizer_state)
 
         return optimizer_groups_state
@@ -2112,19 +2116,24 @@ def state_dict(self):
             torch.save(checkpoint, "saved.pth")
         """
         state_dict = {}
-        state_dict['loss_scaler'] = self.loss_scaler
+        state_dict[LOSS_SCALER] = self.loss_scaler
         state_dict['dynamic_loss_scale'] = self.dynamic_loss_scale
         state_dict['overflow'] = self.overflow
         state_dict[CLIP_GRAD] = self.clip_grad
 
         if self.elastic_checkpoint:
             state_dict[BASE_OPTIMIZER_STATE] = self._get_base_optimizer_state()
+
+            if "step" in self.optimizer.param_groups[0]:
+                # Assuming "step" is the only item that changes through training iterations
+                assert all(group["step"] == self.optimizer.param_groups[0]["step"]
+                           for group in self.optimizer.param_groups), "All param groups must have the same step value"
+                state_dict[BASE_OPTIMIZER_STATE_STEP] = self.optimizer.param_groups[0]["step"]
         else:
             state_dict[BASE_OPTIMIZER_STATE] = self.optimizer.state_dict()
 
         # Remove paddings for DP alignment to enable loading for other alignment values
-        fp32_groups_without_padding = self._get_groups_without_padding(
-            self.single_partition_of_fp32_groups)
+        fp32_groups_without_padding = self._get_groups_without_padding(self.single_partition_of_fp32_groups)
         state_dict[SINGLE_PARTITION_OF_FP32_GROUPS] = fp32_groups_without_padding
 
         state_dict[
@@ -2146,17 +2155,13 @@ def _restore_from_elastic_fp32_weights(self, all_state_dict):
 
         for i in range(len(self.single_partition_of_fp32_groups)):
             partition_id = dist.get_rank(group=self.real_dp_process_group[i])
-            merged_partitions = [
-                sd[SINGLE_PARTITION_OF_FP32_GROUPS][i] for sd in all_state_dict
-            ]
+            merged_partitions = [sd[SINGLE_PARTITION_OF_FP32_GROUPS][i] for sd in all_state_dict]
             if self.is_moe_group(self.optimizer.param_groups[i]):
-                ranks = self.get_ep_ranks(
-                    group_name=self.optimizer.param_groups[i]['name'])
+                ranks = self.get_ep_ranks(group_name=self.optimizer.param_groups[i]['name'])
                 merged_partitions = [merged_partitions[i] for i in ranks]
             flat_merged_partitions = self.flatten_dense_tensors_aligned(
                 merged_partitions,
-                self.nccl_start_alignment_factor *
-                dist.get_world_size(group=self.real_dp_process_group[i]))
+                self.nccl_start_alignment_factor * dist.get_world_size(group=self.real_dp_process_group[i]))
             dp_partitions = self.get_data_parallel_partitions(flat_merged_partitions, i)
             merged_single_partition_of_fp32_groups.append(dp_partitions[partition_id])
 
@@ -2165,7 +2170,8 @@ def _restore_from_elastic_fp32_weights(self, all_state_dict):
 
     # Restore base optimizer fp32 weights from ZeRO fp16 or bfloat16 weights
     def _restore_from_bit16_weights(self):
-        for group_id, (bit16_partitions, fp32_partition) in enumerate(zip(self.parallel_partitioned_bit16_groups, self.single_partition_of_fp32_groups)):
+        for group_id, (bit16_partitions, fp32_partition) in enumerate(
+                zip(self.parallel_partitioned_bit16_groups, self.single_partition_of_fp32_groups)):
             partition_id = dist.get_rank(group=self.real_dp_process_group[group_id])
             fp32_partition.data.copy_(bit16_partitions[partition_id].data)
 
@@ -2178,11 +2184,8 @@ def _partition_base_optimizer_state(self, state_key, all_partition_states, group
         partition_id = dist.get_rank(group=self.real_dp_process_group[group_id])
         alignment = dist.get_world_size(group=self.real_dp_process_group[group_id])
         if torch.is_tensor(all_partition_states[0]):
-            flat_merged_partitions = self.flatten_dense_tensors_aligned(
-                all_partition_states,
-                alignment)
-            dp_partitions = self.get_data_parallel_partitions(flat_merged_partitions,
-                                                              group_id)
+            flat_merged_partitions = self.flatten_dense_tensors_aligned(all_partition_states, alignment)
+            dp_partitions = self.get_data_parallel_partitions(flat_merged_partitions, group_id)
             return dp_partitions[partition_id]
         else:
             # Assume non-tensor states are not partitioned and equal across ranks, so return first one
@@ -2217,47 +2220,39 @@ def _restore_elastic_base_optimizer_state(self, all_state_dict):
         base_optimizer_group_states = []
         for i in range(len(self.optimizer.param_groups)):
             partition_states = {}
-            all_partition_group_states = [
-                sd[BASE_OPTIMIZER_STATE][i] for sd in all_state_dict
-            ]
+            all_partition_group_states = [sd[BASE_OPTIMIZER_STATE][i] for sd in all_state_dict]
 
             if self.is_moe_group(self.optimizer.param_groups[i]):
-                ranks = self.get_ep_ranks(
-                    group_name=self.optimizer.param_groups[i]['name'])
-                all_partition_group_states = [
-                    all_partition_group_states[i] for i in ranks
-                ]
+                ranks = self.get_ep_ranks(group_name=self.optimizer.param_groups[i]['name'])
+                all_partition_group_states = [all_partition_group_states[i] for i in ranks]
 
             for key in all_partition_group_states[0].keys():
-                all_partition_states = [
-                    all_states[key] for all_states in all_partition_group_states
-                ]
-                partition_states[key] = self._partition_base_optimizer_state(
-                    key,
-                    all_partition_states,
-                    i)
+                all_partition_states = [all_states[key] for all_states in all_partition_group_states]
+                partition_states[key] = self._partition_base_optimizer_state(key, all_partition_states, i)
             base_optimizer_group_states.append(partition_states)
 
         self._restore_base_optimizer_state(base_optimizer_group_states)
 
+        # Restore step
+        if BASE_OPTIMIZER_STATE_STEP in all_state_dict[0]:
+            assert all(sd[BASE_OPTIMIZER_STATE_STEP] == all_state_dict[0][BASE_OPTIMIZER_STATE_STEP]
+                       for sd in all_state_dict), "State dicts of all partitions must have the same step value"
+            loaded_param_groups_step = all_state_dict[0][BASE_OPTIMIZER_STATE_STEP]
+            for param_group in self.optimizer.param_groups:
+                param_group['step'] = loaded_param_groups_step
+
     def load_state_dict(self,
                         state_dict_list,
                         load_optimizer_states=True,
                         load_from_fp32_weights=False,
-                        checkpoint_folder=None):
+                        checkpoint_folder=None,
+                        load_serial=None):
         if checkpoint_folder:
-            self._load_universal_checkpoint(checkpoint_folder,
-                                            load_optimizer_states,
-                                            load_from_fp32_weights)
+            self._load_universal_checkpoint(checkpoint_folder, load_optimizer_states, load_from_fp32_weights)
         else:
-            self._load_legacy_checkpoint(state_dict_list,
-                                         load_optimizer_states,
-                                         load_from_fp32_weights)
-
-    def _load_universal_checkpoint(self,
-                                   checkpoint_folder,
-                                   load_optimizer_states,
-                                   load_from_fp32_weights):
+            self._load_legacy_checkpoint(state_dict_list, load_optimizer_states, load_from_fp32_weights)
+
+    def _load_universal_checkpoint(self, checkpoint_folder, load_optimizer_states, load_from_fp32_weights):
         self._load_hp_checkpoint_state(checkpoint_folder)
 
     @property
@@ -2267,23 +2262,40 @@ def param_groups(self):
 
     def _load_hp_checkpoint_state(self, checkpoint_dir):
         checkpoint_dir = os.path.join(checkpoint_dir, "zero")
+        optim_state_path = os.path.join(checkpoint_dir, "optimizer_state.pt")
+        assert os.path.isfile(
+            optim_state_path), f'{optim_state_path} containing optimizer global state is missing! Cannot proceed.'
+        optim_sd = torch.load(optim_state_path)
+        self._load_global_state(optim_sd)
+
         tp_rank = bwc_tensor_model_parallel_rank(mpu=self.mpu)
         tp_world_size = self.mpu.get_slice_parallel_world_size()
-
         for i, _ in enumerate(self.optimizer.param_groups):
             for lp in self.bit16_groups[i]:
                 if lp._hp_mapping is not None:
                     #print(f"Loading {self.param_names[lp]} {tp_rank=} {tp_world_size=}")
-                    lp.load_hp_checkpoint_state(
-                        os.path.join(checkpoint_dir,
-                                     self.param_names[lp]),
-                        tp_rank,
-                        tp_world_size)
-
-    def _load_legacy_checkpoint(self,
-                                state_dict_list,
-                                load_optimizer_states=True,
-                                load_from_fp32_weights=False):
+                    lp.load_hp_checkpoint_state(os.path.join(checkpoint_dir, self.param_names[lp]), tp_rank,
+                                                tp_world_size)
+
+    def _load_global_state(self, sd):
+        self.loss_scaler = sd.get(LOSS_SCALER, self.loss_scaler)
+        self.dynamic_loss_scale = sd.get('dynamic_loss_scale', self.dynamic_loss_scale)
+        self.overflow = sd.get('overflow', self.overflow)
+        self.clip_grad = sd.get(CLIP_GRAD, self.clip_grad)
+
+        ckpt_version = sd.get(DS_VERSION, False)
+        assert ckpt_version, f"Empty ds_version in checkpoint, not clear how to proceed"
+        ckpt_version = pkg_version.parse(ckpt_version)
+
+        # zero stage 1 mode
+        if not self.partition_gradients:
+            required_version = pkg_version.parse("0.3.17")
+            error_str = f"ZeRO stage 1 changed in {required_version} and is not backwards compatible " \
+                "with older stage 1 checkpoints. If you'd like to load an old ZeRO-1 checkpoint " \
+                "please use an older version of DeepSpeed (<= 0.5.8) and set 'legacy_stage1': true in your zero config json."
+            assert required_version <= ckpt_version, f"Old version: {ckpt_version} {error_str}"
+
+    def _load_legacy_checkpoint(self, state_dict_list, load_optimizer_states=True, load_from_fp32_weights=False):
         r"""Loading ZeRO checkpoint
 
         Arguments:
@@ -2313,23 +2325,7 @@ def _load_legacy_checkpoint(self,
         # I think it should actually be ok to reload the optimizer before the model.
         dp_rank = dist.get_rank(group=self.dp_process_group)
         current_rank_sd = state_dict_list[dp_rank]
-        self.loss_scaler = current_rank_sd.get('loss_scaler', self.loss_scaler)
-        self.dynamic_loss_scale = current_rank_sd.get('dynamic_loss_scale',
-                                                      self.dynamic_loss_scale)
-        self.overflow = current_rank_sd.get('overflow', self.overflow)
-        self.clip_grad = current_rank_sd.get(CLIP_GRAD, self.clip_grad)
-
-        ckpt_version = current_rank_sd.get(DS_VERSION, False)
-        assert ckpt_version, f"Empty ds_version in checkpoint, not clear how to proceed"
-        ckpt_version = pkg_version.parse(ckpt_version)
-
-        # zero stage 1 mode
-        if not self.partition_gradients:
-            required_version = pkg_version.parse("0.3.17")
-            error_str = f"ZeRO stage 1 changed in {required_version} and is not backwards compatible " \
-                "with older stage 1 checkpoints. If you'd like to load an old ZeRO-1 checkpoint " \
-                "please use an older version of DeepSpeed (<= 0.5.8) and set 'legacy_stage1': true in your zero config json."
-            assert required_version <= ckpt_version, f"Old version: {ckpt_version} {error_str}"
+        self._load_global_state(current_rank_sd)
 
         ckpt_is_rigid = isinstance(current_rank_sd[BASE_OPTIMIZER_STATE], dict)
 
@@ -2353,8 +2349,7 @@ def _load_legacy_checkpoint(self,
                     self._restore_elastic_base_optimizer_state(state_dict_list)
                 else:
                     # loading an elastic checkpoint into rigid exec
-                    self._restore_base_optimizer_state(
-                        current_rank_sd[BASE_OPTIMIZER_STATE])
+                    self._restore_base_optimizer_state(current_rank_sd[BASE_OPTIMIZER_STATE])
 
         # At this point, the optimizer's references to the model's fp32 parameters are up to date.
         # The optimizer's hyperparameters and internal buffers are also up to date.
@@ -2377,7 +2372,8 @@ def _load_legacy_checkpoint(self,
                 self._restore_from_elastic_fp32_weights(state_dict_list)
             else:
                 # For non-elastic checkpoint, simply copying from saved weights of current rank is sufficient.
-                for current, saved in zip(self.single_partition_of_fp32_groups, current_rank_sd[SINGLE_PARTITION_OF_FP32_GROUPS]):
+                for current, saved in zip(self.single_partition_of_fp32_groups,
+                                          current_rank_sd[SINGLE_PARTITION_OF_FP32_GROUPS]):
                     src_tensor = _get_padded_tensor(saved, current.numel())
                     current.data.copy_(src_tensor.data)
         else:
@@ -2397,9 +2393,7 @@ def _handle_overflow(cpu_sum, x, i):
             if not math.isfinite(float(v)):
                 t_i = v_i
                 break
-        logger.info(
-            f"rank {rank} detected overflow {cpu_sum} in tensor {i}:{t_i} shape {x.shape}"
-        )
+        logger.info(f"rank {rank} detected overflow {cpu_sum} in tensor {i}:{t_i} shape {x.shape}")
 
 
 def estimate_zero2_model_states_mem_needs(total_params,
@@ -2422,9 +2416,7 @@ def estimate_zero2_model_states_mem_needs(total_params,
 
 def model_to_params(model):
     # shared params calculated only once
-    total_params = sum(
-        dict((p.data_ptr(),
-              p.numel()) for p in model.parameters()).values())
+    total_params = sum(dict((p.data_ptr(), p.numel()) for p in model.parameters()).values())
     return total_params
 
 
@@ -2452,11 +2444,10 @@ def estimate_zero2_model_states_mem_needs_all_live(model,
 
     total_params = model_to_params(model)
 
-    estimate_zero2_model_states_mem_needs_all_cold(
-        total_params=total_params,
-        num_gpus_per_node=num_gpus_per_node,
-        num_nodes=num_nodes,
-        additional_buffer_factor=additional_buffer_factor)
+    estimate_zero2_model_states_mem_needs_all_cold(total_params=total_params,
+                                                   num_gpus_per_node=num_gpus_per_node,
+                                                   num_nodes=num_nodes,
+                                                   additional_buffer_factor=additional_buffer_factor)
 
 
 def estimate_zero2_model_states_mem_needs_all_cold(total_params,
@@ -2480,6 +2471,7 @@ def estimate_zero2_model_states_mem_needs_all_cold(total_params,
         - ``additional_buffer_factor``: estimation factor (defaults to 1.5):
 
     """
+
     def format_options(cpu_offload):
         enabled = []
         device = f'{OffloadDeviceEnum.cpu:4}' if cpu_offload else "none"
@@ -2488,19 +2480,16 @@ def format_options(cpu_offload):
 
     nodes_str = "nodes" if num_nodes > 1 else "node"
     gpus_str = "GPUs" if num_gpus_per_node > 1 else "GPU"
-    print(
-        "Estimated memory needed for params, optim states and gradients for a:\n"
-        f"HW: Setup with {num_nodes} {nodes_str}, {num_gpus_per_node} {gpus_str} per node.\n"
-        f"SW: Model with {int(total_params/1e6)}M total params.")
+    print("Estimated memory needed for params, optim states and gradients for a:\n"
+          f"HW: Setup with {num_nodes} {nodes_str}, {num_gpus_per_node} {gpus_str} per node.\n"
+          f"SW: Model with {int(total_params/1e6)}M total params.")
     print("  per CPU  |  per GPU |   Options")
     for cpu_offload in [True, False]:
-        cpu_mem, gpu_mem = estimate_zero2_model_states_mem_needs(
-            total_params=total_params,
-            num_gpus_per_node=num_gpus_per_node,
-            num_nodes=num_nodes,
-            cpu_offload=cpu_offload,
-            additional_buffer_factor=additional_buffer_factor
-        )
+        cpu_mem, gpu_mem = estimate_zero2_model_states_mem_needs(total_params=total_params,
+                                                                 num_gpus_per_node=num_gpus_per_node,
+                                                                 num_nodes=num_nodes,
+                                                                 cpu_offload=cpu_offload,
+                                                                 additional_buffer_factor=additional_buffer_factor)
 
         options_str = format_options(cpu_offload=cpu_offload)
         print(f" {cpu_mem/2**30:7.2f}GB | {gpu_mem/2**30:6.2f}GB | {options_str}")
diff --git a/deepspeed/runtime/zero/test.py b/deepspeed/runtime/zero/test.py
index d13c247b3de4..1904a8d395be 100644
--- a/deepspeed/runtime/zero/test.py
+++ b/deepspeed/runtime/zero/test.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 from deepspeed.runtime.zero.contiguous_memory_allocator import ContiguousMemoryAllocator
diff --git a/deepspeed/runtime/zero/tiling.py b/deepspeed/runtime/zero/tiling.py
index d66caea85ee7..af1c5e54737a 100644
--- a/deepspeed/runtime/zero/tiling.py
+++ b/deepspeed/runtime/zero/tiling.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 import deepspeed
@@ -27,6 +30,7 @@ def split_tensor_along_last_dim(tensor, partitions, contiguous_split_chunks=Fals
 
 
 class TiledLinear(torch.nn.Module):
+
     def __init__(self,
                  in_features,
                  out_features,
@@ -114,10 +118,7 @@ def __init__(self,
                 local_bias = bias if in_id == (in_splits - 1) else False
 
                 local_in_dim = self.in_parts[in_id + 1] - self.in_parts[in_id]
-                local = linear_cls(local_in_dim,
-                                   local_out_dim,
-                                   bias=local_bias,
-                                   **kwargs)
+                local = linear_cls(local_in_dim, local_out_dim, bias=local_bias, **kwargs)
                 self.linears[out_id].append(local)
 
         # Optionally initialize with a known tensor
@@ -127,13 +128,12 @@ def __init__(self,
     def forward(self, input_):
         if self.in_splits > 1 and not self.input_is_already_split:
             input_parts = partition(input_.shape[-1], self.in_splits)
-            split_sizes = [
-                input_parts[p + 1] - input_parts[p] for p in range(self.in_splits)
-            ]
+            split_sizes = [input_parts[p + 1] - input_parts[p] for p in range(self.in_splits)]
             inputs = self._split_global_input(input_, split_sizes)
         elif self.in_splits > 1:
             inputs = input_
-            assert len(inputs) == self.in_splits, f"Col splits {self.in_splits} does not match input splits {len(inputs)}"
+            assert len(
+                inputs) == self.in_splits, f"Col splits {self.in_splits} does not match input splits {len(inputs)}"
         else:
             # no splits
             inputs = [input_]
@@ -260,6 +260,7 @@ class TiledLinearReturnBias(TiledLinear):
     """Wrapper for a Linear class that returns its own bias parameter, such as
     used by Megatron-LM.
     """
+
     def _reduce_local_output(self, in_id, out_id, current_out, new_out):
         """Reduces output tensors, but not the returned bias. """
         if current_out is not None:
@@ -273,10 +274,7 @@ def _reduce_local_output(self, in_id, out_id, current_out, new_out):
         tensor, bias = new_out
         assert tensor is not None
 
-        tensor = super()._reduce_local_output(in_id=in_id,
-                                              out_id=out_id,
-                                              current_out=old_tensor,
-                                              new_out=tensor)
+        tensor = super()._reduce_local_output(in_id=in_id, out_id=out_id, current_out=old_tensor, new_out=tensor)
 
         if bias is None:
             bias = old_bias
diff --git a/deepspeed/runtime/zero/utils.py b/deepspeed/runtime/zero/utils.py
index eafddad3f35f..0bf1ca4a894d 100755
--- a/deepspeed/runtime/zero/utils.py
+++ b/deepspeed/runtime/zero/utils.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import os
 from typing import List
@@ -7,7 +10,9 @@
 from deepspeed import comm as dist
 from deepspeed.utils import logger
 from deepspeed.ops.adam import DeepSpeedCPUAdam
+from deepspeed.ops.adagrad import DeepSpeedCPUAdagrad
 from deepspeed.ops.adam import FusedAdam
+from deepspeed.ops.lion import DeepSpeedCPULion, FusedLion
 from deepspeed.utils.nvtx import instrument_w_nvtx
 from deepspeed.accelerator import get_accelerator
 
@@ -15,9 +20,7 @@
 def _initialize_parameter_parallel_groups(parameter_parallel_size=None):
     data_parallel_size = int(dist.get_world_size())
     parameter_parallel_size = parameter_parallel_size or data_parallel_size
-    logger.info("data_parallel_size: %s, parameter_parallel_size: %s",
-                data_parallel_size,
-                parameter_parallel_size)
+    logger.info("data_parallel_size: %s, parameter_parallel_size: %s", data_parallel_size, parameter_parallel_size)
     assert data_parallel_size % parameter_parallel_size == 0, \
         'world size should be divisible by parameter parallel size'
     rank = dist.get_rank()
@@ -35,10 +38,8 @@ class ZeRORuntimeException(Exception):
 
 
 ZERO_SUPPORTED_OPTIMIZERS = [
-    torch.optim.Adam,
-    torch.optim.AdamW,
-    FusedAdam,
-    DeepSpeedCPUAdam
+    torch.optim.Adam, torch.optim.AdamW, FusedAdam, DeepSpeedCPUAdam, torch.optim.Adagrad, DeepSpeedCPUAdagrad,
+    DeepSpeedCPULion, FusedLion
 ]
 
 # Add apex FusedAdam to supported list if apex is installed
@@ -52,9 +53,7 @@ class ZeRORuntimeException(Exception):
 
 def is_zero_supported_optimizer(optimizer):
     if dist.get_rank() == 0:
-        logger.info(
-            f'Checking ZeRO support for optimizer={optimizer.__class__.__name__} type={type(optimizer)}'
-        )
+        logger.info(f'Checking ZeRO support for optimizer={optimizer.__class__.__name__} type={type(optimizer)}')
     return type(optimizer) in ZERO_SUPPORTED_OPTIMIZERS
 
 
diff --git a/deepspeed/sequence/__init__.py b/deepspeed/sequence/__init__.py
new file mode 100644
index 000000000000..208299fb8c50
--- /dev/null
+++ b/deepspeed/sequence/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
diff --git a/deepspeed/sequence/layer.py b/deepspeed/sequence/layer.py
new file mode 100644
index 000000000000..e1dbff87f4ec
--- /dev/null
+++ b/deepspeed/sequence/layer.py
@@ -0,0 +1,109 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+
+from typing import Any, Tuple
+from torch import Tensor
+from torch.nn import Module
+
+import deepspeed.comm as dist
+
+
+def single_all_to_all(input, scatter_idx, gather_idx, group):
+    seq_world_size = dist.get_world_size(group)
+    inp_shape = list(input.shape)
+    inp_shape[scatter_idx] = inp_shape[scatter_idx] // seq_world_size
+    if scatter_idx < 2:
+        input_t = input.reshape(
+            [seq_world_size, inp_shape[scatter_idx]] + \
+            inp_shape[scatter_idx + 1:]
+        ).contiguous()
+    else:
+        # transpose groups of heads with the seq-len parallel dimension, so that we can scatter them!
+        input_t = input.reshape(
+            [-1, seq_world_size, inp_shape[scatter_idx]] + \
+            inp_shape[scatter_idx + 1:]
+        ).transpose(0, 1).contiguous()
+
+    output = torch.empty_like(input_t)
+    dist.all_to_all_single(output, input_t, group=group)
+
+    # if scattering the seq-dim, transpose the heads back to the original dimension
+    if scatter_idx < 2:
+        output = output.transpose(0, 1).contiguous()
+
+    return output.reshape(
+        inp_shape[: gather_idx] + \
+        [inp_shape[gather_idx] * seq_world_size,] + \
+        inp_shape[gather_idx + 1:]).contiguous()
+
+
+class _SeqAllToAll(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, group: dist.ProcessGroup, input: Tensor, scatter_idx: int, gather_idx: int) -> Tensor:
+
+        ctx.group = group
+        ctx.scatter_idx = scatter_idx
+        ctx.gather_idx = gather_idx
+
+        return single_all_to_all(input, scatter_idx, gather_idx, group)
+
+    @staticmethod
+    def backward(ctx: Any, *grad_output: Tensor) -> Tuple[None, Tensor, None, None]:
+        return (None, _SeqAllToAll.apply(ctx.group, *grad_output, ctx.gather_idx, ctx.scatter_idx), None, None)
+
+
+class DistributedAttention(torch.nn.Module):
+    """Initialization.
+
+    Arguments:
+        local_attention (Module): local attention with q,k,v
+        sequence_process_group (ProcessGroup): sequence parallel process group
+        scatter_idx (int): scatter_idx for all2all comm
+        gather_idx (int): gather_idx for all2all comm
+    """
+
+    def __init__(
+        self,
+        local_attention: Module,
+        sequence_process_group: dist.ProcessGroup,
+        scatter_idx: int = 2,
+        gather_idx: int = 0,
+    ) -> None:
+
+        super(DistributedAttention, self).__init__()
+        self.local_attn = local_attention
+        self.spg = sequence_process_group
+        self.scatter_idx = scatter_idx
+        self.gather_idx = gather_idx
+
+    def forward(self, query: Tensor, key: Tensor, value: Tensor, *args: Any) -> Tensor:
+        """ forward
+
+        Arguments:
+            query (Tensor): query input to the layer
+            key (Tensor): key input to the layer
+            value (Tensor): value input to the layer
+            args: other args
+
+        Returns:
+            * output (Tensor): context output
+        """
+        # TODO Merge three alltoall calls into one
+        # TODO (Reza): change the api on the megatron-deepspeed side so that we only receive all data (q,k, and v) together!
+        #in shape : e.g.,  [s/p:h:]
+        query_layer = _SeqAllToAll.apply(self.spg, query, self.scatter_idx, self.gather_idx)
+        key_layer = _SeqAllToAll.apply(self.spg, key, self.scatter_idx, self.gather_idx)
+        value_layer = _SeqAllToAll.apply(self.spg, value, self.scatter_idx, self.gather_idx)
+
+        #out shape : e.g., [s:h/p:]
+        context_layer = self.local_attn(query_layer, key_layer, value_layer, *args)
+
+        output = _SeqAllToAll.apply(self.spg, context_layer, self.gather_idx, self.scatter_idx)
+
+        #out e.g., [s/p::h]
+        return output
diff --git a/deepspeed/utils/__init__.py b/deepspeed/utils/__init__.py
index c48383c01c60..6237d7239682 100644
--- a/deepspeed/utils/__init__.py
+++ b/deepspeed/utils/__init__.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .logging import logger, log_dist
 from .comms_logging import get_caller_func
@@ -9,5 +12,10 @@
 # TODO: Move tensor fragment and mixed precision to zero utils
 from .tensor_fragment import tensor_fragment, get_full_hp_param, get_hp_fragment_mapping, fragment_address, get_full_hp_grad
 from .tensor_fragment import safe_get_full_fp32_param, safe_get_full_grad, safe_get_full_optimizer_state
+from .tensor_fragment import set_full_hp_param
+from .tensor_fragment import safe_set_full_fp32_param, safe_set_full_optimizer_state
+from .tensor_fragment import safe_get_local_fp32_param, safe_get_local_grad, safe_get_local_optimizer_state
+from .tensor_fragment import safe_set_local_fp32_param, safe_set_local_optimizer_state
 from .mixed_precision_linkage import link_hp_params
 from deepspeed.runtime.dataloader import RepeatingLoader
+from .numa import get_numactl_cmd
diff --git a/deepspeed/utils/comms_logging.py b/deepspeed/utils/comms_logging.py
index d6b0410642a9..0e405a06d23e 100644
--- a/deepspeed/utils/comms_logging.py
+++ b/deepspeed/utils/comms_logging.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import math
 from deepspeed.utils import log_dist
@@ -9,6 +12,12 @@ def get_caller_func(frame=3):
     return sys._getframe(frame).f_code.co_name
 
 
+def print_rank_0(message):
+    import deepspeed.comm as dist
+    if dist.get_rank() == 0:
+        print(message)
+
+
 # Helper function to pretty-print message sizes
 def convert_size(size_bytes):
     if size_bytes == 0:
@@ -31,11 +40,11 @@ def calc_bw_log(comm_op, size, duration):
     if comm_op == "all_to_all_single":
         tput = (size / duration)
         busbw = (size / duration) * ((n - 1) / n)
-    elif comm_op == "all_gather" or comm_op == "all_gather_base" or comm_op == "reduce_scatter" or comm_op == "reduce_scatter_base":
+    elif comm_op == "all_gather" or comm_op == "all_gather_into_tensor" or comm_op == "reduce_scatter" or comm_op == "reduce_scatter_tensor":
         size *= n
         tput = (size / duration)
         busbw = (size / duration) * ((n - 1) / n)
-    elif comm_op == "all_reduce":
+    elif comm_op == "all_reduce" or comm_op == "all_reduce_coalesced" or comm_op == "inference_all_reduce":
         tput = (size * 2 / duration)
         busbw = (size / duration) * (2 * (n - 1) / n)
     elif comm_op == "send" or comm_op == "recv" or comm_op == "isend" or comm_op == "irecv" or comm_op == "broadcast" or comm_op == "reduce" or comm_op == "gather" or comm_op == "scatter" or comm_op == "barrier":
@@ -56,6 +65,7 @@ def calc_bw_log(comm_op, size, duration):
 
 
 class CommsLogger:
+
     def __init__(self):
         from deepspeed.comm.constants import COMMS_LOGGER_VERBOSE_DEFAULT, COMMS_LOGGER_DEBUG_DEFAULT, COMMS_LOGGER_PROF_OPS_DEFAULT, COMMS_LOGGER_PROF_ALL_DEFAULT, COMMS_LOGGER_ENABLED_DEFAULT
         self.comms_dict = {}
@@ -92,7 +102,6 @@ def stop_profiling_op(self, op_name_list):
 
     # Add log entry
     def append(self, raw_name, record_name, latency, msg_size):
-        import deepspeed.comm as dist
         algbw, busbw = calc_bw_log(raw_name, msg_size, latency)
         if record_name in self.comms_dict.keys():
             # If this comm_op has already been logged with this message size, just add to existing record
@@ -110,22 +119,22 @@ def append(self, raw_name, record_name, latency, msg_size):
         # If verbose, print every comm op
         # TODO: Add to tensorboard
         if self.verbose:
-            n = dist.get_world_size()
-            log_str = f"rank={dist.get_rank()} | comm op: " + record_name + " | time (ms): {:.2f}".format(
-                latency)
-            log_str += " | msg size: " + convert_size(msg_size)
-            log_str += " | algbw (Gbps): {:.2f} ".format(algbw)
-            log_str += " | busbw (Gbps): {:.2f} ".format(busbw)
+            log_str = f"comm op: {record_name} | time (ms): {latency:.2f} | msg size: {convert_size(msg_size)} | algbw (Gbps): {algbw:.2f} | busbw (Gbps): {busbw:.2f}"
             log_dist(log_str, [0])
 
     # Print summary at end of iteration, epoch, or training
-    def log_all(self):
+    def log_all(self, print_log=True, show_straggler=False):
+        import torch
         from deepspeed.utils.timer import trim_mean
-        print(
-            f"{'Comm. Op': <20}{'Message Size': <20}{'Count': <20}{'Total Latency(ms)': <20}{'Avg Latency(ms)': <20}{'tput_avg (Gbps)': <20}{'busbw_avg (Gbps)': <20}"
-        )
+        import deepspeed.comm as dist
+        from deepspeed.comm.reduce_op import ReduceOp
+        if print_log:
+            print(
+                f"{'Comm. Op': <20}{'Message Size': <20}{'Count': <20}{'Total Latency(ms)': <20}{'Avg Latency(ms)': <20}{'tput_avg (Gbps)': <20}{'busbw_avg (Gbps)': <20}"
+            )
         for record_name in self.comms_dict.keys():
-            print(record_name)
+            if print_log:
+                print(record_name)
             for msg_size, vals in sorted(self.comms_dict[record_name].items()):
                 # vals[0] is the count for each msg size
                 count = vals[0]
@@ -136,6 +145,34 @@ def log_all(self):
                 avg_lat = trim_mean(vals[1], 0.1)
                 avg_algbw = trim_mean(vals[2], 0.1)
                 avg_busbw = trim_mean(vals[3], 0.1)
+                if print_log:
+                    print(
+                        f"{' ': <20}{convert_size(msg_size): <20}{count: <20}{total_lat: <20.2f}{avg_lat: <20.2f}{avg_algbw: <20.2f}{avg_busbw: <20.2f}"
+                    )
+
+        if show_straggler:
+            if print_log:
+                print("_______________________________")
+                print("Breakdown with straggler effect")
+                print("-------------------------------")
                 print(
-                    f"{' ': <20}{convert_size(msg_size): <20}{count: <20}{total_lat: <20.2f}{avg_lat: <20.2f}{avg_algbw: <20.2f}{avg_busbw: <20.2f}"
+                    f"{'Comm. Op': <20}{'Message Size': <20}{'Count': <20}{'Total comm lat(ms)': <20}{'Total straggler(ms)': <20}{'Avg comm lat(ms)': <20}{'Avg straggler(ms)': <20}"
                 )
+            for record_name in self.comms_dict.keys():
+                if print_log:
+                    print(record_name)
+                for msg_size, vals in sorted(self.comms_dict[record_name].items()):
+                    # vals[0] is the count for each msg size
+                    count = vals[0]
+                    # vals[1] is a list of latency records for each msg size
+                    lats = torch.tensor(vals[1])
+                    min_lats = torch.tensor(vals[1])
+                    dist.all_reduce(min_lats, op=ReduceOp.MIN)
+                    total_lat = min_lats.sum().item()
+                    total_straggler = (lats - min_lats).sum().item()
+                    avg_lat = trim_mean(min_lats.tolist(), 0.1)
+                    avg_straggler = trim_mean((lats - min_lats).tolist(), 0.1)
+                    if print_log:
+                        print(
+                            f"{' ': <20}{convert_size(msg_size): <20}{count: <20}{total_lat: <20.2f}{total_straggler: <20.2f}{avg_lat: <20.2f}{avg_straggler: <20.2f}"
+                        )
diff --git a/deepspeed/utils/debug.py b/deepspeed/utils/debug.py
index f3f3ea56d5cf..02295fa98011 100644
--- a/deepspeed/utils/debug.py
+++ b/deepspeed/utils/debug.py
@@ -1,5 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
-""" debug utils """
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 # For lazy import with printflock()
 fcntl = None
@@ -81,7 +83,7 @@ def print_rank_0(message, debug=False, force=False):
 
     """
     global fcntl
-    if fcntl == None:
+    if fcntl is None:
         import fcntl
 
     with open(__file__, "r") as fh:
@@ -128,6 +130,7 @@ def print_rank_0(message, debug=False, force=False):
 
 
 def print_backward_tensors(tensor):
+
     def _print_bwd_tensors(grad_fn):
         print(f"Backward tensors in {grad_fn}")
         for funcs in grad_fn.next_functions:
@@ -135,9 +138,7 @@ def _print_bwd_tensors(grad_fn):
                 try:
                     tensor = getattr(funcs[0], 'variable')
                     print(funcs[0])
-                    print(
-                        f"Tensor - id: {id(tensor)}, shape: {tensor.shape}, data: {tensor}, grad: {tensor.grad}"
-                    )
+                    print(f"Tensor - id: {id(tensor)}, shape: {tensor.shape}, data: {tensor}, grad: {tensor.grad}")
                 except AttributeError as e:
                     _print_bwd_tensors(funcs[0])
 
diff --git a/deepspeed/utils/exceptions.py b/deepspeed/utils/exceptions.py
index 678948c2ef31..9dba0933cc9e 100644
--- a/deepspeed/utils/exceptions.py
+++ b/deepspeed/utils/exceptions.py
@@ -1,6 +1,7 @@
-'''
-Copyright 2022 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 
 class DeprecatedException(Exception):
diff --git a/deepspeed/utils/groups.py b/deepspeed/utils/groups.py
index a4d403a7dd0a..3ebe8cd75522 100644
--- a/deepspeed/utils/groups.py
+++ b/deepspeed/utils/groups.py
@@ -1,6 +1,7 @@
-'''
-Copyright 2021 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 # The file has been adapted from https://github.com/NVIDIA/Megatron-LM and retains the following license from the original file
 
@@ -25,20 +26,25 @@
 """
 
 from deepspeed import comm as dist
-
 from deepspeed.utils import log_dist
 from deepspeed.utils.exceptions import DeprecatedException
-
+from deepspeed.accelerator import get_accelerator
 # Expert parallel group that the current rank belongs to.
 _EXPERT_PARALLEL_GROUP = {}
 # Expert data parallel group that the current rank belongs to.
 _EXPERT_DATA_PARALLEL_GROUP = {}
 # dist world group needs to be cloned for some cases
 _WORLD_GROUP = None
+# ZeRO parameter  partitioning group that the current rank belongs to.
+_ZERO_PARAM_INTRA_PARALLEL_GROUP = None
 # global object to maintain mpu object if passed by a Megatron client
 mpu = None
 # global object that stores tensor parallel world size for experts
 expert_tensor_parallel_world_size = 1
+# All to All quantized graident communication groups
+_ALL_TO_ALL_GROUP = {}
+
+_DATA_PARALLEL_GROUP = None
 
 
 # Deprecated groups initialize function.
@@ -51,8 +57,7 @@ def initialize(ep_size=1, mpu=None):
 
 def _ensure_divisibility(numerator, denominator):
     """Ensure that numerator is divisible by the denominator."""
-    assert numerator % denominator == 0, '{} is not divisible by {}'.format(
-        numerator, denominator)
+    assert numerator % denominator == 0, '{} is not divisible by {}'.format(numerator, denominator)
 
 
 # Not currently used. Helper function to create a model (tensor) parallel group.
@@ -78,8 +83,7 @@ def _create_model_parallel(model_parallel_size_):
     with a total of 16 GPUs, rank 0 to 7 belong to the first box and
     ranks 8 to 15 belong to the second box.
     """
-    log_dist(f'Creating model parallel group with size {model_parallel_size_}',
-             ranks=[0])
+    log_dist(f'Creating model parallel group with size {model_parallel_size_}', ranks=[0])
     # Get world size and rank. Ensure some consistencies.
     assert dist.is_initialized()
     world_size = dist.get_world_size()
@@ -106,7 +110,7 @@ def _create_model_parallel(model_parallel_size_):
     return _DATA_PARALLEL_GROUP, _MODEL_PARALLEL_GROUP
 
 
-def _create_expert_and_data_parallel(expert_parallel_size_):
+def _create_expert_and_data_parallel(expert_parallel_size_, use_data_before_expert_parallel_=False):
     """
         Create expert and data parallel groups.
 
@@ -118,12 +122,11 @@ def _create_expert_and_data_parallel(expert_parallel_size_):
         expert_data_parallel_group = [0,2,4,6,8,10,12,14], [1,3,5,7,9,11,13,15] - all reduce is only on MoE params
         expert_parallel_group = [0, 1], [2,3], [4,5], [6,7], [8,9] - no all reduce, but all to all
         data_parallel_group = [0,1,...,15] - all reduce is only on non-MoE
+        use_data_before_expert_parallel_ (bool): Use the D + E instead of E + D topology
     """
     assert dist.is_initialized()
 
-    log_dist(
-        f'Creating expert and data parallel groups with size {expert_parallel_size_}',
-        ranks=[0])
+    log_dist(f'Creating expert and data parallel groups with size {expert_parallel_size_}', ranks=[0])
     world_size = dist.get_world_size()
     rank = dist.get_rank()
 
@@ -134,33 +137,49 @@ def _create_expert_and_data_parallel(expert_parallel_size_):
     # Build the expert data parallel groups.
     global _EXPERT_DATA_PARALLEL_GROUP
 
+    ep_stride = world_size // expert_parallel_size_
+
     # Only create group if it does not already exist
     if group_name not in _EXPERT_DATA_PARALLEL_GROUP:
         for i in range(expert_parallel_size_):
-            ranks = range(i, world_size, expert_parallel_size_)
+            if use_data_before_expert_parallel_:
+                ranks = range(i * ep_stride, (i + 1) * ep_stride)
+            else:
+                ranks = range(i, world_size, expert_parallel_size_)
             group = dist.new_group(ranks)
-            log_dist(
-                f'Creating expert data parallel process group named {group_name} with ranks: {list(ranks)}',
-                [0])
-            if i == (rank % expert_parallel_size_):
-                _EXPERT_DATA_PARALLEL_GROUP[group_name] = group
+            log_dist(f'Creating expert data parallel process group named {group_name} with ranks: {list(ranks)}', [0])
+            if use_data_before_expert_parallel_:
+                if i == (rank // ep_stride):
+                    _EXPERT_DATA_PARALLEL_GROUP[group_name] = group
+            else:
+                if i == (rank % expert_parallel_size_):
+                    _EXPERT_DATA_PARALLEL_GROUP[group_name] = group
 
     # Build the expert parallel groups.
     global _EXPERT_PARALLEL_GROUP
 
     # Only create group if it does not already exist
     if group_name not in _EXPERT_PARALLEL_GROUP:
-        for i in range(world_size // expert_parallel_size_):
-            ranks = range(i * expert_parallel_size_, (i + 1) * expert_parallel_size_)
-            group = dist.new_group(ranks)
-            log_dist(
-                f'creating expert parallel process group named {group_name} with ranks: {list(ranks)}',
-                [0])
-            if i == (rank // expert_parallel_size_):
-                _EXPERT_PARALLEL_GROUP[group_name] = group
-
-
-def _get_expert_parallel_ranks(world_size, model_parallel_size_, expert_parallel_size_):
+        if use_data_before_expert_parallel_:
+            for i in range(ep_stride):
+                ranks = range(i, world_size, ep_stride)
+                group = dist.new_group(ranks)
+                log_dist(f'creating expert parallel process group named {group_name} with ranks: {list(ranks)}', [0])
+                if i == (rank % ep_stride):
+                    _EXPERT_PARALLEL_GROUP[group_name] = group
+        else:
+            for i in range(world_size // expert_parallel_size_):
+                ranks = range(i * expert_parallel_size_, (i + 1) * expert_parallel_size_)
+                group = dist.new_group(ranks)
+                log_dist(f'creating expert parallel process group named {group_name} with ranks: {list(ranks)}', [0])
+                if i == (rank // expert_parallel_size_):
+                    _EXPERT_PARALLEL_GROUP[group_name] = group
+
+
+def _get_expert_parallel_ranks(world_size,
+                               model_parallel_size_,
+                               expert_parallel_size_,
+                               use_data_before_expert_parallel_=False):
     """Generate expert parallel and expert data parallel group ranks list.
 
         Example - E + M + D parallel
@@ -176,7 +195,7 @@ def _get_expert_parallel_ranks(world_size, model_parallel_size_, expert_parallel
         world_size (int): Distributed world size.
         model_parallel_size_ (int): Model parallel group size.
         expert_parallel_size_ (int): Expert parallel group size.
-
+        use_data_before_expert_parallel_ (bool): Use the D + E instead of E + D topology
     Returns:
         Expert parallel group ranks and Expert data parallel group ranks list.
     """
@@ -187,8 +206,19 @@ def _get_expert_parallel_ranks(world_size, model_parallel_size_, expert_parallel
     # Generate data parallel groups
     data_parallel_groups = []
     dp_group_size = model_parallel_size_
-    for i in range(dp_group_size):
-        data_parallel_groups.append(list(range(i, world_size, dp_group_size)))
+
+    if use_data_before_expert_parallel_:
+        dp_stride = world_size // expert_parallel_size_ // model_parallel_size_
+        for i in range(dp_group_size):
+            data_parallel_groups.append(list())
+            for ds in range(dp_stride):
+                # [0, 4, 8, 12, 16, 20, 24, 28, 2, 6, 10, 14, 18, 22, 26, 30]
+                # [1, 5, 9, 13, 17, 21, 25, 29, 3, 7, 11, 15, 19, 23, 27, 31]
+                data_parallel_groups[-1].extend(
+                    list(range(i + ds * model_parallel_size_, world_size, dp_stride * model_parallel_size_)))
+    else:
+        for i in range(dp_group_size):
+            data_parallel_groups.append(list(range(i, world_size, dp_group_size)))
 
     expert_parallel_groups = []
     expert_data_parallel_groups = []
@@ -206,7 +236,7 @@ def _get_expert_parallel_ranks(world_size, model_parallel_size_, expert_parallel
     return expert_parallel_groups, expert_data_parallel_groups
 
 
-def _create_expert_data_and_model_parallel(expert_parallel_size_, mpu):
+def _create_expert_data_and_model_parallel(expert_parallel_size_, mpu, use_data_before_expert_parallel_=False):
     """
         Create expert and data parallel groups based on MPU (model parallel) group.
 
@@ -251,7 +281,7 @@ def _create_expert_data_and_model_parallel(expert_parallel_size_, mpu):
     # Need to check conditions outside the group creation loop because of the way torch.dist group creation works
     if group_name not in _EXPERT_DATA_PARALLEL_GROUP and group_name not in _EXPERT_PARALLEL_GROUP:
         expert_parallel_groups, expert_data_parallel_groups = _get_expert_parallel_ranks(
-            world_size, model_parallel_size_, expert_parallel_size_)
+            world_size, model_parallel_size_, expert_parallel_size_, use_data_before_expert_parallel_)
         for ranks in expert_parallel_groups:
             group = dist.new_group(ranks)
             if rank in list(ranks):
@@ -323,10 +353,38 @@ def _clone_world_group():
     return _WORLD_GROUP
 
 
+def _get_local_all_to_all_group():
+    assert dist.is_initialized(), 'dist is not initialized'
+    global _ALL_TO_ALL_GROUP
+    device_per_node = get_accelerator().device_count()
+    num_local = dist.get_world_size() // device_per_node
+    if num_local == 0 and dist.get_world_size() > 0:
+        assert dist.get_world_size() >= 1, 'num_gpus must >=1, cannot initialize All-To-All'
+        cur_rank = []
+        for i in range(dist.get_world_size()):
+            cur_rank.append(i)
+        _ALL_TO_ALL_GROUP['local_0'] = dist.new_group(ranks=cur_rank)
+    elif num_local == 1:
+        assert dist.get_world_size(
+        ) == device_per_node, 'num_gpus not equal to device per node, cannot initialize All-To-All'
+        _ALL_TO_ALL_GROUP['local_0'] = dist.new_group(ranks=[i for i in range(device_per_node)])
+    else:
+        assert dist.get_world_size() > device_per_node, 'num_nodes<2 cannot initialize All-To-All'
+        for i in range(num_local):
+            local_rank = [j + device_per_node * i for j in range(device_per_node)]
+            _ALL_TO_ALL_GROUP[f"local_{i}"] = dist.new_group(ranks=local_rank)
+
+        for i in range(device_per_node):
+            cur_rank = []
+            for j in range(num_local):
+                cur_rank.append(i + j * device_per_node)
+            _ALL_TO_ALL_GROUP[f"global_{i}"] = dist.new_group(ranks=cur_rank)
+    return _ALL_TO_ALL_GROUP
+
+
 def _get_data_parallel_group():
     """Get the data parallel group the caller rank belongs to."""
-    assert dist.is_initialized(), \
-        'dist is not initialized'
+    assert dist.is_initialized(), 'dist is not initialized'
     global mpu
     if mpu is not None:
         return mpu.get_data_parallel_group()
@@ -335,7 +393,7 @@ def _get_data_parallel_group():
 
 
 def _get_broadcast_src_rank():
-    return dist.get_global_rank(_get_data_parallel_group(), 0)
+    return dist.get_global_rank(_get_sequence_data_parallel_group(), 0)
 
 
 def _get_expert_broadcast_src_rank(group_name):
@@ -388,12 +446,117 @@ def _get_model_parallel_world_size():
 
 def _get_data_parallel_rank():
     """Return my rank for the data parallel group."""
-    global mpu
-    if mpu is not None:
-        return mpu.get_data_parallel_rank()
     return dist.get_rank(group=_get_data_parallel_group())
 
 
+def _get_sequence_parallel_world_size():
+    """Return world size for the model parallel group."""
+    global mpu
+    if mpu is not None and hasattr(mpu, 'get_sequence_parallel_world_size'):
+        return mpu.get_sequence_parallel_world_size()
+    return 1
+
+
+def _get_sequence_parallel_rank():
+    """Return my rank for the data parallel group."""
+    global mpu
+    if mpu is not None and hasattr(mpu, 'get_sequence_parallel_rank'):
+        return mpu.get_sequence_parallel_rank()
+    return 0
+
+
+def _get_sequence_parallel_group():
+    global mpu
+    if mpu is not None and hasattr(mpu, 'get_sequence_parallel_group'):
+        return mpu.get_sequence_parallel_group()
+    return None
+
+
+def _get_sequence_data_parallel_world_size():
+    """Return world size for the model parallel group."""
+    global mpu
+    if mpu is not None and hasattr(mpu, 'get_sequence_data_parallel_world_size'):
+        return mpu.get_sequence_data_parallel_world_size()
+    return _get_data_parallel_world_size()
+
+
+def _get_sequence_data_parallel_rank():
+    """Return my rank for the data parallel group."""
+    global mpu
+    if mpu is not None and hasattr(mpu, 'get_sequence_data_parallel_rank'):
+        return mpu.get_sequence_data_parallel_rank()
+    return _get_data_parallel_rank()
+
+
+def _get_sequence_data_parallel_group():
+    global mpu
+    # When sequence parallelism is enabled, the process group for zero sharding and
+    # gradient allreduce must be across both dimensions of data and sequence parallelism.
+    if mpu is not None and hasattr(mpu, 'get_sequence_data_parallel_group'):
+        return mpu.get_sequence_data_parallel_group()
+    return _get_data_parallel_group()
+
+
 def _get_expert_model_parallel_world_size():
     global expert_tensor_parallel_world_size
     return expert_tensor_parallel_world_size
+
+
+def _create_zero_param_parallel_group(group_size):
+    """
+        Create parameter partitioning group within ZeRO data parallel groups.
+
+        Example - ZP + D parallel
+        world_size = 16
+        zero_hpz_partition_size = 2 # number of ranks with with replicated params (dual partitioning)
+        zero_param_intra_parallel_group = [0, 1], [2,3], [4,5], [6,7], [8,9] - segmented (subgroup) with rep partition
+        data_parallel_group = [0,1,...,15] - all reduce is on ZeRO model
+    """
+    assert dist.is_initialized()
+    global _ZERO_PARAM_INTRA_PARALLEL_GROUP
+    # Only create group if it does not already exist
+    assert _ZERO_PARAM_INTRA_PARALLEL_GROUP is None, \
+        'ZeRO parameter intra parallel group is already initialized'
+
+    world_size = dist.get_world_size()
+    rank = dist.get_rank()
+
+    zero_param_parallel_size_ = min(group_size, world_size)
+    _ensure_divisibility(world_size, zero_param_parallel_size_)
+
+    # Build the ZeRO param intra parallel groups.
+    for i in range(world_size // zero_param_parallel_size_):
+        ranks = range(i * zero_param_parallel_size_, (i + 1) * zero_param_parallel_size_)
+        group = dist.new_group(ranks)
+        if i == (rank // zero_param_parallel_size_):
+            _ZERO_PARAM_INTRA_PARALLEL_GROUP = group
+
+
+def _get_zero_param_intra_parallel_group():
+    """Get the ZeRO parameter partitioning intra parallel group the caller rank belongs to."""
+    #assert _ZERO_PARAM_INTRA_PARALLEL_GROUP is not None, \
+    #    'ZeRO parameter partitioning group is not initialized'
+    #TODO: Add warning
+    return _ZERO_PARAM_INTRA_PARALLEL_GROUP
+
+
+def _zero_param_parallel_is_initialized():
+    """Check if ZeRO data parallel with parameter partititioning groups are initialized."""
+    ###TODO: assert that MPU is not set
+    if _ZERO_PARAM_INTRA_PARALLEL_GROUP is None and _DATA_PARALLEL_GROUP is None:
+        return False
+
+
+def _get_zero_param_intra_parallel_rank_in_mygroup():
+    """Return my rank for the ZeRO parameter inter parallel group."""
+    return dist.get_rank(group=_get_zero_param_intra_parallel_group())
+
+
+def _get_zero_param_intra_parallel_group_world_size():
+    """Return world size for the ZeRO parameter parallel group."""
+    return dist.get_world_size(group=_get_zero_param_intra_parallel_group())
+
+
+def _get_zero_param_intra_parallel_group_ranks():
+    """Return all ranks for the ZeRO parameter intra parallel group."""
+    return dist.get_all_ranks_from_group(group=_get_zero_param_intra_parallel_group())
diff --git a/deepspeed/utils/init_on_device.py b/deepspeed/utils/init_on_device.py
index 1b51efe60384..52dbf71d9562 100644
--- a/deepspeed/utils/init_on_device.py
+++ b/deepspeed/utils/init_on_device.py
@@ -1,6 +1,8 @@
-'''
-Copyright 2020 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 import torch
 from typing import Callable
 from torch import Tensor
@@ -35,13 +37,10 @@ def __init__(self, dtype, device="meta", enabled=True):
 
         if device == "meta":
             if pkg_version.parse('1.10') > pkg_version.parse(torch.__version__):
-                raise NotImplementedError(
-                    "Meta tensor support is not available, please upgrade to torch 1.10+"
-                )
+                raise NotImplementedError("Meta tensor support is not available, please upgrade to torch 1.10+")
+
+    def fp_tensor_constructor(self, fn: Callable, target_fp_dtype: torch.dtype) -> Callable:
 
-    def fp_tensor_constructor(self,
-                              fn: Callable,
-                              target_fp_dtype: torch.dtype) -> Callable:
         def wrapped_fn(*args, **kwargs) -> Tensor:
             if kwargs.get("device", None) is None:
                 kwargs['device'] = self.device
@@ -53,6 +52,7 @@ def wrapped_fn(*args, **kwargs) -> Tensor:
         return wrapped_fn
 
     def get_new_tensor_fn_for_dtype(self, dtype: torch.dtype) -> Callable:
+
         def new_tensor(cls, *args) -> Tensor:
             tensor = OnDevice._orig_torch_empty(0, device=self.device).new_empty(*args)
             if tensor.is_floating_point():
diff --git a/deepspeed/utils/logging.py b/deepspeed/utils/logging.py
index 631ee2177c65..1e62d96e1032 100644
--- a/deepspeed/utils/logging.py
+++ b/deepspeed/utils/logging.py
@@ -1,5 +1,9 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
 
+# DeepSpeed Team
+
+import functools
 import logging
 import sys
 import os
@@ -14,6 +18,7 @@
 
 
 class LoggerFactory:
+
     @staticmethod
     def create_logger(name=None, level=logging.INFO):
         """create a logger
@@ -29,9 +34,8 @@ def create_logger(name=None, level=logging.INFO):
         if name is None:
             raise ValueError("name for logger cannot be None")
 
-        formatter = logging.Formatter(
-            "[%(asctime)s] [%(levelname)s] "
-            "[%(filename)s:%(lineno)d:%(funcName)s] %(message)s")
+        formatter = logging.Formatter("[%(asctime)s] [%(levelname)s] "
+                                      "[%(filename)s:%(lineno)d:%(funcName)s] %(message)s")
 
         logger_ = logging.getLogger(name)
         logger_.setLevel(level)
@@ -46,6 +50,21 @@ def create_logger(name=None, level=logging.INFO):
 logger = LoggerFactory.create_logger(name="DeepSpeed", level=logging.INFO)
 
 
+@functools.lru_cache(None)
+def warning_once(*args, **kwargs):
+    """
+    This method is identical to `logger.warning()`, but will emit the warning with the same message only once
+
+    Note: The cache is for the function arguments, so 2 different callers using the same arguments will hit the cache.
+    The assumption here is that all warning messages are unique across the code. If they aren't then need to switch to
+    another type of cache that includes the caller frame information in the hashing function.
+    """
+    logger.warning(*args, **kwargs)
+
+
+logger.warning_once = warning_once
+
+
 def print_configuration(args, name):
     logger.info("{}:".format(name))
     for arg in sorted(vars(args)):
diff --git a/deepspeed/utils/mixed_precision_linkage.py b/deepspeed/utils/mixed_precision_linkage.py
index ac785ed9d2bf..b1afa8f00aa3 100644
--- a/deepspeed/utils/mixed_precision_linkage.py
+++ b/deepspeed/utils/mixed_precision_linkage.py
@@ -1,36 +1,21 @@
-"""
-Copyright 2022 The Microsoft DeepSpeed Team
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 import types
 from deepspeed.utils import get_full_hp_param, get_full_hp_grad, get_hp_fragment_mapping
+from deepspeed.utils import set_full_hp_param
 
 
-def link_hp_params(lp_param_list,
-                   flat_hp_partition,
-                   gradient_dict,
-                   offload_gradient_dict,
-                   use_offload,
-                   param_group_index,
-                   partition_start,
-                   partition_size,
-                   partition_optimizer_state,
-                   dp_group):
-    local_lp_param_and_offset = _init_lp_to_hp_mapping(lp_param_list,
-                                                       partition_start,
-                                                       partition_size,
-                                                       dp_group)
+def link_hp_params(lp_param_list, flat_hp_partition, gradient_dict, offload_gradient_dict, use_offload,
+                   param_group_index, partition_start, partition_size, partition_optimizer_state, dp_group):
+    local_lp_param_and_offset = _init_lp_to_hp_mapping(lp_param_list, partition_start, partition_size, dp_group)
 
     for lp_param, lp_start in local_lp_param_and_offset:
-        lp_param._hp_mapping = get_hp_fragment_mapping(lp_param,
-                                                       lp_start,
-                                                       flat_hp_partition,
-                                                       gradient_dict,
-                                                       offload_gradient_dict,
-                                                       use_offload,
-                                                       param_group_index,
-                                                       partition_start,
-                                                       partition_size,
-                                                       partition_optimizer_state)
+        lp_param._hp_mapping = get_hp_fragment_mapping(lp_param, lp_start, flat_hp_partition, gradient_dict,
+                                                       offload_gradient_dict, use_offload, param_group_index,
+                                                       partition_start, partition_size, partition_optimizer_state)
 
 
 def _init_lp_to_hp_mapping(lp_param_list, partition_start, partition_size, dp_group):
@@ -43,6 +28,7 @@ def _init_lp_to_hp_mapping(lp_param_list, partition_start, partition_size, dp_gr
         lp_param._dp_group = dp_group
         lp_param.get_full_hp_param = types.MethodType(get_full_hp_param, lp_param)
         lp_param.get_full_hp_grad = types.MethodType(get_full_hp_grad, lp_param)
+        lp_param.set_full_hp_param = types.MethodType(set_full_hp_param, lp_param)
 
         # lp_param overlaps with partition if both are true
         # 1) current_offset < partition_end,
diff --git a/deepspeed/utils/numa.py b/deepspeed/utils/numa.py
new file mode 100644
index 000000000000..13617826b1ce
--- /dev/null
+++ b/deepspeed/utils/numa.py
@@ -0,0 +1,202 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+# return a list of list for cores to numa mapping
+# [
+#     [ cores for numa 0 ]
+#     [ cores belong to numa 1 ]
+#     ...
+# ]
+
+import distutils
+import os
+import psutil
+import subprocess
+
+
+# return a list of list for cores to numa mapping
+# [
+#     [ cores for numa 0 ]
+#     [ cores belong to numa 1 ]
+#     ...
+# ]
+def get_numa_cores():
+    ret = []
+    output = subprocess.check_output(['numactl', '--hardware']).decode("utf-8")
+    lines = output.split('\n')
+    for line in lines:
+        if line.startswith('available:'):
+            num_numas = int(line.split(' ')[1])
+            break
+    for numa in range(num_numas):
+        for line in lines:
+            if line.startswith(f'node {numa} cpus:'):
+                cores = line.split(' ')[3:]
+                ret.append([int(core) for core in cores])
+    return ret
+
+
+def check_for_numactl_pkg():
+    libs = dict(
+        dpkg=["-l", "numactl", "apt"],
+        pacman=["-Q", "numactl", "pacman"],
+        rpm=["-q", "numactl", "yum"],
+    )
+
+    found = False
+    for pkgmgr, data in libs.items():
+        flag, lib, tool = data
+        path = distutils.spawn.find_executable(pkgmgr)
+        if path is not None:
+            cmd = f"{pkgmgr} {flag} {lib}"
+            result = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
+            if result.wait() == 0:
+                found = True
+            else:
+                print(f"please install the {lib} package with {tool}")
+            break
+    return found
+
+
+def parse_range(rng):
+    try:
+        value = int(rng)
+        return range(value, value + 1)
+    except ValueError:
+        # value is not a single number
+        parts = rng.split('-')
+        if len(parts) != 2:
+            raise ValueError("Bad range: '%s', range must be either a number or two number separated by dash" %
+                             (rng, ))
+        start = int(parts[0])
+        end = int(parts[1])
+        if start > end:
+            raise ValueError("Bad range: '%s', range end must larger than or equal to start" % (rng, ))
+        return range(start, end + 1)
+
+
+# parse comma and dash separated range list into list
+# i.e. "0,2-4,6" --> [0, 2, 3, 4, 6]
+# rules:
+# 1. Range list number be comma separated, each item are either a single number,
+#    or a range marked by two numbers (both number are included in the range)
+# 2. Sub ranges must be in ascend order and not overlap with each other
+# 3. No space in the range expression
+def parse_range_list(range_str):
+    number_list = []
+    last = -1
+    range_list = range_str.split(',')
+    for sub_range in range_list:
+        sub_number_list = parse_range(sub_range)
+        if sub_number_list[0] <= last:
+            raise ValueError(
+                "Bad range: '%s', sub ranges must not overlap with each other and should be in ascend order" %
+                (range_str, ))
+        last = sub_number_list[-1]
+        number_list.extend(sub_number_list)
+    return number_list
+
+
+def get_numactl_cmd(bind_core_list, num_local_procs, local_rank):
+    numactl_cmd = []
+    check_for_numactl_pkg()
+    if 'KMP_AFFINITY' in os.environ.keys():
+        raise ValueError("Environment variable KMP_AFFINITY conflicts with numactl "
+                         "because it interfere with how many CPU cores numactl can set. "
+                         "Unset KMP_AFFINITY before launching deepspeed.\n\n"
+                         "\t$ unset KMP_AFFINITY\n"
+                         "\t$ deepspeed <deepspeed command parameters>")
+    if bind_core_list is not None:
+        core_list = parse_range_list(bind_core_list)
+        total_cores = len(core_list)
+    else:
+        total_cores = psutil.cpu_count(logical=False)
+        core_list = range(total_cores)
+    cores_per_rank = total_cores // num_local_procs
+    assert cores_per_rank >= 1, "At least one core needs to be assigned to each rank"
+    core_list_for_rank = core_list[cores_per_rank * local_rank:cores_per_rank * (local_rank + 1)]
+    numactl_cmd.append("numactl")
+
+    # check if all cores belong to same numa, if true, bind process to that numa domain with -m parameter
+    numa_cores = get_numa_cores()
+    num_numas = len(numa_cores)
+
+    numa_mode = "normal"
+
+    non_empty_numa_list = []
+    empty_numa_list = []
+    previous_numa_cores = []
+    numa_node_list = []
+    numa_node_list_list = []
+    for i in range(num_numas):
+        # look for empty numa which is HBM numa
+        if numa_cores[i] == []:
+            empty_numa_list.append(i)
+        else:
+            non_empty_numa_list.append(i)
+
+            # check for fakenuma
+            if numa_cores[i] == previous_numa_cores:
+                if numa_node_list == []:
+                    #first duplication, add previous node into list
+                    numa_node_list.append(i - 1)
+                numa_node_list.append(i)
+            else:
+                if numa_node_list != []:
+                    numa_node_list_list.append(numa_node_list)
+                    numa_node_list = []
+        previous_numa_cores = numa_cores[i]
+    if numa_node_list != []:
+        numa_node_list_list.append(numa_node_list)
+
+    if empty_numa_list != [] and len(empty_numa_list) == len(non_empty_numa_list):
+        numa_mode = "flat_hbm"
+        numa_dict = dict(zip(non_empty_numa_list, empty_numa_list))
+    elif numa_node_list_list != []:
+        numa_mode = "fake"
+
+    if numa_mode == "normal":
+        for i in range(num_numas):
+            if set(core_list_for_rank) <= set(numa_cores[i]):
+                numactl_cmd.append("-m")
+                numactl_cmd.append(f"{i}")
+                break
+    elif numa_mode == "flat_hbm":
+        for i in range(num_numas):
+            if set(core_list_for_rank) <= set(numa_cores[i]):
+                numactl_cmd.append("-p")
+                numactl_cmd.append(f"{numa_dict[i]}")
+                break
+    elif numa_mode == "fake":
+        for i in range(num_numas):
+            if set(core_list_for_rank) <= set(numa_cores[i]):
+                for nodes in numa_node_list_list:
+                    if i in nodes:
+                        numactl_cmd.append("-m")
+                        numactl_cmd.append(f"{','.join(map(str, nodes))}")
+                        break
+                # the following construct break the outer loop if inner loop breaks
+                else:
+                    continue
+                break
+
+    numactl_cmd.append("-C")
+    last_core = core_list_for_rank[0]
+    first_core = last_core
+    core_list_str = f"{last_core}"
+    for core_id in core_list_for_rank[1:]:
+        if core_id == last_core + 1:
+            last_core = core_id
+            continue
+        else:
+            if first_core == last_core:
+                core_list_str = f"{core_list_str},{core_id}"
+            else:
+                core_list_str = f"{core_list_str}-{last_core},{core_id}"
+            first_core = core_id
+            last_core = core_id
+    if first_core != last_core:
+        core_list_str = f"{core_list_str}-{last_core}"
+    numactl_cmd.append(f"{core_list_str}")
+    return cores_per_rank, numactl_cmd
diff --git a/deepspeed/utils/nvtx.py b/deepspeed/utils/nvtx.py
index 881000311c36..3823599e7bf2 100644
--- a/deepspeed/utils/nvtx.py
+++ b/deepspeed/utils/nvtx.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from deepspeed.accelerator import get_accelerator
 
@@ -6,6 +9,7 @@
 def instrument_w_nvtx(func):
     """decorator that causes an NVTX range to be recorded for the duration of the
     function call."""
+
     def wrapped_fn(*args, **kwargs):
         get_accelerator().range_push(func.__qualname__)
         ret_val = func(*args, **kwargs)
diff --git a/deepspeed/utils/tensor_fragment.py b/deepspeed/utils/tensor_fragment.py
index 8204735748c5..5f94070dc4c7 100644
--- a/deepspeed/utils/tensor_fragment.py
+++ b/deepspeed/utils/tensor_fragment.py
@@ -1,10 +1,12 @@
-"""
-Copyright 2022 The Microsoft DeepSpeed Team
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 from dataclasses import dataclass
 from deepspeed import comm as dist
+from typing import Dict
 
 
 @dataclass
@@ -19,9 +21,9 @@ class tensor_fragment:
     lp_fragment_address: fragment_address
     hp_fragment: torch.Tensor
     hp_fragment_address: fragment_address
-    optim_fragment: {}
-    gradient_dict: {}
-    offload_gradient_dict: {}
+    optim_fragment: Dict
+    gradient_dict: Dict
+    offload_gradient_dict: Dict
     use_offload: bool
     param_group_index: int
 
@@ -43,25 +45,31 @@ def get_hp_fragment_address(self):
     def get_optim_state_keys(self):
         return list(self.optim_fragment.keys())
 
+    def get_hp_fragment(self, optim_state_key=None):
+        if optim_state_key is None:
+            return self.hp_fragment
+        return self.get_optim_state_fragment(optim_state_key)
+
 
 def get_full_hp_param(self, optim_state_key=None):
     reduce_buffer = torch.zeros_like(self, dtype=torch.float32).flatten()
     if self._hp_mapping is not None:
         lp_frag_address = self._hp_mapping.lp_fragment_address
-        reduce_fragment = torch.narrow(reduce_buffer,
-                                       0,
-                                       lp_frag_address.start,
-                                       lp_frag_address.numel)
-        if optim_state_key is None:
-            hp_fragment = self._hp_mapping.hp_fragment
-        else:
-            hp_fragment = self._hp_mapping.get_optim_state_fragment(optim_state_key)
-
+        reduce_fragment = torch.narrow(reduce_buffer, 0, lp_frag_address.start, lp_frag_address.numel)
+        hp_fragment = self._hp_mapping.get_hp_fragment(optim_state_key)
         reduce_fragment.data.copy_(hp_fragment.data)
     dist.all_reduce(reduce_buffer, group=self._dp_group)
     return reduce_buffer.reshape_as(self)
 
 
+def set_full_hp_param(self, value, optim_state_key=None):
+    if self._hp_mapping is not None:
+        lp_frag_address = self._hp_mapping.lp_fragment_address
+        value_fragment = torch.narrow(value.flatten(), 0, lp_frag_address.start, lp_frag_address.numel)
+        hp_fragment = self._hp_mapping.get_hp_fragment(optim_state_key)
+        hp_fragment.data.copy_(value_fragment.data)
+
+
 def get_full_hp_grad(self):
     reduce_buffer = torch.zeros_like(self, dtype=torch.float32).flatten()
     if self._hp_mapping is not None:
@@ -72,21 +80,14 @@ def get_full_hp_grad(self):
         else:
             gradient_dict = hp_mapping.gradient_dict
 
-        if hp_mapping.param_group_index not in gradient_dict or gradient_dict[
-                hp_mapping.param_group_index] is None:
-            raise ValueError(
-                "Gradients are only available immediately after backward and before engine step"
-            )
+        if hp_mapping.param_group_index not in gradient_dict or gradient_dict[hp_mapping.param_group_index] is None:
+            raise ValueError("Gradients are only available immediately after backward and before engine step")
 
-        lp_grad_fragment = gradient_dict[hp_mapping.param_group_index][
-            self._index_in_param_group]
+        lp_grad_fragment = gradient_dict[hp_mapping.param_group_index][self._index_in_param_group]
         hp_grad_fragment = lp_grad_fragment.to(torch.float32).flatten()
 
         lp_frag_address = self._hp_mapping.lp_fragment_address
-        reduce_fragment = torch.narrow(reduce_buffer,
-                                       0,
-                                       lp_frag_address.start,
-                                       lp_frag_address.numel)
+        reduce_fragment = torch.narrow(reduce_buffer, 0, lp_frag_address.start, lp_frag_address.numel)
 
         if self.view(-1).shape == hp_grad_fragment.shape:
             reduce_buffer.data.copy_(hp_grad_fragment.data)
@@ -113,11 +114,28 @@ def safe_get_full_fp32_param(param):
     return None
 
 
+def safe_set_full_fp32_param(param, value):
+    """Update the partitioned fp32 parameter of a low-precision (e.g., fp16) parameter.
+
+        Args:
+            param (``torch.nn.Parameter``): A model parameter
+            value (``torch.Tensor``): New value
+    """
+    # ZeRO stage 3 param
+    if hasattr(param, 'ds_id'):
+        param._z3_optimizer.set_full_hp_param(value, param)
+
+    # ZeRO stage 1, 2, and bf16_optimizer params
+    if hasattr(param, '_hp_mapping'):
+        param.set_full_hp_param(value)
+
+
 def safe_get_full_optimizer_state(param, optim_state_key):
     """Assemble and return the fp32 optimizer state of a low-precision (e.g., fp16) parameter.
 
         Args:
             param (``torch.nn.Parameter``): A model parameter
+            optim_state_key (``string``): Key value of optimizer state (e.g., `exp_avg` in Adam optimizer)
     """
     # ZeRO stage 3 param
     if hasattr(param, 'ds_id'):
@@ -129,6 +147,23 @@ def safe_get_full_optimizer_state(param, optim_state_key):
     return None
 
 
+def safe_set_full_optimizer_state(param, value, optim_state_key):
+    """Update the partitioned fp32 optimizer state of a low-precision (e.g., fp16) parameter.
+
+        Args:
+            param (``torch.nn.Parameter``): A model parameter
+            value (``torch.Tensor``): New value
+            optim_state_key (``string``): Key value of optimizer state (e.g., `exp_avg` in Adam optimizer)
+    """
+    # ZeRO stage 3 param
+    if hasattr(param, 'ds_id'):
+        param._z3_optimizer.set_full_hp_param(value, param, optim_state_key)
+
+    # ZeRO stage 1, 2, and bf16_optimizer params
+    if hasattr(param, '_hp_mapping'):
+        param.set_full_hp_param(value, optim_state_key)
+
+
 # TODO: Figure out the correct return dtype
 def safe_get_full_grad(param):
     """Assemble and return the fp32 gradient of a low-precision (e.g., fp16) parameter.
@@ -150,16 +185,77 @@ def safe_get_full_grad(param):
     return None
 
 
-def get_hp_fragment_mapping(lp_param,
-                            lp_start,
-                            flat_hp_partition,
-                            gradient_dict,
-                            offload_gradient_dict,
-                            use_offload,
-                            param_group_index,
-                            partition_start,
-                            partition_size,
-                            optimizer_state_dict):
+### Local API  START ###
+def safe_get_local_grad(param):
+    """Get the fp32 gradient of a partitioned parameter.
+        Args:
+            param (``torch.nn.Parameter``): A model parameter
+    """
+    if param.grad is not None:
+        return param.grad
+
+    # ZeRO stage 3 param
+    if hasattr(param, 'ds_id'):
+        return param._z3_optimizer.get_local_fp32_grad_for_param(param)
+
+    return None
+
+
+def safe_get_local_fp32_param(param):
+    """Get the fp32 partitioned parameter.
+        Args:
+            param (``torch.nn.Parameter``): A model parameter
+    """
+    # ZeRO stage 3 param
+    if hasattr(param, 'ds_id'):
+        return param._z3_optimizer.get_local_fp32_param(param)
+
+    return None
+
+
+def safe_get_local_optimizer_state(param, optim_state_key):
+    """Get the fp32 optimizer state of a partitioned parameter.
+        Args:
+            param (``torch.nn.Parameter``): A model parameter
+            optim_state_key (``string``): Key value of optimizer state (e.g., `exp_avg` in Adam optimizer)
+    """
+    # ZeRO stage 3 param
+    if hasattr(param, 'ds_id'):
+        return param._z3_optimizer.get_local_fp32_param(param, optim_state_key)
+
+    return None
+
+
+def safe_set_local_optimizer_state(param, value, optim_state_key):
+    """Update the fp32 optimizer state of a partitioned parameter.
+        Args:
+            param (``torch.nn.Parameter``): A model parameter
+            value (``torch.Tensor``): New value
+            optim_state_key (``string``): Key value of optimizer state (e.g., `exp_avg` in Adam optimizer)
+    """
+    # ZeRO stage 3 param
+    if hasattr(param, 'ds_id'):
+        param._z3_optimizer.set_local_hp_param(value, param, optim_state_key)
+
+
+def safe_set_local_fp32_param(param, value):
+    """Update the partitioned fp32 parameter.
+        Args:
+            param (``torch.nn.Parameter``): A model parameter
+            value (``torch.Tensor``): New value
+    """
+    # ZeRO stage 3 param
+    if hasattr(param, 'ds_id'):
+        param._z3_optimizer.set_local_hp_param(value, param)
+
+
+### Local API  END ###
+
+# TODO: Implement API for setting ZeRO partitioned gradients
+
+
+def get_hp_fragment_mapping(lp_param, lp_start, flat_hp_partition, gradient_dict, offload_gradient_dict, use_offload,
+                            param_group_index, partition_start, partition_size, optimizer_state_dict):
     lp_end = lp_param.numel() + lp_start
     hp_start = partition_start
     hp_end = partition_start + partition_size
@@ -170,25 +266,16 @@ def get_hp_fragment_mapping(lp_param,
         f'fragment start {fragment_start} should be < fragment_end {fragment_end}'
 
     fragment_numel = fragment_end - fragment_start
-    hp_frag_address = fragment_address(start=fragment_start - hp_start,
-                                       numel=fragment_numel)
-    hp_fragment_tensor = flat_hp_partition.narrow(0,
-                                                  hp_frag_address.start,
-                                                  hp_frag_address.numel)
+    hp_frag_address = fragment_address(start=fragment_start - hp_start, numel=fragment_numel)
+    hp_fragment_tensor = flat_hp_partition.narrow(0, hp_frag_address.start, hp_frag_address.numel)
     optim_fragment = {
-        key: value.narrow(0,
-                          hp_frag_address.start,
-                          hp_frag_address.numel)
-        for key,
-        value in optimizer_state_dict.items()
+        key: value.narrow(0, hp_frag_address.start, hp_frag_address.numel)
+        for key, value in optimizer_state_dict.items()
         if torch.is_tensor(value) and value.shape == flat_hp_partition.shape
     }
 
-    lp_frag_address = fragment_address(start=fragment_start - lp_start,
-                                       numel=fragment_numel)
-    lp_fragment_tensor = lp_param.flatten().narrow(0,
-                                                   lp_frag_address.start,
-                                                   lp_frag_address.numel)
+    lp_frag_address = fragment_address(start=fragment_start - lp_start, numel=fragment_numel)
+    lp_fragment_tensor = lp_param.flatten().narrow(0, lp_frag_address.start, lp_frag_address.numel)
 
     return tensor_fragment(lp_fragment=lp_fragment_tensor,
                            lp_fragment_address=lp_frag_address,
diff --git a/deepspeed/utils/timer.py b/deepspeed/utils/timer.py
index 16865c5e56c2..4aac4ded1243 100755
--- a/deepspeed/utils/timer.py
+++ b/deepspeed/utils/timer.py
@@ -1,12 +1,23 @@
-"""
-Copyright 2019 The Microsoft DeepSpeed Team
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import time
 from numpy import mean
 from deepspeed.utils.logging import log_dist
 from deepspeed.accelerator import get_accelerator
-from deepspeed import comm as dist
+
+FORWARD_MICRO_TIMER = 'fwd_microstep'
+FORWARD_GLOBAL_TIMER = 'fwd'
+BACKWARD_MICRO_TIMER = 'bwd_microstep'
+BACKWARD_GLOBAL_TIMER = 'bwd'
+BACKWARD_INNER_MICRO_TIMER = 'bwd_inner_microstep'
+BACKWARD_INNER_GLOBAL_TIMER = 'bwd_inner'
+BACKWARD_REDUCE_MICRO_TIMER = 'bwd_allreduce_microstep'
+BACKWARD_REDUCE_GLOBAL_TIMER = 'bwd_allreduce'
+STEP_MICRO_TIMER = 'step_microstep'
+STEP_GLOBAL_TIMER = 'step'
 
 try:
     import psutil
@@ -18,9 +29,8 @@
 
 
 class CudaEventTimer(object):
-    def __init__(self,
-                 start_event: get_accelerator().Event,
-                 end_event: get_accelerator().Event):
+
+    def __init__(self, start_event: get_accelerator().Event, end_event: get_accelerator().Event):
         self.start_event = start_event
         self.end_event = end_event
 
@@ -32,33 +42,51 @@ def get_elapsed_msec(self):
 
 class SynchronizedWallClockTimer:
     """Group of timers. Borrowed from Nvidia Megatron code"""
+
     class Timer:
         """Timer."""
+
         def __init__(self, name):
             self.name_ = name
             self.started_ = False
             self.event_timers = []
+            self.use_host_timer = get_accelerator().is_synchronized_device()
             self.start_event = None
             self.elapsed_records = None
+            self.start_time = 0.0
+            self.end_time = 0.0
 
         def start(self):
             """Start the timer."""
             assert not self.started_, f"{self.name_} timer has already been started"
-            self.start_event = get_accelerator().Event(enable_timing=True)
-            self.start_event.record()
+            if self.use_host_timer:
+                self.start_time = time.time()
+            else:
+                event_class = get_accelerator().Event
+                self.start_event = event_class(enable_timing=True)
+                self.start_event.record()
             self.started_ = True
 
         def stop(self, reset=False, record=False):
             """Stop the timer."""
             assert self.started_, "timer is not started"
-            end_event = get_accelerator().Event(enable_timing=True)
-            end_event.record()
-            self.event_timers.append(CudaEventTimer(self.start_event, end_event))
-            self.start_event = None
+            event_class = get_accelerator().Event
+            if self.use_host_timer:
+                self.end_time = time.time()
+                self.event_timers.append(self.end_time - self.start_time)
+            else:
+                event_class = get_accelerator().Event
+                end_event = event_class(enable_timing=True)
+                end_event.record()
+                self.event_timers.append(CudaEventTimer(self.start_event, end_event))
+                self.start_event = None
             self.started_ = False
 
         def _get_elapsed_msec(self):
-            self.elapsed_records = [et.get_elapsed_msec() for et in self.event_timers]
+            if self.use_host_timer:
+                self.elapsed_records = [et * 1000.0 for et in self.event_timers]
+            else:
+                self.elapsed_records = [et.get_elapsed_msec() for et in self.event_timers]
             self.event_timers.clear()
             return sum(self.elapsed_records)
 
@@ -102,20 +130,18 @@ def __call__(self, name):
 
     @staticmethod
     def memory_usage():
-        alloc = "mem_allocated: {:.4f} GB".format(get_accelerator().memory_allocated() /
-                                                  (1024 * 1024 * 1024))
-        max_alloc = "max_mem_allocated: {:.4f} GB".format(
-            get_accelerator().max_memory_allocated() / (1024 * 1024 * 1024))
-        cache = "cache_allocated: {:.4f} GB".format(get_accelerator().memory_cached() /
-                                                    (1024 * 1024 * 1024))
-        max_cache = "max_cache_allocated: {:.4f} GB".format(
-            get_accelerator().max_memory_cached() / (1024 * 1024 * 1024))
+        alloc = "mem_allocated: {:.4f} GB".format(get_accelerator().memory_allocated() / (1024 * 1024 * 1024))
+        max_alloc = "max_mem_allocated: {:.4f} GB".format(get_accelerator().max_memory_allocated() /
+                                                          (1024 * 1024 * 1024))
+        cache = "cache_allocated: {:.4f} GB".format(get_accelerator().memory_cached() / (1024 * 1024 * 1024))
+        max_cache = "max_cache_allocated: {:.4f} GB".format(get_accelerator().max_memory_cached() /
+                                                            (1024 * 1024 * 1024))
         return " | {} | {} | {} | {}".format(alloc, max_alloc, cache, max_cache)
 
     def log(self, names, normalizer=1.0, reset=True, memory_breakdown=False, ranks=None):
         """Log a group of timers."""
         assert normalizer > 0.0
-        string = f"rank={dist.get_rank()} time (ms)"
+        string = f"time (ms)"
         for name in names:
             if name in self.timers:
                 elapsed_time = (self.timers[name].elapsed(reset=reset) / normalizer)
@@ -134,7 +160,43 @@ def get_mean(self, names, normalizer=1.0, reset=True):
         return means
 
 
+class NoopTimer:
+
+    class Timer:
+
+        def start(self):
+            ...
+
+        def reset(self):
+            ...
+
+        def stop(self, **kwargs):
+            ...
+
+        def elapsed(self, **kwargs):
+            return 0
+
+        def mean(self):
+            return 0
+
+    def __init__(self):
+        self.timer = self.Timer()
+
+    def __call__(self, name):
+        return self.timer
+
+    def get_timers(self):
+        return {}
+
+    def log(self, names, normalizer=1.0, reset=True, memory_breakdown=False, ranks=None):
+        ...
+
+    def get_mean(self, names, normalizer=1.0, reset=True):
+        ...
+
+
 class ThroughputTimer:
+
     def __init__(
         self,
         batch_size,
@@ -203,23 +265,19 @@ def stop(self, global_step=False, report_speed=True):
                             self.global_step_count,
                             self.avg_samples_per_sec(),
                             self.batch_size / self.step_elapsed_time,
-                            round(get_accelerator().memory_allocated() / 1024**3,
-                                  2),
-                            round(get_accelerator().max_memory_allocated() / 1024**3,
-                                  2),
+                            round(get_accelerator().memory_allocated() / 1024**3, 2),
+                            round(get_accelerator().max_memory_allocated() / 1024**3, 2),
                         ))
                     if self.monitor_memory:
                         virt_mem = psutil.virtual_memory()
                         swap = psutil.swap_memory()
-                        self.logging(
-                            "epoch={}/micro_step={}/global_step={}, vm %: {}, swap %: {}"
-                            .format(
-                                self.epoch_count,
-                                self.micro_step_count,
-                                self.global_step_count,
-                                virt_mem.percent,
-                                swap.percent,
-                            ))
+                        self.logging("epoch={}/micro_step={}/global_step={}, vm %: {}, swap %: {}".format(
+                            self.epoch_count,
+                            self.micro_step_count,
+                            self.global_step_count,
+                            virt_mem.percent,
+                            swap.percent,
+                        ))
                 self.step_elapsed_time = 0
 
     def avg_samples_per_sec(self):
@@ -241,7 +299,7 @@ def trim_mean(data, trim_percent):
     Returns:
         float: Trimmed mean.
     """
-    assert trim_percent >= 0.0 and trim_percent <= 1.0
+    assert 0.0 <= trim_percent <= 1.0
     n = len(data)
     # Account for edge case of empty list
     if len(data) == 0:
diff --git a/deepspeed/utils/types.py b/deepspeed/utils/types.py
index a6d5ffd24b8f..96b5df625965 100644
--- a/deepspeed/utils/types.py
+++ b/deepspeed/utils/types.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from enum import IntEnum
 
@@ -7,3 +10,18 @@ class ActivationFuncType(IntEnum):
     UNKNOWN = 0
     GELU = 1
     ReLU = 2
+    GATED_GELU = 3
+    GATED_SILU = 4
+
+
+GATED_ACTIVATION_TYPES = [
+    ActivationFuncType.GATED_GELU,
+    ActivationFuncType.GATED_SILU,
+]
+
+
+class NormType(IntEnum):
+    UNKNOWN = 0
+    LayerNorm = 1
+    GroupNorm = 2
+    RMSNorm = 3
diff --git a/deepspeed/utils/zero_to_fp32.py b/deepspeed/utils/zero_to_fp32.py
index f00e256bb787..c98caae31534 100755
--- a/deepspeed/utils/zero_to_fp32.py
+++ b/deepspeed/utils/zero_to_fp32.py
@@ -1,7 +1,11 @@
 #!/usr/bin/env python
-'''Copyright The Microsoft DeepSpeed Team'''
 
-# This script extracts fp32 consolidated weights from a zero 2 and 3 DeepSpeed checkpoints. It gets
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
 # copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
 # the future. Once extracted, the weights don't require DeepSpeed and can be used in any
 # application.
@@ -15,18 +19,25 @@
 import os
 import re
 from collections import OrderedDict
+from dataclasses import dataclass
 
 # while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
 # DeepSpeed data structures it has to be available in the current python environment.
 from deepspeed.utils import logger
-from deepspeed.checkpoint.constants import (DS_VERSION,
-                                            OPTIMIZER_STATE_DICT,
-                                            SINGLE_PARTITION_OF_FP32_GROUPS,
-                                            FP32_FLAT_GROUPS,
-                                            ZERO_STAGE,
-                                            PARTITION_COUNT,
-                                            PARAM_SHAPES,
-                                            BUFFER_NAMES)
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+                                            FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+
+
+@dataclass
+class zero_model_state:
+    buffers: dict()
+    param_shapes: dict()
+    shared_params: list
+    ds_version: int
+    frozen_param_shapes: dict()
+    frozen_param_fragments: dict()
+
 
 debug = 0
 
@@ -52,7 +63,7 @@ def get_model_state_file(checkpoint_dir, zero_stage):
         raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
 
     # there should be only one file
-    if zero_stage == 2:
+    if zero_stage <= 2:
         file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
     elif zero_stage == 3:
         file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
@@ -63,39 +74,68 @@ def get_model_state_file(checkpoint_dir, zero_stage):
     return file
 
 
-def get_optim_files(checkpoint_dir):
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
     # XXX: need to test that this simple glob rule works for multi-node setup too
-    optim_files = sorted(glob.glob(os.path.join(checkpoint_dir,
-                                                "*_optim_states.pt")),
-                         key=natural_keys)
+    ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
 
-    if len(optim_files) == 0:
-        raise FileNotFoundError(
-            f"can't find '*_optim_states.pt' files in directory '{checkpoint_dir}'")
+    if len(ckpt_files) == 0:
+        raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
 
-    return optim_files
+    return ckpt_files
 
 
-def parse_model_state(file):
-    state_dict = torch.load(file, map_location=device)
+def get_optim_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
 
-    if BUFFER_NAMES not in state_dict:
-        raise ValueError(f"{file} is not a model state checkpoint")
-    buffer_names = state_dict[BUFFER_NAMES]
-    if debug:
-        print("Found buffers:", buffer_names)
 
-    # recover just the buffers while restoring them to fp32 if they were saved in fp16
-    buffers = {
-        k: v.float()
-        for k,
-        v in state_dict["module"].items() if k in buffer_names
-    }
-    param_shapes = state_dict[PARAM_SHAPES]
+def get_model_state_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+
+
+def parse_model_states(files):
+    zero_model_states = []
+    for file in files:
+        state_dict = torch.load(file, map_location=device)
+
+        if BUFFER_NAMES not in state_dict:
+            raise ValueError(f"{file} is not a model state checkpoint")
+        buffer_names = state_dict[BUFFER_NAMES]
+        if debug:
+            print("Found buffers:", buffer_names)
+
+        # recover just the buffers while restoring them to fp32 if they were saved in fp16
+        buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+        param_shapes = state_dict[PARAM_SHAPES]
 
-    ds_version = state_dict.get(DS_VERSION, None)
+        # collect parameters that are included in param_shapes
+        param_names = []
+        for s in param_shapes:
+            for name in s.keys():
+                param_names.append(name)
 
-    return buffers, param_shapes, ds_version
+        # update with frozen parameters
+        frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+        if frozen_param_shapes is not None:
+            if debug:
+                print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+            param_names += list(frozen_param_shapes.keys())
+
+        # handle shared params
+        shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+
+        ds_version = state_dict.get(DS_VERSION, None)
+
+        frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+
+        z_model_state = zero_model_state(buffers=buffers,
+                                         param_shapes=param_shapes,
+                                         shared_params=shared_params,
+                                         ds_version=ds_version,
+                                         frozen_param_shapes=frozen_param_shapes,
+                                         frozen_param_fragments=frozen_param_fragments)
+        zero_model_states.append(z_model_state)
+
+    return zero_model_states
 
 
 def parse_optim_states(files, ds_checkpoint_dir):
@@ -103,7 +143,11 @@ def parse_optim_states(files, ds_checkpoint_dir):
     total_files = len(files)
     state_dicts = []
     for f in files:
-        state_dicts.append(torch.load(f, map_location=device))
+        state_dict = torch.load(f, map_location=device)
+        # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+        # and also handle the case where it was already removed by another helper script
+        state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+        state_dicts.append(state_dict)
 
     if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
         raise ValueError(f"{files[0]} is not a zero checkpoint")
@@ -124,18 +168,15 @@ def parse_optim_states(files, ds_checkpoint_dir):
         )
 
     # the groups are named differently in each stage
-    if zero_stage == 2:
+    if zero_stage <= 2:
         fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
     elif zero_stage == 3:
         fp32_groups_key = FP32_FLAT_GROUPS
     else:
         raise ValueError(f"unknown zero stage {zero_stage}")
 
-    if zero_stage == 2:
-        fp32_flat_groups = [
-            state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key]
-            for i in range(len(state_dicts))
-        ]
+    if zero_stage <= 2:
+        fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
     elif zero_stage == 3:
         # if there is more than one param group, there will be multiple flattened tensors - one
         # flattened tensor per group - for simplicity merge them into a single tensor
@@ -144,8 +185,7 @@ def parse_optim_states(files, ds_checkpoint_dir):
         # will require matching the sub-lists of param_shapes for each param group flattened tensor
 
         fp32_flat_groups = [
-            torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key],
-                      0) for i in range(len(state_dicts))
+            torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts))
         ]
 
     return zero_stage, world_size, fp32_flat_groups
@@ -163,29 +203,53 @@ def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir):
 
     optim_files = get_optim_files(ds_checkpoint_dir)
     zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
-    print(
-        f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
-
-    model_file = get_model_state_file(ds_checkpoint_dir, zero_stage)
-    buffers, param_shapes, ds_version = parse_model_state(model_file)
-    print(f'Parsing checkpoint created by deepspeed=={ds_version}')
-
-    if zero_stage == 2:
-        return _get_fp32_state_dict_from_zero2_checkpoint(world_size,
-                                                          param_shapes,
-                                                          fp32_flat_groups,
-                                                          buffers)
+    print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+
+    model_files = get_model_state_files(ds_checkpoint_dir)
+
+    zero_model_states = parse_model_states(model_files)
+    print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+
+    if zero_stage <= 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states)
     elif zero_stage == 3:
-        return _get_fp32_state_dict_from_zero3_checkpoint(world_size,
-                                                          param_shapes,
-                                                          fp32_flat_groups,
-                                                          buffers)
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states)
+
 
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
 
-def _get_fp32_state_dict_from_zero2_checkpoint(world_size,
-                                               param_shapes,
-                                               fp32_flat_groups,
-                                               buffers):
+    frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+    frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+
+    if debug:
+        num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+        print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        state_dict[name] = frozen_param_fragments[name]
+
+        if debug:
+            print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
 
     # Reconstruction protocol:
     #
@@ -194,8 +258,7 @@ def _get_fp32_state_dict_from_zero2_checkpoint(world_size,
     if debug:
         for i in range(world_size):
             for j in range(len(fp32_flat_groups[0])):
-                print(
-                    f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+                print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
 
     # XXX: memory usage doubles here (zero2)
     num_param_groups = len(fp32_flat_groups[0])
@@ -204,26 +267,16 @@ def _get_fp32_state_dict_from_zero2_checkpoint(world_size,
         merged_partitions = [sd[i] for sd in fp32_flat_groups]
         full_single_fp32_vector = torch.cat(merged_partitions, 0)
         merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
-    avail_numel = sum([
-        full_single_fp32_vector.numel()
-        for full_single_fp32_vector in merged_single_partition_of_fp32_groups
-    ])
+    avail_numel = sum(
+        [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
 
     if debug:
         wanted_params = sum([len(shapes) for shapes in param_shapes])
-        wanted_numel = sum(
-            [sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
         # not asserting if there is a mismatch due to possible padding
         print(f"Have {avail_numel} numels to process.")
         print(f"Need {wanted_numel} numels in {wanted_params} params.")
 
-    state_dict = OrderedDict()
-
-    # buffers
-    state_dict.update(buffers)
-    if debug:
-        print(f"added {len(buffers)} buffers")
-
     # params
     # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
     # out-of-core computing solution
@@ -239,13 +292,8 @@ def _get_fp32_state_dict_from_zero2_checkpoint(world_size,
             total_params += 1
 
             if debug:
-                print(
-                    f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} "
-                )
-            state_dict[name] = full_single_fp32_vector.narrow(
-                0,
-                offset,
-                unpartitioned_numel).view(shape)
+                print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+            state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
             offset += unpartitioned_numel
 
         # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
@@ -268,12 +316,28 @@ def zero2_align(x):
 
         # Sanity check
         if offset != avail_numel:
-            raise ValueError(
-                f"consumed {offset} numels out of {avail_numel} - something is wrong")
+            raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states):
+    state_dict = OrderedDict()
 
-    print(
-        f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements"
-    )
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    _zero2_merge_frozen_params(state_dict, zero_model_states)
+
+    _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
 
     return state_dict
 
@@ -285,15 +349,48 @@ def zero3_partitioned_param_info(unpartitioned_numel, world_size):
     return partitioned_numel, padding_numel
 
 
-def _get_fp32_state_dict_from_zero3_checkpoint(world_size,
-                                               param_shapes,
-                                               fp32_flat_groups,
-                                               buffers):
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    if debug:
+        for i in range(world_size):
+            num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+            print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in zero_model_states[0].frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+        state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
 
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    avail_numel = fp32_flat_groups[0].numel() * world_size
     # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
     # param, re-consolidating each param, while dealing with padding if any
 
-    avail_numel = fp32_flat_groups[0].numel() * world_size
     # merge list of dicts, preserving order
     param_shapes = {k: v for d in param_shapes for k, v in d.items()}
 
@@ -304,15 +401,9 @@ def _get_fp32_state_dict_from_zero3_checkpoint(world_size,
         wanted_params = len(param_shapes)
         wanted_numel = sum(shape.numel() for shape in param_shapes.values())
         # not asserting if there is a mismatch due to possible padding
-        print(f"Have {avail_numel} numels to process.")
-        print(f"Need {wanted_numel} numels in {wanted_params} params.")
-
-    state_dict = OrderedDict()
-
-    # buffers
-    state_dict.update(buffers)
-    if debug:
-        print(f"added {len(buffers)} buffers")
+        avail_numel = fp32_flat_groups[0].numel() * world_size
+        print(f"Trainable params: Have {avail_numel} numels to process.")
+        print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
 
     # params
     # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
@@ -330,30 +421,41 @@ def _get_fp32_state_dict_from_zero3_checkpoint(world_size,
 
         if debug:
             print(
-                f"{total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+                f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
             )
 
         # XXX: memory usage doubles here
         state_dict[name] = torch.cat(
-            tuple(fp32_flat_groups[i].narrow(0,
-                                             offset,
-                                             partitioned_numel)
-                  for i in range(world_size)),
-            0).narrow(0,
-                      0,
-                      unpartitioned_numel).view(shape)
+            tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)),
+            0).narrow(0, 0, unpartitioned_numel).view(shape)
         offset += partitioned_numel
 
     offset *= world_size
 
     # Sanity check
     if offset != avail_numel:
-        raise ValueError(
-            f"consumed {offset} numels out of {avail_numel} - something is wrong")
+        raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
 
-    print(
-        f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements"
-    )
+
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+
+    _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
 
     return state_dict
 
@@ -465,19 +567,21 @@ def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
 if __name__ == "__main__":
 
     parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "checkpoint_dir",
-        type=str,
-        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument("checkpoint_dir",
+                        type=str,
+                        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
     parser.add_argument(
         "output_file",
         type=str,
-        help=
-        "path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)"
-    )
+        help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)")
+    parser.add_argument("-t",
+                        "--tag",
+                        type=str,
+                        default=None,
+                        help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
     parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
     args = parser.parse_args()
 
     debug = args.debug
 
-    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, args.output_file)
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, args.output_file, tag=args.tag)
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 9bcfedb8d8f3..c5f0124b3908 100755
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -1,4 +1,6 @@
-FROM nvidia/cuda:10.0-devel-ubuntu18.04
+FROM nvidia/cuda:11.7.1-devel-ubuntu18.04
+
+ENV DEBIAN_FRONTEND noninteractive
 
 ##############################################################################
 # Temporary Installation Directory
@@ -95,7 +97,7 @@ ENV PYTHON_VERSION=3
 RUN apt-get install -y python3 python3-dev && \
         rm -f /usr/bin/python && \
         ln -s /usr/bin/python3 /usr/bin/python && \
-        curl -O https://bootstrap.pypa.io/get-pip.py && \
+        curl -O https://bootstrap.pypa.io/pip/3.6/get-pip.py && \
         python get-pip.py && \
         rm get-pip.py && \
         pip install --upgrade pip && \
@@ -152,13 +154,13 @@ RUN pip install psutil \
 ###############################################################################
 ENV SSH_PORT=2222
 RUN cat /etc/ssh/sshd_config > ${STAGE_DIR}/sshd_config && \
-        sed "0,/^#Port 22/s//Port ${SSH_PORT}/" ${STAGE_DIR}/sshd_config > /etc/ssh/sshd_config
+        sed "0,/^Port 22/s//Port ${SSH_PORT}/" ${STAGE_DIR}/sshd_config > /etc/ssh/sshd_config
 
 ##############################################################################
 # PyTorch
 ##############################################################################
-ENV PYTORCH_VERSION=1.2.0
-ENV TORCHVISION_VERSION=0.4.0
+ENV PYTORCH_VERSION=1.9.0
+ENV TORCHVISION_VERSION=0.10.0
 ENV TENSORBOARDX_VERSION=1.8
 RUN pip install torch==${PYTORCH_VERSION}
 RUN pip install torchvision==${TORCHVISION_VERSION}
diff --git a/docs/README.md b/docs/README.md
index fbd9b68ac20e..0c3aaaeda600 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -20,7 +20,7 @@ Add these lines to your `.bashrc` or equivalent to ensure you have permissions t
 export GEM_HOME="$HOME/gems"
 export PATH="$HOME/gems/bin:$PATH"
 ```
-Don't forget to `source ~/.bashrc` afterwards 😊.
+Don't forget to `source ~/.bashrc` afterward 😊.
 
 
 Now we can install Jekyll and [Bundler](https://bundler.io/):
@@ -35,7 +35,7 @@ We now need to install the required Ruby packages for the website.
 
 > Could not locate Gemfile
 
-**NOTE**: this step frequently hangs when connected to a VPN (including MSVPN). Simply disconnect for the package installation.
+**NOTE**: This step frequently hangs when connected to a VPN (including MSVPN). Simply disconnect for the package installation.
 
 
 ```
diff --git a/docs/_config.yml b/docs/_config.yml
index 7127b8459fe2..ac8d9028e58f 100644
--- a/docs/_config.yml
+++ b/docs/_config.yml
@@ -41,6 +41,7 @@ collections:
       - cifar-10.md
       - curriculum-learning.md
       - data-efficiency.md
+      - ds4sci_evoformerattention.md
       - flops-profiler.md
       - pytorch-profiler.md
       - autotuning.md
diff --git a/docs/_data/navigation.yml b/docs/_data/navigation.yml
index 6f7c443c7958..217d56c14812 100755
--- a/docs/_data/navigation.yml
+++ b/docs/_data/navigation.yml
@@ -17,6 +17,8 @@ lnav:
     url: /inference/
   - title: 'Compression'
     url: /compression/
+  - title: 'Science'
+    url: /deepspeed4science/
   - title: 'Getting Started'
     url: /getting-started/
   - title: 'ds_config'
@@ -67,6 +69,8 @@ lnav:
         url: /tutorials/curriculum-learning/
       - title: 'Data Efficiency'
         url: /tutorials/data-efficiency/
+      - title: 'DS4Sci_EvoformerAttention'
+        url: /tutorials/ds4sci_evoformerattention/
       - title: 'Flops Profiler'
         url: /tutorials/flops-profiler/
       - title: 'PyTorch Profiler'
@@ -113,5 +117,7 @@ lnav:
         url: /tutorials/zero-offload/
       - title: 'ZeRO'
         url: /tutorials/zero/
+      - title: 'ZeRO++'
+        url: /tutorials/zeropp/
   - title: 'Contributing'
     url: /contributing/
diff --git a/docs/_pages/config-json.md b/docs/_pages/config-json.md
index 2d497bb1b567..e9d7166b05b3 100755
--- a/docs/_pages/config-json.md
+++ b/docs/_pages/config-json.md
@@ -181,7 +181,7 @@ Example of <i>**scheduler**</i>
 
 ### Communication options
 
-<i>**communication_data_type**</i>: [boolean]
+<i>**communication_data_type**</i>: [string]
 
 | Description                                                                                                                   | Default |
 | ----------------------------------------------------------------------------------------------------------------------------- | ------- |
@@ -224,6 +224,7 @@ Example of <i>**scheduler**</i>
     "initial_scale_power": 16,
     "loss_scale_window": 1000,
     "hysteresis": 2,
+    "consecutive_hysteresis": false,
     "min_loss_scale": 1
 }
 ```
@@ -250,7 +251,7 @@ Example of <i>**scheduler**</i>
 
 | Description                                                                                                                                                                                             | Default |
 | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------- |
-| <i>**initial_scale_power**</i> is a **fp16** parameter representing the power of the initial dynamic loss scale value. The actual loss scale is computed as 2<sup><i>**initial_scale_power**</i></sup>. | `32`    |
+| <i>**initial_scale_power**</i> is a **fp16** parameter representing the power of the initial dynamic loss scale value. The actual loss scale is computed as 2<sup><i>**initial_scale_power**</i></sup>. | `16`    |
 
 <i>**fp16:loss_scale_window**</i>: [integer]
 
@@ -264,6 +265,12 @@ Example of <i>**scheduler**</i>
 | --------------------------------------------------------------------------------------------------- | ------- |
 | <i>**hysteresis**</i> is a **fp16** parameter representing the delay shift in dynamic loss scaling. | `2`     |
 
+<i>**fp16:consecutive_hysteresis**</i>: [boolean]
+
+| Description                                                                                         | Default |
+| --------------------------------------------------------------------------------------------------- | ------- |
+| <i>**consecutive_hysteresis**</i> is a **fp16** parameter representing whether to refill the hysteresis if we reach an iteration that doesn't overflow | `false`     |
+
 <i>**fp16:min_loss_scale**</i>: [integer]
 
 | Description                                                                                           | Default |
@@ -366,6 +373,9 @@ Enabling and configuring ZeRO memory optimizations
     "stage3_gather_16bit_weights_on_model_save": [true|false],
     "ignore_unused_parameters": [true|false]
     "round_robin_gradients": [true|false]
+    "zero_hpz_partition_size": 1
+    "zero_quantized_weights": [true|false]
+    "zero_quantized_gradients": [true|false]
     }
 ```
 
@@ -417,6 +427,12 @@ Enabling and configuring ZeRO memory optimizations
 | ------------------------------------------------------------------------------------------------------------------- | ------- |
 | Copies the gradients to a contiguous buffer as they are produced. Avoids memory fragmentation during backward pass. | `True`  |
 
+<i>**load_from_fp32_weights**</i>: [boolean]
+
+| Description                                                                                                                                                                                                                          | Default |
+| ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ------- |
+| Initialize fp32 master weights from fp32 copies in checkpoint (no precision loss) or from model's fp16 copies (with precision loss). This can be used to initialize optimizer state even when checkpoint is missing optimizer state. | `True`  |
+
 <i>**grad_hooks**</i>: [boolean]
 
 | Description                                                                                                                               | Default |
@@ -473,6 +489,23 @@ Enabling and configuring ZeRO memory optimizations
 |--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| ------- |
 | Consolidate the weights before saving the model by `save_16bit_model()`. Since the weights are partitioned across GPUs, they aren't part of `state_dict`, so this function automatically gathers the weights when this option is enabled and then saves the fp16 model weights. | `False` |
 
+***zero_hpz_partition_size***: [integer]
+
+| Description                                                                                                                         | Default |
+| ----------------------------------------------------------------------------------------------------------------------------------- | ------- |
+| Number of ranks in hiearchical partitioning ZeRO (hpZ) secondary tensor group of ZeRO++, default is 1 meaning no hpZ, ideal is number of ranks (gpus) per node. | `1`   |
+
+***zero_quantized_weights***: [boolean]
+
+| Description                                                                                                                         | Default |
+| ----------------------------------------------------------------------------------------------------------------------------------- | ------- |
+|Boolean indicating whether to enable communication efficient quantized weights of ZeRO++. | `False`   |
+
+***zero_quantized_gradients***: [boolean]
+
+| Description                                                                                                                         | Default |
+| ----------------------------------------------------------------------------------------------------------------------------------- | ------- |
+|Boolean indicating whether to enable communication efficient quantized gradients of ZeRO++. | `False`   |
 
 ***cpu_offload***: [boolean]
 
@@ -543,6 +576,7 @@ Note that if the value of "device" is not specified or not supported, an asserti
     "device": "[cpu|nvme]",
     "nvme_path": "/local_nvme",
     "pin_memory": [true|false],
+    "ratio": 0.3,
     "buffer_count": 4,
     "fast_init": false
   }
@@ -565,6 +599,12 @@ Note that if the value of "device" is not specified or not supported, an asserti
 | ---------------------------------------------------------------------------------------------------- | ------- |
 | Offload to page-locked CPU memory. This could boost throughput at the cost of extra memory overhead. | `false` |
 
+***ratio***: [float]
+
+| Description                                                         | Default |
+| ------------------------------------------------------------------- | ------- |
+| the ratio of parameters updating (i.e. optimizer step) on CPU side. | 1       |
+
 ***buffer_count***: [integer]
 
 | Description                                                                                                                                                                                                                                              | Default |
@@ -692,7 +732,7 @@ Configuring the asynchronous I/O module for offloading parameter and optimizer s
 
 | Description                                                                                                               | Default |
 |---------------------------------------------------------------------------------------------------------------------------| ------- |
-| Whether to run autotuing experiments whose results already exist. Setting it to true would overwrite the existing result. | `false` |
+| Whether to run autotuning experiments whose results already exist. Setting it to true would overwrite the existing result. | `false` |
 
 
 <i>**metric**</i>: [string]
@@ -849,7 +889,7 @@ Configuring the asynchronous I/O module for offloading parameter and optimizer s
 
 | Description                                                   | Default |
 | ------------------------------------------------------------- | ------- |
-| Inserts torch.cuda.synchronize() at each checkpoint boundary. | `false` |
+| Inserts get_accelerator().synchronize() at each checkpoint boundary. | `false` |
 
 
 <i>**profile**</i>: [boolean]
@@ -1117,7 +1157,7 @@ Deepspeed's Monitor module can log training details into a [Tensorboard](https:/
 | `Train/Eigenvalues/ModelBlockParam_{i}`   | Eigen values per param block. | `eigenvalue` must be enabled. |
 | `Train/Samples/elapsed_time_ms_forward`   | The global duration of the forward pass. | `flops_profiler.enabled` or `wall_clock_breakdown`. |
 | `Train/Samples/elapsed_time_ms_backward`   | The global duration of the forward pass. | `flops_profiler.enabled` or `wall_clock_breakdown`.  |
-| `Train/Samples/elapsed_time_ms_backward_inner`   | The backward time that does not include the the gradient reduction time. Only in cases where the gradient reduction is not overlapped, if it is overlapped then the inner time should be about the same as the entire backward time. | `flops_profiler.enabled` or `wall_clock_breakdown`.  |
+| `Train/Samples/elapsed_time_ms_backward_inner`   | The backward time that does not include the gradient reduction time. Only in cases where the gradient reduction is not overlapped, if it is overlapped then the inner time should be about the same as the entire backward time. | `flops_profiler.enabled` or `wall_clock_breakdown`.  |
 | `Train/Samples/elapsed_time_ms_backward_allreduce`   | The global duration of the allreduce operation. | `flops_profiler.enabled` or `wall_clock_breakdown`.  |
 | `Train/Samples/elapsed_time_ms_step`   | The optimizer step time | `flops_profiler.enabled` or `wall_clock_breakdown`.  |
 
@@ -1435,6 +1475,25 @@ Different quantization sets, this is used for different quantization parameters.
 }
 ```
 
+```json
+"compression_training": {
+  "sparse_pruning":{
+    "shared_parameters":{
+      "enabled": true,
+      "schedule_offset": 30,
+      "schedule_offset_end": 90,
+      "schedule_offset_stride": 15,
+      "method": "snip_momentum",
+      "block_pattern": "4x1",
+      "dense_ratio": 0.4,
+      "excluded_modules": ['classifier', 'pooler']
+    },
+    "different_groups":{
+    }
+  }
+}
+```
+
 <i>**shared_parameters**</i>: [dictionary]
 
 Shared parameters for all sparse pruning groups.
@@ -1443,11 +1502,17 @@ Shared parameters for all sparse pruning groups.
 | ----- | ----- | ----- |
 | <i>**enabled**</i>: [boolean] | Enable sparse pruning or not. | `false` |
 | <i>**schedule_offset**</i>: [integer] | Enable sparse pruning after scheduled steps (can be treated as warmup steps). | `0` |
-| <i>**method**</i>: [string] | Choose different pruning methods, l1 (static, magnitude based) or topk (dynamic, learnable). | `"l1"` |
+| <i>**schedule_offset_end**</i>: [integer] | Disable sparse pruning after scheduled steps, mandotory for `snip_momentum`. | `0` |
+| <i>**schedule_offset_stride**</i>: [integer] | The stride of pruning on training steps, mandotory for `snip_momentum`. | `"1"` |
+| <i>**method**</i>: [string] | Choose different pruning methods, l1 (static, magnitude based), topk (dynamic, learnable) or snip_momentum (structured pruning). | `"l1"` |
+| <i>**block_pattern**</i>: [string] | Choose different structured pruning block patterns, NxM or N:M (N and M are integers). For instance, "4x1" or "2:4" are common block patterns, mandotory for `snip_momentum`. | `"4x1"` |
+| <i>**dense_ratio**</i>: [float] | Used to get the targeted global sparsity ratio, mandotory for `snip_momentum`. | `"0.1"` |
+| <i>**excluded_modules**</i>: [list] | Excluded pruning scope on some special modules like output layer. | `[]` |
 
 <i>**different_groups**</i>: [dictionary]
 
 Different pruning sets, this is used for different pruning parameters. In this example, we give one set. In practice, you can choose the number of sets based on your requirements.
+Note for `snip_momentum` method, you can leave it as empty.
 
 | Fields | Value | Default |
 | ----- | ----- | ----- |
diff --git a/docs/_pages/deepspeed4science.md b/docs/_pages/deepspeed4science.md
new file mode 100755
index 000000000000..b35351838f22
--- /dev/null
+++ b/docs/_pages/deepspeed4science.md
@@ -0,0 +1,50 @@
+---
+title: "DeepSpeed4Science Overview and Tutorial"
+permalink: /deepspeed4science/
+toc: true
+toc_label: "Contents"
+toc_sticky: true
+---
+
+In line with Microsoft's mission to solve humanity's most pressing challenges, the DeepSpeed team at Microsoft is responding to this opportunity by launching a new initiative called *DeepSpeed4Science*, aiming to build unique capabilities through AI system technology innovations to help domain experts to unlock today's biggest science mysteries. This page serves as an overview page for all technologies released (or to be released in the future) as part of the DeepSpeed4Science initiative, making it easier for scientists to shop for techniques they need. Details of the DeepSpeed4Science initiative can be found at [our website](https://deepspeed4science.ai/). For each technique we will introduce what is it for, when to use it, links to how to use it, and existing scientific applications of the techniques (we welcome users to contribute more showcases if you apply our techniques in your scientific research).
+
+To cite DeepSpeed4Science, please cite our [white paper](https://arxiv.org/abs/2310.04610):
+
+```
+@article{song2023deepspeed4science,
+  title={DeepSpeed4Science Initiative: Enabling Large-Scale Scientific Discovery through Sophisticated AI System Technologies},
+  author={Song, Shuaiwen Leon and Kruft, Bonnie and Zhang, Minjia and Li, Conglong and Chen, Shiyang and Zhang, Chengming and Tanaka, Masahiro and Wu, Xiaoxia and Rasley, Jeff and Awan, Ammar Ahmad and others},
+  journal={arXiv preprint arXiv:2310.04610},
+  year={2023}
+}
+```
+
+* [2023/09] We are releasing two techniques: [DeepSpeed4Science large-scale training framework](#new-megatron-deepspeed-for-large-scale-ai4science-model-training), [DS4Sci_EvoformerAttention](#memory-efficient-evoformerattention-kernels) and their scientific applications in structural biology research.
+
+
+## New Megatron-DeepSpeed for Large-Scale AI4Science Model Training
+
+We are proud to introduce [new Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed), which is an updated framework for large-scale model training. We rebased and enabled DeepSpeed with the newest Megatron-LM for long sequence support and many other capabilities. With the new Megatron-DeepSpeed, users can now train their large AI4Science models like GenSLMs with much longer sequences via a synergetic combination of ZeRO-style data parallelism, tensor parallelism, sequence parallelism, pipeline parallelism, model state offloading, and several newly added memory optimization techniques such as attention mask offloading and position embedding partitioning.
+
+![new Megatron-DeepSpeed](/assets/images/new-megatron-ds.png){: .align-center}
+<p align="center">
+<em>The figure depicts system capability in terms of enabling long sequence lengths for training a 33B parameter GPT-like model using our new Megatron-DeepSpeed framework. The results show that the new Megatron-DeepSpeed enables 9x longer sequence lengths than NVIDIA's Megatron-LM without triggering out-of-memory error. </em>
+</p>
+
+To see how the new Megatron-DeepSpeed helps enabling new system capabilities, such as training models with massive sequences length, please read our [tutorial](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples_deepspeed/deepspeed4science/megatron_long_seq_support).
+
+Meanwhile, our new Megatron-DeepSpeed has been applied to genome-scale foundation model [GenSLMs](https://github.com/ramanathanlab/genslm), which is a 2022 [ACM Gordon Bell award](https://www.acm.org/media-center/2022/november/gordon-bell-special-prize-covid-research-2022) winning genome-scale language model from Argonne National Lab. To achieve their scientific goal, GenSLMs and similar models require very long sequence support for both training and inference that is beyond generic LLM's long-sequence strategies. By leveraging DeepSpeed4Science's new Megatron-DeepSpeed, GenSLMs team is able to train their 25B model with 512K sequence length, much longer than their original 42K sequence length. Detailed information about the methodology can be found at [our website](https://deepspeed4science.ai/2023/09/18/model-showcase-genslms/). GenSLMs team also hosts an [example](https://github.com/ramanathanlab/genslm/tree/main/examples/long-sequences) about how to use DeepSpeed4Science in the GenSLMs repo.
+
+
+## Memory-Efficient EvoformerAttention Kernels
+
+[Evoformer](https://www.nature.com/articles/s41586-021-03819-2) is a key building block for scientific models such as DeepMind's AlphaFold. However, EvoFormer's multiple sequence alignment (MSA) attention frequently runs into memory explosion problems during training/inference, such as in protein structure prediction models. Existing techniques such as FlashAttention cannot effectively support Evoformer because EvoFormerAttention uses row-wise/column-wise/triangle attention, which are different from standard Transformer self-attention and cross-attention that require custom optimizations. To mitigate the memory explosion problem, we introduce `DS4Sci_EvoformerAttention` kernels, a collection of kernels that improve the memory efficiency of variants of EvoFormer. `DS4Sci_EvoformerAttention` is easy-to-use. To see how you can use it, please refer to our [tutorial](/tutorials/ds4sci_evoformerattention/).
+
+`DS4Sci_EvoformerAttention` has already been applied to [OpenFold](https://github.com/aqlaboratory/openfold), which is a community reproduction of DeepMind's AlphaFold2 that makes it possible to train or finetune AlphaFold2 on new datasets. With DS4Sci_EvoformerAttention kernels, OpenFold team is able to reduce the peak memory requirement by 13x without accuracy loss. Detailed information about the methodology can be found at [our website](https://deepspeed4science.ai/2023/09/18/model-showcase-openfold/).
+
+<!-- OpenFold team also hosts an [example](https://github.com/aqlaboratory/openfold/blob/main/tests/test_deepspeed_evo_attention.py) about how to use DS4Sci_EvoformerAttention in the OpenFold repo. -->
+
+![DS4Sci_EvoformerAttention](/assets/images/evoformer.png){: .align-center}
+<p align="center">
+<em>The figure shows that DeepSpeed's EvoFormerAttention kernels help reduce OpenFold’s peak memory requirement for training by 13X. </em>
+</p>
diff --git a/docs/_pages/training.md b/docs/_pages/training.md
index 466800a3d987..e31651cc487a 100644
--- a/docs/_pages/training.md
+++ b/docs/_pages/training.md
@@ -44,7 +44,7 @@ optimizations on advanced hyperparameter tuning and optimizers. For example:
   | 64 V100 GPUs   | DeepSpeed |            **8.68** hr|
   | 16 V100 GPUs   | DeepSpeed |           **33.22** hr|
 
-  *BERT codes and tutorials will be available soon.*
+  *BERT code and tutorials will be available soon.*
 
 * DeepSpeed trains GPT2 (1.5 billion parameters) 3.75x faster than state-of-art, NVIDIA
   Megatron on Azure GPUs.
@@ -201,6 +201,7 @@ Enable 16-bit (FP16) training by in the `deepspeed_config` JSON.
     "loss_scale": 0,
     "loss_scale_window": 1000,
     "hysteresis": 2,
+    "consecutive_hysteresis": false,
     "min_loss_scale": 1
 }
 ```
@@ -364,7 +365,7 @@ They offer the same convergence as Adam/LAMB, incur up to 26x less communication
 up to 6.6x higher throughput for BERT-Large pretraining and up to 2.7x higher throughput
 for SQuAD fine-tuning on bandwidth-limited clusters. For more details on usage and performance,
 please refer to the [1-bit Adam tutorial](https://www.deepspeed.ai/tutorials/onebit-adam),
-[1-bit Adam blog post](https://www.deepspeed.ai/news/2020/09/09/onebit-adam-blog-post.md),
+[1-bit Adam blog post](https://www.deepspeed.ai/2020/09/08/onebit-adam-blog-post.html),
 [0/1 Adam tutorial](https://www.deepspeed.ai/tutorials/zero-one-adam)
 and [1-bit LAMB tutorial](https://www.deepspeed.ai/tutorials/onebit-lamb/). For technical details,
 please refer to the [1-bit Adam paper](https://arxiv.org/abs/2102.02888), [0/1 Adam paper](https://arxiv.org/abs/2202.06009) and
@@ -484,7 +485,7 @@ The flops profiler can also be used as a standalone package. Please refer to the
 
 ### Autotuning
 
-The DeepSpeed Autotuner  uses model information, system information, and heuristics to efficiently tune Zero stage, micro batch size, and other Zero configurations. Using the autotuning feature requires no code change from DeepSpeed users. While `"autotuning": {"enabled": true}` is the minimal required to enable auotuning, there are other parameters users can define to configure the autotuning process. Below shows major parameters and their default values in the autotuning configuration. Please refer to the [Autotuning](/tutorials/autotuning) tutorial for more details.
+The DeepSpeed Autotuner  uses model information, system information, and heuristics to efficiently tune Zero stage, micro batch size, and other Zero configurations. Using the autotuning feature requires no code change from DeepSpeed users. While `"autotuning": {"enabled": true}` is the minimal required to enable autotuning, there are other parameters users can define to configure the autotuning process. Below shows major parameters and their default values in the autotuning configuration. Please refer to the [Autotuning](/tutorials/autotuning) tutorial for more details.
 
 ```json
 {
diff --git a/docs/_posts/2020-02-13-release.md b/docs/_posts/2020-02-13-release.md
index 792ff7bfee67..a97a4ba9ccf1 100644
--- a/docs/_posts/2020-02-13-release.md
+++ b/docs/_posts/2020-02-13-release.md
@@ -3,5 +3,5 @@ title: "ZeRO & DeepSpeed: New system optimizations enable training models with o
 date:   2020-02-13
 link: https://www.microsoft.com/en-us/research/blog/ZeRO-deepspeed-new-system-optimizations-enable-training-models-with-over-100-billion-parameters/
 excerpt: ""
-tags: training ZeRO
+tags: training ZeRO English
 ---
diff --git a/docs/_posts/2020-02-13-turing-nlg.md b/docs/_posts/2020-02-13-turing-nlg.md
index 0da59aa8fee3..240f6d78ad02 100644
--- a/docs/_posts/2020-02-13-turing-nlg.md
+++ b/docs/_posts/2020-02-13-turing-nlg.md
@@ -3,5 +3,5 @@ title: "Turing-NLG: A 17-billion-parameter language model by Microsoft"
 date:   2020-02-13
 link: https://www.microsoft.com/en-us/research/blog/turing-nlg-a-17-billion-parameter-language-model-by-microsoft/
 excerpt: "DeepSpeed was used to train the world's largest language model."
-tags: training
+tags: training English
 ---
diff --git a/docs/_posts/2020-03-17-reduce-scatter.md b/docs/_posts/2020-03-17-reduce-scatter.md
index 1753a22e3aa7..329409dfefab 100644
--- a/docs/_posts/2020-03-17-reduce-scatter.md
+++ b/docs/_posts/2020-03-17-reduce-scatter.md
@@ -1,6 +1,7 @@
 ---
 title: "ZeRO stage 1 with reduced communication"
 sneak_preview: true
+tags: training ZeRO English
 excerpt: "Partition-aware ZeRO with up to 2x reduction in communication time!"
 ---
 
diff --git a/docs/_posts/2020-05-19-bert-record.md b/docs/_posts/2020-05-19-bert-record.md
index 93d0c9ce34bd..b47ad0b0beaf 100644
--- a/docs/_posts/2020-05-19-bert-record.md
+++ b/docs/_posts/2020-05-19-bert-record.md
@@ -1,10 +1,9 @@
 ---
 title: "The Fastest and Most Efficient BERT Training through Optimized Transformer Kernels"
 excerpt: ""
-tags: training
 date: 2020-05-19 00:00:00
 toc: false
-tags: training
+tags: training English
 ---
 
 We introduce new technology to accelerate single GPU performance via kernel
@@ -18,6 +17,6 @@ NVIDIA V100 GPUs**, compared with the best published result of 67 minutes on
 the same number and generation of GPUs.
 
 * Brief overview, see our [press release](https://www.microsoft.com/en-us/research/blog/ZeRO-2-deepspeed-shattering-barriers-of-deep-learning-speed-scale/).
-* Detailed technology deep dive, see our [blog post](https://www.deepspeed.ai/news/2020/05/27/fastest-bert-training.html).
+* Detailed technology deep dive, see our [blog post](https://www.deepspeed.ai/2020/05/27/fastest-bert-training.html).
 * Tutorial on how to reproduce our results, see our [BERT pre-training tutorial](https://www.deepspeed.ai/tutorials/bert-pretraining/).
 * The source code for our transformer kernels can be found in the [DeepSpeed repo](https://github.com/microsoft/deepspeed) and BERT pre-training code can be found in the [DeepSpeedExamples repo](https://github.com/microsoft/deepspeedexamples).
diff --git a/docs/_posts/2020-05-19-press-release.md b/docs/_posts/2020-05-19-press-release.md
index 9022a7db40c5..a6611b11cb59 100644
--- a/docs/_posts/2020-05-19-press-release.md
+++ b/docs/_posts/2020-05-19-press-release.md
@@ -2,6 +2,6 @@
 title: "ZeRO-2 & DeepSpeed: Shattering Barriers of Deep Learning Speed & Scale"
 excerpt: ""
 link: https://www.microsoft.com/en-us/research/blog/ZeRO-2-deepspeed-shattering-barriers-of-deep-learning-speed-scale/
-tags: training ZeRO
+tags: training ZeRO English
 date: 2020-05-19 02:00:00
 ---
diff --git a/docs/_posts/2020-05-19-zero-stage2.md b/docs/_posts/2020-05-19-zero-stage2.md
index 4f35012d9aae..44f6cc194bc2 100644
--- a/docs/_posts/2020-05-19-zero-stage2.md
+++ b/docs/_posts/2020-05-19-zero-stage2.md
@@ -1,7 +1,7 @@
 ---
 title: "An Order-of-Magnitude Larger and Faster Training with ZeRO-2"
 excerpt: ""
-tags: training ZeRO
+tags: training ZeRO English
 date: 2020-05-19 01:00:00
 toc: false
 ---
diff --git a/docs/_posts/2020-05-28-fastest-bert-training.md b/docs/_posts/2020-05-28-fastest-bert-training.md
index 99d132c1e53d..62be6c1bffce 100644
--- a/docs/_posts/2020-05-28-fastest-bert-training.md
+++ b/docs/_posts/2020-05-28-fastest-bert-training.md
@@ -1,7 +1,7 @@
 ---
 title: "Microsoft DeepSpeed achieves the fastest BERT training time"
 excerpt: ""
-tags: training
+tags: training English
 date: 2020-05-28 00:00:00
 ---
 
diff --git a/docs/_posts/2020-07-24-deepspeed-webinar.md b/docs/_posts/2020-07-24-deepspeed-webinar.md
index be4ee777ed61..a5b4aa15bef5 100644
--- a/docs/_posts/2020-07-24-deepspeed-webinar.md
+++ b/docs/_posts/2020-07-24-deepspeed-webinar.md
@@ -1,7 +1,7 @@
 ---
 title: "DeepSpeed Microsoft Research Webinar on August 6th, 2020"
 excerpt: ""
-tags: presentations
+tags: presentations English
 link: https://note.microsoft.com/MSR-Webinar-DeepSpeed-Registration-On-Demand.html
 image: /assets/images/webinar-aug2020.png
 date: 2020-07-24 00:00:00
diff --git a/docs/_posts/2020-08-07-webinar-on-demand.md b/docs/_posts/2020-08-07-webinar-on-demand.md
index 983e17eca36b..8b258e88a9b2 100644
--- a/docs/_posts/2020-08-07-webinar-on-demand.md
+++ b/docs/_posts/2020-08-07-webinar-on-demand.md
@@ -1,7 +1,7 @@
 ---
 title: "DeepSpeed Microsoft Research Webinar is now on-demand"
 excerpt: ""
-tags: presentations
+tags: presentations English
 link: https://note.microsoft.com/MSR-Webinar-DeepSpeed-Registration-On-Demand.html
 date: 2020-08-07 00:00:00
 ---
diff --git a/docs/_posts/2020-09-08-sparse-attention-news.md b/docs/_posts/2020-09-08-sparse-attention-news.md
index 4c37054a73c1..79de33a82e3a 100644
--- a/docs/_posts/2020-09-08-sparse-attention-news.md
+++ b/docs/_posts/2020-09-08-sparse-attention-news.md
@@ -1,7 +1,7 @@
 ---
 title: "Powering 10x longer sequences and 6x faster execution through DeepSpeed Sparse Attention"
 excerpt: ""
-tags: training
+tags: training English
 date: 2020-09-09 00:00:00
 toc: false
 ---
@@ -9,6 +9,6 @@ toc: false
 DeepSpeed offers sparse attention kernels, an instrumental technology to support long sequences of model inputs, whether for text, image, or sound. Compared with the classic dense Transformers, it powers an order-of-magnitude longer input sequence and obtains up to 6x faster execution with comparable accuracy. It also outperforms state-of-the-art sparse implementations with 1.5-3x faster execution. Furthermore, our sparse kernels support efficient execution of flexible sparse format and empower users to innovate on their custom sparse structures.
 
 * Brief overview, see our [press release]({{ site.press_release_v3 }}).
-* Detailed technology deep dive, see our [blog post](https://www.deepspeed.ai/news/2020/09/08/sparse-attention.html).
+* Detailed technology deep dive, see our [blog post](https://www.deepspeed.ai/2020/09/08/sparse-attention.html).
 * Tutorial on how to use sparse attention, see our [Sparse attention tutorial](https://www.deepspeed.ai/tutorials/sparse-attention/).
 * The source code for our sparse attention kernels can be found in the [DeepSpeed repo](https://github.com/microsoft/deepspeed) and BERT pre-training code using sparse attention can be found in the [DeepSpeedExamples repo](https://github.com/microsoft/deepspeedexamples).
diff --git a/docs/_posts/2020-09-09-ZeRO-Offload.md b/docs/_posts/2020-09-09-ZeRO-Offload.md
index c270ceadf381..8e2e8423fd55 100755
--- a/docs/_posts/2020-09-09-ZeRO-Offload.md
+++ b/docs/_posts/2020-09-09-ZeRO-Offload.md
@@ -2,7 +2,7 @@
 title: "10x bigger model training on a single GPU with ZeRO-Offload"
 excerpt: ""
 date: 2020-09-09 00:00:00
-tags: training ZeRO
+tags: training ZeRO English
 toc: false
 ---
 
diff --git a/docs/_posts/2020-09-09-onebit-adam-blog-post.md b/docs/_posts/2020-09-09-onebit-adam-blog-post.md
index 413a3d0c1afb..8152190f24d0 100644
--- a/docs/_posts/2020-09-09-onebit-adam-blog-post.md
+++ b/docs/_posts/2020-09-09-onebit-adam-blog-post.md
@@ -2,7 +2,7 @@
 title: "DeepSpeed with 1-bit Adam: 5x less communication and 3.4x faster training"
 excerpt: ""
 date: 2020-09-09 00:00:00
-tags: training
+tags: training English
 ---
 
 ## 1. Introduction
diff --git a/docs/_posts/2020-09-09-onebit-adam-news.md b/docs/_posts/2020-09-09-onebit-adam-news.md
index 8873f58ca01a..d0adcb09987f 100644
--- a/docs/_posts/2020-09-09-onebit-adam-news.md
+++ b/docs/_posts/2020-09-09-onebit-adam-news.md
@@ -2,7 +2,7 @@
 title: "Up to 5x less communication and 3.4x faster training through 1-bit Adam"
 excerpt: ""
 date: 2020-09-09 00:00:00
-tags: training
+tags: training English
 toc: false
 ---
 
@@ -15,6 +15,6 @@ across distributed devices. We introduce a new algorithm - 1-bit Adam - and
 its efficient implementation in DeepSpeed. 1-bit Adam offers the ***same convergence*** as Adam, incurs up to ***5x less communication*** that enables up to ***3.5x higher throughput for BERT-Large pretraining*** and up to ***2.7x higher throughput for SQuAD fine-tuning*** on bandwidth-limited clusters.
 
 * Brief overview, see our [press release]({{ site.press_release_v3 }}).
-* Detailed technology deep dive, see our [blog post](https://www.deepspeed.ai/news/2020/09/08/onebit-adam-blog-post.html).
+* Detailed technology deep dive, see our [blog post](https://www.deepspeed.ai/2020/09/08/onebit-adam-blog-post.html).
 * Tutorial on how to reproduce our results, see our [1-bit Adam tutorial](/tutorials/onebit-adam/).
 * The source code for 1-bit Adam can be found in the [DeepSpeed repo](https://github.com/microsoft/deepspeed). The implementation of 1-bit Adam is in [onebit_adam.py](https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/runtime/fp16/onebit_adam.py) and CUDA-Aware communication for 1-bit Adam is in [custom_collectives.py](https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/runtime/custom_collectives.py). Example codes to try this feature can be found in the [DeepSpeedExamples repo](https://github.com/microsoft/deepspeedexamples) as shown in the [tutorial](/tutorials/onebit-adam/).
diff --git a/docs/_posts/2020-09-09-pipeline-parallelism.md b/docs/_posts/2020-09-09-pipeline-parallelism.md
index 4f2e53ed80ee..48343ebd8d1e 100644
--- a/docs/_posts/2020-09-09-pipeline-parallelism.md
+++ b/docs/_posts/2020-09-09-pipeline-parallelism.md
@@ -2,7 +2,7 @@
 title: "Training a Trillion Parameters with Pipeline Parallelism"
 excerpt: ""
 date: 2020-09-09 00:00:00
-tags: training
+tags: training English
 ---
 
 DeepSpeed includes new support for pipeline parallelism! DeepSpeed's training
diff --git a/docs/_posts/2020-09-09-sparse-attention.md b/docs/_posts/2020-09-09-sparse-attention.md
index aa0fa0bb60d4..9675ef1058dd 100644
--- a/docs/_posts/2020-09-09-sparse-attention.md
+++ b/docs/_posts/2020-09-09-sparse-attention.md
@@ -2,10 +2,10 @@
 title: "DeepSpeed Sparse Attention"
 excerpt: ""
 date: 2020-09-09 01:00:00
-tags: training inference
+tags: training inference English
 ---
 
-Attention-based deep learning models such as the transformers are highly effective in capturing relationship between tokens in an input sequence, even across long distances. As a result, they are used with text, image, and sound-based inputs, where the sequence length can be in thousands of tokens. However, despite the effectiveness of attention modules to capture long term dependencies, in practice, their application to long sequence input is limited by compute and memory requirements of the attention computation that grow quadratically, `O(n^2)`, with the sequence length `n`.
+Attention-based deep learning models such as the transformers are highly effective in capturing the relationship between tokens in an input sequence, even across long distances. As a result, they are used with text, image, and sound-based inputs, where the sequence length can be in thousands of tokens. However, despite the effectiveness of attention modules to capture long term dependencies, in practice, their application to long sequence input is limited by compute and memory requirements of the attention computation that grow quadratically, `O(n^2)`, with the sequence length `n`.
 
 To address this limitation, DeepSpeed offers a suite of sparse attention kernels --an instrumental technology that can reduce the compute and memory requirement of attention computation by orders-of-magnitude via block-sparse computation. The suite not only alleviates the memory bottleneck of attention calculation, but also performs sparse computation efficiently. Its APIs allow convenient integration with any transformer-based models. Along with providing a wide spectrum of sparsity structures, it has the flexibility of handling any user-defined block-sparse structures. More specifically, sparse attention (SA) can be designed to compute local attention between nearby tokens, or global attention via summary tokens computed with local attention. Moreover, SA can also allow random attention, or any combination of local, global, and random attention as shown in the following figure with blue, orange, and green blocks, respectively. As a result, SA decreases the memory footprint to `O(wn)`, in which `1 < w < n` is a parameter, whose value depends on the attention structure.
 
@@ -27,7 +27,7 @@ In a pre-training experiment, we ran BERT model under three settings: dense, den
 
 ![Maximum sequence runnable on BERT](/assets/images/sa_maximum_sequence_runnable_on_bert.png){: .align-center}
 
-* **up to 6.3x faster computation**
+* **Up to 6.3x faster computation**
 We continued the pre-training experiment for different batch sizes and sequence lengths, using [BERT base/large](https://github.com/microsoft/DeepSpeedExamples/tree/master/bing_bert) and [Megatron GPT2](https://github.com/microsoft/DeepSpeedExamples/tree/master/Megatron-LM). In this experiment we let the training to continue for 100 iteration and recorded the average time per last 30 iterations. SA reduces total computation comparing with dense and improves training speed:  the boost is higher with increased sequence length and it is up to 6.3x faster for BERT base, 5.3x for BERT large, and 6.1x for GPT2. Following charts show these results.
 
 ![Training time for BERT base with varying sequence length](/assets/images/sa_bert_base_time_result.png){: .align-center}
@@ -36,14 +36,14 @@ We continued the pre-training experiment for different batch sizes and sequence
 
 ![Training time for GPT2 with varying sequence length](/assets/images/sa_gpt2_time_result.png){: .align-center}
 
-* **higher accuracy**
+* **Higher accuracy**
 Related works along the line of sparse attention ([Sparse Transformer](https://arxiv.org/pdf/1904.10509.pdf), [Longformer](https://arxiv.org/pdf/2004.05150.pdf), [BigBird](https://arxiv.org/pdf/2007.14062.pdf)) have shown comparable or higher accuracy than full attention. Our experience is well aligned. In addition to lower memory overhead and faster computation, we also observe cases in production where SA reaches higher accuracy and faster convergence. The following chart illustrates accuracy of training a production model based on BERT for long document comprehension (2,048 sequence length). The experiment is performed in three settings: dense starting from scratch, SA starting from scratch, and SA continued training from a checkpoint of using dense with sequence length of 512.  We have observed that, for pre-training from scratch, SA converges faster with higher accuracy comparing with dense. Furthermore, SA continuing training from a pre-trained checkpoint performs even better, with respect to both time and accuracy.
 
 
 ![Accuracy of long document comprehension application](/assets/images/sa_long_document_comprehension_result.png){: .align-center}
 
 
-* **comparison with state of the art, Longformer**
+* **Comparison with state of the art, Longformer**
 We compared SA with Longformer, a state-of-the-art sparse structure and implementation. In our experiment, SA uses `Fixed` sparsity, and two implementations have comparable accuracy. On system performance, SA outperforms Longformer both in training and inference:
   * **1.47x** faster execution pre-training MLM on Wikitext103
 We ran an experiment following the [notebook](https://github.com/allenai/longformer/blob/master/scripts/convert_model_to_long.ipynb) offered by Longformer. In this experiment, we pre-train an MLM model using RoBERTa-base checkpoint. This is done on 8 V100-SXM2 GPU. Following table shows the details of the result in which using DeepSpeed Sparse Attention shows 1.47x speed up.
@@ -73,7 +73,7 @@ Through our Long Document Comprehension application we described above, we also
 |32                  |1.24            |
 |16                  |1.23            |
 
-* **flexibility to handle any block-sparse structure**
+* **Flexibility to handle any block-sparse structure**
 DeepSpeed Sparse Attention suite does not target at any specific sparse structure but enables model scientists to explore any block sparse structure with efficient system support. Currently, we have added popular sparse structure like:
   * [Fixed](https://arxiv.org/pdf/1904.10509.pdf) (from OpenAI Sparse Transformer)
   * [BigBird](https://arxiv.org/pdf/2007.14062.pdf) (from Google)
diff --git a/docs/_posts/2020-10-28-progressive-layer-dropping-news.md b/docs/_posts/2020-10-28-progressive-layer-dropping-news.md
index 9664e4de94e7..ee518f53f012 100755
--- a/docs/_posts/2020-10-28-progressive-layer-dropping-news.md
+++ b/docs/_posts/2020-10-28-progressive-layer-dropping-news.md
@@ -2,7 +2,7 @@
 title: "Progressive Layer Dropping"
 excerpt: ""
 date: 2020-10-29 00:00:00
-tags: training
+tags: training English
 toc: false
 ---
 
diff --git a/docs/_posts/2021-03-08-zero3-offload.md b/docs/_posts/2021-03-08-zero3-offload.md
index 9008ebc9f6fa..2bca2bdd826a 100644
--- a/docs/_posts/2021-03-08-zero3-offload.md
+++ b/docs/_posts/2021-03-08-zero3-offload.md
@@ -2,7 +2,7 @@
 title: "DeepSpeed ZeRO-3 Offload"
 excerpt: ""
 date: 2021-03-08 00:00:00
-tags: training ZeRO
+tags: training ZeRO English
 ---
 Today we are announcing the release of ZeRO-3 Offload, a highly efficient and easy to use implementation of ZeRO Stage 3 and ZeRO Offload combined, geared towards our continued goal of democratizing AI by making efficient large-scale DL training available to everyone.  The key benefits of ZeRO-3 Offload are:
 
diff --git a/docs/_posts/2021-05-05-MoQ.md b/docs/_posts/2021-05-05-MoQ.md
index e6f7872a4007..5dd5006e886f 100644
--- a/docs/_posts/2021-05-05-MoQ.md
+++ b/docs/_posts/2021-05-05-MoQ.md
@@ -2,7 +2,7 @@
 title: "Mixture-of-Quantization: A novel quantization approach for reducing model size with minimal accuracy impact"
 excerpt: ""
 date: 2021-05-05 00:00:00
-tags: inference
+tags: inference English
 ---
 
 ## A unified suite for quantization-aware training and inference
diff --git a/docs/_posts/2021-05-05-inference-kernel-optimization.md b/docs/_posts/2021-05-05-inference-kernel-optimization.md
index 63e3ac669e22..991295de9759 100644
--- a/docs/_posts/2021-05-05-inference-kernel-optimization.md
+++ b/docs/_posts/2021-05-05-inference-kernel-optimization.md
@@ -2,7 +2,7 @@
 title: "DeepSpeed Inference: Multi-GPU inference with customized inference kernels and quantization support"
 excerpt: ""
 date: 2021-03-16 00:00:00
-tags: inference
+tags: inference English
 ---
 While DeepSpeed supports training advanced large-scale models, using these trained models in the desired application scenarios is still challenging due to three major limitations in existing inference solutions: 1) lack of support for multi-GPU inference to fit large models and meet latency requirements, 2) limited GPU kernel performance when running inference with small batch sizes, and 3) difficulties in exploiting quantization, which includes both quantizing the model to reduce the model size and latency as well as supporting high-performance inference of quantized models without specialized hardware.
 
diff --git a/docs/_posts/2021-05-14-inference-release.md b/docs/_posts/2021-05-14-inference-release.md
index fd5cca2e0259..14c300d0bc9f 100644
--- a/docs/_posts/2021-05-14-inference-release.md
+++ b/docs/_posts/2021-05-14-inference-release.md
@@ -3,5 +3,5 @@ title: "DeepSpeed: Accelerating large-scale model inference and training via sys
 date:   2021-05-14
 link: https://www.microsoft.com/en-us/research/blog/deepspeed-accelerating-large-scale-model-inference-and-training-via-system-optimizations-and-compression/
 excerpt: ""
-tags: inference
+tags: inference English
 ---
diff --git a/docs/_posts/2021-08-18-deepspeed-moe.md b/docs/_posts/2021-08-18-deepspeed-moe.md
index 5bd9667f2a7f..665c09751b55 100644
--- a/docs/_posts/2021-08-18-deepspeed-moe.md
+++ b/docs/_posts/2021-08-18-deepspeed-moe.md
@@ -3,5 +3,5 @@ title: "DeepSpeed powers 8x larger MoE model training with high performance"
 excerpt: ""
 link: https://www.microsoft.com/en-us/research/blog/deepspeed-powers-8x-larger-moe-model-training-with-high-performance/
 date: 2021-08-18 00:00:00
-tags: training
+tags: training English
 ---
diff --git a/docs/_posts/2021-11-15-autotuning.md b/docs/_posts/2021-11-15-autotuning.md
index ee48d44c5bdf..71acf54438ea 100644
--- a/docs/_posts/2021-11-15-autotuning.md
+++ b/docs/_posts/2021-11-15-autotuning.md
@@ -2,7 +2,7 @@
 title: "Autotuning: Automatically discover the optimal DeepSpeed configuration that delivers good training speed"
 excerpt: ""
 date: 2021-11-16 10:00:00
-tags: training
+tags: training English
 toc: false
 ---
 
diff --git a/docs/_posts/2021-12-09-deepspeed-moe-nlg.md b/docs/_posts/2021-12-09-deepspeed-moe-nlg.md
index 6402202cca3b..99a62fbe00ea 100644
--- a/docs/_posts/2021-12-09-deepspeed-moe-nlg.md
+++ b/docs/_posts/2021-12-09-deepspeed-moe-nlg.md
@@ -2,7 +2,7 @@
 title: "DeepSpeed-MoE for NLG: Reducing the training cost of language models by 5 times"
 excerpt: ""
 date: 2021-12-09 22:00:00
-tags: training
+tags: training English
 ---
 
 Autoregressive transformer-based natural language generation (referred to as
@@ -172,7 +172,7 @@ compute resources.
 To this end we are releasing our [end-to-end pipeline for training MoE based
 NLG models](https://github.com/microsoft/Megatron-DeepSpeed/tree/moe-training),
 along with [specific example
-scripts](https://github.com/microsoft/Megatron-DeepSpeed/tree/moe-training/examples/MoE)
+scripts](https://github.com/microsoft/Megatron-DeepSpeed/tree/moe-training/examples_deepspeed/MoE)
 and [tutorial](/tutorials/mixture-of-experts-nlg) to help get started with our pipeline.  We look forward to the application and
 the innovations that this may bring to the deep learning community.
 
diff --git a/docs/_posts/2022-01-19-moe-inference.md b/docs/_posts/2022-01-19-moe-inference.md
index f2ac1c6de2e1..66ff5b51ad2d 100644
--- a/docs/_posts/2022-01-19-moe-inference.md
+++ b/docs/_posts/2022-01-19-moe-inference.md
@@ -3,5 +3,5 @@ title: "DeepSpeed: Advancing MoE inference and training to power next-generation
 excerpt: ""
 link: https://www.microsoft.com/en-us/research/blog/deepspeed-advancing-moe-inference-and-training-to-power-next-generation-ai-scale/
 date: 2022-01-19 00:00:00
-tags: inference
+tags: inference English
 ---
diff --git a/docs/_posts/2022-03-21-amd-support.md b/docs/_posts/2022-03-21-amd-support.md
index ba8917bc386a..01b2a52c7ca4 100644
--- a/docs/_posts/2022-03-21-amd-support.md
+++ b/docs/_posts/2022-03-21-amd-support.md
@@ -3,5 +3,5 @@ title: "Supporting efficient large model training on AMD Instinct GPUs with Deep
 excerpt: ""
 link: https://cloudblogs.microsoft.com/opensource/2022/03/21/supporting-efficient-large-model-training-on-amd-instinct-gpus-with-deepspeed/
 date: 2022-03-21 00:00:00
-tags: training ZeRO
+tags: training ZeRO English
 ---
diff --git a/docs/_posts/2022-07-26-deepspeed-azure.md b/docs/_posts/2022-07-26-deepspeed-azure.md
index 128cbf4a416e..749be582d9a0 100644
--- a/docs/_posts/2022-07-26-deepspeed-azure.md
+++ b/docs/_posts/2022-07-26-deepspeed-azure.md
@@ -2,7 +2,7 @@
 title: "Azure empowers easy-to-use, high-performance, and hyperscale model training using DeepSpeed"
 excerpt: ""
 date: 2022-07-26 00:09:00
-tags: training azure
+tags: training azure English
 ---
 
 ## Introduction
@@ -19,7 +19,7 @@ In this extended post, we share the details of how DeepSpeed users can train tri
 
 ## Making distributed training faster and easier on Azure using DeepSpeed
 
-We compare the existing manual and error-prone workflow with our proposed easy-to-use workflow for DeepSpeed on Azure in *Figure 2*. Customers can now use easy-to-use [training pipelines](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples) to launch training jobs at scale. The new workflow reduces the number of steps from 11 to just 1 if users rely on the recommended [AzureML](https://azure.microsoft.com/en-us/services/machine-learning/) [recipes](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples/azureml).
+We compare the existing manual and error-prone workflow with our proposed easy-to-use workflow for DeepSpeed on Azure in *Figure 2*. Customers can now use easy-to-use [training pipelines](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples_deepspeed) to launch training jobs at scale. The new workflow reduces the number of steps from 11 to just 1 if users rely on the recommended [AzureML](https://azure.microsoft.com/en-us/services/machine-learning/) [recipes](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples_deepspeed/azureml).
 
 
 ![Workflow](/assets/images/old-vs-new-azure.png){: .align-center}
@@ -29,7 +29,7 @@ We compare the existing manual and error-prone workflow with our proposed easy-t
 For users who have custom environments built using Azure VMs or [Azure VMSS](https://docs.microsoft.com/en-us/azure/virtual-machine-scale-sets/overview), only two steps are needed:
 
 - 1) Run the cluster setup script (to be released in the next few weeks)
-- 2) Use the Azure VMSS [recipes](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples/azure) to launch training.
+- 2) Use the Azure VMSS [recipes](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples_deepspeed/azure) to launch training.
 
 ## Key Performance Benefits
 We already shared a summary of our key performance results in the Azure [announcement](https://azure.microsoft.com/en-us/blog/azure-empowers-easytouse-highperformance-and-hyperscale-model-training-using-deepspeed/). We enable the capability to train 2x larger model sizes (2 trillion vs. 1 trillion parameters), scale to 2x more GPUs (1024 vs. 512), and offer up to 1.8x higher compute throughput/GPU (150 TFLOPs vs. 81 TFLOPs) compared to other [cloud providers](https://medium.com/pytorch/training-a-1-trillion-parameter-model-with-pytorch-fully-sharded-data-parallel-on-aws-3ac13aa96cff).
@@ -48,7 +48,7 @@ We share the details of our experimental setup and some of the best practices we
 We used [NDm A100 v4-series](https://docs.microsoft.com/en-us/azure/virtual-machines/ndm-a100-v4-series) instances in our experiments. Each instance includes two socket AMD EPYC 7V12 64-Core CPUs, 1.7TB main memory and eight A100 80GB GPUs. The system has a balanced PCIe topology connecting 4 GPU devices to each CPU socket. Each GPU within the VM is provided with its own dedicated, topology-agnostic 200 Gb/s NVIDIA Mellanox HDR InfiniBand connection providing an accelerated 200 Gbps high speed fabric. The DeepSpeed library exploits offload capabilities where the activation and optimizer states are allocated in the main memory. Hence, 1.7TB memory capacity per node helps us to scale to large model sizes.
 
 ### Training setup using AzureML
-Users can directly use the AzureML studio and use our published [recipes](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples/azureml) to run experiments without any additional setup. This is the easiest and recommended way of running experiments on Azure.
+Users can directly use the AzureML studio and use our published [recipes](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples_deepspeed/azureml) to run experiments without any additional setup. This is the easiest and recommended way of running experiments on Azure.
 
 ### Training setup using Azure VMSS
 
@@ -122,9 +122,9 @@ The 2T parameter model consists of 160 layers, 32k hidden dimension, and 128 att
 
 We recognize that DeepSpeed users are diverse and have different environments. In this tutorial, our focus is on making things simpler for users who plan to run large model training experiments on Azure.
 
-> The easiest way to do model training on Azure is via the Azure ML recipes. The job submission and data preparation scripts have been made available [here](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples/azureml). Users simply need to setup their Azure ML workspace following the [guide](https://github.com/Azure/azureml-examples/tree/main/python-sdk#set-up) and submit experiment using the aml_submit.py file.
+> The easiest way to do model training on Azure is via the Azure ML recipes. The job submission and data preparation scripts have been made available [here](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples_deepspeed/azureml). Users simply need to setup their Azure ML workspace following the [guide](https://github.com/Azure/azureml-examples/tree/main/python-sdk#set-up) and submit experiment using the aml_submit.py file.
 
-Some users have customized environments built on top of Azure VMs and VMSS based clusters. To simplify training on such setups, we are working on an easy-to-use cluster setup script that will be published in the next few weeks. If you already have a cluster setup running, you can use the [azure recipes](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples/azure) for the 175B and the 1T model. The recipes can easily be modified to train other model configurations.
+Some users have customized environments built on top of Azure VMs and VMSS based clusters. To simplify training on such setups, we are working on an easy-to-use cluster setup script that will be published in the next few weeks. If you already have a cluster setup running, you can use the [azure recipes](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples_deepspeed/azure) for the 175B and the 1T model. The recipes can easily be modified to train other model configurations.
 
 ## Acknowledgement
 
diff --git a/docs/_posts/2022-09-10-zero-inference.md b/docs/_posts/2022-09-10-zero-inference.md
index dd718b9f8839..59a3e3bf15fa 100644
--- a/docs/_posts/2022-09-10-zero-inference.md
+++ b/docs/_posts/2022-09-10-zero-inference.md
@@ -2,7 +2,7 @@
 title: "ZeRO-Inference: Democratizing massive model inference"
 excerpt: ""
 date: 2022-09-10 00:09:00
-tags: inference ZeRO
+tags: inference ZeRO English
 ---
 
 ## Introduction
diff --git a/docs/_posts/2022-10-11-mii.md b/docs/_posts/2022-10-11-mii.md
index 8a3973175965..e0b43f51b1e0 100644
--- a/docs/_posts/2022-10-11-mii.md
+++ b/docs/_posts/2022-10-11-mii.md
@@ -2,7 +2,7 @@
 title: "DeepSpeed-MII: instant speedup on 24,000+ open-source DL models with up to 40x cheaper inference"
 excerpt: ""
 date: 2022-10-11 00:09:00
-tags: inference
+tags: inference English
 ---
 
 [ ![Text Generation Models](/assets/images/mii/hero.png) ](/assets/images/mii/hero.png){: .align-center}
diff --git a/docs/_posts/2022-12-12-data-efficiency.md b/docs/_posts/2022-12-12-data-efficiency.md
index 3b6adb4d7dab..52148707b767 100644
--- a/docs/_posts/2022-12-12-data-efficiency.md
+++ b/docs/_posts/2022-12-12-data-efficiency.md
@@ -2,7 +2,7 @@
 title: "DeepSpeed Data Efficiency: A composable library that makes better use of data, increases training efficiency, and improves model quality"
 excerpt: ""
 date: 2022-12-12 00:09:00
-tags: training
+tags: training English
 ---
 
 [ ![DeepSpeed Data Efficiency](/assets/images/data_efficiency/data_efficiecy_fig0.png) ](/assets/images/data_efficiency/data_efficiecy_fig0.png){: .align-center}
diff --git a/docs/_posts/2023-03-31-multi-modal.md b/docs/_posts/2023-03-31-multi-modal.md
new file mode 100644
index 000000000000..aaef9cfbfd2a
--- /dev/null
+++ b/docs/_posts/2023-03-31-multi-modal.md
@@ -0,0 +1,37 @@
+---
+title: "Scaling Large-Scale Generative Mixture-of-Expert Multimodal Model With VL-MoE "
+excerpt: ""
+date: 2023-03-31 00:09:00
+tags: training English
+---
+
+The field of Artificial Intelligence-Generated Content (AIGC) is rapidly growing, with the goal of making content creation more efficient and accessible. One of the most exciting areas of AIGC is the development of large-scale multi-modal models like [Flamingo](https://arxiv.org/abs/2204.14198), [BLIP](https://arxiv.org/abs/2301.12597), and [GPT4](https://arxiv.org/abs/2303.08774), which can accept inputs from multiple resources, e.g., image, text, audio, etc., and generate a variety of formats as outputs. For example, image creation can be made through stable diffusion and DALLE using the prompt text, and the new feature in the coming Office can create slides with texts, images, animations, etc., by leveraging the power of the new Microsoft Office Copilot.
+
+Scaling up the model size is one common approach to boost usability and capability of AIGC tasks. However, simply scaling up dense architectures (e.g., from GPT-1 to GPT-3) is usually extremely resource-intensive and time-consuming for both model training and inference. One effective way to tackle this challenge is to apply mixture of experts (MoE). In particular, recent [text-based MoE](https://arxiv.org/abs/2201.05596) and [vision-based MoE](https://arxiv.org/abs/2106.05974) studies have demonstrated that MoE models can significantly reduce the training and resource cost as compared to a quality-equivalent dense model, or produce a higher quality model under the same training budget. Up to now, the effectiveness of jointly training MoE for multi-modal models remains not well understood. To explore this important capability, [DeepSpeed team](https://www.deepspeed.ai/) is proud to announce our first large-scale generative mixture-of-expert (MoE) multimodal model, named [VL-MoE](https://arxiv.org/abs/2303.07226).
+
+[ ![Model architecture](/assets/images/vl_moe.png) ](/assets/images/vl_moe.png){: .align-center}
+
+*Figure 1: The new encoding process in our VL-MoE for various modality inputs, for which gray and colored blocks indicate non-activated and activated modules, respectively.*
+
+Specifically, we incorporate the MoE structure into the classical single-tower multi-modal model by comprising of the following components: (1) a shared self-attention module across modalities, (2) a pool of modality-specific experts in the feed-forward network (FFN), and (3) a sparse gated MoE extended from the dense FFN. Subsequently, under the same amount of training resources as that used in [VLMO](https://arxiv.org/abs/2111.02358) (200k training steps), we demonstrate VL-MoE's advantages over the state-of-the-art dense counterparts in the following two aspects:
+
+(1) **VL-MoE can achieve significant accuracy improvement in comparison to its dense counterparts.** Table 1 demonstrates that under the same training budget (i.e., have the same number of activated parameters for each token), VL-MoE Base with 32 experts achieves better accuracy than the VLMO-Base dense model on all four vision-language datasets.
+
+(2) **VL-MoE achieves similar model quality with a much smaller activated number of parameters compared to its dense counterparts.** Our results show that the finetuning performance of our VL-MoE is similar to that of the 3.1X larger VLMO-Large dense model (i.e., 3.1X more activated number of parameters per token). This can directly translate to approximately 3.1X training cost reduction as the training FLOPs for transformers are proportional to the activated model size per token.
+
+
+
+|                               | Param per Token (# Total Param) |       VQA      |     NLVR2     |     COCO    |  Flickr30K  |
+|                               |                                 | test-dev / std |  dev / test-P |   TR / IR   |   TR / IR   |
+|-------------------------------|:-------------------------------:|:--------------:|:-------------:|:-----------:|:-----------:|
+| Dense Counterparts            |                                 |                |               |             |             |
+| VLMO-dense Base               |           180M (180M)           |  76.64 / 76.89 | 82.77 / 83.34 | 74.8 / 57.2 | 92.3 / 79.3 |
+| VLMO-dense Large              |           560M (180M)           |  79.94 / 79.98 | 85.64 / 86.86 | 78.2 / 60.6 | 95.3 / 84.5 |
+| Ours (VL-MoE with 32 Experts) |                                 |                |               |             |             |
+| VL-MoE                        |           180M (1.9B)           |  78.23 / 78.65 | 85.54 / 86.77 | 79.4 / 61.2 | 96.1 / 84.9 |
+
+*Table 1: Comparison of finetuning accuracy results for different models used in vision-language classification tasks and image-text retrieval tasks.*
+
+A sophisticated MoE model design requires a highly efficient and scalable training system that can support multi-dimensional parallelism and efficient memory management. [DeepSpeed MoE](https://www.microsoft.com/en-us/research/blog/deepspeed-advancing-moe-inference-and-training-to-power-next-generation-ai-scale/) training system offers such advanced capabilities including easy-to-use APIs enabling flexible combinations of data, tensor, and expert parallelism. Furthermore, DeepSpeed MoE enables larger model scale than state-of-the-art systems by exploiting expert parallelism and [ZeRO optimizations](https://arxiv.org/abs/1910.02054) together. By leveraging the DeepSpeed MoE system, VL-MoE Base with 32 experts achieves similar model quality as VLMO-dense Large with about 2.5x training speedup.
+
+[DeepSpeed MoE](https://www.microsoft.com/en-us/research/blog/deepspeed-advancing-moe-inference-and-training-to-power-next-generation-ai-scale/) system is already open-sourced and can be easily used as plug-and-play component to achieve high-performance low-cost training for any large-scale MoE models. The tutorial of how to use DeepSpeed MoE is available [here](https://www.deepspeed.ai/tutorials/mixture-of-experts/). VL-MoE is currently in the process of being integrated as a model example of [DeepSpeed Examples](https://github.com/microsoft/DeepSpeedExamples). Please stay tuned for our upcoming updates on this thread.
diff --git a/docs/_posts/2023-04-24-deepspeed-chat-chinese.md b/docs/_posts/2023-04-24-deepspeed-chat-chinese.md
new file mode 100644
index 000000000000..2fd962327b54
--- /dev/null
+++ b/docs/_posts/2023-04-24-deepspeed-chat-chinese.md
@@ -0,0 +1,7 @@
+---
+title: "DeepSpeed Chat: 一键式RLHF训练，让你的类ChatGPT千亿大模型提速省钱15倍"
+excerpt: ""
+link: https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-chat/chinese/README.md
+date: 2023-04-24 00:00:00
+tags: training ZeRO RLHF Chinese
+---
diff --git a/docs/_posts/2023-04-24-deepspeed-chat-japanese.md b/docs/_posts/2023-04-24-deepspeed-chat-japanese.md
new file mode 100644
index 000000000000..63200846ab65
--- /dev/null
+++ b/docs/_posts/2023-04-24-deepspeed-chat-japanese.md
@@ -0,0 +1,7 @@
+---
+title: "DeepSpeed Chat: ChatGPTライクなモデルを簡単・高速・低コストに、あらゆるスケールで学習"
+excerpt: ""
+link: https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-chat/japanese/README.md
+date: 2023-04-24 00:00:00
+tags: training ZeRO RLHF Japanese
+---
diff --git a/docs/_posts/2023-04-24-deepspeed-chat.md b/docs/_posts/2023-04-24-deepspeed-chat.md
new file mode 100644
index 000000000000..70b627b951ee
--- /dev/null
+++ b/docs/_posts/2023-04-24-deepspeed-chat.md
@@ -0,0 +1,7 @@
+---
+title: "DeepSpeed Chat: Easy, Fast and Affordable RLHF Training of ChatGPT-like Models at All Scales"
+excerpt: ""
+link: https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-chat/README.md
+date: 2023-04-24 00:00:00
+tags: training ZeRO RLHF English
+---
diff --git a/docs/_posts/2023-06-07-deepspeed-overview-japanese.md b/docs/_posts/2023-06-07-deepspeed-overview-japanese.md
new file mode 100644
index 000000000000..8f42093991c8
--- /dev/null
+++ b/docs/_posts/2023-06-07-deepspeed-overview-japanese.md
@@ -0,0 +1,8 @@
+---
+title: "DeepSpeed主要技術の概要紹介"
+excerpt: ""
+date: 2023-06-07 00:00:00
+tags: inference training ZeRO RLHF Japanese presentations
+---
+
+我々が研究開発しているDeepSpeedについて、主要技術を日本語で説明した資料を公開しました。GPT3やChatGPTのような生成型AIのための大規模言語モデルを含む、様々な深層学習の訓練や推論に容易に適用でき、モデルの大規模化、高速化、コスト削減を可能にします。[こちら](/assets/files/DeepSpeed_Overview_Japanese_2023Jun7th.pdf)よりダウンロードしてください。
diff --git a/docs/_posts/2023-06-22-zeropp-chinese.md b/docs/_posts/2023-06-22-zeropp-chinese.md
new file mode 100644
index 000000000000..ca52dd5f59ab
--- /dev/null
+++ b/docs/_posts/2023-06-22-zeropp-chinese.md
@@ -0,0 +1,7 @@
+---
+title: "DeepSpeed ZeRO++：降低4倍网络通信，显著提高大模型及类ChatGPT模型训练效率"
+excerpt: ""
+link: https://github.com/microsoft/DeepSpeed/blob/master/blogs/zeropp/chinese/README.md
+date: 2023-06-22 00:00:00
+tags: training ZeRO RLHF Chinese
+---
diff --git a/docs/_posts/2023-06-22-zeropp-japanese.md b/docs/_posts/2023-06-22-zeropp-japanese.md
new file mode 100644
index 000000000000..745fcac41d97
--- /dev/null
+++ b/docs/_posts/2023-06-22-zeropp-japanese.md
@@ -0,0 +1,7 @@
+---
+title: "DeepSpeed ZeRO++: LLMやチャットモデルの訓練を劇的に高速化 – 通信オーバヘッドを1/4に大幅削減 -"
+excerpt: ""
+link: https://github.com/microsoft/DeepSpeed/blob/master/blogs/zeropp/japanese/README.md
+date: 2023-06-22 00:00:00
+tags: training ZeRO RLHF Japanese
+---
diff --git a/docs/_posts/2023-06-22-zeropp.md b/docs/_posts/2023-06-22-zeropp.md
new file mode 100644
index 000000000000..d301942a00cd
--- /dev/null
+++ b/docs/_posts/2023-06-22-zeropp.md
@@ -0,0 +1,7 @@
+---
+title: "DeepSpeed ZeRO++: A leap in speed for LLM and chat model training with 4X less communication"
+excerpt: ""
+link: https://www.microsoft.com/en-us/research/blog/deepspeed-zero-a-leap-in-speed-for-llm-and-chat-model-training-with-4x-less-communication/
+date: 2023-06-22 00:00:00
+tags: training ZeRO RLHF English
+---
diff --git a/docs/_posts/2023-08-24-ulysses-chinese.md b/docs/_posts/2023-08-24-ulysses-chinese.md
new file mode 100644
index 000000000000..613af2fe7583
--- /dev/null
+++ b/docs/_posts/2023-08-24-ulysses-chinese.md
@@ -0,0 +1,7 @@
+---
+title: "DeepSpeed Ulysses: 训练极长序列Transformer模型的系统优化"
+excerpt: ""
+link: https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-ulysses/chinese/README.md
+date: 2023-08-24 00:00:00
+tags: training ZeRO Chinese
+---
diff --git a/docs/_posts/2023-08-24-ulysses-japanese.md b/docs/_posts/2023-08-24-ulysses-japanese.md
new file mode 100644
index 000000000000..921c7c28739d
--- /dev/null
+++ b/docs/_posts/2023-08-24-ulysses-japanese.md
@@ -0,0 +1,7 @@
+---
+title: "DeepSpeed Ulysses: Transformerモデルを非常に長いシーケンスで訓練するための最適化"
+excerpt: ""
+link: https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-ulysses/japanese/README.md
+date: 2023-08-24 00:00:00
+tags: training ZeRO Japanese
+---
diff --git a/docs/_posts/2023-08-24-ulysses.md b/docs/_posts/2023-08-24-ulysses.md
new file mode 100644
index 000000000000..a88a0d66080a
--- /dev/null
+++ b/docs/_posts/2023-08-24-ulysses.md
@@ -0,0 +1,7 @@
+---
+title: "DeepSpeed Ulysses: System Optimizations for Enabling Training of Extreme Long Sequence Transformer Models"
+excerpt: ""
+link: https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-ulysses/README.md
+date: 2023-08-24 00:00:00
+tags: training ZeRO English
+---
diff --git a/docs/_posts/2023-09-12-ZeRO-Inference.md b/docs/_posts/2023-09-12-ZeRO-Inference.md
new file mode 100644
index 000000000000..7b9852dc160b
--- /dev/null
+++ b/docs/_posts/2023-09-12-ZeRO-Inference.md
@@ -0,0 +1,6 @@
+title: "ZeRO-Inference: 20X faster inference through weight quantization and KV cache offloading"
+excerpt: ""
+link: https://github.com/microsoft/DeepSpeedExamples/blob/master/inference/huggingface/zero_inference/README.md
+date: 2023-09-12 00:09:00
+tags: inference ZeRO quantization English
+---
diff --git a/docs/_posts/2023-09-19-deepspeed4science-chinese.md b/docs/_posts/2023-09-19-deepspeed4science-chinese.md
new file mode 100644
index 000000000000..7b0ccf00aa61
--- /dev/null
+++ b/docs/_posts/2023-09-19-deepspeed4science-chinese.md
@@ -0,0 +1,7 @@
+---
+title: "DeepSpeed4Science：利用先进的AI系统优化技术实现科学发现"
+excerpt: ""
+link: https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed4science/chinese/README.md
+date: 2023-09-19 00:00:00
+tags: training inference science Chinese
+---
diff --git a/docs/_posts/2023-09-19-deepspeed4science-japanese.md b/docs/_posts/2023-09-19-deepspeed4science-japanese.md
new file mode 100644
index 000000000000..8c0a1b6d0082
--- /dev/null
+++ b/docs/_posts/2023-09-19-deepspeed4science-japanese.md
@@ -0,0 +1,7 @@
+---
+title: "DeepSpeed4Scienceイニシアティブ: 洗練されたAIシステムのテクノロジーにより大規模な科学的発見を可能に"
+excerpt: ""
+link: https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed4science/japanese/README.md
+date: 2023-09-19 00:00:00
+tags: training inference science Japanese
+---
diff --git a/docs/_posts/2023-09-19-deepspeed4science.md b/docs/_posts/2023-09-19-deepspeed4science.md
new file mode 100644
index 000000000000..faeaa1331944
--- /dev/null
+++ b/docs/_posts/2023-09-19-deepspeed4science.md
@@ -0,0 +1,7 @@
+---
+title: "Announcing the DeepSpeed4Science Initiative: Enabling large-scale scientific discovery through sophisticated AI system technologies"
+excerpt: ""
+link: https://www.microsoft.com/en-us/research/blog/announcing-the-deepspeed4science-initiative-enabling-large-scale-scientific-discovery-through-sophisticated-ai-system-technologies/
+date: 2023-09-19 00:00:00
+tags: training inference science English
+---
diff --git a/docs/_posts/2023-10-04-deepspeed-visualchat-chinese.md b/docs/_posts/2023-10-04-deepspeed-visualchat-chinese.md
new file mode 100644
index 000000000000..290b8b4b8ba4
--- /dev/null
+++ b/docs/_posts/2023-10-04-deepspeed-visualchat-chinese.md
@@ -0,0 +1,7 @@
+---
+title: "DeepSpeed-VisualChat：多轮图像+文字，为你展现不一样的AI聊天魅力"
+excerpt: ""
+link: https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-visualchat/10-03-2023/README-Chinese.md
+date: 2023-10-04 00:00:00
+tags: training Chinese
+---
diff --git a/docs/_posts/2023-10-04-deepspeed-visualchat-japanese.md b/docs/_posts/2023-10-04-deepspeed-visualchat-japanese.md
new file mode 100644
index 000000000000..f8b7e20cc2cf
--- /dev/null
+++ b/docs/_posts/2023-10-04-deepspeed-visualchat-japanese.md
@@ -0,0 +1,7 @@
+---
+title: "DeepSpeed-VisualChat: 複数ラウンド・複数画像の入力が可能なAIチャット体験を実現"
+excerpt: ""
+link: https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-visualchat/10-03-2023/README-Japanese.md
+date: 2023-10-04 00:00:00
+tags: training Japanese
+---
diff --git a/docs/_posts/2023-10-04-deepspeed-visualchat.md b/docs/_posts/2023-10-04-deepspeed-visualchat.md
new file mode 100644
index 000000000000..74a1eb66fd5c
--- /dev/null
+++ b/docs/_posts/2023-10-04-deepspeed-visualchat.md
@@ -0,0 +1,7 @@
+---
+title: "DeepSpeed-VisualChat: Improve Your Chat Experience with Multi-Round Multi-Image Inputs"
+excerpt: ""
+link: https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-visualchat/10-03-2023/README.md
+date: 2023-10-04 00:00:00
+tags: training English
+---
diff --git a/docs/_posts/2023-11-06-deepspeed-fastgen-chinese.md b/docs/_posts/2023-11-06-deepspeed-fastgen-chinese.md
new file mode 100644
index 000000000000..cc259c20361a
--- /dev/null
+++ b/docs/_posts/2023-11-06-deepspeed-fastgen-chinese.md
@@ -0,0 +1,7 @@
+---
+title: "DeepSpeed-FastGen：通过 MII 和 DeepSpeed-Inference 实现 LLM 高吞吐量文本生成"
+excerpt: ""
+link: https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen/chinese/README.md
+date: 2023-11-06 00:00:00
+tags: inference Chinese
+---
diff --git a/docs/_posts/2023-11-06-deepspeed-fastgen-japanese.md b/docs/_posts/2023-11-06-deepspeed-fastgen-japanese.md
new file mode 100644
index 000000000000..a4bb8237783a
--- /dev/null
+++ b/docs/_posts/2023-11-06-deepspeed-fastgen-japanese.md
@@ -0,0 +1,7 @@
+---
+title: "DeepSpeed-FastGen: MIIとDeepSpeed-InferenceによるLLMのための高速なテキスト生成"
+excerpt: ""
+link: https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen/japanese/README.md
+date: 2023-11-06 00:00:00
+tags: inference Japanese
+---
diff --git a/docs/_posts/2023-11-06-deepspeed-fastgen.md b/docs/_posts/2023-11-06-deepspeed-fastgen.md
new file mode 100644
index 000000000000..2af103e81f15
--- /dev/null
+++ b/docs/_posts/2023-11-06-deepspeed-fastgen.md
@@ -0,0 +1,7 @@
+---
+title: "DeepSpeed-FastGen: High-throughput Text Generation for LLMs via MII and DeepSpeed-Inference"
+excerpt: ""
+link: https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen
+date: 2023-11-06 00:00:00
+tags: inference English
+---
diff --git a/docs/_tutorials/MoQ-tutorial.md b/docs/_tutorials/MoQ-tutorial.md
index 587ef4b1ed67..ffee37b158f6 100644
--- a/docs/_tutorials/MoQ-tutorial.md
+++ b/docs/_tutorials/MoQ-tutorial.md
@@ -3,7 +3,7 @@ title: "DeepSpeed Mixture-of-Quantization (MoQ)"
 tags: training quantization
 ---
 
-DeepSpeed introduces new support for model compression using quantization, called Mixture-of-Quantization (MoQ).  MoQ is designed on top of QAT (Quantization-Aware Training), with the difference that it schedules various data precisions across the training process. It starts with quantizing the model with a high precision, such as FP16 or 16-bit quantization, and reduce the precision through a pre-defined schedule until reaching the target quantization bits (like 8-bit). Moreover, we use second-order information of the model parameters to dynamically adjust the quantization schedule for each of layer of the network separately. We have seen that by adding such schedule and using various data precision in the training process, we can quantize the model with better quality and preserve accuracy. For a better understanding of MoQ methodology, please refer to MoQ deep-dive, [here](https://www.deepspeed.ai/2021/05/04/MoQ.html).
+DeepSpeed introduces new support for model compression using quantization, called Mixture-of-Quantization (MoQ).  MoQ is designed on top of QAT (Quantization-Aware Training), with the difference that it schedules various data precisions across the training process. It starts with quantizing the model with a high precision, such as FP16 or 16-bit quantization, and reduce the precision through a pre-defined schedule until reaching the target quantization bits (like 8-bit). Moreover, we use second-order information of the model parameters to dynamically adjust the quantization schedule for each layer of the network separately. We have seen that by adding such schedule and using various data precision in the training process, we can quantize the model with better quality and preserve accuracy. For a better understanding of MoQ methodology, please refer to MoQ deep-dive, [here](https://www.deepspeed.ai/2021/05/04/MoQ.html).
 
 Below, we use fine-tune for the GLUE tasks as an illustration of how to use MoQ.
 
@@ -71,7 +71,7 @@ Before fine-tuning the GLUE tasks using DeepSpeed MoQ, you need:
 
 ### DeepSpeed Configuration File
 
-Prepare a config file `test.json` as below, please note following important parameters for quantization training:
+Prepare a config file `test.json` as below, please note the following important parameters for quantization training:
 
 ```
 {
@@ -134,7 +134,7 @@ python text-classification/run_glue.py \
   --deepspeed test.json
 ```
 
-Running this script will get `MPRC` accuracy and F1 metric results with MoQ quantization.
+Running this script will get `MRPC` accuracy and F1 metric results with MoQ quantization.
 
 
 ### Quantization with dynamic schedule using second-order information (Eigenvalue)
diff --git a/docs/_tutorials/accelerator-abstraction-interface.md b/docs/_tutorials/accelerator-abstraction-interface.md
new file mode 100644
index 000000000000..0810c3c6b5d7
--- /dev/null
+++ b/docs/_tutorials/accelerator-abstraction-interface.md
@@ -0,0 +1,157 @@
+---
+title: DeepSpeed Accelerator Abstraction Interface
+tags: getting-started
+---
+
+# Contents
+- [Contents](#contents)
+- [Introduction](#introduction)
+- [Write accelerator agnostic models](#write-accelerator-agnostic-models)
+  - [Port accelerator runtime calls](#port-accelerator-runtime-calls)
+  - [Port accelerator device name](#port-accelerator-device-name)
+  - [Tensor operations](#tensor-operations)
+  - [Communication backend](#communication-backend)
+- [Run DeepSpeed model on different accelerators](#run-deepspeed-model-on-different-accelerators)
+- [Run DeepSpeed model on CPU](#run-deepspeed-model-on-cpu)
+- [Implement new accelerator extension](#implement-new-accelerator-extension)
+
+# Introduction
+The DeepSpeed Accelerator Abstraction allows user to run large language model seamlessly on various Deep Learning acceleration hardware with DeepSpeed.   It offers a set of accelerator runtime and accelerator op builder interface which can be implemented for different hardware.  This means user can write large language model code without hardware specific code.  With DeepSpeed Accelerator Abstraction, the same large language model can run on different hardware platform, without the need to rewrite model code.  This makes running large language model on different hardware easier.
+
+This document covers three topics related to DeepSpeed Accelerator Abstraction Interface:
+1. Write accelerator agnostic models using DeepSpeed Accelerator Abstraction Interface.
+2. Run DeepSpeed model on different accelerators.
+3. Implement new accelerator extension for DeepSpeed Accelerator Abstraction Interface.
+
+# Write accelerator agnostic models
+In this part, you will learn how to write a model that does not contain HW specific code, or how to port a model that run on a specific HW only to be accelerator agnostic.  To do this, we first import `get_accelerator` from `deepspeed.accelerator`
+```
+from deepspeed.accelerator import get_accelerator
+```
+Note: `get_accelerator()` is the entrance to DeepSpeed Accelerator Abstraction Interface
+## Port accelerator runtime calls
+First we need to port accelerator runtime calls.  On CUDA device, accelerator runtime call appears in the form of `torch.cuda.<interface>(...)`.   With DeepSpeed Accelerator Abstract Interface, such accelerator runtime call can be written in the form of `get_accelerator().<interface>(...)` which will be accelerator agnostic.
+
+A typical conversion looks like the following example:
+
+```
+if torch.cuda.is_available():
+    ...
+```
+-->
+```
+if get_accelerator().is_available():
+    ...
+```
+
+For most `torch.cuda.<interface>(...)` call, we can literally replace `torch.cuda` with `get_accelerator()`.   However, there are some exceptions that needs attention:
+1. For `torch.cuda.current_device()`, we need to know whether calling this interface is to get device index, or supply the return value as a device.   If we want to use the return value as a device string, we need to call `get_accelerator().current_device_name()`.  For example:
+```
+torch.empty(weight_shape, dtype=dtype, device=get_accelerator().current_device_name())
+```
+However, if we wish to get device index as a number, we should call `get_accelerator().current_device()`
+```
+local_rank = get_accelerator().current_device()
+```
+2. For `torch.cuda.default_generators[index]`, convert to `get_accelerator().default_generator(index)`
+
+## Port accelerator device name
+For CUDA specific device name such as `'cuda'` or `'cuda:0'`, or `'cuda:1'`, we convert them to `get_accelerator().device_name()`, `get_accelerator().device_name(0)`, and `get_accelerator().device_name(1)`.
+
+A device name without index can be used if model need to do specific thing for certain accelerator.  We suggest to make as less as such usage only for situations can not be resolve other way.
+
+## Tensor operations
+CUDA specific tensor operations needs to be converted according to the following rules:
+- When we convert a torch tensor to accelerator device such as `my_tensor.cuda()`, we use `my_tensor.to(get_accelerator().device_name())`
+
+- When we check whether a torch tensor is on accelerator device such as `my_tensor.is_cuda`, we use `get_accelerator().on_accelerator(my_tensor)`
+
+- When pin a tensor to GPU memory such as `my_tensor.pin_memory()`, we use `get_accelerator().pin_memory(my_tensor)`
+
+## Communication backend
+When a communication backend string is used, the interface `get_accelerator().communication_backend_name()` is used get get communication backend name. So instead of:
+```
+torch.distributed.init_process_group('nccl')
+```
+, we use:
+```
+torch.distributed.init_process_group(get_accelerator().communication_backend_name())
+```
+
+# Run DeepSpeed model on different accelerators
+Once a model is ported with DeepSpeed Accelerator Abstraction Interface, we can run this model on different accelerators using extension to DeepSpeed.  DeepSpeed check whether certain extension is installed in the environment to decide whether to use the Accelerator backend in that extension.  For example if we wish to run model on Intel GPU, we can install _Intel Extension for DeepSpeed_ following the instruction in [link](https://github.com/intel/intel-extension-for-deepspeed/)
+
+After the extension is installed, install DeepSpeed and run model.   The model will be running on top of DeepSpeed.   Because DeepSpeed installation is also accelerator related, it is recommended to install DeepSpeed accelerator extension before install DeepSpeed.
+
+`CUDA_Accelerator` is the default accelerator in DeepSpeed.  If no other DeepSpeed accelerator extension is installed, `CUDA_Accelerator` will be used.
+
+When run a model on different accelerator in a cloud environment, the recommended practice is provision environment for each accelerator in different env with tool such as _anaconda/miniconda/virtualenv_.  When run model on different Accelerator, load the env accordingly.
+
+Note that different accelerator may have different 'flavor' of float16 or bfloat16.   So it is recommended to make the model configurable for both float16 and bfloat16, in that way model code does not need to be changed when running on different accelerators.
+
+# Run DeepSpeed model on CPU
+DeepSpeed support using CPU as accelerator.  DeepSpeed model using DeepSpeed Accelerator Abstraction Interface could run on CPU without change to model code.   DeepSpeed decide whether _Intel Extension for PyTorch_ is installed in the environment.  If this packaged is installed, DeepSpeed will use CPU as accelerator.  Otherwise CUDA device will be used as accelerator.
+
+To run DeepSpeed model on CPU, use the following steps to prepare environment:
+
+```
+python -m pip install intel_extension_for_pytorch
+python -m pip install oneccl_bind_pt==2.0 -f https://developer.intel.com/ipex-whl-stable-cpu
+git clone https://github.com/oneapi-src/oneCCL
+cd oneCCL
+mkdir build
+cd build
+cmake ..
+make
+make install
+```
+
+Before run CPU workload, we need to source oneCCL environment variables
+```
+source <path-to-oneCCL>/build/_install/env/setvars.sh
+```
+
+After environment is prepared, we can launch DeepSpeed inference with the following command
+```
+deepspeed --bind_cores_to_rank <deepspeed-model-script>
+```
+
+This command would launch number of workers equal to number of CPU sockets on the system.  Currently DeepSpeed support running inference model with AutoTP on top of CPU.  The argument `--bind_cores_to_rank` distribute CPU cores on the system evenly among workers, to allow each worker running on a dedicated set of CPU cores.
+
+On CPU system, there might be daemon process that periodically activate which would increase variance of each worker.  One practice is leave a couple of cores for daemon process using `--bind-core-list` argument:
+
+```
+deepspeed --bind_cores_to_rank --bind_core_list 0-51,56-107 <deepspeed-model-script>
+```
+
+The command above leave 4 cores on each socket to daemon process (assume two sockets, each socket has 56 cores).
+
+We can also set an arbitrary number of workers.  Unlike GPU, CPU cores on host can be further divided into subgroups.  When this number is not set, DeepSpeed would detect number of NUMA nodes on the system and launch one worker for each NUMA node.
+
+```
+deepspeed --num_accelerators 4 --bind_cores_to_rank <deepspeed-model-script>
+```
+
+Launching DeepSpeed model on multiple CPU nodes is similar to other accelerators.  We need to specify `impi` as launcher and specify `--bind_cores_to_rank` for better core binding.  Also specify `slots` number according to number of CPU sockets in host file.
+
+```
+# hostfile content should follow the format
+# worker-1-hostname slots=<#sockets>
+# worker-2-hostname slots=<#sockets>
+# ...
+
+deepspeed --hostfile=<hostfile> --bind_cores_to_rank --launcher impi --master_addr <master-ip> <deepspeed-model-script>
+```
+
+# Implement new accelerator extension
+It is possible to implement a new DeepSpeed accelerator extension to support new accelerator in DeepSpeed.  An example to follow is _[Intel Extension For DeepSpeed](https://github.com/intel/intel-extension-for-deepspeed/)_.   An accelerator extension contains the following components:
+1. XYZ_Accelerator(DeepSpeedAccelerator) class definition, where 'XYZ' is the accelerator name, such as 'XPU' or 'CPU'.
+This class implements `class DeepSpeedAccelerator` and will be returned by `get_accelerator()` in DeepSpeed.
+2. Op builders following https://github.com/intel/intel-extension-for-deepspeed/tree/main/intel_extension_for_deepspeed/op_builder.   All op builders needs to inherit `deepspeed.ops.op_builder.builder.OpBuilder` directly or indirectly.  A common practice is to implement a base op builder (SYCLOpBuilder in the case of Intel Extension for DeepSpeed) and inherit this base op builder instead.
+3. Op kernels as in the following [link](https://github.com/intel/intel-extension-for-deepspeed/tree/main/intel_extension_for_deepspeed/op_builder/csrc).
+
+Note that an extension does not have to implement all op builders under https://github.com/microsoft/DeepSpeed/tree/master/op_builder all at a time.   A missing op builder usually means certain DeepSpeed functionality cannot be used for that Accelerator, but models that does not use that functionality can still run.
+
+When implementing op builder for an accelerator extension, one thing needs to be noted is that the op builder native code is being built by DeepSpeed jit load mechanism.  This mean the native source file being built needs to be in DeepSpeed installation directory.  However these files are defined in accelerator extension installation directory, which cannot be built by DeepSpeed directly.  To solve this, follow the example in https://github.com/intel/intel-extension-for-deepspeed/blob/main/intel_extension_for_deepspeed/op_builder/cpu_adam.py to use 'sycl_kernel_path' and 'sycl_kernel_include' (User can change 'sycl' to other prefix in their own accelerator extension) to allow native code be built during DeepSpeed jit load.
+
+When accelerator extension is installed in the environment, it can be used by either explicit call deepspeed.accelerator.set_accelerator(XYZ_Accelerator()) following the example in https://github.com/microsoft/DeepSpeed/blob/master/accelerator/real_accelerator.py, or add an implicit detection code in get_accelerator in the same file above.
diff --git a/docs/_tutorials/advanced-install.md b/docs/_tutorials/advanced-install.md
index 09a8e6d56234..10197e62f681 100755
--- a/docs/_tutorials/advanced-install.md
+++ b/docs/_tutorials/advanced-install.md
@@ -57,15 +57,21 @@ DS_BUILD_FUSED_LAMB=1 pip install deepspeed
 
 Available `DS_BUILD` options include:
 * `DS_BUILD_OPS` toggles all ops
+* `DS_BUILD_AIO` builds asynchronous (NVMe) I/O op
+* `DS_BUILD_CCL_COMM` builds the communication collective libs
 * `DS_BUILD_CPU_ADAM` builds the CPUAdam op
+* `DS_BUILD_CPU_LION` builds the CPULion op
+* `DS_BUILD_EVOFORMER_ATTN` builds the EvoformerAttn op (from [Alphafold](https://www.deepspeed.ai/tutorials/ds4sci_evoformerattention/))
 * `DS_BUILD_FUSED_ADAM` builds the FusedAdam op (from [apex](https://github.com/NVIDIA/apex))
+* `DS_BUILD_FUSED_LION` builds the FusedLion op
+* `DS_BUILD_CPU_ADAGRAD` builds the CPUAdagrad op
 * `DS_BUILD_FUSED_LAMB` builds the FusedLamb op
+* `DS_BUILD_QUANTIZER` builds the quantizer op
+* `DS_BUILD_RANDOM_LTD` builds the random ltd op
 * `DS_BUILD_SPARSE_ATTN` builds the sparse attention op
 * `DS_BUILD_TRANSFORMER` builds the transformer op
 * `DS_BUILD_TRANSFORMER_INFERENCE` builds the transformer-inference op
 * `DS_BUILD_STOCHASTIC_TRANSFORMER` builds the stochastic transformer op
-* `DS_BUILD_UTILS` builds various optimized utilities
-* `DS_BUILD_AIO` builds asynchronous (NVMe) I/O op
 
 To speed up the build-all process, you can parallelize the compilation process with:
 
@@ -123,6 +129,16 @@ fail. Therefore, if you need to you can override the default location with the h
  TORCH_EXTENSIONS_DIR=./torch-extensions deepspeed ...
 ```
 
+### Conda environment for building from source
+
+If you encounter difficulties during compilation using the default system environment, you can try the conda environment provided, which includes the necessary compilation toolchain and PyTorch.
+
+```bash
+conda env create -n deepspeed -f environment.yml --force
+```
+
+and try above install commands after activating it.
+
 ## Building for the correct architectures
 
 If you're getting the following error:
@@ -147,6 +163,33 @@ This is also recommended to ensure your exact architecture is used. Due to a var
 
 The full list of nvidia GPUs and their compute capabilities can be found [here](https://developer.nvidia.com/cuda-gpus).
 
+## CUDA version mismatch
+
+If you're getting the following error:
+
+```
+Exception: >- DeepSpeed Op Builder: Installed CUDA version {VERSION} does not match the version torch was compiled with {VERSION}, unable to compile cuda/cpp extensions without a matching cuda version.
+```
+You have a misaligned version of CUDA installed compared to the version of CUDA
+used to compile torch. A mismatch in the major version is likely to result in
+errors or unexpected behavior.
+
+The easiest fix for this error is changing the CUDA version installed (check
+with `nvcc --version`) or updating the torch version to match the installed
+CUDA version (check with `python3 -c "import torch; print(torch.__version__)"`).
+
+We only require that the major version matches (e.g., 11.1 and 11.8). However,
+note that even a mismatch in the minor version _may still_ result in unexpected
+behavior and errors, so it's recommended to match both major and minor versions.
+When there's a minor version mismatch, DeepSpeed will log a warning.
+
+If you want to skip this check and proceed with the mismatched CUDA versions,
+use the following environment variable, but beware of unexpected behavior:
+
+```bash
+DS_SKIP_CUDA_CHECK=1
+```
+
 ## Feature specific dependencies
 
 Some DeepSpeed features require specific dependencies outside the general dependencies of DeepSpeed.
diff --git a/docs/_tutorials/automatic-tensor-parallelism.md b/docs/_tutorials/automatic-tensor-parallelism.md
index 6991d5caf925..5d182b2a4532 100644
--- a/docs/_tutorials/automatic-tensor-parallelism.md
+++ b/docs/_tutorials/automatic-tensor-parallelism.md
@@ -7,6 +7,7 @@ tags: inference
    * [Introduction](#introduction)
    * [Example Script](#example-script)
         * [Launching](#launching)
+        * [T5 11B Inference Performance Comparison](#t5-11b-inference-performance-comparison)
         * [OPT 13B Inference Performance Comparison](#opt-13b-inference-performance-comparison)
    * [Supported Models](#supported-models)
    * [Unsupported Models](#unsupported-models)
@@ -65,7 +66,7 @@ With automatic tensor parallelism, we do not need to provide the injection polic
 
 # Example Script
 
-We can observe performance improvement with automatic tensor parallelism using the [inference test suite](https://github.com/microsoft/DeepSpeedExamples/blob/master/inference/huggingface/text-generation/inference-test.py). The script includes per token latency, bandwidth, throughput and memory checks for comparison. See the [README](https://github.com/microsoft/DeepSpeedExamples/tree/master/inference/huggingface/text-generation#deepspeed-huggingface-text-generation-examples) for more information.
+We can observe performance improvement with automatic tensor parallelism using the [inference test suite](https://github.com/microsoft/DeepSpeedExamples/blob/master/inference/huggingface/text-generation/inference-test.py). This script is for testing text-generation models and includes per token latency, bandwidth, throughput and memory checks for comparison. See the [README](https://github.com/microsoft/DeepSpeedExamples/tree/master/inference/huggingface/text-generation#deepspeed-huggingface-text-generation-examples) for more information.
 
 
 ## Launching
@@ -83,19 +84,31 @@ To enable tensor parallelism, you need to use the flag `ds_inference` for the co
 deepspeed --num_gpus <num_gpus> DeepSpeedExamples/inference/huggingface/text-generation/inference-test.py --name <model> --batch_size <batch_size> --test_performance --ds_inference
 ```
 
-## OPT 13B Inference Performance Comparison
+## T5 11B Inference Performance Comparison
 
 The following results were collected using V100 SXM2 32GB GPUs.
 
-### Max New Tokens = 50
+### Latency
 
-| Test       | Memory Allocated per GPU   | Max Batch Size   | Max Throughput per GPU   |
-| ---------- | -------------------------- | ---------------- | ------------------------ |
-| No TP      | 23.94 GB                   | 64               | 18.84 TFlops             |
-| 2 GPU TP   | 12.23 GB                   | 320              | 27.17 TFlops             |
-| 4 GPU TP   | 6.36 GB                    | 664              | 27.63 TFlops             |
+![T5 Latency Graph](/assets/images/auto-tp-chart-latency.png){: .align-center}
+
+### Throughput
 
-### Max New Tokens = 1024
+![T5 Throughput Graph](/assets/images/auto-tp-chart-throughput.png){: .align-center}
+
+### Memory
+
+| Test           | Memory Allocated per GPU   | Max Batch Size | Max Throughput per GPU |
+| -------------- | -------------------------- | -------------- | ---------------------- |
+| No TP or 1 GPU | 21.06 GB                   | 64             | 9.29 TFLOPS            |
+| 2 GPU TP       | 10.56 GB                   | 320            | 13.04 TFLOPS           |
+| 4 GPU TP       | 5.31 GB                    | 768            | 14.04 TFLOPS           |
+
+## OPT 13B Inference Performance Comparison
+
+The following results were collected using V100 SXM2 32GB GPUs.
+
+![OPT Throughput Graph](/assets/images/auto-tp-chart-opt-throughput.png){: .align-center}
 
 | Test       | Memory Allocated per GPU   | Max Batch Size   | Max Throughput per GPU   |
 | ---------- | -------------------------- | ---------------- | ------------------------ |
@@ -110,18 +123,23 @@ The following model families have been successfully tested with automatic tensor
 - albert
 - bert
 - bigbird_pegasus
+- bloom
 - camembert
+- codegen
 - deberta_v2
 - electra
 - ernie
 - esm
+- glm
 - gpt-j
 - gpt-neo
 - gpt-neox
 - longt5
 - luke
+- llama
 - m2m_100
 - marian
+- mpt
 - mvp
 - nezha
 - openai
@@ -142,8 +160,6 @@ The following model families have been successfully tested with automatic tensor
 
 The following models are not currently supported with automatic tensor parallelism. They may still be compatible with other DeepSpeed features (e.g., kernel injection for Bloom):
 
-- bloom
-- codegen
 - deberta
 - flaubert
 - fsmt
diff --git a/docs/_tutorials/azure.md b/docs/_tutorials/azure.md
index 6c7cded7b27c..38af70b3f4b0 100644
--- a/docs/_tutorials/azure.md
+++ b/docs/_tutorials/azure.md
@@ -13,10 +13,10 @@ The recommended and simplest method to try DeepSpeed on Azure is through [AzureM
 
 For AzureML v1 examples, please take a look at easy-to-use examples for Megatron-DeepSpeed, Transformers and CIFAR training [here](https://github.com/Azure/azureml-examples/tree/main/v1/python-sdk/workflows/train/deepspeed).
 
-> Our [Megatron-DeepSpeed](https://github.com/microsoft/megatron-deepspeed) contains the most up to date [recipe](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples/azureml) for end-to-end training on AzureML.
+> Our [Megatron-DeepSpeed](https://github.com/microsoft/megatron-deepspeed) contains the most up to date [recipe](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples_deepspeed/azureml) for end-to-end training on AzureML.
 
 # DeepSpeed on Azure VMs
 
 If you don't have access to AzureML or if want to build a custom environments using [Azure virtual machines](https://azure.microsoft.com/en-us/services/virtual-machines/) or Azure VM Scale-Sets ([VMSS](https://docs.microsoft.com/en-us/azure/virtual-machine-scale-sets/overview)), we are working on easy-to-use cluster setup scripts that will be published in the next few weeks.
 
-If you already have a cluster setup, you can use the [azure recipes](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples/azure) that can easily be modified to train various model configurations.
+If you already have a cluster setup, you can use the [azure recipes](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples_deepspeed/azure) that can easily be modified to train various model configurations.
diff --git a/docs/_tutorials/bert-finetuning.md b/docs/_tutorials/bert-finetuning.md
index f7ea8226022e..3014be18d682 100755
--- a/docs/_tutorials/bert-finetuning.md
+++ b/docs/_tutorials/bert-finetuning.md
@@ -201,7 +201,7 @@ the `--predict_batch_size` should also be 8.
 
 For further details about the transformer kernel, please see our [usage
 tutorial](/tutorials/transformer_kernel/) and [technical deep
-dive](https://www.deepspeed.ai/news/2020/05/27/fastest-bert-training.html) on
+dive](https://www.deepspeed.ai/2020/05/27/fastest-bert-training.html) on
 the fastest BERT training.
 
 
@@ -302,7 +302,7 @@ Table 4. The setting of memory-optimization flags for a range of micro-batch siz
 
 ### FineTuning model pre-trained with DeepSpeed Transformer Kernels
 
-Fine-tuning the model pre-trained using DeepSpeed Transformer and the recipe in [DeepSpeed Fast-Bert Training](https://www.deepspeed.ai/news/2020/05/27/fastest-bert-training.html) should yield F1 score of 90.5 and is expected to increase if you let the pre-training longer than suggested in the tutorial.
+Fine-tuning the model pre-trained using DeepSpeed Transformer and the recipe in [DeepSpeed Fast-Bert Training](https://www.deepspeed.ai/2020/05/27/fastest-bert-training.html) should yield F1 score of 90.5 and is expected to increase if you let the pre-training longer than suggested in the tutorial.
 
 To get these results, we do require some tuning of the dropout settings as described below:
 
diff --git a/docs/_tutorials/bert-pretraining.md b/docs/_tutorials/bert-pretraining.md
index a0943949f9bc..14789d3fda96 100755
--- a/docs/_tutorials/bert-pretraining.md
+++ b/docs/_tutorials/bert-pretraining.md
@@ -5,7 +5,7 @@ tags: training pre-training
 ---
 
 **Note:**
-On 08/15/2022 we have added another BERT pre-training/fine-tuning example at [github.com/microsoft/Megatron-DeepSpeed/tree/main/examples/bert_with_pile](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples/bert_with_pile), which includes a README.md that describes how to use it. Compared to the example described below, the new example in Megatron-DeepSpeed adds supports of ZeRO and tensor-slicing model parallelism (thus support larger model scale), uses a public and richer [Pile dataset](https://github.com/EleutherAI/the-pile) (user can also use their own data), together with some changes to the model architecture and training hyperparameters as described in [this paper](https://arxiv.org/abs/1909.08053). As a result, the BERT models trained by the new example is able to provide better MNLI results than original BERT, but with a slightly different model architecture and larger computation requirements. If you want to train a larger-scale or better quality BERT-style model, we recommend to follow the new example in Megatron-DeepSpeed. If your goal is to strictly reproduce the original BERT model, we recommend to follow the example under DeepSpeedExamples/bing_bert as described below. On the other hand, the tutorial below helps explaining how to integrate DeepSpeed into a pre-training codebase, regardless of which BERT example you use.
+On 08/15/2022 we have added another BERT pre-training/fine-tuning example at [github.com/microsoft/Megatron-DeepSpeed/tree/main/examples_deepspeed/bert_with_pile](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples_deepspeed/bert_with_pile), which includes a README.md that describes how to use it. Compared to the example described below, the new example in Megatron-DeepSpeed adds supports of ZeRO and tensor-slicing model parallelism (thus support larger model scale), uses a public and richer [Pile dataset](https://github.com/EleutherAI/the-pile) (user can also use their own data), together with some changes to the model architecture and training hyperparameters as described in [this paper](https://arxiv.org/abs/1909.08053). As a result, the BERT models trained by the new example is able to provide better MNLI results than original BERT, but with a slightly different model architecture and larger computation requirements. If you want to train a larger-scale or better quality BERT-style model, we recommend to follow the new example in Megatron-DeepSpeed. If your goal is to strictly reproduce the original BERT model, we recommend to follow the example under DeepSpeedExamples/bing_bert as described below. On the other hand, the tutorial below helps explaining how to integrate DeepSpeed into a pre-training codebase, regardless of which BERT example you use.
 {: .notice--info}
 
 In this tutorial we will apply DeepSpeed to pre-train the BERT
@@ -130,7 +130,7 @@ The `model` returned by `deepspeed.initialize` is the DeepSpeed _model
 engine_ that we will use to train the model using the forward, backward and
 step API. Since the model engine exposes the same forward pass API as
 `nn.Module` objects, there is no change in the forward pass.
-Thus, we only modify the the backward pass and optimizer/scheduler steps.
+Thus, we only modify the backward pass and optimizer/scheduler steps.
 
 Backward propagation is performed by calling `backward(loss)` directly with
 the model engine.
@@ -308,7 +308,7 @@ Note:
 
 For more details about the transformer kernel, please see [DeepSpeed
 Transformer Kernel](/tutorials/transformer_kernel/) and [DeepSpeed Fast-Bert
-Training](https://www.deepspeed.ai/news/2020/05/27/fastest-bert-training.html).
+Training](https://www.deepspeed.ai/2020/05/27/fastest-bert-training.html).
 
 
 ### Start Training
@@ -391,4 +391,4 @@ for more details in
 
 Compared to SOTA, DeepSpeed significantly improves single GPU performance for transformer-based model like BERT. Figure above shows the single GPU throughput of training BertBERT-Large optimized through DeepSpeed, compared with two well-known Pytorch implementations, NVIDIA BERT and HuggingFace BERT. DeepSpeed reaches as high as 64 and 53 teraflops throughputs (corresponding to 272 and 52 samples/second) for sequence lengths of 128 and 512, respectively, exhibiting up to 28% throughput improvements over NVIDIA BERT and up to 62% over HuggingFace BERT.  We also support up to 1.8x larger batch size without running out of memory.
 
-For more details on how we achieve the record breaking BERT training time please check out deep dive into DeepSpeed BERT [Fastest BERT Training](https://www.deepspeed.ai/news/2020/05/18/bert-record.html)
+For more details on how we achieve the record breaking BERT training time please check out deep dive into DeepSpeed BERT [Fastest BERT Training](https://www.deepspeed.ai/2020/05/18/bert-record.html)
diff --git a/docs/_tutorials/cifar-10.md b/docs/_tutorials/cifar-10.md
index 74ee04502f18..8b4990d0431e 100644
--- a/docs/_tutorials/cifar-10.md
+++ b/docs/_tutorials/cifar-10.md
@@ -8,21 +8,21 @@ If you haven't already, we advise you to first read through the
 [Getting Started](/getting-started/) guide before stepping through this
 tutorial.
 
-In this tutorial we will be adding DeepSpeed to CIFAR-10 model, which is small image classification model.
+In this tutorial we will be adding DeepSpeed to the CIFAR-10 model, which is a small image classification model.
 
-First we will go over how to run original CIFAR-10. Then we will proceed step-by-step in enabling this model to run with DeepSpeed.
+First we will go over how to run the original CIFAR-10 model. Then we will proceed step-by-step in enabling this model to run with DeepSpeed.
 
 
 
 ## Running Original CIFAR-10
 
-Original model code from [CIFAR-10 Tutorial](https://github.com/pytorch/tutorials/blob/master/beginner_source/blitz/cifar10_tutorial.py), We've copied this repo under [DeepSpeedExamples/cifar/](https://github.com/microsoft/DeepSpeedExamples/tree/master/cifar) and made it available as a submodule. To download, execute:
+Original model code from the [CIFAR-10 Tutorial](https://github.com/pytorch/tutorials/blob/main/beginner_source/blitz/cifar10_tutorial.py), We've copied this repo under [DeepSpeedExamples/training/cifar/](https://github.com/microsoft/DeepSpeedExamples/tree/master/training/cifar) and made it available as a submodule. To download, execute:
 
 ```bash
 git submodule update --init --recursive
 ```
 
-To install requirements for CIFAR-10:
+To install the requirements for the CIFAR-10 model:
 ```bash
 cd DeepSpeedExamples/cifar
 pip install -r requirements.txt
@@ -82,14 +82,14 @@ The first step to apply DeepSpeed is adding DeepSpeed arguments to CIFAR-10 mode
 
      parser=argparse.ArgumentParser(description='CIFAR')
 
-     #data
-     # cuda
+     # Data.
+     # Cuda.
      parser.add_argument('--with_cuda', default=False, action='store_true',
                          help='use CPU in case there\'s no GPU support')
      parser.add_argument('--use_ema', default=False, action='store_true',
                          help='whether use exponential moving average')
 
-     # train
+     # Train.
      parser.add_argument('-b', '--batch_size', default=32, type=int,
                          help='mini-batch size (default: 32)')
      parser.add_argument('-e', '--epochs', default=30, type=int,
@@ -97,7 +97,7 @@ The first step to apply DeepSpeed is adding DeepSpeed arguments to CIFAR-10 mode
      parser.add_argument('--local_rank', type=int, default=-1,
                         help='local rank passed from distributed launcher')
 
-     # Include DeepSpeed configuration arguments
+     # Include DeepSpeed configuration arguments.
      parser = deepspeed.add_config_arguments(parser)
 
      args=parser.parse_args()
@@ -123,16 +123,16 @@ def initialize(args,
                collate_fn=None):
 ```
 
-Here we initialize DeepSpeed with CIFAR-10 model (`net`), `args`, `parameters` and `trainset`:
+Here we initialize DeepSpeed with the CIFAR-10 model (`net`), `args`, `parameters` and `trainset`:
 
 ```python
  parameters = filter(lambda p: p.requires_grad, net.parameters())
  args=add_argument()
 
  # Initialize DeepSpeed to use the following features
- # 1) Distributed model
- # 2) Distributed data loader
- # 3) DeepSpeed optimizer
+ # 1) Distributed model.
+ # 2) Distributed data loader.
+ # 3) DeepSpeed optimizer.
  model_engine, optimizer, trainloader, _ = deepspeed.initialize(args=args, model=net, model_parameters=parameters, training_data=trainset)
 
 ```
@@ -155,7 +155,7 @@ The `model` returned by `deepspeed.initialize` is the _DeepSpeed Model Engine_ t
 
 ```python
      for i, data in enumerate(trainloader):
-         # get the inputs; data is a list of [inputs, labels]
+         # Get the inputs; data is a list of [inputs, labels].
          inputs = data[0].to(model_engine.device)
          labels = data[1].to(model_engine.device)
 
@@ -206,13 +206,13 @@ The next step to use DeepSpeed is to create a configuration JSON file (ds_config
 
 ### Run CIFAR-10 Model with DeepSpeed Enabled
 
-To start training CIFAR-10 model with DeepSpeed applied, execute the following command, it will use all detected GPUs by default.
+To start training the CIFAR-10 model with DeepSpeed applied, execute the following command, it will use all detected GPUs by default.
 
 ```bash
 deepspeed cifar10_deepspeed.py --deepspeed_config ds_config.json
 ```
 
-DeepSpeed usually prints more training details for user to monitor, including training settings, performance statistics and loss trends.
+DeepSpeed usually prints more training details for the user to monitor, including training settings, performance statistics and loss trends.
 ```
 deepspeed.pt cifar10_deepspeed.py --deepspeed_config ds_config.json
 Warning: Permanently added '[192.168.0.22]:42227' (ECDSA) to the list of known hosts.
diff --git a/docs/_tutorials/comms-logging.md b/docs/_tutorials/comms-logging.md
index 52d93eda05bc..2719f08ad200 100644
--- a/docs/_tutorials/comms-logging.md
+++ b/docs/_tutorials/comms-logging.md
@@ -13,7 +13,7 @@ In this tutorial, we introduce DeepSpeed communication logging and provide examp
 
 NOTE: All logging communication calls are synchronized in order to provide accurate timing information. This may hamper performance if your model heavily uses asynchronous communication operations.
 
-Logging communication calls is vital to ensure networking resources are fully utilized. The DeepSpeed communication logger enables the detection and logging of all communication operations launched under `deepspeed.comm`. Each communication operation can all be directly printed to the console immediately after completion (via the `verbose` config option), or a summary may be printed with a call to `deepspeed.comm.log_summary()` in the client code at the completion of training, an epoch, after N training iterations, etc.
+Logging communication calls is vital to ensure networking resources are fully utilized. The DeepSpeed communication logger enables the detection and logging of all communication operations launched under `deepspeed.comm`. Each communication operation can all be directly printed to the console immediately after completion (via the `verbose` config option), or a summary may be printed with a call to `deepspeed.comm.log_summary()` or `deepspeed.com.log_summary(show_straggler=True)` in the client code at the completion of training, an epoch, after N training iterations, etc.
 
 ## Usage
 
@@ -46,9 +46,9 @@ There are currently two ways to view communication log records:
 If the `enabled` configuration option is selected, all communication operations will be immediately printed to the console. This mode is intended for detailed debugging, and is not recommended for most users. The following is an example snippet of `verbose` output:
 
 ```
-[2022-06-26 01:39:55,722] [INFO] [logging.py:69:log_dist] [Rank 0] rank=0 | comm op: reduce_scatter_base | time (ms): 9.46 | msg size: 678.86 MB | algbw (Gbps): 1204.52  | busbw (Gbps): 1129.23
-[2022-06-26 01:39:56,470] [INFO] [logging.py:69:log_dist] [Rank 0] rank=0 | comm op: all_gather_base | time (ms): 0.11 | msg size: 6.0 MB | algbw (Gbps): 954.41  | busbw (Gbps): 894.76
-[2022-06-26 01:39:56,471] [INFO] [logging.py:69:log_dist] [Rank 0] rank=0 | comm op: all_gather_base | time (ms): 0.08 | msg size: 6.0 MB | algbw (Gbps): 1293.47  | busbw (Gbps): 1212.63
+[2022-06-26 01:39:55,722] [INFO] [logging.py:69:log_dist] [Rank 0] rank=0 | comm op: reduce_scatter_tensor | time (ms): 9.46 | msg size: 678.86 MB | algbw (Gbps): 1204.52  | busbw (Gbps): 1129.23
+[2022-06-26 01:39:56,470] [INFO] [logging.py:69:log_dist] [Rank 0] rank=0 | comm op: all_gather_into_tensor | time (ms): 0.11 | msg size: 6.0 MB | algbw (Gbps): 954.41  | busbw (Gbps): 894.76
+[2022-06-26 01:39:56,471] [INFO] [logging.py:69:log_dist] [Rank 0] rank=0 | comm op: all_gather_into_tensor | time (ms): 0.08 | msg size: 6.0 MB | algbw (Gbps): 1293.47  | busbw (Gbps): 1212.63
 ```
 
 For advanced users, the `debug` option will append the calling function of each communication operation to that operation's `log_name`. See [Log Summaries](#log-summaries) for an example of a `deepspeed.comm.log_summary()` call with `debug` enabled.
@@ -99,7 +99,7 @@ Comm. Op            Message Size        Count               Total Latency(ms)
 broadcast
                     2.0 KB              146                 11.12               0.08                0.43                0.41
                     98.25 MB            1                   8317.12             8317.12             0.20                0.19
-reduce_scatter_base
+reduce_scatter_tensor
                     678.86 MB           40                  602.29              9.69                1468.06             1376.31
 ```
 
@@ -111,6 +111,17 @@ Comm. Op            Message Size        Count               Total Latency(ms)
 broadcast | [Caller Func: _broadcast_model]
                     2.0 KB              146                 9.39                0.06                0.52                0.48
                     98.25 MB            1                   8540.60             8540.60             0.19                0.18
-reduce_scatter_base | [Caller Func: reduce_scatter_fn]
+reduce_scatter_tensor | [Caller Func: reduce_scatter_fn]
                     678.86 MB           80                  1527.17             13.94               1211.75             1136.01
 ```
+
+Straggler effect can be shown by supplying optional argument `show_straggler=True` to `deepspeed.comm.log_summary()` call.   Straggler effect is defined as the time a rank waits for the slowest rank to start communication.  For each collective, `log_summary` would get the minimum collective time among all ranks, compute straggler effect as follows:
+
+```
+straggler = sum(t_collectives - allreduce(t_collectives, MIN))
+```
+
+Print straggler effect with the following `log_summary` call in the example above:
+```
+    dist.log_summary(show_straggler=True)
+```
diff --git a/docs/_tutorials/curriculum-learning.md b/docs/_tutorials/curriculum-learning.md
index 161c29cfc04c..29f9417363f0 100644
--- a/docs/_tutorials/curriculum-learning.md
+++ b/docs/_tutorials/curriculum-learning.md
@@ -37,6 +37,7 @@ Curriculum learning can be used by setting the `curriculum_learning` key in the
     "loss_scale": 0,
     "loss_scale_window": 1000,
     "hysteresis": 2,
+    "consecutive_hysteresis": false,
     "min_loss_scale": 1
   },
   "curriculum_learning": {
@@ -113,7 +114,7 @@ After the update on 10/29/2021, now there are two curriculum learning examples f
 
 We provide two curriculum learning examples for Megatron-LM GPT-2 pre-training:
 
-The first one is at [Megatron-DeepSpeed/tree/main/examples/curriculum_learning](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples/curriculum_learning). This integration is based on a newer Megatron-LM fork, and only this curriculum learning example supports pipeline parallelism. However, as of 10/29/2021, we haven't verified ZeRO-2 and ZeRO-3 on this fork. Overall, we highly recommend you to use this example if your model does not require ZeRO-2/3.
+The first one is at [Megatron-DeepSpeed/tree/main/examples_deepspeed/curriculum_learning](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples_deepspeed/curriculum_learning). This integration is based on a newer Megatron-LM fork, and only this curriculum learning example supports pipeline parallelism. However, as of 10/29/2021, we haven't verified ZeRO-2 and ZeRO-3 on this fork. Overall, we highly recommend you to use this example if your model does not require ZeRO-2/3.
 
 The second one is at [DeepSpeedExamples/Megatron-LM-v1.1.5-ZeRO3/curriculum_learning/](https://github.com/microsoft/DeepSpeedExamples/tree/master/Megatron-LM-v1.1.5-ZeRO3/curriculum_learning). This integration is based on an older Megatron-LM hard copy that we will eventually deprecate and this curriculum learning example does not support pipeline parallelism. We recommend you to ONLY use this example if your model requires ZeRO-2/3.
 
@@ -130,7 +131,7 @@ In our [paper](https://arxiv.org/abs/2108.06084) section 5.4 we demonstrate that
 
 ### 2.3 Token-based training termination
 
-Because curriculum learning changes length of each sequence/sample during training, it is very hard/impossible to use number of steps/samples to terminate the training exactly at the desired number of tokens. Thus, we add a `--train-tokens` config for accurate token-based termination. We recommend increasing your original `--train-samples` or `--train-iters` to a large enough number (e.g., 3X of what you used for baseline), and set `--train-tokens` at the exact desired number of training tokens.
+Because curriculum learning changes the length of each sequence/sample during training, it is very hard/impossible to use  a number of steps/samples to terminate the training exactly at the desired number of tokens. Thus, we add a `--train-tokens` config for accurate token-based termination. We recommend increasing your original `--train-samples` or `--train-iters` to a large enough number (e.g., 3X of what you used for baseline), and set `--train-tokens` at the exact desired number of training tokens.
 
 ### 2.4 Token-based LR decay
 
diff --git a/docs/_tutorials/data-efficiency.md b/docs/_tutorials/data-efficiency.md
index 329e3bb89e2f..9ea3a33dab92 100644
--- a/docs/_tutorials/data-efficiency.md
+++ b/docs/_tutorials/data-efficiency.md
@@ -20,15 +20,15 @@ Curriculum learning has been successfully applied to various training tasks (see
 ### 1.3 How to use Curriculum Learning
 
 #### 1.3.1 GPT-3 and BERT pretraining
-The `examples/data_efficiency` directory in our [Megatron-DeepSpeed repo](https://github.com/microsoft/Megatron-DeepSpeed) includes our examples of how to apply curriculum learning to GPT-3 and BERT pretraining. There are 3 steps: data analysis, pretraining, and eval/finetuning.
+The `examples_deepspeed/data_efficiency` directory in our [Megatron-DeepSpeed repo](https://github.com/microsoft/Megatron-DeepSpeed) includes our examples of how to apply curriculum learning to GPT-3 and BERT pretraining. There are 3 steps: data analysis, pretraining, and eval/finetuning.
 
 **Data analysis:** Curriculum learning requires a data analysis before pretraining that calculate the difficulty of each data sample (based on the metric provided by user), and build an index that map difficulty value to corresponding data samples. (There are exceptions: for example the truncation-based sequence length metric can be achieved by data postprocessing without data analysis.) We provide a data analyzer to perform the offline CPU-only data analysis.
 
-`examples/data_efficiency/gpt/ds_analyze_*.sh` and `examples/data_efficiency/bert/ds_analyze_*.sh` are example scripts for GPT-3 and BERT's data analysis. Our data analyzer employs a simple Map-Reduce scheme. First, at the Map stage the `ds_analyze_*_data_map.sh` is used to split the dataset and compute the difficulty value for each data sample. User would need to provide a function to compute the metric (we implement ours in `examples/data_efficiency/analyze_data.py`), the raw training dataset, and other configurations such as number of CPU nodes and number of threads per node. Then the data analyzer will automatically splits the dataset based on number of workers, compute the difficulty values in a batched fashion, and write the results to two indexes: one index maps each data sample to its difficulty value, and another index maps each distinct difficulty value to the corresponding samples. Second, at the Reduce stage the `ds_analyze_*_data_reduce.sh` is used to merge the index files produced by all workers. One thing to note is that in order to enable speedup by distribution yet still being able to merge all the output, the Map stage will potentially generate a lot of output files, which is proportional to number of CPU nodes, number of threads per node, and number of possible metric values. Thus to avoid generating too much output files, we recommend to start with a smaller number of nodes/threads (in the output log we provide an estimate required time for users to judge if they want to increase number of workers), and we recommend to limit number of possible difficulty values when designing your difficulty metric (our experience shows that a few thousands of distinct values is already sufficient to enjoy the benefit of curriculum learning).
+`examples_deepspeed/data_efficiency/gpt/ds_analyze_*.sh` and `examples_deepspeed/data_efficiency/bert/ds_analyze_*.sh` are example scripts for GPT-3 and BERT's data analysis. Our data analyzer employs a simple Map-Reduce scheme. First, at the Map stage the `ds_analyze_*_data_map.sh` is used to split the dataset and compute the difficulty value for each data sample. User would need to provide a function to compute the metric (we implement ours in `examples_deepspeed/data_efficiency/analyze_data.py`), the raw training dataset, and other configurations such as number of CPU nodes and number of threads per node. Then the data analyzer will automatically splits the dataset based on number of workers, compute the difficulty values in a batched fashion, and write the results to two indexes: one index maps each data sample to its difficulty value, and another index maps each distinct difficulty value to the corresponding samples. Second, at the Reduce stage the `ds_analyze_*_data_reduce.sh` is used to merge the index files produced by all workers. One thing to note is that in order to enable speedup by distribution yet still being able to merge all the output, the Map stage will potentially generate a lot of output files, which is proportional to number of CPU nodes, number of threads per node, and number of possible metric values. Thus to avoid generating too much output files, we recommend to start with a smaller number of nodes/threads (in the output log we provide an estimate required time for users to judge if they want to increase number of workers), and we recommend to limit number of possible difficulty values when designing your difficulty metric (our experience shows that a few thousands of distinct values is already sufficient to enjoy the benefit of curriculum learning).
 
-**Pretraining** `examples/data_efficiency/gpt/pretrain` and `examples/data_efficiency/bert/pretrain` include the example pretraining scripts with curriculum learning feature. Several changes are needed to enable curriculum learning during pretraining: (1) User need to provide a DeepSpeed json config file which includes configurations for curriculum learning (see [list of configuration](/docs/config-json/#data-efficiency) for details). We provide tested example configurations in `examples/data_efficiency/gpt/pretrain/ds_pretrain_gpt_1.3B_dense_run.sh` and `examples/data_efficiency/bert/pretrain/ds_pretrain_bert_336M_run.sh`. (2) When initializing the DeepSpeed engine via `deepspeed.initialize`, user needs to provide the train dataset and use the dataloader returned by the initialization (this dataloader includes the curriculum learning capability). We provide an example implementation of this change in `megatron/training.py` function `setup_model_and_optimizer`. (3) If the curriculum learning metric requires data postprocessing (such as truncation-based sequence length), user needs to use the DeepSpeed engine's `set_data_post_process_func` API to provide the postprocessing function. We provide an example implementation of this change in `megatron/training.py`, `pretrain_bert.py`, and `pretrain_gpt.py`. (4) If the curriculum learning metric requires a custom scheduling strategy (the pacing function), user needs to use the DeepSpeed engine's `set_custom_curriculum_learning_schedule` API to provide the function to update the max accepted difficulty during training. DeepSpeed engine will provide a global train step input to this callback function.
+**Pretraining** `examples_deepspeed/data_efficiency/gpt/pretrain` and `examples_deepspeed/data_efficiency/bert/pretrain` include the example pretraining scripts with curriculum learning feature. Several changes are needed to enable curriculum learning during pretraining: (1) User need to provide a DeepSpeed json config file which includes configurations for curriculum learning (see [list of configuration](/docs/config-json/#data-efficiency) for details). We provide tested example configurations in `examples_deepspeed/data_efficiency/gpt/pretrain/ds_pretrain_gpt_1.3B_dense_run.sh` and `examples_deepspeed/data_efficiency/bert/pretrain/ds_pretrain_bert_336M_run.sh`. (2) When initializing the DeepSpeed engine via `deepspeed.initialize`, user needs to provide the train dataset and use the dataloader returned by the initialization (this dataloader includes the curriculum learning capability). We provide an example implementation of this change in `megatron/training.py` function `setup_model_and_optimizer`. (3) If the curriculum learning metric requires data postprocessing (such as truncation-based sequence length), user needs to use the DeepSpeed engine's `set_data_post_process_func` API to provide the postprocessing function. We provide an example implementation of this change in `megatron/training.py`, `pretrain_bert.py`, and `pretrain_gpt.py`. (4) If the curriculum learning metric requires a custom scheduling strategy (the pacing function), user needs to use the DeepSpeed engine's `set_custom_curriculum_learning_schedule` API to provide the function to update the max accepted difficulty during training. DeepSpeed engine will provide a global train step input to this callback function.
 
-**Eval/finetuning** `examples/data_efficiency/gpt/eval/` and `examples/data_efficiency/bert/finetune` include the example scripts for GPT-3 model's zero-/few-shot evaluation and BERT model's finetuning. Our [paper](https://arxiv.org/abs/2212.03597) includes the reference eval/finetune results if you follow our example scripts to perform the pretraining/eval/finetuning.
+**Eval/finetuning** `examples_deepspeed/data_efficiency/gpt/eval/` and `examples_deepspeed/data_efficiency/bert/finetune` include the example scripts for GPT-3 model's zero-/few-shot evaluation and BERT model's finetuning. Our [paper](https://arxiv.org/abs/2212.03597) includes the reference eval/finetune results if you follow our example scripts to perform the pretraining/eval/finetuning.
 
 #### 1.3.2 GPT-2 finetuning
 The `data_efficiency/gpt_finetuning` directory in our [DeepSpeedExamples repo](https://github.com/microsoft/DeepSpeedExamples) includes our examples of how to apply curriculum learning to GPT-2 finetuning. `data_efficiency/gpt_finetuning/finetune/ds_finetune_gpt2_run.sh` is the example finetuning script. For CL metrics that require data analysis (e.g., the vocabulary rarity metric), you need to first use ```data_efficiency/gpt_finetuning/finetune/ds_analyze_gpt_data_*``` to analyze and index the dataset, similar to the GPT-3 pre-training case described above in 1.3.1.
@@ -44,9 +44,9 @@ When you want to pretrain/fine-tune a transformer-based model, it is always a go
 ### 2.3 How to use random-LTD
 
 #### 2.3.1 GPT-3 and BERT pretraining
-The `examples/data_efficiency` directory in our [Megatron-DeepSpeed repo](https://github.com/microsoft/Megatron-DeepSpeed) includes our examples of how to apply random-LTD to GPT-3 and BERT pretraining.
+The `examples_deepspeed/data_efficiency` directory in our [Megatron-DeepSpeed repo](https://github.com/microsoft/Megatron-DeepSpeed) includes our examples of how to apply random-LTD to GPT-3 and BERT pretraining.
 
-`examples/data_efficiency/gpt/pretrain` and `examples/data_efficiency/bert/pretrain` include the example pretraining scripts with random-LTD feature. Several changes are needed to enable random-LTD during pretraining: (1) User need to provide a DeepSpeed json config file which includes configurations for random-LTD (see [list of configuration](/docs/config-json/#data-efficiency) for details). We provide tested example configurations in `examples/data_efficiency/gpt/pretrain/ds_pretrain_gpt_1.3B_dense_run.sh` and `examples/data_efficiency/bert/pretrain/ds_pretrain_bert_336M_run.sh`. (2) After initializing the DeepSpeed engine via `deepspeed.initialize`, user needs to use the `convert_to_random_ltd` API to convert and wrap the model layers in order to enable the random-LTD feature. We provide an example implementation of this change in `megatron/training.py` function `setup_model_and_optimizer`. (3) In order for random-LTD to understand the input argument mapping of the forward function, user need to change all the input arguments (except the hidden_states input) into keyword/named argument. For example, in `megatron/model/transformer.py` we changed the forward function from `def forward(self, hidden_states, attention_mask, encoder_output=None, enc_dec_attn_mask=None, layer_past=None, get_key_value=False):` to `def forward(self, hidden_states, attention_mask=None, encoder_output=None, enc_dec_attn_mask=None, layer_past=None, get_key_value=False):`. (4) When saving model checkpoints, (especially if the state dictionary has non-traditional structure) user needs to use the `remove_random_ltd_state_dict` API to convert the random-LTD-wrapped layers back to original model layers. We provide an example implementation of this change in `megatron/model/language_model.py`.
+`examples_deepspeed/data_efficiency/gpt/pretrain` and `examples_deepspeed/data_efficiency/bert/pretrain` include the example pretraining scripts with random-LTD feature. Several changes are needed to enable random-LTD during pretraining: (1) User need to provide a DeepSpeed json config file which includes configurations for random-LTD (see [list of configuration](/docs/config-json/#data-efficiency) for details). We provide tested example configurations in `examples_deepspeed/data_efficiency/gpt/pretrain/ds_pretrain_gpt_1.3B_dense_run.sh` and `examples_deepspeed/data_efficiency/bert/pretrain/ds_pretrain_bert_336M_run.sh`. (2) After initializing the DeepSpeed engine via `deepspeed.initialize`, user needs to use the `convert_to_random_ltd` API to convert and wrap the model layers in order to enable the random-LTD feature. We provide an example implementation of this change in `megatron/training.py` function `setup_model_and_optimizer`. (3) In order for random-LTD to understand the input argument mapping of the forward function, user need to change all the input arguments (except the hidden_states input) into keyword/named argument. For example, in `megatron/model/transformer.py` we changed the forward function from `def forward(self, hidden_states, attention_mask, encoder_output=None, enc_dec_attn_mask=None, layer_past=None, get_key_value=False):` to `def forward(self, hidden_states, attention_mask=None, encoder_output=None, enc_dec_attn_mask=None, layer_past=None, get_key_value=False):`. (4) When saving model checkpoints, (especially if the state dictionary has non-traditional structure) user needs to use the `remove_random_ltd_state_dict` API to convert the random-LTD-wrapped layers back to original model layers. We provide an example implementation of this change in `megatron/model/language_model.py`.
 
 For eval/finetuning of the pretrained model, see [previous section](#131-gpt-3-and-bert-pretraining) about how to use our example scripts.
 
@@ -85,14 +85,14 @@ And the reference final result is:
 
 ```shell
 For run_cifar.sh:
-13 epoch at time 480.6546013355255s | researved_length 197
+13 epoch at time 480.6546013355255s | reserved_length 197
 iter 5474 | LR [0.0001]| val_acc 97.97000122070312 | layer_token 305784192
 ```
 
 ## 3. Composing curriculum learning and random-LTD to achieve more
 
 ### 3.1 GPT-3 and BERT pretraining
-The `examples/data_efficiency` directory in our [Megatron-DeepSpeed repo](https://github.com/microsoft/Megatron-DeepSpeed) includes our examples of how to compose curriculum learning random-LTD, and apply both of them to GPT-3 and BERT pretraining.
+The `examples_deepspeed/data_efficiency` directory in our [Megatron-DeepSpeed repo](https://github.com/microsoft/Megatron-DeepSpeed) includes our examples of how to compose curriculum learning random-LTD, and apply both of them to GPT-3 and BERT pretraining.
 
 The changes needed are the same as described in previous two sections, since DeepSpeed Data Efficiency already handles the complexity when composing the two techniques. However, one thing to note is that since both random-LTD and some of the curriculum learning metrics will change the sequence length, it could require some extra code to calculate the effective sequence length at each step. We provide an example implementation of this change in `megatron/training.py` function `train` where we calculate the `actual_seq_length`.
 
diff --git a/docs/_tutorials/ds-sequence.md b/docs/_tutorials/ds-sequence.md
new file mode 100755
index 000000000000..815b99d6de35
--- /dev/null
+++ b/docs/_tutorials/ds-sequence.md
@@ -0,0 +1,117 @@
+---
+title: "Getting Started with DeepSpeed-Ulysses for Training Transformer Models with Extreme Long Sequences"
+tags: training
+---
+
+In this tutorial we describe how to enable DeepSpeed-Ulysses. DeepSpeed-Ulysses is a simple but highly communication and memory efficient mechanism sequence parallelism approach for training of large transformer models with massive sequence lengths. It partitions input tensors along the sequence dimension and uses a communication-efficient all-2-all collective for distributed attention computations. Additionally, DeepSpeed-Ulysses incorporates advanced modeling and system optimizations, such as Flash attention, sparse attention, and ZeRO optimizer, to optimize both computational efficiency and memory usage. Training with DeepSpeed sequence parallelism allows both model size and sequence length to scale near indefinitely unbounded by single GPU memory limitation and at a high fraction of peak compute performance. Currently, DeepSpeed-Ulysses can handle sequences up to 1 million in length (10 times the size of a complete Harry Potter book!) on 64 A100 GPUs. Please read our [DeepSpeed-Ulysses blog](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-ulysses) to learn more!
+
+## 1. Installation
+
+You will need to install DeepSpeed v0.10.2 or higher to use the DeepSpeed Sequence feature. Installing DeepSpeed is as simple as `pip install deepspeed`, [see more details](/tutorials/getting-started/).
+
+
+## 2. How to use DeepSpeed-Ulysses in your application?
+
+Integrating DS-Seq into your training code is easy, and in this section we describe how to integrate DeepSpeed-Ulysses through our [Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed) code repo.
+
+
+* **Replace attention module**: First, you need to update your attention module with DeepSpeed-Ulysses DistributedAttention. Here, we use the attention from [Megatron-DeepSpeed ](https://github.com/microsoft/Megatron-DeepSpeed/blob/main/megatron/model/transformer.py) which is the causal attention used in GPT-3 like model training. Rewrite the attention block:
+
+```python
+def __init__():
+    ...
+    self.local_attn = CoreAttention(self.layer_number, config, self.attn_mask_type)
+    self.core_attention = local_attn
+    ...
+
+def forward():
+    ...
+    context_layer = self.core_attention(
+                    query_layer, key_layer, value_layer, attention_mask)
+    ...
+```
+
+with:
+
+```python
+from deepspeed.sequence.layer import DistributedAttention
+
+def __init__():
+    ...
+    self.local_attn = CoreAttention(self.layer_number, config, self.attn_mask_type)
+    self.dist_attn = DistributedAttention(self.local_attn, parallel_state.get_sequence_parallel_group())
+    ...
+
+def forward():
+    ...
+    context_layer = self.dist_attn(query_layer, key_layer, value_layer, attention_mask)
+    ...
+
+```
+
+* **Add sequence parallel communication group**:  Note that DistributedAttention takes `local_attn` and `sequence_parallel_group` as the parameters, where local_attn can be your original attention block. You also need to build the sequence parallel nication group and pass that the DistributedAttention. One way to do this is to build the sequence parallel group at the model initialization stage.
+
+
+```python
+def initialize_model_parallel(
+    ...
+    sequence_parallel_size,
+    ...
+):
+    ...
+    num_sequence_parallel_groups: int = world_size // sequence_parallel_size
+    num_sequence_data_parallel_groups: int = world_size // sequence_parallel_size // data_parallel_size
+    ...
+    global _SEQUENCE_PARALLEL_GROUP
+    for i in range(num_sequence_parallel_groups):
+        ranks = range(i * sequence_parallel_size,
+                      (i + 1) * sequence_parallel_size)
+        group = torch.distributed.new_group(ranks)
+        if rank in ranks:
+            _SEQUENCE_PARALLEL_GROUP = group
+
+def get_sequence_parallel_group():
+    """Get the sequence parallel group the caller rank belongs to."""
+    return _SEQUENCE_PARALLEL_GROUP
+
+```
+
+In the Megatron-DeepSpeed exampele, to enable sequence parallelism, set the degree of parallelism using the --ds-sequence-parallel-size argument. You also need to ensure that the number of attention heads is divisible by this value.
+We have prepared scripts for you to quickly get some examples for training GPT-3 like models with very long sequences:
+
+```shell
+Megatron-DeepSpeed/examples_deepspeed/sequence_parallel$ bash ds_pretrain_gpt_1.3B_seq_parallel_32k.sh
+Megatron-DeepSpeed/examples_deepspeed/sequence_parallel$ bash ds_pretrain_gpt_30B_seq_parallel_32k.sh
+```
+
+Please note that our sequence parallelism feature is currently incompatible with Megatron-LM's tensor or pipeline parallelism.
+
+## 3. Enabling DeepSpeed-Ulysses with FlashAttention?
+
+DeepSpeed's sequence parallelism can be combined with different types of attention implementations to further improve the memory and compute efficiency of long sequence training:
+
+`Classic attention`: attention mechanism implemented via PyTorch.
+
+`FlashAttention`: the implementation from [FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness](https://arxiv.org/abs/2205.14135). Enabled by `--use-flash-attn`.
+
+`FlashAttention + Triton`: a of FlashAttention in Triton (tested with triton==2.0.0.dev20221202). Enabled by `--use-flash-attn-triton`.
+
+For the best performance, we recommend using FlashAttention + Triton. Below are the installation steps. Note that FlashAttention is compatible only with NVIDIA Turing, Ampere, Ada, or Hopper GPUs.
+
+```bash
+# install triton
+git clone -b legacy-backend https://github.com/openai/triton
+cd triton/python/
+pip install cmake
+pip install .
+```
+
+```bash
+# install
+cd ${WORK_DIR}
+git clone -b v1.0.4 https://github.com/HazyResearch/flash-attention
+cd flash-attention
+python setup.py install
+```
+
+You may also want to ensure your model configuration is compliant with FlashAttention's requirements. For instance, to achieve optimal performance, the head size should be divisible by 8. Refer to the document of FlashAttention for more details.
diff --git a/docs/_tutorials/ds4sci_evoformerattention.md b/docs/_tutorials/ds4sci_evoformerattention.md
new file mode 100644
index 000000000000..a623dd6aa2ca
--- /dev/null
+++ b/docs/_tutorials/ds4sci_evoformerattention.md
@@ -0,0 +1,74 @@
+---
+title: "DS4Sci_EvoformerAttention eliminates memory explosion problems for scaling Evoformer-centric structural biology models"
+tags: training inference
+---
+
+## 1. What is DS4Sci_EvoformerAttention
+`DS4Sci_EvoformerAttention` is a collection of kernels built to scale the [Evoformer](https://www.nature.com/articles/s41586-021-03819-2) computation to larger number of sequences and residuals by reducing the memory footprint and increasing the training speed.
+
+## 2. When to use DS4Sci_EvoformerAttention
+`DS4Sci_EvoformerAttention` is most beneficial when the number of sequences and residuals is large. The forward kernel is optimized to accelerate computation. It is beneficial to use the forward kernel during inference for various attention mechanisms. The associated backward kernel can be used during training to reduce the memory footprint at the cost of some computation. Therefore, it is beneficial to use `DS4Sci_EvoformerAttention` in training for memory-constrained operations such as MSA row-wise attention and MSA column-wise attention.
+
+## 3. How to use DS4Sci_EvoformerAttention
+
+### 3.1 Installation
+
+`DS4Sci_EvoformerAttention` is released as part of DeepSpeed >= 0.10.3. `DS4Sci_EvoformerAttention` is implemented based on [CUTLASS](https://github.com/NVIDIA/cutlass). You need to clone the CUTLASS repository and specify the path to it in the environment variable `CUTLASS_PATH`.
+
+```shell
+git clone https://github.com/NVIDIA/cutlass
+export CUTLASS_PATH=/path/to/cutlass
+```
+The kernels will be compiled when `DS4Sci_EvoformerAttention` is called for the first time.
+
+`DS4Sci_EvoformerAttention` requires GPUs with compute capability 7.0 or higher (NVIDIA V100 or later GPUs) and the minimal CUDA version is 11.3. It is recommended to use CUDA 11.7 or later for better performance. Besides, the performance of backward kernel on V100 kernel is not as good as that on A100 for now.
+
+### 3.2 Unit test and benchmark
+
+The unit test and benchmark are available in the `tests` folder in DeepSpeed repo. You can use the following command to run the unit test and benchmark.
+
+```shell
+pytest -s tests/unit/ops/deepspeed4science/test_DS4Sci_EvoformerAttention.py
+python tests/benchmarks/DS4Sci_EvoformerAttention_bench.py
+```
+
+### 3.3 Applying DS4Sci_EvoformerAttention to your own model
+
+To use `DS4Sci_EvoformerAttention` in user's own models, you need to import `DS4Sci_EvoformerAttention` from `deepspeed.ops.deepspeed4science`.
+
+```python
+from deepspeed.ops.deepspeed4science import DS4Sci_EvoformerAttention
+```
+
+`DS4Sci_EvoformerAttention` supports four attention mechanisms in Evoformer (MSA row-wise, MSA column-wise, and 2 kinds of Triangular) by using different inputs as shown in the following examples. In the examples, we denote the number of sequences as `N_seq` and the number of residuals as `N_res`. The dimension of the hidden states `Dim` and head number `Head` are different among different attention. Note that `DS4Sci_EvoformerAttention` requires the input tensors to be in `torch.float16` or `torch.bfloat16` data type.
+
+(a) **MSA row-wise attention** builds attention weights for residue pairs and integrates the information from the pair representation as an additional bias term.
+```python
+# Q, K, V: [Batch, N_seq, N_res, Head, Dim]
+# res_mask: [Batch, N_seq, 1, 1, N_res]
+# pair_bias: [Batch, 1, Head, N_res, N_res]
+out = DS4Sci_EvoformerAttention(Q, K, V, [res_mask, pair_bias])
+```
+
+(b) **MSA column-wise attention** lets the elements that belong to the same target residue exchange information.
+```python
+# Q, K, V: [Batch, N_res, N_seq, Head, Dim]
+# res_mask: [Batch, N_seq, 1, 1, N_res]
+out = DS4Sci_EvoformerAttention(Q, K, V, [res_mask])
+```
+
+(c) **Triangular self-attention** updates the pair representation. There are two kinds of Triangular self-attention: around starting and around ending node. Below is the example of triangular self-attention around starting node. The triangular self-attention around ending node is similar.
+```python
+# Q, K, V: [Batch, N_res, N_res, Head, Dim]
+# res_mask: [Batch, N_res, 1, 1, N_res]
+# right_edges: [Batch, 1, Head, N_res, N_res]
+out = DS4Sci_EvoformerAttention(Q, K, V, [res_mask, right_edges])
+```
+
+## 4. DS4Sci_EvoformerAttention scientific application
+
+### 4.1 DS4Sci_EvoformerAttention eliminates memory explosion problems for scaling Evoformer-centric structural biology models in OpenFold
+
+[OpenFold](https://github.com/aqlaboratory/openfold) is a community reproduction of DeepMind's AlphaFold2 that makes it possible to train or finetune AlphaFold2 on new datasets. Training AlphaFold2 incurs a memory explosion problem because it contains several custom Evoformer attention variants that manifest unusually large activations. By leveraging DeepSpeed4Science's DS4Sci_EvoformerAttention kernels, OpenFold team is able to reduce the peak memory requirement by 13x without accuracy loss. Detailed information about the methodology can be found at [our website](https://deepspeed4science.ai/2023/09/18/model-showcase-openfold/).
+
+<!-- OpenFold team also hosts an [example](https://github.com/aqlaboratory/openfold/blob/main/tests/test_deepspeed_evo_attention.py) about how to use DS4Sci_EvoformerAttention in the OpenFold repo. -->
diff --git a/docs/_tutorials/flops-profiler.md b/docs/_tutorials/flops-profiler.md
index 169bfb18d442..24efc238615a 100644
--- a/docs/_tutorials/flops-profiler.md
+++ b/docs/_tutorials/flops-profiler.md
@@ -316,8 +316,9 @@ The following example shows how to profile AlexNet using the DeepSpeed flops pro
 import torchvision.models as models
 import torch
 from deepspeed.profiling.flops_profiler import get_model_profile
+from deepspeed.accelerator import get_accelerator
 
-with torch.cuda.device(0):
+with get_accelerator().device(0):
     model = models.alexnet()
     batch_size = 256
     flops, macs, params = get_model_profile(model=model, # model
@@ -341,6 +342,7 @@ from functools import partial
 import torch
 from transformers import BertForSequenceClassification, BertTokenizer
 from deepspeed.profiling.flops_profiler import get_model_profile
+from deepspeed.accelerator import get_accelerator
 
 
 def bert_input_constructor(batch_size, seq_len, tokenizer):
@@ -357,7 +359,7 @@ def bert_input_constructor(batch_size, seq_len, tokenizer):
     return inputs
 
 
-with torch.cuda.device(0):
+with get_accelerator().device(0):
     tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
     model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
     batch_size = 4
diff --git a/docs/_tutorials/gan.md b/docs/_tutorials/gan.md
index 1389c91617dd..09572a439eb0 100755
--- a/docs/_tutorials/gan.md
+++ b/docs/_tutorials/gan.md
@@ -101,7 +101,7 @@ deepspeed gan_deepspeed_train.py --dataset celeba --cuda --deepspeed_config gan_
 
 ## Performance Comparison
 
-We use a total batch size of 64 and perform the training on 16 GPUs for 1 epoch on a DGX-2 node which leads to 3x speed-up. The summary of the the results is given below:
+We use a total batch size of 64 and perform the training on 16 GPUs for 1 epoch on a DGX-2 node which leads to 3x speed-up. The summary of the results is given below:
 
 - Baseline total wall clock time for 1 epochs is 393 secs
 
diff --git a/docs/_tutorials/getting-started.md b/docs/_tutorials/getting-started.md
index eea063171c5c..8d2bbf2d9964 100644
--- a/docs/_tutorials/getting-started.md
+++ b/docs/_tutorials/getting-started.md
@@ -9,7 +9,7 @@ tags: getting-started
 
 * Installing is as simple as `pip install deepspeed`, [see more details](/tutorials/advanced-install/).
 * To get started with DeepSpeed on AzureML, please see the [AzureML Examples GitHub](https://github.com/Azure/azureml-examples/tree/main/python-sdk/workflows/train/deepspeed)
-* DeepSpeed has direct integrations with [HuggingFace Transformers](https://github.com/huggingface/transformers) and [PyTorch Lightning](https://github.com/PyTorchLightning/pytorch-lightning). HuggingFace Transformers users can now easily accelerate their models with DeepSpeed through a simple ``--deepspeed`` flag + config file [See more details](https://huggingface.co/transformers/main_classes/trainer.html#deepspeed). PyTorch Lightning provides easy access to DeepSpeed through the Lightning Trainer [See more details](https://pytorch-lightning.readthedocs.io/en/stable/advanced/multi_gpu.html?highlight=deepspeed#deepspeed).
+* DeepSpeed has direct integrations with [HuggingFace Transformers](https://github.com/huggingface/transformers) and [PyTorch Lightning](https://github.com/PyTorchLightning/pytorch-lightning). HuggingFace Transformers users can now easily accelerate their models with DeepSpeed through a simple ``--deepspeed`` flag + config file [See more details](https://huggingface.co/docs/transformers/main_classes/deepspeed). PyTorch Lightning provides easy access to DeepSpeed through the Lightning Trainer [See more details](https://pytorch-lightning.readthedocs.io/en/stable/advanced/multi_gpu.html?highlight=deepspeed#deepspeed).
 * DeepSpeed on AMD can be used via our [ROCm images](https://hub.docker.com/r/deepspeed/rocm501/tags), e.g., `docker pull deepspeed/rocm501:ds060_pytorch110`.
 
 
@@ -235,7 +235,11 @@ propagate all NCCL and PYTHON related environment variables that are set. If
 you would like to propagate additional variables you can specify them in a
 dot-file named `.deepspeed_env` that contains a new-line separated list of
 `VAR=VAL` entries. The DeepSpeed launcher will look in the local path you are
-executing from and also in your home directory (`~/`).
+executing from and also in your home directory (`~/`). If you would like to
+override the default name of this file or path and name with your own, you
+can specify this with the environment variable, `DS_ENV_FILE`.  This is
+mostly useful if you are launching multiple jobs that all require different
+variables.
 
 As a concrete example, some clusters require special NCCL variables to set
 prior to training. The user can simply add these variables to a
diff --git a/docs/_tutorials/inference-tutorial.md b/docs/_tutorials/inference-tutorial.md
index 176662296ad9..6330198053e7 100644
--- a/docs/_tutorials/inference-tutorial.md
+++ b/docs/_tutorials/inference-tutorial.md
@@ -61,7 +61,7 @@ For the models trained using HuggingFace, the model checkpoint can be pre-loaded
 ```json
 "checkpoint.json":
 {
-  "type": "Megatron",
+    "type": "Megatron",
     "version": 0.0,
     "checkpoints": [
         "mp_rank_00/model_optim_rng.pt",
@@ -73,9 +73,9 @@ For models that are trained with DeepSpeed, the checkpoint `json` file only requ
 ```json
 "checkpoint.json":
 {
-  "type": "DeepSpeed",
-    "version": 0.3,
-    "checkpoint_path": "path_to_checkpoints",
+    "type": "ds_model",
+    "version": 0.0,
+    "checkpoints": "path_to_checkpoints",
 }
 ```
 
@@ -132,7 +132,7 @@ Below is an output of the generated text.  You can try other prompt and see how
 
 ## Datatypes and Quantized Models
 
-DeepSpeed inference supports fp32, fp16 and int8 parameters. The appropriate datatype can be set using dtype in `init_inference`, and DeepSpeed will choose the kernels optimized for that datatype. For quantized int8 models, if the model was quantized using DeepSpeed's quantization approach ([MoQ](https://www.deepspeed.ai/news/2020/05/27/MoQ.html)), the setting by which the quantization is applied needs to be passed to `init_inference`. This setting includes the number of groups used for quantization and whether the MLP part of transformer is quantized with extra grouping. For more information on these parameters, please visit our [quantization tutorial](https://www.deepspeed.ai/tutorials/MoQ-tutorial/).
+DeepSpeed inference supports fp32, fp16 and int8 parameters. The appropriate datatype can be set using dtype in `init_inference`, and DeepSpeed will choose the kernels optimized for that datatype. For quantized int8 models, if the model was quantized using DeepSpeed's quantization approach ([MoQ](https://www.deepspeed.ai/2021/05/04/MoQ.html)), the setting by which the quantization is applied needs to be passed to `init_inference`. This setting includes the number of groups used for quantization and whether the MLP part of transformer is quantized with extra grouping. For more information on these parameters, please visit our [quantization tutorial](https://www.deepspeed.ai/tutorials/MoQ-tutorial/).
 
 ```python
 import deepspeed
@@ -140,7 +140,7 @@ model = deepspeed.init_inference(model,
                                  checkpoint='./checkpoint.json',
                                  dtype=torch.int8,
                                  quantization_setting=(quantize_groups,
-                                                       mlp_exra_grouping)
+                                                       mlp_extra_grouping)
                                 )
 ```
 
diff --git a/docs/_tutorials/large-models-w-deepspeed.md b/docs/_tutorials/large-models-w-deepspeed.md
index 21b9956decc2..8e09cccee1fe 100644
--- a/docs/_tutorials/large-models-w-deepspeed.md
+++ b/docs/_tutorials/large-models-w-deepspeed.md
@@ -28,7 +28,7 @@ Since, ZeRO is a replacement to data parallelism, it offers a seamless integrati
 
 ## Deciding which technology to use
 
-**3D Parallelism for GPT-2/GPT-3 like models**: If you are attempting to train a model whose architecture resembles very closely with GPT-2 or GPT-3, then we have already done the hard work of porting 3D parallelism to a GPT-2/GPT-3 architecture-based model and have created a training pipeline that you can use to efficiently train models with hundreds of billion or even trillions of parameters. Both Megatron-Turing NLG 530B and Big Science use a variation of this code base to scale the model training. You can find the code and tutorial to get started in the [DeepSpeed-Megatron GPT-3](https://github.com/microsoft/megatron-deepspeed) repo. For more information on 3D parallelism please chekcout the resources below:
+**3D Parallelism for GPT-2/GPT-3 like models**: If you are attempting to train a model whose architecture resembles very closely with GPT-2 or GPT-3, then we have already done the hard work of porting 3D parallelism to a GPT-2/GPT-3 architecture-based model and have created a training pipeline that you can use to efficiently train models with hundreds of billion or even trillions of parameters. Both Megatron-Turing NLG 530B and Big Science use a variation of this code base to scale the model training. You can find the code and tutorial to get started in the [DeepSpeed-Megatron GPT-3](https://github.com/microsoft/megatron-deepspeed) repo. For more information on 3D parallelism please checkout the resources below:
 
 [3D Parallelism Tutorial](https://www.deepspeed.ai/tutorials/pipeline/) A generic tutorial on how to port your model to use DeepSpeed 3D parallelism
 
@@ -36,7 +36,7 @@ Since, ZeRO is a replacement to data parallelism, it offers a seamless integrati
 
 **ZeRO based technologies**: For most training scenarios, ZeRO offer training efficiency that is on par with 3D parallelism without requiring model code refactoring. Therefore, if you do not already have your code ported to use 3D parallelism, we suggest first trying ZeRO lines of technology to see if it fits your need. Adding ZeRO to your training pipeline with DeepSpeed is simple and does not require you to make changes to your model.  Given the trivial cost of trying out ZeRO with DeepSpeed, it is the fastest way to evaluate and decide if you should further invest in porting your model to use 3D parallelism. Enabling ZeRO with DeepSpeed also gives you access to ZeRO-Offload and ZeRO-Infinity that can enable fine tuning large models on limited GPU resources. To get started, please checkout our [ZeRO Tutorial](https://www.deepspeed.ai/tutorials/zero/).
 
-For more indepth information on ZeRO lines of technologies, please checkout our papers:
+For more in-depth information on ZeRO lines of technologies, please checkout our papers:
 
 [ZeRO (SC20)](https://arxiv.org/pdf/1910.02054.pdf), [ZeRO Offload (ATC21) ](https://www.usenix.org/system/files/atc21-ren-jie.pdf), and [ZeRO-Infinity (SC21)](https://arxiv.org/pdf/2104.07857.pdf),
 
diff --git a/docs/_tutorials/megatron.md b/docs/_tutorials/megatron.md
index 2977f5773029..5242c8184db8 100644
--- a/docs/_tutorials/megatron.md
+++ b/docs/_tutorials/megatron.md
@@ -53,7 +53,7 @@ To use DeepSpeed we will modify three files :
 
 
 ### Argument Parsing
-The first step is to apply DeepSpeed is adding DeepSpeed arguments to
+The first step is adding DeepSpeed arguments to
 Megatron-LM GPT2 model, using `deepspeed.add_config_arguments()` in
 `arguments.py`.
 
@@ -275,7 +275,7 @@ DeepSpeed's `save_checkpoint()`.
          sd['random_rng_state'] = random.getstate()
          sd['np_rng_state'] = np.random.get_state()
          sd['torch_rng_state'] = torch.get_rng_state()
-         sd['cuda_rng_state'] = torch.cuda.get_rng_state()
+         sd['cuda_rng_state'] = get_accelerator().get_rng_state()
          sd['rng_tracker_states'] = mpu.get_cuda_rng_tracker().get_states()
 
      model.save_checkpoint(args.save, iteration, client_state = sd)
diff --git a/docs/_tutorials/mixed_precision_zeropp.md b/docs/_tutorials/mixed_precision_zeropp.md
new file mode 100644
index 000000000000..12ad3556abde
--- /dev/null
+++ b/docs/_tutorials/mixed_precision_zeropp.md
@@ -0,0 +1,55 @@
+---
+title: "Mixed Precision ZeRO++"
+tags: training ZeRO communication-efficiency large-model
+---
+
+Mixed Precision ZeRO++ (MixZ++) is a set of optimization strategies based on [ZeRO](/tutorials/zero/) and [ZeRO++](/tutorials/zeropp/) to improve the efficiency and reduce memory usage for large model training and inference when users use [Low-Rank Adaptation (LoRA)]([/tutorials/zero/](https://arxiv.org/abs/2106.09685)) training. MixZ++ partitions model parameters across GPUs to reduce footprint and gathers them with quantized communication only when needed similar to its ZeRO and ZeRO++ siblings. Our evaluation indicates MixZ++ increases the training throughput by up to [3.3x](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat/ds-chat-release-8-31) for the Llama-2-70B model running on 128 V100 GPUs. Read our [DeepSpeed Chat Blog](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat/ds-chat-release-8-31), [ZeRO++ blog](https://www.microsoft.com/en-us/research/blog/deepspeed-zero-a-leap-in-speed-for-llm-and-chat-model-training-with-4x-less-communication/) and [paper](https://arxiv.org/pdf/2306.10209.pdf) to learn more!
+
+We recommend that you read the tutorials on [Getting Started](/getting-started/), [ZeRO](/tutorials/zero/)  and [Megatron-DeepSpeed](/tutorials/megatron/) before stepping through this tutorial.
+
+## Key Designs
+Mixed Precision ZeRO++ (MixZ++) inherits key designs from [ZeRO++](/tutorials/zeropp/), namely quantized weights (*qwZ*), hierarchical partitioning ZeRO (*hpZ*) but has different applicability:
+ - *qwZ* applies block-based quantization on frozen weights to reduce memory usage and all-gather communication volume. Compared with ZeRO++, *qwZ* in Mixed Precision ZeRO++ keeps the frozen weights quantized so there is no quantization overhead during runtime and memory usage is reduced.
+ - *hpZ* eliminates inter-node parameter all-gather communication through data remapping and recomputation. Compared with ZeRO++, *hpZ* in Mixed Precision ZeRO++ applies to both backward and generation passes.
+
+Collectively, the optimizations bring better scalability and efficiency to LoRA training. Each of the components can be enabled independent of each other and collectively as a group.
+
+## Enabling Mixed Precision ZeRO++ (MixZ++)
+
+A ready to go MixZ++ example has been prepared at [MixZ++ example script](https://github.com/microsoft/DeepSpeedExamples/blob/master/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/llama2/run_llama2_7b_mixz.sh). If you prefer to manually enable MixZ++ in your pipeline, please refer to the instructions below.
+
+### DeepSpeed Configuration Changes
+An example snippet of deepspeed configurations with all MixZ++ optimization enabled is shown below:
+```json
+{
+    "zero_optimization": {
+        "stage": 3,
+        "..."
+        "zero_quantized_nontrainable_weights": true,
+        "zero_hpz_partition_size": 16,
+        "..."
+    }
+}
+```
+Note that for multi-node training, the `"zero_hpz_partition_size"` should be set to the number of GPUs per node. For example, if you have 8 GPUs per node, then `"zero_hpz_partition_size"` should be set to 8. For single-node training, the `"zero_hpz_partition_size"` should not be set.
+
+### Training Script Changes
+DeepSpeed engine will identify the LoRA frozen parameters if the LoRA model is passed when DeepSpeed initializes. However, the popular implementation is to initialize a base model and then convert to LoRA model later. In such cases, users need to explicitly call DeepSpeed engine after LoRA model is converted. This is only a 1-line effort. An example snippet of training script is shown below:
+
+```python
+model, optimizer, _, lr_scheduler = deepspeed.initialize(
+    model=model,
+    optimizer=optimizer,
+    args=args,
+    config=ds_config,
+    lr_scheduler=lr_scheduler,
+    dist_init_required=True)
+# ...
+# (the custom code to convert base model to LoRA model)
+# ...
+# call DeepSpeed engine again to identify LoRA frozen parameters
+model.optimizer.quantize_nontrainable_params()
+# ...
+```
+
+Congratulations! You have completed the Mixed Precision ZeRO++ tutorial.
diff --git a/docs/_tutorials/mixture-of-experts-inference.md b/docs/_tutorials/mixture-of-experts-inference.md
index 7a75c84935d7..882ad7aefd1f 100644
--- a/docs/_tutorials/mixture-of-experts-inference.md
+++ b/docs/_tutorials/mixture-of-experts-inference.md
@@ -23,7 +23,7 @@ In this part, we elaborate the usage of MoE inference support in the DeepSpeed l
 
 ### Initializing for Inference
 
-For inference with DeepSpeed-MoE, use `init_inference` API to load the DeepSpeed MoE model for inference. Here, you can specify the model-parallelism/tensor-slicing degree (mp_size), expert parallelism degree (ep_size), and number of experts (moe_exeperts). We create various process groups based on minimum of the world\_size (total number of GPUs) and expert parallel size. By using this group, we can partition the experts among expert-parallel GPUs. If number of experts is lower than total number of GPUs, DeepSpeed-MoE leverages expert-slicing for partitioning the expert parameters between the expert-parallel GPUs. Furthermore, if the model has not been loaded with the appropriate checkpoint, you can also provide the checkpoint description using a `json` file or simply pass the `'checkpoint'` path to load the model. To inject the high-performance inference kernels, you can set `replace_with_kernel_inject` to True.
+For inference with DeepSpeed-MoE, use `init_inference` API to load the DeepSpeed MoE model for inference. Here, you can specify the model-parallelism/tensor-slicing degree (mp_size), expert parallelism degree (ep_size), and number of experts (moe_experts). We create various process groups based on minimum of the world\_size (total number of GPUs) and expert parallel size. By using this group, we can partition the experts among expert-parallel GPUs. If number of experts is lower than total number of GPUs, DeepSpeed-MoE leverages expert-slicing for partitioning the expert parameters between the expert-parallel GPUs. Furthermore, if the model has not been loaded with the appropriate checkpoint, you can also provide the checkpoint description using a `json` file or simply pass the `'checkpoint'` path to load the model. To inject the high-performance inference kernels, you can set `replace_with_kernel_inject` to True.
 
 ```python
 
@@ -54,7 +54,7 @@ output = model('Input String')
 Here, we show a text-generation example using an MoE model for which we can specify the model-parallel size and number of experts.
 DeepSpeed inference-engine takes care of creating the different parallelism groups using the tensor-slicing degree, number of experts, and the total number of GPUs used for running the MoE model. Regarding the expert parameters, we first use the expert-parallelism to assign each group of experts to one GPU. If number of GPUs is higher than number of experts, we use expert-slicing to partition each expert vertically/horizontally across the GPUs.
 
-Let's take a look at some of the parameters passed to run our example. Please refer to [DeepSpeed-Example](https://github.com/microsoft/Megatron-DeepSpeed/blob/main/examples/generate_text.sh) for a complete generate-text inference example.
+Let's take a look at some of the parameters passed to run our example. Please refer to [DeepSpeed-Example](https://github.com/microsoft/Megatron-DeepSpeed/blob/main/examples_deepspeed/generate_text.sh) for a complete generate-text inference example.
 
 
 ```bash
@@ -66,7 +66,7 @@ generate_samples_gpt.py \
        --num-attention-heads 32 \
        --max-position-embeddings 1024 \
        --tokenizer-type GPT2BPETokenizer \
-       --load $checpoint_path \
+       --load $checkpoint_path \
        --fp16 \
        --ds-inference \
 ```
diff --git a/docs/_tutorials/mixture-of-experts-nlg.md b/docs/_tutorials/mixture-of-experts-nlg.md
index c88df2df75e0..6fc7022ba1fb 100755
--- a/docs/_tutorials/mixture-of-experts-nlg.md
+++ b/docs/_tutorials/mixture-of-experts-nlg.md
@@ -15,7 +15,7 @@ You would need to install DeepSpeed v0.6.0 or higher to use the MoE feature. The
 To apply MoE to the GPT-style model, we made several changes in Megatron framework, mostly in `megatron/model/` where we add the MoE layers into the model.
 
 ### 2.2. Pre-training the Standard MoE model
-We provide example training scripts under [examples/MoE](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples/MoE) which we used to perform the experiments in our [Blog]({{ site.press_release_v6 }}). There are a few new hyperparameters for standard MoE model:
+We provide example training scripts under [examples_deepspeed/MoE](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples_deepspeed/MoE) which we used to perform the experiments in our [Blog]({{ site.press_release_v6 }}). There are a few new hyperparameters for standard MoE model:
 
 `--num-experts`: the number of experts per MoE layer. In our experiments we set it to 128. Larger number of experts tend to provide better convergence, but it's a diminishing return.
 
@@ -30,7 +30,7 @@ We provide example training scripts under [examples/MoE](https://github.com/micr
 
 
 ### 2.3. Pre-training the PR-MoE model
-PR-MoE is a new designed MoE models, standing for Pyramid-Residual-MoE, which improves the parameter efficiency up to 3x as compared to standard MoE. Please see our [Blog]({{ site.press_release_v6 }}) for more details. We provide example training scripts under [examples/MoE](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples/MoE). There are a few different hyperparameters for PR-MoE model compared to standard MoE:
+PR-MoE is a new designed MoE models, standing for Pyramid-Residual-MoE, which improves the parameter efficiency up to 3x as compared to standard MoE. Please see our [Blog]({{ site.press_release_v6 }}) for more details. We provide example training scripts under [examples_deepspeed/MoE](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples_deepspeed/MoE). There are a few different hyperparameters for PR-MoE model compared to standard MoE:
 
 `--num-experts`: Instead of providing a single number, to enable Pyramid-MoE, you need to provide a list, whose length is the same as the number of MoE layers. We suggest to use more experts in the latter stage (close to output) of the model.
 
@@ -57,14 +57,14 @@ Regarding training data, we are not able to release our internal data but any pu
 Table 1: Zero-shot evaluation results (last six columns) for different dense and MoE NLG models. All zero-shot evaluation results use the accuracy metric.
 
 ### 2.4. Training MoS with reduced model size
-MoS, standing for Mixture-of-Students, is a staged distillation-based technique for compressing large MoE models. MoS further reduces the model size by 12.5%, leading to up 3.7x model size reduction when combined with PR-MoE over the standard MoE. The reduced model size helps reduce the latecy and cost during inference. To train an MoS model, one needs to specify a few additional parameters. We will use PR-MoE as an example:
+MoS, standing for Mixture-of-Students, is a staged distillation-based technique for compressing large MoE models. MoS further reduces the model size by 12.5%, leading to up 3.7x model size reduction when combined with PR-MoE over the standard MoE. The reduced model size helps reduce the latency and cost during inference. To train an MoS model, one needs to specify a few additional parameters. We will use PR-MoE as an example:
 
 `--mos`: This would enable Mixture-of-Students via knowledge distillation.
 
-`--load-teacher`: This specifies the path to the teacher model checkpoint. This is a mandatory argumentment for using MoS and the teacher model checkpoint can be obtained by either training a standard MoE or the PR-MoE.
+`--load-teacher`: This specifies the path to the teacher model checkpoint. This is a mandatory argument for using MoS and the teacher model checkpoint can be obtained by either training a standard MoE or the PR-MoE.
 
 `num-layers-teacher`, `--hidden-size-teacher`, `--hidden-size-teacher`, `--num-experts-teacher`: In addition to the teacher model checkpoint path, we also need to specify the model architecture of the teacher model such as its number of layers, hidden dimension size, and the number of experts per MoE layer. In the case of PR-MoE, we need to also provide a list of experts for the teacher model, where we remove a few expert layers from the teacher model.
 
 In addition to the new parameters above, we observe that using the teacher PR-MoE during the entire training process may adversely impact the final student model accuracy. In our experiments, we use a staged distillation method by stopping distillation early in the training process (e.g., after 400K steps) and perform optimization only against the standard language modeling loss for the rest of the training.
 
-We provide example training scripts under [examples/MoE](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples/MoE). Details of our parameter settings can be found in the example training scripts. The performance results of MoS can be seen from our [blog post](https://www.microsoft.com/en-us/research/blog/deepspeed-powers-8x-larger-moe-model-training-with-high-performance/) and our [paper](https://arxiv.org/abs/2201.05596).
+We provide example training scripts under [examples_deepspeed/MoE](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples_deepspeed/MoE). Details of our parameter settings can be found in the example training scripts. The performance results of MoS can be seen from our [blog post](https://www.microsoft.com/en-us/research/blog/deepspeed-powers-8x-larger-moe-model-training-with-high-performance/) and our [paper](https://arxiv.org/abs/2201.05596).
diff --git a/docs/_tutorials/mixture-of-experts.md b/docs/_tutorials/mixture-of-experts.md
index 23d807ab3eb1..e7739a6a5051 100644
--- a/docs/_tutorials/mixture-of-experts.md
+++ b/docs/_tutorials/mixture-of-experts.md
@@ -63,7 +63,7 @@ Updated with MoE Layers
 
 ### Pyramid-Residual MoE
 
-Recently, we proposed a novel [Pyramid-Residual MoE](https://arxiv.org/abs/2201.05596]) (PR-MoE) model architecture. To create such an MoE model, the users need to do two additional things: 1) To make a pyramid structure, pass num_experts as a list e.g. [4, 8] and 2) Use the ```use_residual``` flag to indicate that the MoE layer is now a Residual MoE layer.
+Recently, we proposed a novel [Pyramid-Residual MoE](https://arxiv.org/abs/2201.05596) (PR-MoE) model architecture. To create such an MoE model, the users need to do two additional things: 1) To make a pyramid structure, pass num_experts as a list e.g. [4, 8] and 2) Use the ```use_residual``` flag to indicate that the MoE layer is now a Residual MoE layer.
 
 ```python
 self.experts = deepspeed.moe.layer.MoE(hidden_size=input_dim, expert=ExpertModule(), num_experts=[..], ep_size=ep_size, use_residual=True)
@@ -165,4 +165,4 @@ We have devised a new technique called “Random Token Selection” that greatly
 
 ## Advanced MoE usage
 
-We have added an example of applying MoE to NLG models. Please read more in this [newsletter](https://www.deepspeed.ai/news/2021/12/09/deepspeed-moe-nlg.html) and [tutorial](/tutorials/mixture-of-experts-nlg/).
+We have added an example of applying MoE to NLG models. Please read more in this [newsletter](https://www.deepspeed.ai/2021/12/09/deepspeed-moe-nlg.html) and [tutorial](/tutorials/mixture-of-experts-nlg/).
diff --git a/docs/_tutorials/model-compression.md b/docs/_tutorials/model-compression.md
index 20f2e6a6b25b..c8713cb1f616 100644
--- a/docs/_tutorials/model-compression.md
+++ b/docs/_tutorials/model-compression.md
@@ -25,7 +25,7 @@ If the model is very deep, you may consider using this method. It works much bet
 
 Layer reduction can be enabled and configured using the DeepSpeed config JSON file ([configuration details](/docs/config-json/#layer-reduction)). Users have the freedom to select any depth by `keep_number_layer` and any subset of the network layers by `teacher_layer`. In addition, users also can choose whether to reinitialize the input/output layers from the given model (teacher model) by `other_module_name`.
 
-To apply layer reduction for task-specific compression, we provide an example on how to do so for BERT fine-tuning. Layer reduction is about resetting the depth of network architecture and reinitialization of weight parameters, which happens before the training process. The example includes the following changes to the client code (`model_compression/bert/run_glue_no_trainer.py` in [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples)):
+To apply layer reduction for task-specific compression, we provide an example on how to do so for BERT fine-tuning. Layer reduction is about resetting the depth of network architecture and reinitialization of weight parameters, which happens before the training process. The example includes the following changes to the client code (`compression/bert/run_glue_no_trainer.py` in [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples)):
 
 (1) When initial the model, the number of layers in the model config should be the same as `keep_number_layer` in DeepSpeed config JSON file. For Hugging Face BERT example, set `config.num_hidden_layers = ds_config["compression_training"]["layer_reduction"]["keep_number_layer"]`.
 
@@ -36,8 +36,8 @@ To apply layer reduction for task-specific compression, we provide an example on
 One can run our layer reduction example in [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples) by:
 
 ```shell
-DeepSpeedExamples/model_compression/bert$ pip install -r requirements.txt
-DeepSpeedExamples/model_compression/bert$ bash bash_script/layer_reduction.sh
+DeepSpeedExamples/compression/bert$ pip install -r requirements.txt
+DeepSpeedExamples/compression/bert$ bash bash_script/layer_reduction.sh
 ```
 
 And the final result is:
@@ -51,7 +51,7 @@ To apply layer reduction for task-agnostic compression, we provide an example on
 
 Step 1: Obtain the latest version of the [Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed).
 
-Step 2: Enter `Megatron-DeepSpeed/examples/compression` directory.
+Step 2: Enter `Megatron-DeepSpeed/examples_deepspeed/compression` directory.
 
 Step 3: Run the example bash script such as `ds_pretrain_gpt_125M_dense_cl_kd.sh`. The args related to the pre-training distillation are:
 
@@ -97,7 +97,7 @@ Weight quantization can be enabled and configured using the DeepSpeed config JSO
 
 (4)`start_bit` and `target_bit`, to simplify the first experiment we suggest to set them the same such that we apply quantization to the target bit once the iteration reaches `schedule_offset`.
 
-There are two changes to the client code (`model_compression/bert/run_glue_no_trainer.py` in [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples)):
+There are two changes to the client code (`compression/bert/run_glue_no_trainer.py` in [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples)):
 
 (1) After initialization of the model, apply `init_compression` function to the model with DeepSpeed JSON configurations.
 
@@ -106,8 +106,8 @@ There are two changes to the client code (`model_compression/bert/run_glue_no_tr
 One can run our weight quantization example in [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples) by:
 
 ```shell
-DeepSpeedExamples/model_compression/bert$ pip install -r requirements.txt
-DeepSpeedExamples/model_compression/bert$ bash bash_script/quant_weight.sh
+DeepSpeedExamples/compression/bert$ pip install -r requirements.txt
+DeepSpeedExamples/compression/bert$ bash bash_script/quant_weight.sh
 ```
 
 And the final result is:
@@ -139,8 +139,8 @@ The client code change is the same as [weight quantization](#12-weight-quantizat
 One can run our activation quantization example in [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples) by:
 
 ```shell
-DeepSpeedExamples/model_compression/bert$ pip install -r requirements.txt
-DeepSpeedExamples/model_compression/bert$ bash bash_script/quant_activation.sh
+DeepSpeedExamples/compression/bert$ pip install -r requirements.txt
+DeepSpeedExamples/compression/bert$ bash bash_script/quant_activation.sh
 ```
 
 And the final result is:
@@ -158,7 +158,7 @@ Pruning aims to reduce the number of parameters and operations involved in gener
 
 | **Method**            | **Type**     |
 | --------------------- | ------------ |
-| [Sparse pruning](#141-sparse-pruning)  | Unstructured |
+| [Sparse pruning](#141-sparse-pruning)  | Unstructured and Structured |
 | [Row pruning](#142-row-pruning)     | Structured    |
 | [Head pruning](#143-head-pruning)     | Structured    |
 | [Channel pruning](#144-channel-pruning) | Structured    |
@@ -166,7 +166,7 @@ Pruning aims to reduce the number of parameters and operations involved in gener
 #### 1.4.1 Sparse Pruning
 **What is sparse pruning**
 
-Sparse pruning means we set some of the elements in each weight matrix with zero values. There is no structure pattern in the zero values. One way to perform pruning is based on the absolute value of the weight parameters, see for instance [this paper](https://arxiv.org/abs/1506.02626).
+Sparse pruning means we set some of the elements in each weight matrix with zero values. Relying on the pruning method user chosen, the zero values may have structured pattern or unstructured pattern. One way to perform pruning is based on the absolute value of the weight parameters, see for instance [this paper](https://arxiv.org/abs/1506.02626). Another way to perform pruning is based on the weights' effect to the loss function when they are masked, see for instance [this paper](https://arxiv.org/abs/1810.02340).
 
 **When to use sparse pruning**
 
@@ -178,19 +178,21 @@ Sparse pruning can be enabled and configured using the DeepSpeed config JSON fil
 
 (1)`schedule_offset`, we empirically find that when using `method: topk`, it’s better to set the `schedule_offset` to a large value such as 10% of the total training steps.
 
-(2)`method`, we support L1 norm and topk methods. Users are welcome to contribute more methods.
+(2)`method`, we support L1 norm, topk and snip_momentum methods. Users are welcome to contribute more methods.
 
-(3)`sp1`, users can expand more groups such as `sp2`, `sp3`, etc.
+(3)`sp1`, users can expand more groups such as `sp2`, `sp3`, etc. Note this is not needed for snip_momentum method.
 
-(4)`dense_ratio`, for unstructured sparse pruning, the dense ratio could be less than 0.1 for BRET-base model while still yielding a good accuracy. For ResNet-50, the dense ratio could be as low as 0.3 while still having good accuracy on ImageNet.
+(4)`dense_ratio`, for unstructured sparse pruning, the dense ratio could be less than 0.1 for BRET-base model while still yielding a good accuracy. For ResNet-50, the dense ratio could be as low as 0.3 while still having good accuracy on ImageNet. for structured sparse pruning like snip_momentum, the dense ratio should be specified in shared_parameters and is used to calculate the global sparsity ratio.
+
+(5)`frequency`, `block_pattern` and `schedule_offset_end`, they are used to specify the pruning frequency on steps, the block-wise pruning pattern (NxM and N in M), and the end steps for pruning. For snip_momentum method, these configurations are mandatory.
 
 The client code change is the same as [weight quantization](#12-weight-quantization).
 
 One can run our sparse pruning example in [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples) by:
 
 ```shell
-DeepSpeedExamples/model_compression/bert$ pip install -r requirements.txt
-DeepSpeedExamples/model_compression/bert$ bash bash_script/pruning_sparse.sh
+DeepSpeedExamples/compression/bert$ pip install -r requirements.txt
+DeepSpeedExamples/compression/bert$ bash bash_script/pruning_sparse.sh
 ```
 
 And the final result is:
@@ -224,8 +226,8 @@ The client code change is the same as [weight quantization](#12-weight-quantizat
 One can run our row pruning example in [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples) by:
 
 ```shell
-DeepSpeedExamples/model_compression/bert$ pip install -r requirements.txt
-DeepSpeedExamples/model_compression/bert$ bash bash_script/pruning_row.sh
+DeepSpeedExamples/compression/bert$ pip install -r requirements.txt
+DeepSpeedExamples/compression/bert$ bash bash_script/pruning_row.sh
 ```
 
 And the final result is:
@@ -261,8 +263,8 @@ The client code change is the same as [weight quantization](#12-weight-quantizat
 One can run our head pruning example in [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples) by:
 
 ```shell
-DeepSpeedExamples/model_compression/bert$ pip install -r requirements.txt
-DeepSpeedExamples/model_compression/bert$ bash bash_script/pruning_head.sh
+DeepSpeedExamples/compression/bert$ pip install -r requirements.txt
+DeepSpeedExamples/compression/bert$ bash bash_script/pruning_head.sh
 ```
 
 And the final result is:
@@ -288,7 +290,7 @@ One can run our channel pruning example in [DeepSpeedExamples](https://github.co
 
 ```shell
 pip install torch torchvision
-DeepSpeedExamples/model_compression/cifar$ bash run_compress.sh
+DeepSpeedExamples/compression/cifar$ bash run_compress.sh
 ```
 
 And the final result is:
@@ -316,8 +318,8 @@ When you want to quantize the transformer-based model to INT8 or INT4/INT8 forma
 One can run our BERT example in [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples) by:
 
 ```shell
-DeepSpeedExamples/model_compression/bert$ pip install -r requirements.txt
-DeepSpeedExamples/model_compression/bert$ bash bash_script/ZeroQuant/zero_quant.sh
+DeepSpeedExamples/compression/bert$ pip install -r requirements.txt
+DeepSpeedExamples/compression/bert$ bash bash_script/ZeroQuant/zero_quant.sh
 ```
 
 And the final result is:
@@ -329,8 +331,8 @@ Clean the best model, and the accuracy of the clean model is acc/mm-acc:0.842791
 One can run our GPT example by:
 
 ```shell
-DeepSpeedExamples/model_compression/gpt2$ pip install -r requirements.txt
-DeepSpeedExamples/model_compression/gpt2$ bash bash_script/run_zero_quant.sh
+DeepSpeedExamples/compression/gpt2$ pip install -r requirements.txt
+DeepSpeedExamples/compression/gpt2$ bash bash_script/run_zero_quant.sh
 ```
 
 And the final result is:
@@ -361,22 +363,22 @@ If you want to significantly compress your models while retaining competitive pe
 
 **How to use XTC**
 
-**Installation:** Examples of XTC extreme compression for BERT models are at `model_compression/bert/bash_script/XTC` in [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples). You will need to install the requirements by:
+**Installation:** Examples of XTC extreme compression for BERT models are at `compression/bert/bash_script/XTC` in [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples). You will need to install the requirements by:
 
 ```shell
-DeepSpeedExamples/model_compression/bert$ pip install -r requirements.txt
+DeepSpeedExamples/compression/bert$ pip install -r requirements.txt
 ```
 
 **Implementation of XTC methods:**
 To accommodate users who do not have a fine-tuned model or task-specific model for compression, with the arg `--model_name_or_path yoshitomo-matsubara/bert-base-uncased-${TASK_NAME}` our python script `run_glue_no_trainer.py` automatically downloads the models from Hugging Face. Users can also use their own models with better accuracy as the teacher and the student model initialization.
 
 ### 3.1  One-bit or Two-bit BERT-base (12-layer) with 8-bit activation quantization
-For the configurations, see `model_compression/bert/config/XTC/ds_config_W1A8_Qgroup1_fp32.json` in [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples). In our paper, we used FP32 (`"fp16": {"enabled": false}`) to perform training, while directly applying 8-bit quantization (`"bits": 8`) to the activations and 1-bit quantization (`"start_bits": 1, "target_bits": 1`) to the attention (query, key, val) and feedforward weight matrices (`"modules": ["attention.self", "intermediate", "output.dense"]`) at the beginning of the training (`"schedule_offset": 0`).  In addition, we also apply 1-bit quantization to `word_embeddings` as weight quantization.
+For the configurations, see `compression/bert/config/XTC/ds_config_W1A8_Qgroup1_fp32.json` in [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples). In our paper, we used FP32 (`"fp16": {"enabled": false}`) to perform training, while directly applying 8-bit quantization (`"bits": 8`) to the activations and 1-bit quantization (`"start_bits": 1, "target_bits": 1`) to the attention (query, key, val) and feedforward weight matrices (`"modules": ["attention.self", "intermediate", "output.dense"]`) at the beginning of the training (`"schedule_offset": 0`).  In addition, we also apply 1-bit quantization to `word_embeddings` as weight quantization.
 
 One can run this example by:
 
 ```shell
-DeepSpeedExamples/model_compression/bert$ bash bash_script/XTC/quant_1bit.sh
+DeepSpeedExamples/compression/bert$ bash bash_script/XTC/quant_1bit.sh
 ```
 
 And the final result is:
@@ -385,7 +387,7 @@ And the final result is:
 Clean the best model, and the accuracy of the clean model is acc/mm-acc:0.8293428425878757/0.8396053702196908
 ```
 
-The other important feature we would like to mention is the `quantize_groups` inside `weight_quantization`, which is set to be 1 here to match our XTC paper's FP32 training setup. We find that under FP16 training, smaller number of quantization group (e.g., 1 or 2) could lead to unstable training. Thus, we recommend using larger number of groups (e.g., 64) under FP16. `model_compression/bert/config/ds_config_W1A8_Qgroup64_fp16.json` in [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples) is the FP16 example configurations, where `"fp16": {"enabled": true}` and `"weight_quantization": {"shared_parameters": {"quantize_weight_in_forward": false}}` are different from FP32 case.
+The other important feature we would like to mention is the `quantize_groups` inside `weight_quantization`, which is set to be 1 here to match our XTC paper's FP32 training setup. We find that under FP16 training, smaller number of quantization group (e.g., 1 or 2) could lead to unstable training. Thus, we recommend using larger number of groups (e.g., 64) under FP16. `compression/bert/config/ds_config_W1A8_Qgroup64_fp16.json` in [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples) is the FP16 example configurations, where `"fp16": {"enabled": true}` and `"weight_quantization": {"shared_parameters": {"quantize_weight_in_forward": false}}` are different from FP32 case.
 
 With this config, we quantize the existing fined-tuned models downloaded from Hugging Face. For 2-bit weight quantization, user needs to update the ds_config JSON file. To give a sense of the compression performance of downloaded models compared to our paper, we collect the results (1/2-bit BERT on MNLI and QQP with 18 training epochs) in table below. The difference between this tutorial and paper is because they use different checkpoints. Data augmentation introduces in [TinyBERT](https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/TinyBERT) will help significantly for smaller tasks (such as mrpc, rte, sst-b and cola). See more details in [our paper](https://arxiv.org/abs/2206.01859).
 
@@ -397,12 +399,12 @@ This section consists of two parts: (a) we first perform a light-weight layer re
 
 **3.2.1 Light-weight Layer Reduction**
 
-`model_compression/bert/config/XTC/ds_config_layer_reduction_fp16.json` in [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples) is the example configuration for reducing the 12-layer BERT-base to a 6-layer one. The student’s layers are initialized from i-layer of the teacher with i= [1, 3 ,5 ,7 ,9 ,11] (note that the layer starts from 0), which is called `Skip-BERT_5` in our XTC paper. In addition, student’s modules including embedding, pooler and classifier are also initialized from teacher. For 5-layer layer reduction, one needs to change the configs in `ds_config_layer_reduction_fp16.json` to `"keep_number_layer": 5`, `"teacher_layer": [2, 4 ,6, 8, 10]`(like in `model_compression/bert/config/ds_config_TEMPLATE.json`).
+`compression/bert/config/XTC/ds_config_layer_reduction_fp16.json` in [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples) is the example configuration for reducing the 12-layer BERT-base to a 6-layer one. The student’s layers are initialized from i-layer of the teacher with i= [1, 3 ,5 ,7 ,9 ,11] (note that the layer starts from 0), which is called `Skip-BERT_5` in our XTC paper. In addition, student’s modules including embedding, pooler and classifier are also initialized from teacher. For 5-layer layer reduction, one needs to change the configs in `ds_config_layer_reduction_fp16.json` to `"keep_number_layer": 5`, `"teacher_layer": [2, 4 ,6, 8, 10]`(like in `compression/bert/config/ds_config_TEMPLATE.json`).
 
 One can run this example by:
 
 ```shell
-DeepSpeedExamples/model_compression/bert$ bash bash_script/XTC/layer_reduction.sh
+DeepSpeedExamples/compression/bert$ bash bash_script/XTC/layer_reduction.sh
 ```
 
 And the final result is:
@@ -411,7 +413,7 @@ And the final result is:
 Clean the best model, and the accuracy of the clean model is acc/mm-acc:0.8377992868059093/0.8365541090317331
 ```
 
-Notably, when using one-stage knowledge distillation (`--distill_method one_stage`), the difference between the outputs of teacher and student models (att_loss and rep_loss) also need to be consistent with the initialization. See the function `_kd_function` under `forward_loss` in `model_compression/bert/util.py`.
+Notably, when using one-stage knowledge distillation (`--distill_method one_stage`), the difference between the outputs of teacher and student models (att_loss and rep_loss) also need to be consistent with the initialization. See the function `_kd_function` under `forward_loss` in `compression/bert/util.py`.
 
 For mnli/qqp, we set `--num_train_epochs 36`, `--learning_rate 5e-5`, and with the JSON config above. The results are given below (we also include the fp16 training results). Using fp32 clearly results in more stable performance than fp16, although fp16 can speed up the training time.
 
@@ -419,12 +421,12 @@ For mnli/qqp, we set `--num_train_epochs 36`, `--learning_rate 5e-5`, and with t
 
 **3.2.2 One-bit or Two-bit quantization for 6-layer (5-layer) BERT**
 
-Given the above layer-reduced models ready, we now continue to compress the model with 1/2-bit quantization. `model_compression/bert/config/XTC/ds_config_layer_reduction_W1Q8_fp32.json` in [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples) is the example configuration where we set the layer reduction to be true on top of `model_compression/bert/config/XTC/ds_config_W1A8_Qgroup1_fp32.json`. In addition to the configuration, we need to update the path for the student model using `--pretrained_dir_student` in the script `model_compression/bert/bash_script/XTC/layer_reduction_1bit.sh`. User can train with a different teacher model by adding `--pretrained_dir_teacher`.
+Given the above layer-reduced models ready, we now continue to compress the model with 1/2-bit quantization. `compression/bert/config/XTC/ds_config_layer_reduction_W1Q8_fp32.json` in [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples) is the example configuration where we set the layer reduction to be true on top of `compression/bert/config/XTC/ds_config_W1A8_Qgroup1_fp32.json`. In addition to the configuration, we need to update the path for the student model using `--pretrained_dir_student` in the script `compression/bert/bash_script/XTC/layer_reduction_1bit.sh`. User can train with a different teacher model by adding `--pretrained_dir_teacher`.
 
 One can run this example by:
 
 ```shell
-DeepSpeedExamples/model_compression/bert$ bash bash_script/XTC/layer_reduction_1bit.sh
+DeepSpeedExamples/compression/bert$ bash bash_script/XTC/layer_reduction_1bit.sh
 ```
 
 And the final result is:
diff --git a/docs/_tutorials/one-cycle.md b/docs/_tutorials/one-cycle.md
index 12967ad56ad5..0b3c8ff0bcf0 100644
--- a/docs/_tutorials/one-cycle.md
+++ b/docs/_tutorials/one-cycle.md
@@ -42,33 +42,33 @@ of learning rate and momentum because they are correlated hyperparameters. We
 have leveraged this recommendation to reduce configuration burden by organizing
 the 1-cycle parameters into two groups:
 
-1. Global parameters for configuring the cycle and decay phase
-2. Local parameters for configuring learning rate and momentum
+1. Global parameters for configuring the cycle and decay phase.
+2. Local parameters for configuring learning rate and momentum.
 
 The global parameters for configuring the 1-cycle phases are:
 
-1. `cycle_first_step_size`: The count of training steps to complete first step of cycle phase
-2. `cycle_first_stair_count`: The count of updates (or stairs) in first step of cycle phase
-3. `cycle_second_step_size`: The count of training steps to complete second step of cycle phase
-4. `cycle_second_stair_count`: The count of updates (or stairs) in the second step of cycle phase
-5. `post_cycle_decay_step_size`: The interval, in training steps, to decay hyperparameter in decay phase
+1. `cycle_first_step_size`: The count of training steps to complete first step of cycle phase.
+2. `cycle_first_stair_count`: The count of updates (or stairs) in first step of cycle phase.
+3. `cycle_second_step_size`: The count of training steps to complete second step of cycle phase.
+4. `cycle_second_stair_count`: The count of updates (or stairs) in the second step of cycle phase.
+5. `post_cycle_decay_step_size`: The interval, in training steps, to decay hyperparameter in decay phase.
 
 The local parameters for the hyperparameters are:
 
 **Learning rate**:
 
-1. `cycle_min_lr`: minimum learning rate in cycle phase
-2. `cycle_max_lr`: maximum learning rate in cycle phase
-3. `decay_lr_rate`: decay rate for learning rate in decay phase
+1. `cycle_min_lr`: Minimum learning rate in cycle phase.
+2. `cycle_max_lr`: Maximum learning rate in cycle phase.
+3. `decay_lr_rate`: Decay rate for learning rate in decay phase.
 
 Although appropriate values `cycle_min_lr` and `cycle_max_lr` values can be
 selected based on experience or expertise,  we recommend using [learning rate
 range test](/tutorials/lrrt/) feature of DeepSpeed to configure them.
 
 **Momentum**
-1. `cycle_min_mom`: minimum momentum in cycle phase
-2. `cycle_max_mom`: maximum momentum in cycle phase
-3. `decay_mom_rate`: decay rate for momentum in decay phase
+1. `cycle_min_mom`: Minimum momentum in cycle phase.
+2. `cycle_max_mom`: Maximum momentum in cycle phase.
+3. `decay_mom_rate`: Decay rate for momentum in decay phase.
 
 ## Required Model Configuration Changes
 
@@ -122,9 +122,9 @@ GPU, but was converging slowly to target performance (AUC) when training on 8
 GPUs (8X batch size). The plot below shows model convergence with 8 GPUs for
 these learning rate schedules:
 
-1. **Fixed**: using an optimal fixed learning rate for 1-GPU training.
-2. **LinearScale**: using a fixed learning rate that is 8X of **Fixed**.
-3. **1Cycle**: using 1-Cycle schedule.
+1. **Fixed**: Using an optimal fixed learning rate for 1-GPU training.
+2. **LinearScale**: Using a fixed learning rate that is 8X of **Fixed**.
+3. **1Cycle**: Using 1-Cycle schedule.
 
 ![model_convergence](/assets/images/model_convergence.png)
 
diff --git a/docs/_tutorials/onebit-adam.md b/docs/_tutorials/onebit-adam.md
index 5166869ebe99..932bb355cf26 100644
--- a/docs/_tutorials/onebit-adam.md
+++ b/docs/_tutorials/onebit-adam.md
@@ -16,7 +16,7 @@ This tutorial is updated on 03/04/2021 to reflect the 1-bit Adam v2. Changes inc
 1) The NCCL-based implementation requires PyTorch >= 1.8 (and NCCL >= 2.8.3 when you have 64 or more GPUs). See details below. 2) Although 1-bit Adam is compatible with both FP16 and FP32, currently we only verified the convergence under mixed precision/FP16 training. 3) Currently the MPI-based implementation is not compatible with pipeline parallelism. 4) Frequent checkpoint loading could hurt 1-bit Adam's convergence. See details below.
 {: .notice--warning}
 
-In this tutorial, we are going to introduce the 1-bit Adam optimizer in DeepSpeed. 1-bit Adam can improve model training speed on communication-constrained clusters, especially for communication-intensive large models by reducing the overall communication volume by up to 5x. Detailed description of the 1-bit Adam algorithm, its implementation in DeepSpeed, and performance evaluation is available from our [blog post](https://www.deepspeed.ai/news/2020/09/08/onebit-adam-blog-post.html). We also have a [paper](https://arxiv.org/abs/2102.02888) which provides the most complete details including algorithm, system implementation, theoretical analysis, and more evaluations.
+In this tutorial, we are going to introduce the 1-bit Adam optimizer in DeepSpeed. 1-bit Adam can improve model training speed on communication-constrained clusters, especially for communication-intensive large models by reducing the overall communication volume by up to 5x. Detailed description of the 1-bit Adam algorithm, its implementation in DeepSpeed, and performance evaluation is available from our [blog post](https://www.deepspeed.ai/2020/09/08/onebit-adam-blog-post.html). We also have a [paper](https://arxiv.org/abs/2102.02888) which provides the most complete details including algorithm, system implementation, theoretical analysis, and more evaluations.
 
 To illustrate the benefits and usage of 1-bit Adam optimizer in DeepSpeed, we use the following two training tasks as examples:
 
@@ -46,7 +46,7 @@ cd DeepSpeedExamples/
 In 1-bit Adam v2, we introduce a new system implementation for compressed communication using the NCCL backend of PyTorch distributed. This significantly improves the usability due to NCCL’s integration with PyTorch distributed. The performance of our new NCCL-based implementation is also better than our earlier MPI-based implementation for Ethernet-based systems and on-par for InfiniBand-based systems. Thus we highly recommend users to choose this implementation.
 
 **Watch out!**
-This NCCL-based implementation requires PyTorch >= 1.8. It also requires NCCL >= 2.8.3 when you have 64 or more GPUs to avoid certain NCCL runtime bugs. Currently (2021/03/16) NCCL 2.8.3 is not officially supported by PyTorch. The solution we used is by hacking in NCCL 2.8.3 via `LD_PRELOAD`: 1) Install NCCL 2.8.3. This works for us on a CUDA 11 system: `apt-get install -y libnccl2=2.8.3-1+cuda11.0 libnccl-dev=2.8.3-1+cuda11.0`. 2) Set `LD_PRELOAD` to the the library path. This works for us: `LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libnccl.so.2.8.3`. To confirm `LD_PRELOAD` is working you can see the version it uses in the NCCL logs if you have `NCCL_DEBUG=INFO`, it should say: NCCL version 2.8.3+cuda11.0.
+This NCCL-based implementation requires PyTorch >= 1.8. It also requires NCCL >= 2.8.3 when you have 64 or more GPUs to avoid certain NCCL runtime bugs. Currently (2021/03/16) NCCL 2.8.3 is not officially supported by PyTorch. The solution we used is by hacking in NCCL 2.8.3 via `LD_PRELOAD`: 1) Install NCCL 2.8.3. This works for us on a CUDA 11 system: `apt-get install -y libnccl2=2.8.3-1+cuda11.0 libnccl-dev=2.8.3-1+cuda11.0`. 2) Set `LD_PRELOAD` to the library path. This works for us: `LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libnccl.so.2.8.3`. To confirm `LD_PRELOAD` is working you can see the version it uses in the NCCL logs if you have `NCCL_DEBUG=INFO`, it should say: NCCL version 2.8.3+cuda11.0.
 {: .notice--warning}
 
 #### 1.2.2 MPI-based implementation
@@ -77,7 +77,7 @@ mpirun -np [#processes] -ppn [#GPUs on each node] -hostfile [hostfile] [MPI flag
 
 ### 1.3 1-bit Algorithm
 
-The detailed description of the 1-bit Algorithm can be seen from our [blog post](https://www.deepspeed.ai/news/2020/09/08/onebit-adam-blog-post.html) and our [paper](https://arxiv.org/abs/2102.02888).
+The detailed description of the 1-bit Algorithm can be seen from our [blog post](https://www.deepspeed.ai/2020/09/08/onebit-adam-blog-post.html) and our [paper](https://arxiv.org/abs/2102.02888).
 
 ### 1.4 Configuration of 1-bit Adam
 The 1-bit Adam feature can be used by setting the optimizer configuration options as follows. An example json config file is shown below.
@@ -215,7 +215,7 @@ We fixed the learning rate to 3e-5. The table below shows the F1 and the EM scor
 
 Figure 1: Scalability of 1-bit Adam for SQuAD Finetuning on V100 GPUs with batch size of 3/GPU. -->
 
-Performance results of SQuAD Fine-tuning can be seen from our [blog post](https://www.deepspeed.ai/news/2020/09/08/onebit-adam-blog-post.html) and our [paper](https://arxiv.org/abs/2102.02888).
+Performance results of SQuAD Fine-tuning can be seen from our [blog post](https://www.deepspeed.ai/2020/09/08/onebit-adam-blog-post.html) and our [paper](https://arxiv.org/abs/2102.02888).
 
 
 
@@ -295,4 +295,4 @@ The above file is for BERT-large. For BERT-base training (sequence length 128),
 
 ### 3.3 Performance Results for BERT Pre-training
 
-Performance results of BERT Pre-training can be seen from our [blog post](https://www.deepspeed.ai/news/2020/09/08/onebit-adam-blog-post.html) and our [paper](https://arxiv.org/abs/2102.02888).
+Performance results of BERT Pre-training can be seen from our [blog post](https://www.deepspeed.ai/2020/09/08/onebit-adam-blog-post.html) and our [paper](https://arxiv.org/abs/2102.02888).
diff --git a/docs/_tutorials/onebit-lamb.md b/docs/_tutorials/onebit-lamb.md
index 822f79e61740..4873f1f35c17 100644
--- a/docs/_tutorials/onebit-lamb.md
+++ b/docs/_tutorials/onebit-lamb.md
@@ -32,7 +32,7 @@ cd DeepSpeedExamples/
 In DeepSpeed, we introduce a system implementation for compressed communication using the NCCL backend of PyTorch distributed. This implementation provides better performance and usability than the MPI-based implementation below. Thus we highly recommend users to choose this implementation.
 
 **Watch out!**
-This NCCL-based implementation requires PyTorch >= 1.8. It also requires NCCL >= 2.8.3 when you have 64 or more GPUs to avoid certain NCCL runtime bugs. Currently (2021/03/16) NCCL 2.8.3 is not officially supported by PyTorch. The solution we used is by hacking in NCCL 2.8.3 via `LD_PRELOAD`: 1) Install NCCL 2.8.3. This works for us on a CUDA 11 system: `apt-get install -y libnccl2=2.8.3-1+cuda11.0 libnccl-dev=2.8.3-1+cuda11.0`. 2) Set `LD_PRELOAD` to the the library path. This works for us: `LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libnccl.so.2.8.3`. To confirm `LD_PRELOAD` is working you can see the version it uses in the NCCL logs if you have `NCCL_DEBUG=INFO`, it should say: NCCL version 2.8.3+cuda11.0.
+This NCCL-based implementation requires PyTorch >= 1.8. It also requires NCCL >= 2.8.3 when you have 64 or more GPUs to avoid certain NCCL runtime bugs. Currently (2021/03/16) NCCL 2.8.3 is not officially supported by PyTorch. The solution we used is by hacking in NCCL 2.8.3 via `LD_PRELOAD`: 1) Install NCCL 2.8.3. This works for us on a CUDA 11 system: `apt-get install -y libnccl2=2.8.3-1+cuda11.0 libnccl-dev=2.8.3-1+cuda11.0`. 2) Set `LD_PRELOAD` to the library path. This works for us: `LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libnccl.so.2.8.3`. To confirm `LD_PRELOAD` is working you can see the version it uses in the NCCL logs if you have `NCCL_DEBUG=INFO`, it should say: NCCL version 2.8.3+cuda11.0.
 {: .notice--warning}
 
 #### 1.2.2 MPI-based implementation
diff --git a/docs/_tutorials/pipeline.md b/docs/_tutorials/pipeline.md
index 8eff1d996c04..35990643f93f 100644
--- a/docs/_tutorials/pipeline.md
+++ b/docs/_tutorials/pipeline.md
@@ -245,7 +245,7 @@ end is reached:
 train_loader = deepspeed.utils.RepeatingLoader(train_loader)
 train_iter = iter(train_loader)
 for step in range(args.steps):
-    loss = engine.train_batch(data_iter=trainiter)
+    loss = engine.train_batch(data_iter=train_iter)
 ```
 
 
diff --git a/docs/_tutorials/progressive_layer_dropping.md b/docs/_tutorials/progressive_layer_dropping.md
index b7b868bf29d3..1054e91b24b2 100755
--- a/docs/_tutorials/progressive_layer_dropping.md
+++ b/docs/_tutorials/progressive_layer_dropping.md
@@ -95,7 +95,7 @@ Note that the above configuration assumes training on 64 X 32GB V100 GPUs. Each
 
 Table 1. Pre-training hyperparameters
 
-**Note:** DeepSpeed now supports PreLayerNorm as the default way for training BERT, because of its ability to avoid vanishing gradient, stabilize optimization, and performance gains, as described in our fastest BERT training [blog post](https://www.deepspeed.ai/news/2020/05/27/fastest-bert-training.html). We therefore support the switchable Transformer block directly on the the BERT with PreLayerNorm. The implementation can be found at "example\bing_bert\nvidia\modelingpreln_layerdrop.py".
+**Note:** DeepSpeed now supports PreLayerNorm as the default way for training BERT, because of its ability to avoid vanishing gradient, stabilize optimization, and performance gains, as described in our fastest BERT training [blog post](https://www.deepspeed.ai/2020/05/27/fastest-bert-training.html). We therefore support the switchable Transformer block directly on the BERT with PreLayerNorm. The implementation can be found at "example\bing_bert\nvidia\modelingpreln_layerdrop.py".
 
 ## Fine-tuning with DeepSpeed on GLUE Tasks
 
diff --git a/docs/_tutorials/pytorch-profiler.md b/docs/_tutorials/pytorch-profiler.md
index a9a9f58d6e32..6e32d72c2aa8 100644
--- a/docs/_tutorials/pytorch-profiler.md
+++ b/docs/_tutorials/pytorch-profiler.md
@@ -5,7 +5,7 @@ tags: profiling performance-tuning
 
 This tutorial describes how to use [PyTorch Profiler](https://pytorch.org/blog/introducing-pytorch-profiler-the-new-and-improved-performance-tool/) with DeepSpeed.
 
-PyTorch Profiler is an open-source tool that enables accurate and efficient performance analysis and troubleshooting for large-scale deep learning models.  The profiling results can be outputted as a `.json` trace file and viewed in Google Chrome's trace viewer (chrome://tracing).
+PyTorch Profiler is an open-source tool that enables accurate and efficient performance analysis and troubleshooting for large-scale deep learning models.  The profiling results can be outputted as a `.json` trace file and viewed in Google's Perfetto trace viewer (https://ui.perfetto.dev).
 Microsoft Visual Studio Code's Python extension integrates TensorBoard into the code editor, including the support for the PyTorch Profiler.
 
 For more details, refer to [PYTORCH PROFILER](https://pytorch.org/tutorials/recipes/recipes/profiler_recipe.html#pytorch-profiler).
@@ -22,12 +22,12 @@ from torch.profiler import profile, record_function, ProfilerActivity
 
 with torch.profiler.profile(
     schedule=torch.profiler.schedule(
-        wait=5, # during this phase profiler is not active
-        warmup=2, # during this phase profiler starts tracing, but the results are discarded
-        active=6, # during this phase profiler traces and records data
-        repeat=2), # specifies an upper bound on the number of cycles
+        wait=5, # During this phase profiler is not active.
+        warmup=2, # During this phase profiler starts tracing, but the results are discarded.
+        active=6, # During this phase profiler traces and records data.
+        repeat=2), # Specifies an upper bound on the number of cycles.
     on_trace_ready=tensorboard_trace_handler,
-    with_stack=True # enable stack tracing, adds extra profiling overhead
+    with_stack=True # Enable stack tracing, adds extra profiling overhead.
 ) as profiler:
     for step, batch in enumerate(data_loader):
         print("step:{}".format(step))
@@ -40,7 +40,7 @@ with torch.profiler.profile(
 
         #weight update
         model_engine.step()
-        profiler.step() # send the signal to the profiler that the next step has started
+        profiler.step() # Send the signal to the profiler that the next step has started.
 ```
 
 ## Label arbitrary code ranges
@@ -48,7 +48,7 @@ with torch.profiler.profile(
 The `record_function` context manager can be used to label arbitrary code ranges with user provided names. For example, the following code marks `"model_forward"` as a label:
 
 ```python
-with profile(record_shapes=True) as prof: # record_shapes indicates whether to record shapes of the operator inputs
+with profile(record_shapes=True) as prof: # record_shapes indicates whether to record shapes of the operator inputs.
     with record_function("""):"
         model_engine(inputs)
 ```
diff --git a/docs/_tutorials/sparse-attention.md b/docs/_tutorials/sparse-attention.md
index d28b2d1ff33c..034dea6c10ff 100644
--- a/docs/_tutorials/sparse-attention.md
+++ b/docs/_tutorials/sparse-attention.md
@@ -9,7 +9,7 @@ In this tutorial we describe how to use DeepSpeed Sparse Attention (SA) and its
 {: .notice--warning}
 
 ## Sparse attention modules
-* **MatMul**: This module handles block-sparse matrix-matrix multiplication. Currently it supports SDD, DSD, and DDS as described in [DeepSpeed Sparse Attention](https://www.deepspeed.ai/news/2020/09/08/sparse-attention.html) section.
+* **MatMul**: This module handles block-sparse matrix-matrix multiplication. Currently it supports SDD, DSD, and DDS as described in [DeepSpeed Sparse Attention](https://www.deepspeed.ai/2020/09/08/sparse-attention.html) section.
 * **Softmax**: This module applies block sparse softmax. It handles both forward and backward pass.
 * **SparseSelfAttention**: This module uses MatMul and Softmax kernels and generates Context Layer output given Query, Keys and Values. It is a simplified version of common operations in any self-attention layer. It can also apply:
   * `Relative position embedding`
@@ -104,7 +104,7 @@ if self.sparse_attention_config is not None:
       position_ids=None,
       inputs_embeds=None,
       pad_token_id=self.pad_token_id,
-      model_mbeddings=self.embeddings)
+      model_embeddings=self.embeddings)
 .
 .
 .
diff --git a/docs/_tutorials/transformer_kernel.md b/docs/_tutorials/transformer_kernel.md
index 915117fc3af9..959f5b7d41f9 100755
--- a/docs/_tutorials/transformer_kernel.md
+++ b/docs/_tutorials/transformer_kernel.md
@@ -14,7 +14,7 @@ To this end, we have developed a new kernel for transformer networks which inclu
 optimizations specific to these layers, which boost the training throughput on single GPU and scales
 well as we increase the number of GPUs. For more information on the details
 of transformer kernel, please visit our recent blog post on the [fastest BERT
-training](https://www.deepspeed.ai/news/2020/05/27/fastest-bert-training.html).
+training](https://www.deepspeed.ai/2020/05/27/fastest-bert-training.html).
 
 ## Prerequisites
 
@@ -96,7 +96,7 @@ By setting the `normalize_invertible` flag, we force the kernel to drop the inpu
 
 The `attn_dropout_checkpoint` and `gelu_checkpoint` flags refer to the checkpointing approach, in which we drop the inputs to some parts of the transformer layer, attention dropout and Gelu, in order to save an important part of the activation memory. Based on our performance profiling, the performance cost of rematerializing these two are negligible and finally the performance benefit that we gain from running larger batch size compensate for that.
 
-The following table shows which memory optimization flags need to be turned on when running BERT-Large on NVIDIA V100 GPU with 32GB of memory, considering different micro-batch sizes and sequence lengths. For the two sequence lengths, 128 and 512, used in our experiments, we have seen that larger batch size improves the overall training performance for both. Please see our [blog post](https://www.deepspeed.ai/news/2020/05/27/fastest-bert-training.html) for more information regarding the performance evaluation of these configurations.
+The following table shows which memory optimization flags need to be turned on when running BERT-Large on NVIDIA V100 GPU with 32GB of memory, considering different micro-batch sizes and sequence lengths. For the two sequence lengths, 128 and 512, used in our experiments, we have seen that larger batch size improves the overall training performance for both. Please see our [blog post](https://www.deepspeed.ai/2020/05/27/fastest-bert-training.html) for more information regarding the performance evaluation of these configurations.
 
 | Micro-batch size |    128 sequence-length    |           512 sequence-length            |
 | :--------------: | :-----------------------: | :--------------------------------------: |
diff --git a/docs/_tutorials/zero-one-adam.md b/docs/_tutorials/zero-one-adam.md
index 751fce597e9f..2dd956e802fd 100644
--- a/docs/_tutorials/zero-one-adam.md
+++ b/docs/_tutorials/zero-one-adam.md
@@ -33,7 +33,7 @@ cd DeepSpeedExamples/
 In DeepSpeed, we introduce a system implementation for compressed communication using the NCCL backend of PyTorch distributed. This implementation provides better performance and usability than the MPI-based implementation below. Thus we highly recommend users to choose this implementation.
 
 **Watch out!**
-This NCCL-based implementation requires PyTorch >= 1.8. It also requires NCCL >= 2.8.3 when you have 64 or more GPUs to avoid certain NCCL runtime bugs. Currently (2021/03/16) NCCL 2.8.3 is not officially supported by PyTorch. The solution we used is by hacking in NCCL 2.8.3 via `LD_PRELOAD`: 1) Install NCCL 2.8.3. This works for us on a CUDA 11 system: `apt-get install -y libnccl2=2.8.3-1+cuda11.0 libnccl-dev=2.8.3-1+cuda11.0`. 2) Set `LD_PRELOAD` to the the library path. This works for us: `LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libnccl.so.2.8.3`. To confirm `LD_PRELOAD` is working you can see the version it uses in the NCCL logs if you have `NCCL_DEBUG=INFO`, it should say: NCCL version 2.8.3+cuda11.0.
+This NCCL-based implementation requires PyTorch >= 1.8. It also requires NCCL >= 2.8.3 when you have 64 or more GPUs to avoid certain NCCL runtime bugs. Currently (2021/03/16) NCCL 2.8.3 is not officially supported by PyTorch. The solution we used is by hacking in NCCL 2.8.3 via `LD_PRELOAD`: 1) Install NCCL 2.8.3. This works for us on a CUDA 11 system: `apt-get install -y libnccl2=2.8.3-1+cuda11.0 libnccl-dev=2.8.3-1+cuda11.0`. 2) Set `LD_PRELOAD` to the library path. This works for us: `LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libnccl.so.2.8.3`. To confirm `LD_PRELOAD` is working you can see the version it uses in the NCCL logs if you have `NCCL_DEBUG=INFO`, it should say: NCCL version 2.8.3+cuda11.0.
 {: .notice--warning}
 
 #### 1.2.2 MPI-based implementation
diff --git a/docs/_tutorials/zeropp.md b/docs/_tutorials/zeropp.md
new file mode 100644
index 000000000000..866bb9389e22
--- /dev/null
+++ b/docs/_tutorials/zeropp.md
@@ -0,0 +1,84 @@
+---
+title: "ZeRO++"
+tags: training ZeRO communication-efficiency large-model
+---
+
+ZeRO++ is a system of communication optimization strategies built on top of [ZeRO](/tutorials/zero/) to offer unmatched efficiency for large model training regardless of the scale or cross-device bandwidth constraints. Read our [ZeRO++ blog](https://www.microsoft.com/en-us/research/blog/deepspeed-zero-a-leap-in-speed-for-llm-and-chat-model-training-with-4x-less-communication/) and [paper](https://arxiv.org/pdf/2306.10209.pdf) to learn more!
+
+We recommend that you read the tutorials on [Getting Started](/getting-started/), [ZeRO](/tutorials/zero/)  and [Megatron-DeepSpeed](/tutorials/megatron/) before stepping through this tutorial.
+
+
+## Three Components of ZeRO++
+ZeRO++ consists of three key designs, namely quantized weights (*qwZ*), hiearchical partitioning ZeRO (*hpZ*), and quantized gradients (*qgZ*):
+ - *qwZ* applies block-based quantization to reduce ZeRO parameter all-gather communication volume by half from FP16 to INT8)
+ - *hpZ* eliminates inter-node backward parameter all-gather communication through data remapping and recomputation
+ - *qgZ* replaces gradients allreduce collective with a new communication efficient all-to-all based quantized gradient averaging.
+
+Collectively, the three optimization reduces communication volume by 4x compared to ZeRO baseline. Each of the three components can be enabled independent of each other and collectively as a group as described in the next section.
+
+## Training Environment
+
+For this tutorial, we will configure a 18 billion parameter GPT-2 model using the DeepSpeed [Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed/tree/master/) GPT-2 code. We will use 4 nodes of 16x [NVIDIA Tesla V100-SXM3 Tensor Core GPU](https://www.nvidia.com/en-us/data-center/v100/) with 32GB RAM per node for this exercise.
+
+
+## Training a 18B parameter GPT-2 with ZeRO++
+There are no change needed to the user code. However, since ZeRO++ extends ZeRO Stage 3 (ZeRO-3), appropriate flags need to be added to activate each or all of the three ZeRO++ communication collective optimizations. The three flags and their meanings and defaults and preferred values:
+
+ - zero_quantized_weights: Boolean indicating whether to use quantized zero weights (*qwZ*), default is false
+ - zero_hpz_partition_size: number of ranks in *hpZ* (secondary partition) group, default is 1 meaning no hpZ, ideal is number of ranks (gpus) per node
+ - zero_quantized_gradients: Boolean indicating whether to use quantized zero gradients (*qgZ*), default is false
+
+
+### DeepSpeed Configuration Changes
+An example snippet of deepspeed configurations with all three ZeRO++ optimization enable is shown below:
+```json
+{
+    "zero_optimization": {
+        "stage": 3,
+        "reduce_bucket_size": 10000000,
+        "reduce_scatter": true,
+
+        "zero_quantized_weights": true,
+        "zero_hpz_partition_size": 16,
+        "zero_quantized_gradients": true,
+
+        "contiguous_gradients": true,
+        "overlap_comm": true
+    }
+}
+```
+
+Finally, to launch your experiment, issue the following command:
+
+```python
+       deepspeed pretrain_zeropp_gpt.py \
+       --tensor-model-parallel-size 1 \
+       --pipeline-model-parallel-size 1 \
+       --num-layers 40 \
+       --hidden-size 6144 \
+       --seq-length 512 \
+       --num-attention-heads 32 \
+       --batch-size 1 \
+       --zero-stage 3 \
+       --deepspeed_config ds_zeropp_config.json \
+       --deepspeed-activation-checkpointing \
+       --fp16 \
+       --checkpoint-activations
+```
+
+See more details on Megatron-DeepSpeed [tutorial](/tutorials/megatron/) examples on how to launch a Megatron-DeepSpeed job.
+
+
+Here is a screenshots of the training log for both ZeRO baseline and ZeRO++:
+
+ZeRO baseline
+<a href="/assets/images/zeropp/ZeRO-baseline.png">
+<img src="/assets/images/zeropp/ZeRO-baseline.png">
+</a>
+
+ZeRO++
+<a href="/assets/images/zeropp/ZeROpp.png">
+<img src="/assets/images/zeropp/ZeROpp.png">
+</a>
+
+Congratulations! You have completed the ZeRO++ tutorial.
diff --git a/docs/assets/files/DeepSpeed_Overview_Japanese_2023Jun7th.pdf b/docs/assets/files/DeepSpeed_Overview_Japanese_2023Jun7th.pdf
new file mode 100644
index 000000000000..f65e97547006
Binary files /dev/null and b/docs/assets/files/DeepSpeed_Overview_Japanese_2023Jun7th.pdf differ
diff --git a/docs/assets/files/ICML-5mins.pdf b/docs/assets/files/ICML-5mins.pdf
new file mode 100755
index 000000000000..834360ca217d
Binary files /dev/null and b/docs/assets/files/ICML-5mins.pdf differ
diff --git a/docs/assets/files/SC21-ZeRO-Infinity.pdf b/docs/assets/files/SC21-ZeRO-Infinity.pdf
new file mode 100755
index 000000000000..436712590c44
Binary files /dev/null and b/docs/assets/files/SC21-ZeRO-Infinity.pdf differ
diff --git a/docs/assets/files/presentation-mlops.pdf b/docs/assets/files/presentation-mlops.pdf
new file mode 100755
index 000000000000..1ae13599d4e8
Binary files /dev/null and b/docs/assets/files/presentation-mlops.pdf differ
diff --git a/docs/assets/files/sc22-ds-inference.pdf b/docs/assets/files/sc22-ds-inference.pdf
new file mode 100755
index 000000000000..2b72da03ec39
Binary files /dev/null and b/docs/assets/files/sc22-ds-inference.pdf differ
diff --git a/docs/assets/files/zeroquant_series.pdf b/docs/assets/files/zeroquant_series.pdf
new file mode 100755
index 000000000000..515c9b3748f3
Binary files /dev/null and b/docs/assets/files/zeroquant_series.pdf differ
diff --git a/docs/assets/images/3pillars.png b/docs/assets/images/3pillars.png
deleted file mode 100755
index c2943ca912a1..000000000000
Binary files a/docs/assets/images/3pillars.png and /dev/null differ
diff --git a/docs/assets/images/DeepSpeed-pillars.png b/docs/assets/images/DeepSpeed-pillars.png
new file mode 100644
index 000000000000..e41a02a86058
Binary files /dev/null and b/docs/assets/images/DeepSpeed-pillars.png differ
diff --git a/docs/assets/images/auto-tp-chart-latency.png b/docs/assets/images/auto-tp-chart-latency.png
new file mode 100644
index 000000000000..7c883a2a4503
Binary files /dev/null and b/docs/assets/images/auto-tp-chart-latency.png differ
diff --git a/docs/assets/images/auto-tp-chart-opt-throughput.png b/docs/assets/images/auto-tp-chart-opt-throughput.png
new file mode 100644
index 000000000000..4180a6b063d9
Binary files /dev/null and b/docs/assets/images/auto-tp-chart-opt-throughput.png differ
diff --git a/docs/assets/images/auto-tp-chart-throughput.png b/docs/assets/images/auto-tp-chart-throughput.png
new file mode 100644
index 000000000000..518bcbd8e439
Binary files /dev/null and b/docs/assets/images/auto-tp-chart-throughput.png differ
diff --git a/docs/assets/images/evoformer.png b/docs/assets/images/evoformer.png
new file mode 100755
index 000000000000..a3da3b18febd
Binary files /dev/null and b/docs/assets/images/evoformer.png differ
diff --git a/docs/assets/images/new-megatron-ds.png b/docs/assets/images/new-megatron-ds.png
new file mode 100755
index 000000000000..a8f408338afe
Binary files /dev/null and b/docs/assets/images/new-megatron-ds.png differ
diff --git a/docs/assets/images/vl_moe.png b/docs/assets/images/vl_moe.png
new file mode 100644
index 000000000000..5c58e89b54ce
Binary files /dev/null and b/docs/assets/images/vl_moe.png differ
diff --git a/docs/assets/images/zeropp/ZeRO-baseline.png b/docs/assets/images/zeropp/ZeRO-baseline.png
new file mode 100644
index 000000000000..108c06a097aa
Binary files /dev/null and b/docs/assets/images/zeropp/ZeRO-baseline.png differ
diff --git a/docs/assets/images/zeropp/ZeROpp.png b/docs/assets/images/zeropp/ZeROpp.png
new file mode 100644
index 000000000000..a72715cb699a
Binary files /dev/null and b/docs/assets/images/zeropp/ZeROpp.png differ
diff --git a/docs/code-docs/source/autotuning.rst b/docs/code-docs/source/autotuning.rst
index 8019ce3d0c26..23ae090e2ceb 100644
--- a/docs/code-docs/source/autotuning.rst
+++ b/docs/code-docs/source/autotuning.rst
@@ -6,7 +6,7 @@ The DeepSpeed Autotuner mitigates this pain point and automatically discovers th
 The Autotuner uses model information, system information, and heuristics to efficiently tune system knobs that affect compute and memory efficiencies, such as ZeRO optimization stages, micro-batch sizes, and many other ZeRO optimization configurations.
 It not only reduces the time and resources users spend on tuning, but also can discover configurations better than hand-tuned methods.
 
-Please see the the `Autotuning tutorial <https://www.deepspeed.ai/tutorials/autotuning/>`_ for usage details.
+Please see the `Autotuning tutorial <https://www.deepspeed.ai/tutorials/autotuning/>`_ for usage details.
 
 Autotuner
 ---------------------------------------------------
diff --git a/docs/code-docs/source/conf.py b/docs/code-docs/source/conf.py
index 059aa7c0b67c..64bfc2722252 100644
--- a/docs/code-docs/source/conf.py
+++ b/docs/code-docs/source/conf.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 # Configuration file for the Sphinx documentation builder.
 #
diff --git a/docs/code-docs/source/model-checkpointing.rst b/docs/code-docs/source/model-checkpointing.rst
index c797943dd662..85f7c7947a60 100644
--- a/docs/code-docs/source/model-checkpointing.rst
+++ b/docs/code-docs/source/model-checkpointing.rst
@@ -22,3 +22,35 @@ DeepSpeed provides routines for extracting fp32 weights from the saved ZeRO chec
 .. autofunction:: deepspeed.utils.zero_to_fp32.load_state_dict_from_zero_checkpoint
 
 .. autofunction:: deepspeed.utils.zero_to_fp32.convert_zero_checkpoint_to_fp32_state_dict
+
+
+Avoiding ZeRO Checkpoint Bloat
+------------------------------
+ZeRO stage 1 and 2 checkpoints created using ``torch.save()`` can sometimes be larger than expected. This bloat
+is caused by the interaction of ZeRO's tensor flattening and torch's tensor `storage management <https://pytorch.org/docs/stable/notes/serialization.html#preserve-storage-sharing>`_ .
+You can avoid this problem by using the ``clone_tensors_for_torch_save`` utility of DeepSpeed as illustrated below.
+
+.. autofunction:: deepspeed.checkpoint.utils.clone_tensors_for_torch_save
+
+The following code snippet illustrates this functionality for creating a HuggingFace model checkpoint:
+
+.. code-block:: python
+
+    ds_config = {
+     ...
+    }
+    model = AutoModelForCausalLM.from_pretrained("facebook/opt-13b", torch_dtype=torch.float16)
+    ds_engine, _, _, _ = deepspeed.initialize(model=model, config_params=ds_config)
+    lean_state_dict = deepspeed.checkpoint.utils.clone_tensors_for_torch_save(ds_engine.module.state_dict())
+    ds_engine.module.save_pretrained("lean_after", state_dict=lean_state_dict)
+
+
+
+Universal Checkpoints (under development)
+------------------------------------------
+Parallelism techniques such as ZeRO data parallelism (DP), Tensor parallelism (TP), Pipeline parallelism (TP), which shard model and/or
+optimizer states make it difficult to resume training with a checkpoint that was created on a different number of GPUs. DeepSpeed provides the
+Universal Checkpoint mechanism to address this problem. Universal Checkpoints give users the flexibility of changing the number of GPUs when training
+with 3D (TP, PP, and DP) parallelism, and enables more efficient use of elastic training hardware. The easiest way to get started with
+using Universal Checkpoints is to consult the `Megatron-DeepSpeed <https://github.com/microsoft/Megatron-DeepSpeed/blob/main/examples_deepspeed/universal_checkpointing/README.md>`_
+and `BLOOM <https://github.com/bigscience-workshop/bigscience/blob/master/train/tr11-176B-ml/README.md#checkpoint-reshaping>`_ examples.
diff --git a/docs/code-docs/source/monitor.rst b/docs/code-docs/source/monitor.rst
index a0ab894a97e4..d286af23f09e 100644
--- a/docs/code-docs/source/monitor.rst
+++ b/docs/code-docs/source/monitor.rst
@@ -15,7 +15,7 @@ overview of what DeepSpeed will log automatically.
     `Train/Eigenvalues/ModelBlockParam_{i}`,Eigen values per param block.,`eigenvalue` must be enabled.
     `Train/Samples/elapsed_time_ms_forward`,The global duration of the forward pass.,`flops_profiler.enabled` or `wall_clock_breakdown`.
     `Train/Samples/elapsed_time_ms_backward`,The global duration of the forward pass.,`flops_profiler.enabled` or `wall_clock_breakdown`.
-    `Train/Samples/elapsed_time_ms_backward_inner`,The backward time that does not include the the gradient reduction time. Only in cases where the gradient reduction is not overlapped, if it is overlapped then the inner time should be about the same as the entire backward time.,`flops_profiler.enabled` or `wall_clock_breakdown`.
+    `Train/Samples/elapsed_time_ms_backward_inner`,The backward time that does not include the gradient reduction time. Only in cases where the gradient reduction is not overlapped, if it is overlapped then the inner time should be about the same as the entire backward time.,`flops_profiler.enabled` or `wall_clock_breakdown`.
     `Train/Samples/elapsed_time_ms_backward_allreduce`,The global duration of the allreduce operation.,`flops_profiler.enabled` or `wall_clock_breakdown`.
     `Train/Samples/elapsed_time_ms_step`,The optimizer step time,`flops_profiler.enabled` or `wall_clock_breakdown`.
 
diff --git a/docs/code-docs/source/zero3.rst b/docs/code-docs/source/zero3.rst
index 60c347018589..2a6a48ca91db 100644
--- a/docs/code-docs/source/zero3.rst
+++ b/docs/code-docs/source/zero3.rst
@@ -57,7 +57,7 @@ ZeRO Configurations
 All the settings for DeepSpeed ZeRO are set with the `DeepSpeedZeroConfig`_.
 The dictionary provided under the ``zero_optimization`` entry of the main
 DeepSpeed configuration dict will be parsed and validated with this class.
-Sub-configurations for parameter offload and optimzer offload settings are
+Sub-configurations for parameter offload and optimizer offload settings are
 parsed by `DeepSpeedZeroOffloadParamConfig`_ and
 `DeepSpeedZeroOffloadOptimizerConfig`_.
 
@@ -155,7 +155,39 @@ Example ZeRO-3 Configurations
             ...
         }
 
+MiCS Configurations
+===================
+
+All MiCS configurations are set with `DeepSpeedZeroConfig`. MiCS assumes ZeRO
+stage 3 optimization is enabled. For now, there are two configuration fields of
+MiCS `mics_shard_size` and `mics_hierarchical_params_gather`. `mics_shard_size`
+controls how many devices are used for partitioning the model states.
+`mics_hierarchical_params_gather` controls whether we use a two-stage
+hierarchical way to gather parameters in the forward computation.
+`mics_hierarchical_params_gather` is useful when model states are partitioned
+across multiple nodes and the cross-node bandwidth is slow. By default this is
+turned off.
+
+
+Example MiCS Configurations
+===========================
+
+#. Use MiCS to partition the model states (including optimizer states,
+   gradients, and parameters). The following config example partitions the model
+   states to eight devices, and assumes the eight devices are located within a
+   single node (`mics_hierarchical_params_gather` is `False`).
+
+    .. code-block:: python
+        :emphasize-lines: 3
 
+        {
+            "zero_optimization": {
+                "stage": 3,
+                "mics_shard_size": 8,
+                "mics_hierarchical_params_gather": False,
+            },
+            ...
+        }
 
 Assumptions
 ===========
@@ -277,6 +309,17 @@ DeepSpeed can automatically detect the following external parameter scenarios:
 .. autofunction:: deepspeed.zero.unregister_external_parameter
 
 
+.. `Module.apply <https://pytorch.org/docs/stable/generated/torch.nn.Module.html?highlight=module+apply#torch.nn.Module.apply>`_
+Overriding Module.apply
+===============================
+A convenient mechanism for customizing model initialization is `Module.apply <https://pytorch.org/docs/stable/generated/torch.nn.Module.html?highlight=module+apply#torch.nn.Module.apply>`_.
+With ZeRO stage 3, ``Module.apply`` implementations must account for parameter partitioning by ``zero.Init`` during model initialization. The default behavior of ZeRO stage 3 is to automatically
+handle this issue by overriding ``Module.apply`` to ensure that parameters are gathered before access by ``Module.apply``. The benefit of this approach is development convenience, since
+users are saved the burden of manual parameter coordination in ``Module.apply``. However, the downside is slow model initialization, since all the model parameters (e.g., billions) are gathered
+even though the common usage of ``Module.apply`` is to customize a few parameters. Developers can disable this default behavior by setting the ``override_module_apply`` configuration knob to ``False``,
+for faster model initialization at the cost of manually handling partitioned parameters in their ``Module.apply`` implementations.
+
+
 Memory-Centric Tiling
 ---------------------
 
@@ -298,9 +341,9 @@ parallelism to fit them in limited GPU memory.
 Debugging
 ---------
 
-Debugging ZeRO training is complicated by the partitioning of parameters, gradients, and optimizer states. None of these 3 groups of tensors (model states) can be normally accessed because of that. To overcome that DeepSpeed provides the following routines for accessing individual model states in their unpartitioned form.
+Debugging ZeRO training is complicated by the partitioning of parameters, gradients, and optimizer states. None of these 3 groups of tensors (model states) can be normally accessed because of that. To overcome that DeepSpeed provides the following routines for accessing individual model states in both their partitioned (local) and unpartitioned (full) forms.
 
-Important: Please note that these utilities must be called by all processes participating in the training, even if you decide to do something with the result only in the main process. If all processes don't participate these utilities will hang waiting for all processes to send their contribution.
+Important: Please note that, to access the unpartitioned (full) form, these utilities must be called by all processes participating in the training, even if you decide to do something with the result only in the main process. If all processes don't participate these utilities will hang waiting for all processes to send their contribution.
 
 Additionally, you must be aware that these routines return correct data only in specific phases of the training. So for examples the gradients are valid after ``backward`` and before ``step``. The optimizer states are updated after ``step``. Same goes for fp32 master weights.
 
@@ -310,6 +353,12 @@ Additionally, you must be aware that these routines return correct data only in
 
 .. autofunction:: deepspeed.utils.safe_get_full_optimizer_state
 
+.. autofunction:: deepspeed.utils.safe_get_local_fp32_param
+
+.. autofunction:: deepspeed.utils.safe_get_local_grad
+
+.. autofunction:: deepspeed.utils.safe_get_local_optimizer_state
+
 
 These routines can be used in a training loop as shown in the following snippet.
 
@@ -319,15 +368,91 @@ These routines can be used in a training loop as shown in the following snippet.
     [...]
     from deepspeed.utils import safe_get_full_fp32_param, safe_get_full_grad, safe_get_full_optimizer_state
     for n, lp in model.named_parameters():
-        # 1. gradient lookup
+        # 1. Access the full states
+        # 1) gradient lookup
         # For zero1 and zero2, gradient lookup must be called after `backward` and before `step`
         # For zero3, gradient lookup must be called after `backward`
         hp_grad = safe_get_full_grad(lp)
 
-        # 2. fp32 and optim states can probably be called anywhere in the training loop, but will be updated after `step`
+
+        # 2) fp32 and optim states can probably be called anywhere in the training loop, but will be updated after `step`
         hp = safe_get_full_fp32_param(lp)
         exp_avg = safe_get_full_optimizer_state(lp, "exp_avg")
         exp_avg_sq = safe_get_full_optimizer_state(lp, "exp_avg_sq")
 
+        # 2. Access the local states (zero3)
+        # For zero3, all of the parameters, gradients, and optimizer states are partitioned,
+        # and each process can access its corresponding local state.
+        local_hp = safe_get_local_fp32_param(lp)
+        local_hp_grad = safe_get_local_grad(lp)
+        local_exp_avg = safe_get_local_optimizer_state(lp, "exp_avg")
+        local_exp_avg_sq = safe_get_local_optimizer_state(lp, "exp_avg_sq")
+
     [...]
     optimizer.step()
+
+
+
+Modifying Partitioned States
+----------------------------
+
+Sometimes, a user may want to modify parameters or optimizer states outside of the regular training loop. This is currently difficult in ZeRO training because of partitioning. To overcome that, DeepSpeed provides the following routines for modifying the fp32 master parameters and the fp32 optimizer states.
+
+.. autofunction:: deepspeed.utils.safe_set_full_fp32_param
+
+.. autofunction:: deepspeed.utils.safe_set_full_optimizer_state
+
+.. autofunction:: deepspeed.utils.safe_set_local_fp32_param
+
+.. autofunction:: deepspeed.utils.safe_set_local_optimizer_state
+
+These routines can be used at any point after initialization of the DeepSpeed engine (i.e., ``deepspeed.initialize()``) as shown in the following snippet.
+
+.. code-block:: python
+
+    [...]
+    from deepspeed.utils import safe_set_full_fp32_param, safe_set_full_optimizer_state
+    from deepspeed.utils import safe_set_local_fp32_param, safe_set_local_optimizer_state
+    # Here is an example to zero all the fp32 parameters and optimizer states.
+    for n, lp in model.named_parameters():
+        # 1. For zero stage 1 or 2, set the full fp32 and their full optim states
+        zero_tensor = torch.zeros_like(lp)
+
+        safe_set_full_fp32_param(lp, zero_tensor)
+        safe_get_full_optimizer_state(lp, zero_tensor, "exp_avg")
+        safe_get_full_optimizer_state(lp, zero_tensor, "exp_avg_sq")
+
+        # 2. For zero stage 3, each process sets its local fp32 parameters and their local optimizer states individually
+        zero_tensor_local = torch.zeros_like(lp.ds_tensor.shape)
+
+        safe_set_local_fp32_param(lp, zero_tensor_local)
+        safe_set_local_optimizer_state(lp, zero_tensor_local, "exp_avg")
+        safe_set_local_optimizer_state(lp, zero_tensor_local, "exp_avg_sq")
+
+    [...]
+
+
+GPU Memory Management
+---------------------
+
+By default at the end of training with ZeRO stage 3 some parameters could remain unpartitioned and use up some gpu memory.
+This is done on purpose as an optimization should you resume training again. If you'd like to clear out the cached
+parameters that use up gpu memory, you can call ``empty_partition_cache`` method of a DeepSpeed engine.
+
+.. autofunction::deepspeed.DeepSpeedEngine.empty_partition_cache
+
+The following code snippet illustrates this functionality.
+
+.. code-block:: python
+
+    with zero.Init():
+        model = MyLargeModel()
+
+    ds_engine, _, _, _ = deepspeed.initialize(model, ...)
+    for batch in ...:
+        loss = ds_engine(batch)
+        ds_engine.backward(batch)
+        ds_engine.step()
+
+    # Free GPU memory consumed by model parameters
+    ds_engine.empty_partition_cache()
diff --git a/docs/index.md b/docs/index.md
index 79c5ad21f058..2c18c80c6dae 100755
--- a/docs/index.md
+++ b/docs/index.md
@@ -5,49 +5,64 @@ toc_label: "Contents"
 title: "Latest News"
 
 ---
-<b> DeepSpeed trained the world's most powerful language models ([MT-530B](https://www.microsoft.com/en-us/research/blog/using-deepspeed-and-megatron-to-train-megatron-turing-nlg-530b-the-worlds-largest-and-most-powerful-generative-language-model/), [BLOOM](https://huggingface.co/blog/bloom-megatron-deepspeed)); [learn how](https://www.deepspeed.ai/tutorials/large-models-w-deepspeed/).</b>
+<b> <span style="color:orange" > DeepSpeed empowers ChatGPT-like model training with a single click, offering 15x speedup over SOTA RLHF systems with unprecedented cost reduction at all scales; [learn how](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat)</span>.</b>
 
-* [2023/02] [Automatic Tensor Parallelism: Enables tensor parallelism by default without providing an injection policy](https://www.deepspeed.ai/tutorials/automatic-tensor-parallelism/)
-* [2022/12] [DeepSpeed Data Efficiency: A composable library that makes better use of data, increases training efficiency, and improves model quality](https://www.deepspeed.ai/2022/12/11/data-efficiency.html)
-* [2022/11] [Stable Diffusion Image Generation under 1 second w. DeepSpeed MII](https://github.com/microsoft/DeepSpeed-MII/tree/main/examples/benchmark/txt2img)
-* [2022/10] [DeepSpeed-MII: instant speedup on 24,000+ open-source DL models with up to 40x cheaper inference](https://www.deepspeed.ai/2022/10/10/mii.html)
-* [2022/09] [ZeRO-Inference: Democratizing massive model inference](https://www.deepspeed.ai/2022/09/09/zero-inference.html)
-* [2022/07] [Azure and DeepSpeed empower easy-to-use and high-performance model training](https://azure.microsoft.com/en-us/blog/azure-empowers-easytouse-highperformance-and-hyperscale-model-training-using-deepspeed/)
+* [2023/11] [DeepSpeed ZeRO-Offload++: 6x Higher Training Throughput via Collaborative CPU/GPU Twin-Flow](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-offloadpp)
+* [2023/11] [DeepSpeed-FastGen: High-throughput Text Generation for LLMs via MII and DeepSpeed-Inference](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen) [[English](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen)] [[中文](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen/chinese/README.md)] [[日本語](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen/japanese/README.md)]
+* [2023/10] [DeepSpeed-VisualChat: Improve Your Chat Experience with Multi-Round Multi-Image Inputs](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-visualchat/10-03-2023/README.md) [[English](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-visualchat/10-03-2023/README.md)] [[中文](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-visualchat/10-03-2023/README-Chinese.md)] [[日本語](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-visualchat/10-03-2023/README-Japanese.md)]
+* [2023/09] Announcing the DeepSpeed4Science Initiative: Enabling large-scale scientific discovery through sophisticated AI system technologies [[DeepSpeed4Science website](https://deepspeed4science.ai/)] [[Tutorials](https://www.deepspeed.ai/deepspeed4science/)] [[White paper](https://arxiv.org/abs/2310.04610)] [[Blog](https://www.microsoft.com/en-us/research/blog/announcing-the-deepspeed4science-initiative-enabling-large-scale-scientific-discovery-through-sophisticated-ai-system-technologies/)] [[中文](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed4science/chinese/README.md)] [[日本語](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed4science/japanese/README.md)]
+* [2023/08] [DeepSpeed ZeRO-Inference: 20x faster inference through weight quantization and KV cache offloading](https://github.com/microsoft/DeepSpeedExamples/blob/master/inference/huggingface/zero_inference/README.md)
 
+<!-- NOTE: we must use html for news items otherwise links will be broken in the 'more news' section -->
+
+<details>
+ <summary>More news</summary>
+ <ul>
+  <li>[2023/08] <a href="https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat/ds-chat-release-8-31/README.md">DeepSpeed-Chat: Llama/Llama-2 system support, efficiency boost, and training stability improvements</a></li>
+
+  <li>[2023/08] <a href="https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-ulysses">DeepSpeed Ulysses: System Optimizations for Enabling Training of Extreme Long Sequence Transformer Models</a> [<a href="https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-ulysses/chinese/README.md">中文</a>] [<a href="https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-ulysses/japanese/README.md">日本語</a>]</li>
+
+  <li>[2023/06] <a href="https://www.microsoft.com/en-us/research/blog/deepspeed-zero-a-leap-in-speed-for-llm-and-chat-model-training-with-4x-less-communication/">ZeRO++: A leap in speed for LLM and chat model training with 4X less communication</a> [<a href="https://www.microsoft.com/en-us/research/blog/deepspeed-zero-a-leap-in-speed-for-llm-and-chat-model-training-with-4x-less-communication/">English</a>] [<a href="https://github.com/microsoft/DeepSpeed/blob/master/blogs/zeropp/chinese/README.md">中文</a>] [<a href="https://github.com/microsoft/DeepSpeed/blob/master/blogs/zeropp/japanese/README.md">日本語</a>]</li>
+ </ul>
+</details>
 
 # Extreme Speed and Scale for DL Training and Inference
 
-   DeepSpeed is an easy-to-use deep learning optimization software suite that enables unprecedented scale and speed for Deep Learning Training and Inference. With DeepSpeed you can:
+   ***[DeepSpeed](https://www.deepspeed.ai/) enables world's most powerful language models like [MT-530B](https://www.microsoft.com/en-us/research/blog/using-deepspeed-and-megatron-to-train-megatron-turing-nlg-530b-the-worlds-largest-and-most-powerful-generative-language-model/) and [BLOOM](https://huggingface.co/blog/bloom-megatron-deepspeed)***. It is an easy-to-use deep learning optimization software suite that powers unprecedented scale and speed for both training and inference. With DeepSpeed you can:
 
 * Train/Inference dense or sparse models with billions or trillions of parameters
 * Achieve excellent system throughput and efficiently scale to thousands of GPUs
-* Train/Inference on resource constrained GPU systems
-* Achieve unprecedented low latency and high thoughput for inference
+* Train/Inference on resource-constrained GPU systems
+* Achieve unprecedented low latency and high throughput for inference
 * Achieve extreme compression for an unparalleled inference latency and model size reduction with low costs
 
 
-# DeepSpeed has three innovation pillars:
+# DeepSpeed has four innovation pillars:
 
-![Three innovation pillars](/assets/images/3pillars.png){: .align-center}
+[![Four innovation pillars](/assets/images/DeepSpeed-pillars.png){: .align-center}](https://deepspeed4science.ai/)
 
 
 ## DeepSpeed-Training
 
-DeepSpeed offers a confluence of system innovations, that has made large scale DL training effective, and efficient, greatly improved ease of use, and redefined the DL training landscape in terms of scale that is possible. These innovations such as ZeRO, 3D-Parallelism, DeepSpeed-MoE, ZeRO-Infinity, etc fall under the DeepSpeed-Training pillar. Learn more: [DeepSpeed-Training](https://www.deepspeed.ai/training)
+DeepSpeed offers a confluence of system innovations, that has made large-scale DL training effective, and efficient, greatly improved ease of use, and redefined the DL training landscape in terms of scale that is possible. These innovations such as ZeRO, 3D-Parallelism, DeepSpeed-MoE, ZeRO-Infinity, etc fall under the DeepSpeed-Training pillar. Learn more: [DeepSpeed-Training](https://www.deepspeed.ai/training)
 
 ## DeepSpeed-Inference
 
-DeepSpeed brings together innovations in parallelism technology such as tensor, pipeline, expert and ZeRO-parallelism, and combines them with high performance custom inference kernels, communication optimizations and heterogeneous memory technologies to enable inference at an unprecedented scale, while achieving unparalleled latency, thoughput and cost reduction. This systematic composition of system technologies for inference falls under the DeepSpeed-Inference. Learn more: [DeepSpeed-Inference](https://www.deepspeed.ai/inference)
+DeepSpeed brings together innovations in parallelism technology such as tensor, pipeline, expert and ZeRO-parallelism, and combines them with high-performance custom inference kernels, communication optimizations and heterogeneous memory technologies to enable inference at an unprecedented scale, while achieving unparalleled latency, throughput and cost reduction. This systematic composition of system technologies for inference falls under the DeepSpeed-Inference. Learn more: [DeepSpeed-Inference](https://www.deepspeed.ai/inference)
 
 ## DeepSpeed-Compression
 
 To further increase the inference efficiency, DeepSpeed offers easy-to-use and flexible-to-compose compression techniques for researchers and practitioners to compress their models while delivering faster speed, smaller model size, and significantly reduced compression cost. Moreover, SoTA innovations on compression like ZeroQuant and XTC are included under the DeepSpeed-Compression pillar. Learn more: [DeepSpeed-Compression](https://www.deepspeed.ai/compression)
 
+## DeepSpeed4Science
+
+In line with Microsoft's mission to solve humanity's most pressing challenges, the DeepSpeed team at Microsoft is responding to this opportunity by launching a new initiative called *DeepSpeed4Science*, aiming to build unique capabilities through AI system technology innovations to help domain experts to unlock today's biggest science mysteries. Learn more: [DeepSpeed4Science website](https://deepspeed4science.ai/) and [tutorials](/deepspeed4science/)
+
 # DeepSpeed Software Suite
 
 ## DeepSpeed Library
 
-   The [DeepSpeed](https://github.com/microsoft/deepspeed) library implements and packages the innovations and technologies in DeepSpeed Training, Inference and Compression Pillars into a single easy-to-use, open-sourced repository. It allows for easy composition of multitude of features within a single training, infernece or compression pipeline. The DeepSpeed Library is heavily adopted by the DL community, and has been used to enable some of the most powerful models (see [DeepSpeed Adoption](#deepspeed-adoption)).
+   The [DeepSpeed](https://github.com/microsoft/deepspeed) library implements and packages the innovations and technologies in DeepSpeed Training, Inference and Compression Pillars into a single easy-to-use, open-sourced repository. It allows for an easy composition of a multitude of features within a single training, inference or compression pipeline. The DeepSpeed Library is heavily adopted by the DL community, and has been used to enable some of the most powerful models (see [DeepSpeed Adoption](#deepspeed-adoption)).
 
 ## Model Implementations for Inference (MII)
 
@@ -55,11 +70,11 @@ To further increase the inference efficiency, DeepSpeed offers easy-to-use and f
 
 ## DeepSpeed on Azure
 
-   DeepSpeed users are diverse and have access to different environments. We recommend to try DeepSpeed on Azure as it is the simplest and easiest method. The recommended method to try DeepSpeed on Azure is through AzureML [recipes](https://github.com/Azure/azureml-examples/tree/main/python-sdk/workflows/train/deepspeed). The job submission and data preparation scripts have been made available [here](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples/azureml). For more details on how to use DeepSpeed on Azure, please follow the [Azure tutorial](https://www.deepspeed.ai/tutorials/azure/).
+   DeepSpeed users are diverse and have access to different environments. We recommend trying DeepSpeed on Azure as it is the simplest and easiest method. The recommended method to try DeepSpeed on Azure is through AzureML [recipes](https://github.com/Azure/azureml-examples/tree/main/python-sdk/workflows/train/deepspeed). The job submission and data preparation scripts have been made available [here](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples_deepspeed/azureml). For more details on how to use DeepSpeed on Azure, please follow the [Azure tutorial](https://www.deepspeed.ai/tutorials/azure/).
 
 # DeepSpeed Adoption
 
-DeepSpeed has been used to train many different large-scale models, below is a list of several examples that we are aware of (if you'd like to include your model please submit a PR):
+DeepSpeed has been used to train many different large-scale models. Below is a list of several examples that we are aware of (if you'd like to include your model please submit a PR):
 
   * [Megatron-Turing NLG (530B)](https://www.microsoft.com/en-us/research/blog/using-deepspeed-and-megatron-to-train-megatron-turing-nlg-530b-the-worlds-largest-and-most-powerful-generative-language-model/)
   * [Jurassic-1 (178B)](https://uploads-ssl.webflow.com/60fd4503684b466578c0d307/61138924626a6981ee09caf6_jurassic_tech_paper.pdf)
@@ -110,19 +125,32 @@ comments.
 1. Samyam Rajbhandari, Jeff Rasley, Olatunji Ruwase, Yuxiong He. (2019) ZeRO: memory optimizations toward training trillion parameter models. [arXiv:1910.02054](https://arxiv.org/abs/1910.02054) and [In Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis (SC '20)](https://dl.acm.org/doi/10.5555/3433701.3433727).
 2. Jeff Rasley, Samyam Rajbhandari, Olatunji Ruwase, and Yuxiong He. (2020) DeepSpeed: System Optimizations Enable Training Deep Learning Models with Over 100 Billion Parameters. [In Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining (KDD '20, Tutorial)](https://dl.acm.org/doi/10.1145/3394486.3406703).
 3. Minjia Zhang, Yuxiong He. (2020) Accelerating Training of Transformer-Based Language Models with Progressive Layer Dropping. [arXiv:2010.13369](https://arxiv.org/abs/2010.13369) and [NeurIPS 2020](https://proceedings.neurips.cc/paper/2020/hash/a1140a3d0df1c81e24ae954d935e8926-Abstract.html).
-4. Jie Ren, Samyam Rajbhandari, Reza Yazdani Aminabadi, Olatunji Ruwase, Shuangyan Yang, Minjia Zhang, Dong Li, Yuxiong He. (2021) ZeRO-Offload: Democratizing Billion-Scale Model Training. [arXiv:2101.06840](https://arxiv.org/abs/2101.06840) and [USENIX ATC 2021](https://www.usenix.org/conference/atc21/presentation/ren-jie).
+4. Jie Ren, Samyam Rajbhandari, Reza Yazdani Aminabadi, Olatunji Ruwase, Shuangyan Yang, Minjia Zhang, Dong Li, Yuxiong He. (2021) ZeRO-Offload: Democratizing Billion-Scale Model Training. [arXiv:2101.06840](https://arxiv.org/abs/2101.06840) and [USENIX ATC 2021](https://www.usenix.org/conference/atc21/presentation/ren-jie). [[paper]](https://arxiv.org/abs/2101.06840) [[slides]](https://www.usenix.org/system/files/atc21_slides_ren-jie.pdf) [[blog]](https://www.microsoft.com/en-us/research/blog/deepspeed-extreme-scale-model-training-for-everyone/)
 5. Hanlin Tang, Shaoduo Gan, Ammar Ahmad Awan, Samyam Rajbhandari, Conglong Li, Xiangru Lian, Ji Liu, Ce Zhang, Yuxiong He. (2021) 1-bit Adam: Communication Efficient Large-Scale Training with Adam's Convergence Speed. [arXiv:2102.02888](https://arxiv.org/abs/2102.02888) and [ICML 2021](http://proceedings.mlr.press/v139/tang21a.html).
-6. Samyam Rajbhandari, Olatunji Ruwase, Jeff Rasley, Shaden Smith, Yuxiong He. (2021) ZeRO-Infinity: Breaking the GPU Memory Wall for Extreme Scale Deep Learning. [arXiv:2104.07857](https://arxiv.org/abs/2104.07857) and [SC 2021](https://dl.acm.org/doi/abs/10.1145/3458817.3476205).
+6. Samyam Rajbhandari, Olatunji Ruwase, Jeff Rasley, Shaden Smith, Yuxiong He. (2021) ZeRO-Infinity: Breaking the GPU Memory Wall for Extreme Scale Deep Learning. [arXiv:2104.07857](https://arxiv.org/abs/2104.07857) and [SC 2021](https://dl.acm.org/doi/abs/10.1145/3458817.3476205). [[paper]](https://arxiv.org/abs/2104.07857) [[slides]](https://github.com/microsoft/DeepSpeed/blob/master/docs/assets/files/SC21-ZeRO-Infinity.pdf) [[blog]](https://www.microsoft.com/en-us/research/blog/zero-infinity-and-deepspeed-unlocking-unprecedented-model-scale-for-deep-learning-training/)
 7. Conglong Li, Ammar Ahmad Awan, Hanlin Tang, Samyam Rajbhandari, Yuxiong He. (2021) 1-bit LAMB: Communication Efficient Large-Scale Large-Batch Training with LAMB's Convergence Speed. [arXiv:2104.06069](https://arxiv.org/abs/2104.06069) and [HiPC 2022](https://hipc.org/advance-program/).
 8. Conglong Li, Minjia Zhang, Yuxiong He. (2021) The Stability-Efficiency Dilemma: Investigating Sequence Length Warmup for Training GPT Models. [arXiv:2108.06084](https://arxiv.org/abs/2108.06084) and [NeurIPS 2022](https://openreview.net/forum?id=JpZ5du_Kdh).
 9. Yucheng Lu, Conglong Li, Minjia Zhang, Christopher De Sa, Yuxiong He. (2022) Maximizing Communication Efficiency for Large-scale Training via 0/1 Adam. [arXiv:2202.06009](https://arxiv.org/abs/2202.06009).
-10. Samyam Rajbhandari, Conglong Li, Zhewei Yao, Minjia Zhang, Reza Yazdani Aminabadi, Ammar Ahmad Awan, Jeff Rasley, Yuxiong He. (2022) DeepSpeed-MoE: Advancing Mixture-of-Experts Inference and Training to Power Next-Generation AI Scale [arXiv:2201.05596](https://arxiv.org/abs/2201.05596) and [ICML 2022](https://proceedings.mlr.press/v162/rajbhandari22a.html).
+10. Samyam Rajbhandari, Conglong Li, Zhewei Yao, Minjia Zhang, Reza Yazdani Aminabadi, Ammar Ahmad Awan, Jeff Rasley, Yuxiong He. (2022) DeepSpeed-MoE: Advancing Mixture-of-Experts Inference and Training to Power Next-Generation AI Scale [arXiv:2201.05596](https://arxiv.org/abs/2201.05596) and [ICML 2022](https://proceedings.mlr.press/v162/rajbhandari22a.html). [[pdf]](https://arxiv.org/abs/2201.05596) [[slides]](https://github.com/microsoft/DeepSpeed/blob/master/docs/assets/files/ICML-5mins.pdf) [[blog]](https://www.microsoft.com/en-us/research/blog/deepspeed-advancing-moe-inference-and-training-to-power-next-generation-ai-scale/)
 11. Shaden Smith, Mostofa Patwary, Brandon Norick, Patrick LeGresley, Samyam Rajbhandari, Jared Casper, Zhun Liu, Shrimai Prabhumoye, George Zerveas, Vijay Korthikanti, Elton Zhang, Rewon Child, Reza Yazdani Aminabadi, Julie Bernauer, Xia Song, Mohammad Shoeybi, Yuxiong He, Michael Houston, Saurabh Tiwary, Bryan Catanzaro. (2022) Using DeepSpeed and Megatron to Train Megatron-Turing NLG 530B, A Large-Scale Generative Language Model [arXiv:2201.11990](https://arxiv.org/abs/2201.11990).
 12. Xiaoxia Wu, Zhewei Yao, Minjia Zhang, Conglong Li, Yuxiong He. (2022) Extreme Compression for Pre-trained Transformers Made Simple and Efficient. [arXiv:2206.01859](https://arxiv.org/abs/2206.01859) and [NeurIPS 2022](https://openreview.net/forum?id=xNeAhc2CNAl).
-13. Zhewei Yao, Reza Yazdani Aminabadi, Minjia Zhang, Xiaoxia Wu, Conglong Li, Yuxiong He. (2022) ZeroQuant: Efficient and Affordable Post-Training Quantization for Large-Scale Transformers. [arXiv:2206.01861](https://arxiv.org/abs/2206.01861) and [NeurIPS 2022](https://openreview.net/forum?id=f-fVCElZ-G1).
-14. Reza Yazdani Aminabadi, Samyam Rajbhandari, Minjia Zhang, Ammar Ahmad Awan, Cheng Li, Du Li, Elton Zheng, Jeff Rasley, Shaden Smith, Olatunji Ruwase, Yuxiong He. (2022) DeepSpeed Inference: Enabling Efficient Inference of Transformer Models at Unprecedented Scale. [arXiv:2207.00032](https://arxiv.org/abs/2207.00032) and [SC 2022](https://dl.acm.org/doi/abs/10.5555/3571885.3571946).
+13. Zhewei Yao, Reza Yazdani Aminabadi, Minjia Zhang, Xiaoxia Wu, Conglong Li, Yuxiong He. (2022) ZeroQuant: Efficient and Affordable Post-Training Quantization for Large-Scale Transformers. [arXiv:2206.01861](https://arxiv.org/abs/2206.01861) and [NeurIPS 2022](https://openreview.net/forum?id=f-fVCElZ-G1) [[slides]](https://github.com/microsoft/DeepSpeed/blob/master/docs/assets/files/zeroquant_series.pdf) [[blog]](https://www.microsoft.com/en-us/research/blog/deepspeed-compression-a-composable-library-for-extreme-compression-and-zero-cost-quantization/)
+14. Reza Yazdani Aminabadi, Samyam Rajbhandari, Minjia Zhang, Ammar Ahmad Awan, Cheng Li, Du Li, Elton Zheng, Jeff Rasley, Shaden Smith, Olatunji Ruwase, Yuxiong He. (2022) DeepSpeed Inference: Enabling Efficient Inference of Transformer Models at Unprecedented Scale. [arXiv:2207.00032](https://arxiv.org/abs/2207.00032) and [SC 2022](https://dl.acm.org/doi/abs/10.5555/3571885.3571946). [[paper]](https://arxiv.org/abs/2207.00032) [[slides]](https://github.com/microsoft/DeepSpeed/blob/master/docs/assets/files/sc22-ds-inference.pdf) [[blog]](https://www.microsoft.com/en-us/research/blog/deepspeed-accelerating-large-scale-model-inference-and-training-via-system-optimizations-and-compression/)
 15. Zhewei Yao, Xiaoxia Wu, Conglong Li, Connor Holmes, Minjia Zhang, Cheng Li, Yuxiong He. (2022) Random-LTD: Random and Layerwise Token Dropping Brings Efficient Training for Large-scale Transformers. [arXiv:2211.11586](https://arxiv.org/abs/2211.11586).
-16. Conglong Li, Zhewei Yao, Xiaoxia Wu, Minjia Zhang, Yuxiong He. (2022) DeepSpeed Data Efficiency: Improving Deep Learning Model Quality and Training Efficiency via Efficient Data Sampling and Routing. [arXiv:2212.03597](https://arxiv.org/abs/2212.03597).
+16. Conglong Li, Zhewei Yao, Xiaoxia Wu, Minjia Zhang, Yuxiong He. (2022) DeepSpeed Data Efficiency: Improving Deep Learning Model Quality and Training Efficiency via Efficient Data Sampling and Routing. [arXiv:2212.03597](https://arxiv.org/abs/2212.03597) [ENLSP2023 Workshop at NeurIPS2023](https://neurips2023-enlsp.github.io/)
+17. Xiaoxia Wu, Cheng Li, Reza Yazdani Aminabadi, Zhewei Yao, Yuxiong He. (2023) Understanding INT4 Quantization for Transformer Models: Latency Speedup, Composability, and Failure Cases. [arXiv:2301.12017](https://arxiv.org/abs/2301.12017) and [ICML2023](https://icml.cc/Conferences/2023).
+18. Syed Zawad, Cheng Li, Zhewei Yao, Elton Zheng, Yuxiong He, Feng Yan. (2023) DySR: Adaptive Super-Resolution via Algorithm and System Co-design. [ICLR:2023](https://openreview.net/forum?id=Pgtn4l6eKjv).
+19. Sheng Shen, Zhewei Yao, Chunyuan Li, Trevor Darrell, Kurt Keutzer, Yuxiong He. (2023) Scaling Vision-Language Models with Sparse Mixture of Experts. [arXiv:2303.07226](https://arxiv.org/abs/2303.07226) and [Finding at EMNLP2023](https://2023.emnlp.org/).
+20. Quentin Anthony, Ammar Ahmad Awan, Jeff Rasley, Yuxiong He, Aamir Shafi, Mustafa Abduljabbar, Hari Subramoni, Dhabaleswar Panda. (2023) MCR-DL: Mix-and-Match Communication Runtime for Deep Learning [arXiv:2303.08374](https://arxiv.org/abs/2303.08374) and will appear at IPDPS 2023.
+21. Siddharth Singh, Olatunji Ruwase, Ammar Ahmad Awan, Samyam Rajbhandari, Yuxiong He, Abhinav Bhatele. (2023) A Hybrid Tensor-Expert-Data Parallelism Approach to Optimize Mixture-of-Experts Training [arXiv:2303.06318](https://arxiv.org/abs/2303.06318) and will appear at ICS 2023.
+22. Guanhua Wang, Heyang Qin, Sam Ade Jacobs, Xiaoxia Wu, Connor Holmes, Zhewei Yao, Samyam Rajbhandari, Olatunji Ruwase, Feng Yan, Lei Yang, Yuxiong He. (2023) ZeRO++: Extremely Efficient Collective Communication for Giant Model Training [arXiv:2306.10209](https://arxiv.org/abs/2306.10209) and [ML for Sys Workshop at NeurIPS2023](http://mlforsystems.org/) [[blog]](https://www.microsoft.com/en-us/research/blog/deepspeed-zero-a-leap-in-speed-for-llm-and-chat-model-training-with-4x-less-communication/)
+23. Zhewei Yao, Xiaoxia Wu, Cheng Li, Stephen Youn, Yuxiong He. (2023) ZeroQuant-V2: Exploring Post-training Quantization in LLMs from Comprehensive Study to Low Rank Compensation [arXiv:2303.08302](https://arxiv.org/abs/2303.08302) and [ENLSP2023 Workshop at NeurIPS2023](https://neurips2023-enlsp.github.io/) [[slides]](https://github.com/microsoft/DeepSpeed/blob/master/docs/assets/files/zeroquant_series.pdf)
+24. Pareesa Ameneh Golnari, Zhewei Yao, Yuxiong He. (2023) Selective Guidance: Are All the Denoising Steps of Guided Diffusion Important? [arXiv:2305.09847](https://arxiv.org/abs/2305.09847)
+25. Zhewei Yao, Reza Yazdani Aminabadi, Olatunji Ruwase, Samyam Rajbhandari, Xiaoxia Wu, Ammar Ahmad Awan, Jeff Rasley, Minjia Zhang, Conglong Li, Connor Holmes, Zhongzhu Zhou, Michael Wyatt, Molly Smith, Lev Kurilenko, Heyang Qin, Masahiro Tanaka, Shuai Che, Shuaiwen Leon Song, Yuxiong He. (2023) DeepSpeed-Chat: Easy, Fast and Affordable RLHF Training of ChatGPT-like Models at All Scales [arXiv:2308.01320](https://arxiv.org/abs/2308.01320).
+26. Xiaoxia Wu, Zhewei Yao, Yuxiong He. (2023) ZeroQuant-FP: A Leap Forward in LLMs Post-Training W4A8 Quantization Using Floating-Point Formats [arXiv:2307.09782](https://arxiv.org/abs/2307.09782) and [ENLSP2023 Workshop at NeurIPS2023](https://neurips2023-enlsp.github.io/) [[slides]](https://github.com/microsoft/DeepSpeed/blob/master/docs/assets/files/zeroquant_series.pdf)
+27. Zhewei Yao, Xiaoxia Wu, Conglong Li, Minjia Zhang, Heyang Qin, Olatunji Ruwase, Ammar Ahmad Awan, Samyam Rajbhandari, Yuxiong He. (2023) DeepSpeed-VisualChat: Multi-Round Multi-Image Interleave Chat via Multi-Modal Causal Attention [arXiv:2309.14327](https://arxiv.org/pdf/2309.14327.pdf)
+28. Shuaiwen Leon Song, Bonnie Kruft, Minjia Zhang, Conglong Li, Shiyang Chen, Chengming Zhang, Masahiro Tanaka, Xiaoxia Wu, Jeff Rasley, Ammar Ahmad Awan, Connor Holmes, Martin Cai, Adam Ghanem, Zhongzhu Zhou, Yuxiong He, et al. (2023) DeepSpeed4Science Initiative: Enabling Large-Scale Scientific Discovery through Sophisticated AI System Technologies [arXiv:2310.04610](https://arxiv.org/abs/2310.04610) [[blog]](https://www.microsoft.com/en-us/research/blog/announcing-the-deepspeed4science-initiative-enabling-large-scale-scientific-discovery-through-sophisticated-ai-system-technologies/)
+29. Zhewei Yao, Reza Yazdani Aminabadi, Stephen Youn, Xiaoxia Wu, Elton Zheng, Yuxiong He. (2023) ZeroQuant-HERO: Hardware-Enhanced Robust Optimized Post-Training Quantization Framework for W8A8 Transformers [arXiv:2310.17723](https://arxiv.org/abs/2310.17723)
 
 # Videos
 1. DeepSpeed KDD 2020 Tutorial
@@ -136,7 +164,8 @@ comments.
     * Registration is free and all videos are available on-demand.
     * [ZeRO & Fastest BERT: Increasing the scale and speed of deep learning training in DeepSpeed](https://note.microsoft.com/MSR-Webinar-DeepSpeed-Registration-On-Demand.html).
 3. [DeepSpeed on AzureML](https://youtu.be/yBVXR8G8Bg8)
-4. Community Tutorials
+4. [Large Model Training and Inference with DeepSpeed // Samyam Rajbhandari // LLMs in Prod Conference](https://www.youtube.com/watch?v=cntxC3g22oU) [[slides]](docs/assets/files/presentation-mlops.pdf)
+5. Community Tutorials
     * [DeepSpeed: All the tricks to scale to gigantic models (Mark Saroufim)](https://www.youtube.com/watch?v=pDGI668pNg0)
     * [Turing-NLG, DeepSpeed and the ZeRO optimizer (Yannic Kilcher)](https://www.youtube.com/watch?v=tC01FRB0M7w)
     * [Ultimate Guide To Scaling ML Models (The AI Epiphany)](https://www.youtube.com/watch?v=hc0u4avAkuM)
diff --git a/environment.yml b/environment.yml
new file mode 100644
index 000000000000..28c298717d80
--- /dev/null
+++ b/environment.yml
@@ -0,0 +1,21 @@
+channels:
+  - nvidia/label/cuda-11.8.0
+  - pytorch # or pytorch-nightly
+  - conda-forge
+dependencies:
+  - pytorch
+  - torchvision
+  - torchaudio
+  - cuda
+  - pytorch-cuda=11.8
+  - compilers
+  - sysroot_linux-64==2.17
+  - gcc==11.4
+  - ninja
+  - py-cpuinfo
+  - libaio
+  - ca-certificates
+  - certifi
+  - openssl
+  - python=3.10
+  - pydantic
diff --git a/examples/README.md b/examples/README.md
index c61ee047df9a..5dfc26c17613 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -4,6 +4,6 @@ If you are looking for examples using DeepSpeed please see the following resourc
 
 1. [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples)
 2. [Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed)
-3. [DeepSpeed + AzureML](https://github.com/Azure/azureml-examples/tree/main/python-sdk/workflows/train/deepspeed)
+3. [DeepSpeed + AzureML](https://github.com/Azure/azureml-examples/tree/main/v1/python-sdk/workflows/train/deepspeed)
 4. [DeepSpeed + Hugging Face Transformers Integration](https://huggingface.co/docs/transformers/main_classes/deepspeed)
-5. [DeepSpeed + PyTorch Lightning](https://pytorch-lightning.readthedocs.io/en/latest/api/pytorch_lightning.utilities.deepspeed.html)
+5. [DeepSpeed + PyTorch Lightning](https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.utilities.deepspeed.html)
diff --git a/install.sh b/install.sh
index 6770924d1ef8..8d56afe40493 100755
--- a/install.sh
+++ b/install.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
 
 set -e
 err_report() {
@@ -121,7 +121,7 @@ rm_if_exist() {
     if [ -f $1 ]; then
         rm $VERBOSE $1
     elif [ -d $1 ]; then
-        rm -r $VERBOSE $1
+        rm -rf $VERBOSE $1
     fi
 }
 
diff --git a/op_builder/__init__.py b/op_builder/__init__.py
index b41f074d9ba0..afe48159933c 100644
--- a/op_builder/__init__.py
+++ b/op_builder/__init__.py
@@ -1,6 +1,8 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 import sys
 import os
 import pkgutil
@@ -13,7 +15,7 @@
 
 # List of all available op builders from deepspeed op_builder
 try:
-    import deepspeed.ops.op_builder  # noqa: F401
+    import deepspeed.ops.op_builder  # noqa: F401 # type: ignore
     op_builder_dir = "deepspeed.ops.op_builder"
 except ImportError:
     op_builder_dir = "op_builder"
@@ -45,9 +47,7 @@ def _builder():
     if module_name != 'all_ops' and module_name != 'builder':
         module = importlib.import_module(f".{module_name}", package=op_builder_dir)
         for member_name in module.__dir__():
-            if member_name.endswith(
-                    'Builder'
-            ) and member_name != "OpBuilder" and member_name != "CUDAOpBuilder":
+            if member_name.endswith('Builder') and member_name != "OpBuilder" and member_name != "CUDAOpBuilder":
                 # assign builder name to variable with same name
                 # the following is equivalent to i.e. TransformerBuilder = "TransformerBuilder"
                 this_module.__dict__[member_name] = builder_closure(member_name)
diff --git a/op_builder/all_ops.py b/op_builder/all_ops.py
index a6010addb60b..9c41f35eaf1b 100644
--- a/op_builder/all_ops.py
+++ b/op_builder/all_ops.py
@@ -1,6 +1,8 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 import os
 import pkgutil
 import importlib
diff --git a/op_builder/async_io.py b/op_builder/async_io.py
index 3075e9179752..b55c821910b9 100644
--- a/op_builder/async_io.py
+++ b/op_builder/async_io.py
@@ -1,6 +1,8 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 import distutils.spawn
 import subprocess
 
@@ -19,14 +21,10 @@ def absolute_name(self):
 
     def sources(self):
         return [
-            'csrc/aio/py_lib/deepspeed_py_copy.cpp',
-            'csrc/aio/py_lib/py_ds_aio.cpp',
-            'csrc/aio/py_lib/deepspeed_py_aio.cpp',
-            'csrc/aio/py_lib/deepspeed_py_aio_handle.cpp',
-            'csrc/aio/py_lib/deepspeed_aio_thread.cpp',
-            'csrc/aio/common/deepspeed_aio_utils.cpp',
-            'csrc/aio/common/deepspeed_aio_common.cpp',
-            'csrc/aio/common/deepspeed_aio_types.cpp',
+            'csrc/aio/py_lib/deepspeed_py_copy.cpp', 'csrc/aio/py_lib/py_ds_aio.cpp',
+            'csrc/aio/py_lib/deepspeed_py_aio.cpp', 'csrc/aio/py_lib/deepspeed_py_aio_handle.cpp',
+            'csrc/aio/py_lib/deepspeed_aio_thread.cpp', 'csrc/aio/common/deepspeed_aio_utils.cpp',
+            'csrc/aio/common/deepspeed_aio_common.cpp', 'csrc/aio/common/deepspeed_aio_types.cpp',
             'csrc/aio/py_lib/deepspeed_pin_tensor.cpp'
         ]
 
@@ -37,11 +35,17 @@ def cxx_args(self):
         # -O0 for improved debugging, since performance is bound by I/O
         CPU_ARCH = self.cpu_arch()
         SIMD_WIDTH = self.simd_width()
+        import torch  # Keep this import here to avoid errors when building DeepSpeed wheel without torch installed
+        TORCH_MAJOR, TORCH_MINOR = map(int, torch.__version__.split('.')[0:2])
+        if TORCH_MAJOR >= 2 and TORCH_MINOR >= 1:
+            CPP_STD = '-std=c++17'
+        else:
+            CPP_STD = '-std=c++14'
         return [
             '-g',
             '-Wall',
             '-O0',
-            '-std=c++14',
+            CPP_STD,
             '-shared',
             '-fPIC',
             '-Wno-reorder',
@@ -56,15 +60,9 @@ def extra_ldflags(self):
 
     def check_for_libaio_pkg(self):
         libs = dict(
-            dpkg=["-l",
-                  "libaio-dev",
-                  "apt"],
-            pacman=["-Q",
-                    "libaio",
-                    "pacman"],
-            rpm=["-q",
-                 "libaio-devel",
-                 "yum"],
+            dpkg=["-l", "libaio-dev", "apt"],
+            pacman=["-Q", "libaio", "pacman"],
+            rpm=["-q", "libaio-devel", "yum"],
         )
 
         found = False
@@ -73,15 +71,11 @@ def check_for_libaio_pkg(self):
             path = distutils.spawn.find_executable(pkgmgr)
             if path is not None:
                 cmd = f"{pkgmgr} {flag} {lib}"
-                result = subprocess.Popen(cmd,
-                                          stdout=subprocess.PIPE,
-                                          stderr=subprocess.PIPE,
-                                          shell=True)
+                result = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
                 if result.wait() == 0:
                     found = True
                 else:
-                    self.warning(
-                        f"{self.NAME}: please install the {lib} package with {tool}")
+                    self.warning(f"{self.NAME}: please install the {lib} package with {tool}")
                 break
         return found
 
@@ -91,11 +85,9 @@ def is_compatible(self, verbose=True):
         # which is a function provided by libaio that is used in the async_io op.
         # If needed, one can define -I and -L entries in CFLAGS and LDFLAGS
         # respectively to specify the directories for libaio.h and libaio.so.
-        aio_compatible = self.has_function('io_submit', ('aio', ))
+        aio_compatible = self.has_function('io_pgetevents', ('aio', ))
         if verbose and not aio_compatible:
-            self.warning(
-                f"{self.NAME} requires the dev libaio .so object and headers but these were not found."
-            )
+            self.warning(f"{self.NAME} requires the dev libaio .so object and headers but these were not found.")
 
             # Check for the libaio package via known package managers
             # to print suggestions on which package to install.
diff --git a/op_builder/builder.py b/op_builder/builder.py
index 198e8471a0b5..3613791c938d 100644
--- a/op_builder/builder.py
+++ b/op_builder/builder.py
@@ -1,6 +1,8 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 import os
 import sys
 import time
@@ -27,25 +29,27 @@
 try:
     import torch
 except ImportError:
-    print(
-        f"{WARNING} unable to import torch, please install it if you want to pre-compile any deepspeed ops."
-    )
+    print(f"{WARNING} unable to import torch, please install it if you want to pre-compile any deepspeed ops.")
 else:
     TORCH_MAJOR = int(torch.__version__.split('.')[0])
     TORCH_MINOR = int(torch.__version__.split('.')[1])
 
 
+class MissingCUDAException(Exception):
+    pass
+
+
+class CUDAMismatchException(Exception):
+    pass
+
+
 def installed_cuda_version(name=""):
-    import torch.cuda
-    if not torch.cuda.is_available():
-        return 0, 0
     import torch.utils.cpp_extension
     cuda_home = torch.utils.cpp_extension.CUDA_HOME
-    assert cuda_home is not None, "CUDA_HOME does not exist, unable to compile CUDA op(s)"
+    if cuda_home is None:
+        raise MissingCUDAException("CUDA_HOME does not exist, unable to compile CUDA op(s)")
     # Ensure there is not a cuda version mismatch between torch and nvcc compiler
-    output = subprocess.check_output([cuda_home + "/bin/nvcc",
-                                      "-V"],
-                                     universal_newlines=True)
+    output = subprocess.check_output([cuda_home + "/bin/nvcc", "-V"], universal_newlines=True)
     output_split = output.split()
     release_idx = output_split.index("release")
     release = output_split[release_idx + 1].replace(',', '').split(".")
@@ -57,8 +61,7 @@ def installed_cuda_version(name=""):
 def get_default_compute_capabilities():
     compute_caps = DEFAULT_COMPUTE_CAPABILITIES
     import torch.utils.cpp_extension
-    if torch.utils.cpp_extension.CUDA_HOME is not None and installed_cuda_version(
-    )[0] >= 11:
+    if torch.utils.cpp_extension.CUDA_HOME is not None and installed_cuda_version()[0] >= 11:
         if installed_cuda_version()[0] == 11 and installed_cuda_version()[1] == 0:
             # Special treatment of CUDA 11.0 because compute_86 is not supported.
             compute_caps += ";8.0"
@@ -70,39 +73,32 @@ def get_default_compute_capabilities():
 # list compatible minor CUDA versions - so that for example pytorch built with cuda-11.0 can be used
 # to build deepspeed and system-wide installed cuda 11.2
 cuda_minor_mismatch_ok = {
-    10: [
-        "10.0",
-        "10.1",
-        "10.2",
-    ],
-    11: ["11.0",
-         "11.1",
-         "11.2",
-         "11.3",
-         "11.4",
-         "11.5",
-         "11.6",
-         "11.7",
-         "11.8"],
+    10: ["10.0", "10.1", "10.2"],
+    11: ["11.0", "11.1", "11.2", "11.3", "11.4", "11.5", "11.6", "11.7", "11.8"],
+    12: ["12.0", "12.1", "12.2", "12.3"],
 }
 
 
 def assert_no_cuda_mismatch(name=""):
     cuda_major, cuda_minor = installed_cuda_version(name)
-    if cuda_minor == 0 and cuda_major == 0:
-        return False
     sys_cuda_version = f'{cuda_major}.{cuda_minor}'
     torch_cuda_version = ".".join(torch.version.cuda.split('.')[:2])
     # This is a show-stopping error, should probably not proceed past this
     if sys_cuda_version != torch_cuda_version:
-        if (cuda_major in cuda_minor_mismatch_ok
-                and sys_cuda_version in cuda_minor_mismatch_ok[cuda_major]
+        if (cuda_major in cuda_minor_mismatch_ok and sys_cuda_version in cuda_minor_mismatch_ok[cuda_major]
                 and torch_cuda_version in cuda_minor_mismatch_ok[cuda_major]):
             print(f"Installed CUDA version {sys_cuda_version} does not match the "
                   f"version torch was compiled with {torch.version.cuda} "
                   "but since the APIs are compatible, accepting this combination")
             return True
-        raise Exception(
+        elif os.getenv("DS_SKIP_CUDA_CHECK", "0") == "1":
+            print(
+                f"{WARNING} DeepSpeed Op Builder: Installed CUDA version {sys_cuda_version} does not match the "
+                f"version torch was compiled with {torch.version.cuda}."
+                "Detected `DS_SKIP_CUDA_CHECK=1`: Allowing this combination of CUDA, but it may result in unexpected behavior."
+            )
+            return True
+        raise CUDAMismatchException(
             f">- DeepSpeed Op Builder: Installed CUDA version {sys_cuda_version} does not match the "
             f"version torch was compiled with {torch.version.cuda}, unable to compile "
             "cuda/cpp extensions without a matching cuda version.")
@@ -112,11 +108,13 @@ def assert_no_cuda_mismatch(name=""):
 class OpBuilder(ABC):
     _rocm_version = None
     _is_rocm_pytorch = None
+    _loaded_ops = {}
 
     def __init__(self, name):
         self.name = name
         self.jit_mode = False
         self.build_for_cpu = False
+        self.enable_bf16 = False
         self.error_log = None
 
     @abstractmethod
@@ -142,12 +140,11 @@ def validate_torch_version(torch_info):
         install_torch_version = torch_info['version']
         current_torch_version = ".".join(torch.__version__.split('.')[:2])
         if install_torch_version != current_torch_version:
-            raise RuntimeError(
-                "PyTorch version mismatch! DeepSpeed ops were compiled and installed "
-                "with a different version than what is being used at runtime. "
-                f"Please re-install DeepSpeed or switch torch versions. "
-                f"Install torch version={install_torch_version}, "
-                f"Runtime torch version={current_torch_version}")
+            raise RuntimeError("PyTorch version mismatch! DeepSpeed ops were compiled and installed "
+                               "with a different version than what is being used at runtime. "
+                               f"Please re-install DeepSpeed or switch torch versions. "
+                               f"Install torch version={install_torch_version}, "
+                               f"Runtime torch version={current_torch_version}")
 
     @staticmethod
     def validate_torch_op_version(torch_info):
@@ -155,22 +152,20 @@ def validate_torch_op_version(torch_info):
             current_cuda_version = ".".join(torch.version.cuda.split('.')[:2])
             install_cuda_version = torch_info['cuda_version']
             if install_cuda_version != current_cuda_version:
-                raise RuntimeError(
-                    "CUDA version mismatch! DeepSpeed ops were compiled and installed "
-                    "with a different version than what is being used at runtime. "
-                    f"Please re-install DeepSpeed or switch torch versions. "
-                    f"Install CUDA version={install_cuda_version}, "
-                    f"Runtime CUDA version={current_cuda_version}")
+                raise RuntimeError("CUDA version mismatch! DeepSpeed ops were compiled and installed "
+                                   "with a different version than what is being used at runtime. "
+                                   f"Please re-install DeepSpeed or switch torch versions. "
+                                   f"Install CUDA version={install_cuda_version}, "
+                                   f"Runtime CUDA version={current_cuda_version}")
         else:
             current_hip_version = ".".join(torch.version.hip.split('.')[:2])
             install_hip_version = torch_info['hip_version']
             if install_hip_version != current_hip_version:
-                raise RuntimeError(
-                    "HIP version mismatch! DeepSpeed ops were compiled and installed "
-                    "with a different version than what is being used at runtime. "
-                    f"Please re-install DeepSpeed or switch torch versions. "
-                    f"Install HIP version={install_hip_version}, "
-                    f"Runtime HIP version={current_hip_version}")
+                raise RuntimeError("HIP version mismatch! DeepSpeed ops were compiled and installed "
+                                   "with a different version than what is being used at runtime. "
+                                   f"Please re-install DeepSpeed or switch torch versions. "
+                                   f"Install HIP version={install_hip_version}, "
+                                   f"Runtime HIP version={current_hip_version}")
 
     @staticmethod
     def is_rocm_pytorch():
@@ -184,8 +179,7 @@ def is_rocm_pytorch():
             pass
         else:
             if TORCH_MAJOR > 1 or (TORCH_MAJOR == 1 and TORCH_MINOR >= 5):
-                _is_rocm_pytorch = hasattr(torch.version,
-                                           'hip') and torch.version.hip is not None
+                _is_rocm_pytorch = hasattr(torch.version, 'hip') and torch.version.hip is not None
                 if _is_rocm_pytorch:
                     from torch.utils.cpp_extension import ROCM_HOME
                     _is_rocm_pytorch = ROCM_HOME is not None
@@ -242,17 +236,6 @@ def is_compatible(self, verbose=True):
     def extra_ldflags(self):
         return []
 
-    def libraries_installed(self, libraries):
-        valid = False
-        check_cmd = 'dpkg -l'
-        for lib in libraries:
-            result = subprocess.Popen(f'dpkg -l {lib}',
-                                      stdout=subprocess.PIPE,
-                                      stderr=subprocess.PIPE,
-                                      shell=True)
-            valid = valid or result.wait() == 0
-        return valid
-
     def has_function(self, funcname, libraries, verbose=False):
         '''
         Test for existence of a function within a tuple of libraries.
@@ -280,9 +263,7 @@ def has_function(self, funcname, libraries, verbose=False):
             tempdir = tempfile.mkdtemp()
 
             # Define a simple C program that calls the function in question
-            prog = "void %s(void); int main(int argc, char** argv) { %s(); return 0; }" % (
-                funcname,
-                funcname)
+            prog = "void %s(void); int main(int argc, char** argv) { %s(); return 0; }" % (funcname, funcname)
 
             # Write the test program to a file.
             filename = os.path.join(tempdir, 'test.c')
@@ -303,16 +284,13 @@ def has_function(self, funcname, libraries, verbose=False):
 
             # Attempt to compile the C program into an object file.
             cflags = shlex.split(os.environ.get('CFLAGS', ""))
-            objs = compiler.compile([filename],
-                                    output_dir=output_dir,
-                                    extra_preargs=self.strip_empty_entries(cflags))
+            objs = compiler.compile([filename], output_dir=output_dir, extra_preargs=self.strip_empty_entries(cflags))
 
             # Attempt to link the object file into an executable.
             # Be sure to tack on any libraries that have been specified.
             ldflags = shlex.split(os.environ.get('LDFLAGS', ""))
             compiler.link_executable(objs,
-                                     os.path.join(tempdir,
-                                                  'a.out'),
+                                     os.path.join(tempdir, 'a.out'),
                                      extra_preargs=self.strip_empty_entries(ldflags),
                                      libraries=libraries)
 
@@ -356,9 +334,8 @@ def cpu_arch(self):
         try:
             cpu_info = get_cpu_info()
         except Exception as e:
-            self.warning(
-                f"{self.name} attempted to use `py-cpuinfo` but failed (exception type: {type(e)}, {e}), "
-                "falling back to `lscpu` to get this information.")
+            self.warning(f"{self.name} attempted to use `py-cpuinfo` but failed (exception type: {type(e)}, {e}), "
+                         "falling back to `lscpu` to get this information.")
             cpu_info = self._backup_cpuinfo()
             if cpu_info is None:
                 return "-march=native"
@@ -370,23 +347,20 @@ def cpu_arch(self):
 
     def is_cuda_enable(self):
         try:
-            if torch.cuda.is_available():
-                return '-D__ENABLE_CUDA__'
-        except:
-            print(
-                f"{WARNING} {self.name} torch.cuda is missing, only cpu ops can be compiled!"
-            )
+            assert_no_cuda_mismatch(self.name)
+            return '-D__ENABLE_CUDA__'
+        except MissingCUDAException:
+            print(f"{WARNING} {self.name} cuda is missing or is incompatible with installed torch, "
+                  "only cpu ops can be compiled!")
             return '-D__DISABLE_CUDA__'
         return '-D__DISABLE_CUDA__'
 
     def _backup_cpuinfo(self):
         # Construct cpu_info dict from lscpu that is similar to what py-cpuinfo provides
         if not self.command_exists('lscpu'):
-            self.warning(
-                f"{self.name} attempted to query 'lscpu' after failing to use py-cpuinfo "
-                "to detect the CPU architecture. 'lscpu' does not appear to exist on "
-                "your system, will fall back to use -march=native and non-vectorized execution."
-            )
+            self.warning(f"{self.name} attempted to query 'lscpu' after failing to use py-cpuinfo "
+                         "to detect the CPU architecture. 'lscpu' does not appear to exist on "
+                         "your system, will fall back to use -march=native and non-vectorized execution.")
             return None
         result = subprocess.check_output('lscpu', shell=True)
         result = result.decode('utf-8').strip().lower()
@@ -418,9 +392,8 @@ def simd_width(self):
         try:
             cpu_info = get_cpu_info()
         except Exception as e:
-            self.warning(
-                f"{self.name} attempted to use `py-cpuinfo` but failed (exception type: {type(e)}, {e}), "
-                "falling back to `lscpu` to get this information.")
+            self.warning(f"{self.name} attempted to use `py-cpuinfo` but failed (exception type: {type(e)}, {e}), "
+                         "falling back to `lscpu` to get this information.")
             cpu_info = self._backup_cpuinfo()
             if cpu_info is None:
                 return '-D__SCALAR__'
@@ -443,13 +416,9 @@ def command_exists(self, cmd):
             valid = valid or result.wait() == 0
 
         if not valid and len(cmds) > 1:
-            print(
-                f"{WARNING} {self.name} requires one of the following commands '{cmds}', but it does not exist!"
-            )
+            print(f"{WARNING} {self.name} requires one of the following commands '{cmds}', but it does not exist!")
         elif not valid and len(cmds) == 1:
-            print(
-                f"{WARNING} {self.name} requires the '{cmd}' command, but it does not exist!"
-            )
+            print(f"{WARNING} {self.name} requires the '{cmd}' command, but it does not exist!")
         return valid
 
     def warning(self, msg):
@@ -464,23 +433,27 @@ def deepspeed_src_path(self, code_path):
 
     def builder(self):
         from torch.utils.cpp_extension import CppExtension
-        return CppExtension(
-            name=self.absolute_name(),
-            sources=self.strip_empty_entries(self.sources()),
-            include_dirs=self.strip_empty_entries(self.include_paths()),
-            extra_compile_args={'cxx': self.strip_empty_entries(self.cxx_args())},
-            extra_link_args=self.strip_empty_entries(self.extra_ldflags()))
+        return CppExtension(name=self.absolute_name(),
+                            sources=self.strip_empty_entries(self.sources()),
+                            include_dirs=self.strip_empty_entries(self.include_paths()),
+                            extra_compile_args={'cxx': self.strip_empty_entries(self.cxx_args())},
+                            extra_link_args=self.strip_empty_entries(self.extra_ldflags()))
 
     def load(self, verbose=True):
+        if self.name in __class__._loaded_ops:
+            return __class__._loaded_ops[self.name]
+
         from deepspeed.git_version_info import installed_ops, torch_info
-        if installed_ops[self.name]:
+        if installed_ops.get(self.name, False):
             # Ensure the op we're about to load was compiled with the same
             # torch/cuda versions we are currently using at runtime.
             self.validate_torch_version(torch_info)
             if torch.cuda.is_available() and isinstance(self, CUDAOpBuilder):
                 self.validate_torch_op_version(torch_info)
 
-            return importlib.import_module(self.absolute_name())
+            op_module = importlib.import_module(self.absolute_name())
+            __class__._loaded_ops[self.name] = op_module
+            return op_module
         else:
             return self.jit_load(verbose)
 
@@ -490,23 +463,19 @@ def jit_load(self, verbose=True):
                 f"Unable to JIT load the {self.name} op due to it not being compatible due to hardware/software issue. {self.error_log}"
             )
         try:
-            import ninja  # noqa: F401
+            import ninja  # noqa: F401 # type: ignore
         except ImportError:
-            raise RuntimeError(
-                f"Unable to JIT load the {self.name} op due to ninja not being installed."
-            )
+            raise RuntimeError(f"Unable to JIT load the {self.name} op due to ninja not being installed.")
 
         if isinstance(self, CUDAOpBuilder) and not self.is_rocm_pytorch():
-            self.build_for_cpu = not assert_no_cuda_mismatch(self.name)
+            self.build_for_cpu = not torch.cuda.is_available()
 
         self.jit_mode = True
         from torch.utils.cpp_extension import load
 
         start_build = time.time()
         sources = [self.deepspeed_src_path(path) for path in self.sources()]
-        extra_include_paths = [
-            self.deepspeed_src_path(path) for path in self.include_paths()
-        ]
+        extra_include_paths = [self.deepspeed_src_path(path) for path in self.include_paths()]
 
         # Torch will try and apply whatever CCs are in the arch list at compile time,
         # we have already set the intended targets ourselves we know that will be
@@ -517,14 +486,26 @@ def jit_load(self, verbose=True):
             torch_arch_list = os.environ.get("TORCH_CUDA_ARCH_LIST")
             os.environ["TORCH_CUDA_ARCH_LIST"] = ""
 
-        op_module = load(
-            name=self.name,
-            sources=self.strip_empty_entries(sources),
-            extra_include_paths=self.strip_empty_entries(extra_include_paths),
-            extra_cflags=self.strip_empty_entries(self.cxx_args()),
-            extra_cuda_cflags=self.strip_empty_entries(self.nvcc_args()),
-            extra_ldflags=self.strip_empty_entries(self.extra_ldflags()),
-            verbose=verbose)
+        nvcc_args = self.strip_empty_entries(self.nvcc_args())
+        cxx_args = self.strip_empty_entries(self.cxx_args())
+
+        if isinstance(self, CUDAOpBuilder):
+            if not self.build_for_cpu and self.enable_bf16:
+                cxx_args.append("-DBF16_AVAILABLE")
+                nvcc_args.append("-DBF16_AVAILABLE")
+                nvcc_args.append("-U__CUDA_NO_BFLOAT16_OPERATORS__")
+                nvcc_args.append("-U__CUDA_NO_BFLOAT162_OPERATORS__")
+
+        if self.is_rocm_pytorch():
+            cxx_args.append("-D__HIP_PLATFORM_AMD__=1")
+
+        op_module = load(name=self.name,
+                         sources=self.strip_empty_entries(sources),
+                         extra_include_paths=self.strip_empty_entries(extra_include_paths),
+                         extra_cflags=cxx_args,
+                         extra_cuda_cflags=nvcc_args,
+                         extra_ldflags=self.strip_empty_entries(self.extra_ldflags()),
+                         verbose=verbose)
 
         build_duration = time.time() - start_build
         if verbose:
@@ -534,10 +515,13 @@ def jit_load(self, verbose=True):
         if torch_arch_list:
             os.environ["TORCH_CUDA_ARCH_LIST"] = torch_arch_list
 
+        __class__._loaded_ops[self.name] = op_module
+
         return op_module
 
 
 class CUDAOpBuilder(OpBuilder):
+
     def compute_capability_args(self, cross_compile_archs=None):
         """
         Returns nvcc compute capability compile flags.
@@ -584,16 +568,19 @@ def compute_capability_args(self, cross_compile_archs=None):
         ccs = self.filter_ccs(ccs)
         if len(ccs) == 0:
             raise RuntimeError(
-                f"Unable to load {self.name} op due to no compute capabilities remaining after filtering"
-            )
+                f"Unable to load {self.name} op due to no compute capabilities remaining after filtering")
 
         args = []
+        self.enable_bf16 = True
         for cc in ccs:
             num = cc[0] + cc[2]
             args.append(f'-gencode=arch=compute_{num},code=sm_{num}')
             if cc.endswith('+PTX'):
                 args.append(f'-gencode=arch=compute_{num},code=compute_{num}')
 
+            if int(cc[0]) <= 7:
+                self.enable_bf16 = False
+
         return args
 
     def filter_ccs(self, ccs: List[str]):
@@ -620,7 +607,13 @@ def is_compatible(self, verbose=True):
         return super().is_compatible(verbose)
 
     def builder(self):
-        self.build_for_cpu = not assert_no_cuda_mismatch(self.name)
+        try:
+            if not self.is_rocm_pytorch():
+                assert_no_cuda_mismatch(self.name)
+            self.build_for_cpu = False
+        except MissingCUDAException:
+            self.build_for_cpu = True
+
         if self.build_for_cpu:
             from torch.utils.cpp_extension import CppExtension as ExtensionBuilder
         else:
@@ -628,14 +621,20 @@ def builder(self):
 
         compile_args = {'cxx': self.strip_empty_entries(self.cxx_args())} if self.build_for_cpu else \
                        {'cxx': self.strip_empty_entries(self.cxx_args()), \
-                           'nvcc': self.strip_empty_entries(self.nvcc_args())}
+                        'nvcc': self.strip_empty_entries(self.nvcc_args())}
 
-        cuda_ext = ExtensionBuilder(
-            name=self.absolute_name(),
-            sources=self.strip_empty_entries(self.sources()),
-            include_dirs=self.strip_empty_entries(self.include_paths()),
-            libraries=self.strip_empty_entries(self.libraries_args()),
-            extra_compile_args=compile_args)
+        if not self.build_for_cpu and self.enable_bf16:
+            compile_args['cxx'].append("-DBF16_AVAILABLE")
+
+        if self.is_rocm_pytorch():
+            compile_args['cxx'].append("-D__HIP_PLATFORM_AMD__=1")
+
+        cuda_ext = ExtensionBuilder(name=self.absolute_name(),
+                                    sources=self.strip_empty_entries(self.sources()),
+                                    include_dirs=self.strip_empty_entries(self.include_paths()),
+                                    libraries=self.strip_empty_entries(self.libraries_args()),
+                                    extra_compile_args=compile_args,
+                                    extra_link_args=self.strip_empty_entries(self.extra_ldflags()))
 
         if self.is_rocm_pytorch():
             # hip converts paths to absolute, this converts back to relative
@@ -643,7 +642,10 @@ def builder(self):
             curr_file = Path(__file__).parent.parent  # ds root
             for i in range(len(sources)):
                 src = Path(sources[i])
-                sources[i] = str(src.relative_to(curr_file))
+                if src.is_absolute():
+                    sources[i] = str(src.relative_to(curr_file))
+                else:
+                    sources[i] = str(src)
             cuda_ext.sources = sources
         return cuda_ext
 
@@ -654,8 +656,7 @@ def hipify_extension(self):
                 project_directory=os.getcwd(),
                 output_directory=os.getcwd(),
                 header_include_dirs=self.include_paths(),
-                includes=[os.path.join(os.getcwd(),
-                                       '*')],
+                includes=[os.path.join(os.getcwd(), '*')],
                 extra_files=[os.path.abspath(s) for s in self.sources()],
                 show_detailed=True,
                 is_pytorch_extension=True,
@@ -666,7 +667,7 @@ def cxx_args(self):
         if sys.platform == "win32":
             return ['-O2']
         else:
-            return ['-O3', '-std=c++14', '-g', '-Wno-reorder']
+            return ['-O3', '-std=c++17', '-g', '-Wno-reorder']
 
     def nvcc_args(self):
         if self.build_for_cpu:
@@ -675,9 +676,7 @@ def nvcc_args(self):
         if self.is_rocm_pytorch():
             ROCM_MAJOR, ROCM_MINOR = self.installed_rocm_version()
             args += [
-                '-std=c++14',
-                '-U__HIP_NO_HALF_OPERATORS__',
-                '-U__HIP_NO_HALF_CONVERSIONS__',
+                '-std=c++17', '-U__HIP_NO_HALF_OPERATORS__', '-U__HIP_NO_HALF_CONVERSIONS__',
                 '-U__HIP_NO_HALF2_OPERATORS__',
                 '-DROCM_VERSION_MAJOR=%s' % ROCM_MAJOR,
                 '-DROCM_VERSION_MINOR=%s' % ROCM_MINOR
@@ -685,13 +684,9 @@ def nvcc_args(self):
         else:
             cuda_major, _ = installed_cuda_version()
             args += [
-                '-allow-unsupported-compiler' if sys.platform == "win32" else '',
-                '--use_fast_math',
-                '-std=c++17'
-                if sys.platform == "win32" and cuda_major > 10 else '-std=c++14',
-                '-U__CUDA_NO_HALF_OPERATORS__',
-                '-U__CUDA_NO_HALF_CONVERSIONS__',
-                '-U__CUDA_NO_HALF2_OPERATORS__'
+                '-allow-unsupported-compiler' if sys.platform == "win32" else '', '--use_fast_math',
+                '-std=c++17' if cuda_major > 10 else '-std=c++14', '-U__CUDA_NO_HALF_OPERATORS__',
+                '-U__CUDA_NO_HALF_CONVERSIONS__', '-U__CUDA_NO_HALF2_OPERATORS__'
             ]
             if os.environ.get('DS_DEBUG_CUDA_BUILD', '0') == '1':
                 args.append('--ptxas-options=-v')
@@ -709,6 +704,7 @@ def libraries_args(self):
 
 
 class TorchCPUOpBuilder(CUDAOpBuilder):
+
     def extra_ldflags(self):
         if self.build_for_cpu:
             return ['-fopenmp']
@@ -724,6 +720,8 @@ def cxx_args(self):
         if not self.build_for_cpu:
             if not self.is_rocm_pytorch():
                 CUDA_LIB64 = os.path.join(torch.utils.cpp_extension.CUDA_HOME, "lib64")
+                if not os.path.exists(CUDA_LIB64):
+                    CUDA_LIB64 = os.path.join(torch.utils.cpp_extension.CUDA_HOME, "lib")
             else:
                 CUDA_LIB64 = os.path.join(torch.utils.cpp_extension.ROCM_HOME, "lib")
 
diff --git a/op_builder/cpu/__init__.py b/op_builder/cpu/__init__.py
new file mode 100644
index 000000000000..0ecf2caf65ae
--- /dev/null
+++ b/op_builder/cpu/__init__.py
@@ -0,0 +1,10 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+'''Copyright The Microsoft DeepSpeed Team'''
+
+from .comm import CCLCommBuilder
+from .fused_adam import FusedAdamBuilder
+from .cpu_adam import CPUAdamBuilder
+from .no_impl import NotImplementedBuilder
diff --git a/op_builder/cpu/builder.py b/op_builder/cpu/builder.py
new file mode 100644
index 000000000000..f6a71c7d1971
--- /dev/null
+++ b/op_builder/cpu/builder.py
@@ -0,0 +1,34 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+try:
+    # is op_builder from deepspeed or a 3p version? this should only succeed if it's deepspeed
+    # if successful this also means we're doing a local install and not JIT compile path
+    from op_builder import __deepspeed__  # noqa: F401 # type: ignore
+    from op_builder.builder import OpBuilder
+except ImportError:
+    from deepspeed.ops.op_builder.builder import OpBuilder
+
+
+class CPUOpBuilder(OpBuilder):
+
+    def builder(self):
+        from torch.utils.cpp_extension import CppExtension as ExtensionBuilder
+
+        compile_args = {'cxx': self.strip_empty_entries(self.cxx_args())}
+
+        cpp_ext = ExtensionBuilder(name=self.absolute_name(),
+                                   sources=self.strip_empty_entries(self.sources()),
+                                   include_dirs=self.strip_empty_entries(self.include_paths()),
+                                   libraries=self.strip_empty_entries(self.libraries_args()),
+                                   extra_compile_args=compile_args)
+
+        return cpp_ext
+
+    def cxx_args(self):
+        return ['-O3', '-g', '-Wno-reorder']
+
+    def libraries_args(self):
+        return []
diff --git a/op_builder/cpu/comm.py b/op_builder/cpu/comm.py
new file mode 100644
index 000000000000..ec908eb0622b
--- /dev/null
+++ b/op_builder/cpu/comm.py
@@ -0,0 +1,44 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import os
+from .builder import CPUOpBuilder
+
+
+class CCLCommBuilder(CPUOpBuilder):
+    BUILD_VAR = "DS_BUILD_CCL_COMM"
+    NAME = "deepspeed_ccl_comm"
+
+    def __init__(self, name=None):
+        name = self.NAME if name is None else name
+        super().__init__(name=name)
+
+    def absolute_name(self):
+        return f'deepspeed.ops.comm.{self.NAME}_op'
+
+    def sources(self):
+        return ['csrc/cpu/comm/ccl.cpp']
+
+    def include_paths(self):
+        includes = ['csrc/cpu/includes']
+        return includes
+
+    def cxx_args(self):
+        return ['-O2', '-fopenmp']
+
+    def is_compatible(self, verbose=True):
+        # TODO: add soft compatibility check for private binary release.
+        #  a soft check, as in we know it can be trivially changed.
+        return super().is_compatible(verbose)
+
+    def extra_ldflags(self):
+        ccl_root_path = os.environ.get("CCL_ROOT")
+        if ccl_root_path == None:
+            raise ValueError(
+                "Didn't find CCL_ROOT, install oneCCL from https://github.com/oneapi-src/oneCCL and source its environment variable"
+            )
+            return []
+        else:
+            return ['-lccl', f'-L{ccl_root_path}/lib']
diff --git a/op_builder/cpu/cpu_adam.py b/op_builder/cpu/cpu_adam.py
new file mode 100644
index 000000000000..0c8438aea40d
--- /dev/null
+++ b/op_builder/cpu/cpu_adam.py
@@ -0,0 +1,27 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .builder import CPUOpBuilder
+
+
+class CPUAdamBuilder(CPUOpBuilder):
+    BUILD_VAR = "DS_BUILD_CPU_ADAM"
+    NAME = "cpu_adam"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'deepspeed.ops.adam.{self.NAME}_op'
+
+    def sources(self):
+        return ['csrc/adam/cpu_adam.cpp', 'csrc/adam/cpu_adam_impl.cpp']
+
+    def libraries_args(self):
+        args = super().libraries_args()
+        return args
+
+    def include_paths(self):
+        return ['csrc/includes']
diff --git a/op_builder/cpu/fused_adam.py b/op_builder/cpu/fused_adam.py
new file mode 100644
index 000000000000..34b43825b090
--- /dev/null
+++ b/op_builder/cpu/fused_adam.py
@@ -0,0 +1,23 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .builder import CPUOpBuilder
+
+
+class FusedAdamBuilder(CPUOpBuilder):
+    BUILD_VAR = "DS_BUILD_FUSED_ADAM"
+    NAME = "fused_adam"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'deepspeed.ops.adam.{self.NAME}_op'
+
+    def sources(self):
+        return ['csrc/cpu/adam/fused_adam.cpp', 'csrc/adam/cpu_adam_impl.cpp']
+
+    def include_paths(self):
+        return ['csrc/includes']
diff --git a/op_builder/cpu/no_impl.py b/op_builder/cpu/no_impl.py
new file mode 100644
index 000000000000..69d114a9f1c0
--- /dev/null
+++ b/op_builder/cpu/no_impl.py
@@ -0,0 +1,24 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .builder import CPUOpBuilder
+
+
+class NotImplementedBuilder(CPUOpBuilder):
+    BUILD_VAR = "DS_BUILD_NOT_IMPLEMENTED"
+    NAME = "deepspeed_not_implemented"
+
+    def __init__(self, name=None):
+        name = self.NAME if name is None else name
+        super().__init__(name=name)
+
+    def absolute_name(self):
+        return f'deepspeed.ops.comm.{self.NAME}_op'
+
+    def load(self, verbose=True):
+        raise ValueError("This op had not been implemented on CPU backend.")
+
+    def sources(self):
+        return []
diff --git a/op_builder/cpu_adagrad.py b/op_builder/cpu_adagrad.py
index bf7c98052fc5..d3f163f7464a 100644
--- a/op_builder/cpu_adagrad.py
+++ b/op_builder/cpu_adagrad.py
@@ -1,6 +1,8 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 import os
 from .builder import TorchCPUOpBuilder
 
@@ -37,14 +39,5 @@ def include_paths(self):
         elif not self.is_rocm_pytorch():
             CUDA_INCLUDE = [os.path.join(torch.utils.cpp_extension.CUDA_HOME, "include")]
         else:
-            CUDA_INCLUDE = [
-                os.path.join(torch.utils.cpp_extension.ROCM_HOME,
-                             "include"),
-                os.path.join(torch.utils.cpp_extension.ROCM_HOME,
-                             "include",
-                             "rocrand"),
-                os.path.join(torch.utils.cpp_extension.ROCM_HOME,
-                             "include",
-                             "hiprand"),
-            ]
+            CUDA_INCLUDE = []
         return ['csrc/includes'] + CUDA_INCLUDE
diff --git a/op_builder/cpu_adam.py b/op_builder/cpu_adam.py
index bdf1166d1e3a..7c34c4ce43a1 100644
--- a/op_builder/cpu_adam.py
+++ b/op_builder/cpu_adam.py
@@ -1,6 +1,8 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 import os
 from .builder import TorchCPUOpBuilder
 
@@ -17,9 +19,9 @@ def absolute_name(self):
 
     def sources(self):
         if self.build_for_cpu:
-            return ['csrc/adam/cpu_adam.cpp']
+            return ['csrc/adam/cpu_adam.cpp', 'csrc/adam/cpu_adam_impl.cpp']
 
-        return ['csrc/adam/cpu_adam.cpp', 'csrc/common/custom_cuda_kernel.cu']
+        return ['csrc/adam/cpu_adam.cpp', 'csrc/adam/cpu_adam_impl.cpp', 'csrc/common/custom_cuda_kernel.cu']
 
     def libraries_args(self):
         args = super().libraries_args()
@@ -38,14 +40,5 @@ def include_paths(self):
         elif not self.is_rocm_pytorch():
             CUDA_INCLUDE = [os.path.join(torch.utils.cpp_extension.CUDA_HOME, "include")]
         else:
-            CUDA_INCLUDE = [
-                os.path.join(torch.utils.cpp_extension.ROCM_HOME,
-                             "include"),
-                os.path.join(torch.utils.cpp_extension.ROCM_HOME,
-                             "include",
-                             "rocrand"),
-                os.path.join(torch.utils.cpp_extension.ROCM_HOME,
-                             "include",
-                             "hiprand"),
-            ]
+            CUDA_INCLUDE = []
         return ['csrc/includes'] + CUDA_INCLUDE
diff --git a/op_builder/cpu_lion.py b/op_builder/cpu_lion.py
new file mode 100644
index 000000000000..5c16d10ebb44
--- /dev/null
+++ b/op_builder/cpu_lion.py
@@ -0,0 +1,48 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import os
+from .builder import TorchCPUOpBuilder
+
+
+class CPULionBuilder(TorchCPUOpBuilder):
+    BUILD_VAR = "DS_BUILD_CPU_LION"
+    NAME = "cpu_lion"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'deepspeed.ops.lion.{self.NAME}_op'
+
+    def sources(self):
+        if self.build_for_cpu:
+            return ['csrc/lion/cpu_lion.cpp', 'csrc/lion/cpu_lion_impl.cpp']
+
+        return ['csrc/lion/cpu_lion.cpp', 'csrc/lion/cpu_lion_impl.cpp', 'csrc/common/custom_cuda_kernel.cu']
+
+    def libraries_args(self):
+        args = super().libraries_args()
+        if self.build_for_cpu:
+            return args
+
+        if not self.is_rocm_pytorch():
+            args += ['curand']
+
+        return args
+
+    def include_paths(self):
+        import torch
+        if self.build_for_cpu:
+            CUDA_INCLUDE = []
+        elif not self.is_rocm_pytorch():
+            CUDA_INCLUDE = [os.path.join(torch.utils.cpp_extension.CUDA_HOME, "include")]
+        else:
+            CUDA_INCLUDE = [
+                os.path.join(torch.utils.cpp_extension.ROCM_HOME, "include"),
+                os.path.join(torch.utils.cpp_extension.ROCM_HOME, "include", "rocrand"),
+                os.path.join(torch.utils.cpp_extension.ROCM_HOME, "include", "hiprand"),
+            ]
+        return ['csrc/includes'] + CUDA_INCLUDE
diff --git a/op_builder/evoformer_attn.py b/op_builder/evoformer_attn.py
new file mode 100644
index 000000000000..6e7721f94e01
--- /dev/null
+++ b/op_builder/evoformer_attn.py
@@ -0,0 +1,72 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .builder import CUDAOpBuilder, installed_cuda_version
+import os
+
+
+class EvoformerAttnBuilder(CUDAOpBuilder):
+    BUILD_VAR = "DS_BUILD_EVOFORMER_ATTN"
+    NAME = "evoformer_attn"
+
+    def __init__(self, name=None):
+        name = self.NAME if name is None else name
+        super().__init__(name=name)
+        self.cutlass_path = os.environ.get('CUTLASS_PATH')
+
+    def absolute_name(self):
+        return f'deepspeed.ops.{self.NAME}_op'
+
+    def extra_ldflags(self):
+        if not self.is_rocm_pytorch():
+            return ['-lcurand']
+        else:
+            return []
+
+    def sources(self):
+        src_dir = 'csrc/deepspeed4science/evoformer_attn'
+        return [f'{src_dir}/attention.cpp', f'{src_dir}/attention_back.cu', f'{src_dir}/attention_cu.cu']
+
+    def nvcc_args(self):
+        args = super().nvcc_args()
+        try:
+            import torch
+        except ImportError:
+            self.warning("Please install torch if trying to pre-compile kernels")
+            return args
+        major = torch.cuda.get_device_properties(0).major  #ignore-cuda
+        minor = torch.cuda.get_device_properties(0).minor  #ignore-cuda
+        args.append(f"-DGPU_ARCH={major}{minor}")
+        return args
+
+    def is_compatible(self, verbose=True):
+        try:
+            import torch
+        except ImportError:
+            self.warning("Please install torch if trying to pre-compile kernels")
+            return False
+        if self.cutlass_path is None:
+            self.warning("Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH")
+            return False
+        with open(f'{self.cutlass_path}/CHANGELOG.md', 'r') as f:
+            if '3.1.0' not in f.read():
+                self.warning("Please use CUTLASS version >= 3.1.0")
+                return False
+        cuda_okay = True
+        if not self.is_rocm_pytorch() and torch.cuda.is_available():  #ignore-cuda
+            sys_cuda_major, _ = installed_cuda_version()
+            torch_cuda_major = int(torch.version.cuda.split('.')[0])
+            cuda_capability = torch.cuda.get_device_properties(0).major  #ignore-cuda
+            if cuda_capability < 7:
+                self.warning("Please use a GPU with compute capability >= 7.0")
+                cuda_okay = False
+            if torch_cuda_major < 11 or sys_cuda_major < 11:
+                self.warning("Please use CUDA 11+")
+                cuda_okay = False
+        return super().is_compatible(verbose) and cuda_okay
+
+    def include_paths(self):
+        includes = [f'{self.cutlass_path}/include', f'{self.cutlass_path}/tools/util/include']
+        return includes
diff --git a/op_builder/fused_adam.py b/op_builder/fused_adam.py
index 2883d417ede9..ac6e4eeaaea5 100644
--- a/op_builder/fused_adam.py
+++ b/op_builder/fused_adam.py
@@ -1,6 +1,8 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 from .builder import CUDAOpBuilder
 
 import sys
@@ -29,9 +31,7 @@ def cxx_args(self):
     def nvcc_args(self):
         nvcc_flags = ['-O3'] + self.version_dependent_macros()
         if not self.is_rocm_pytorch():
-            nvcc_flags.extend([
-                '-allow-unsupported-compiler' if sys.platform == "win32" else '',
-                '-lineinfo',
-                '--use_fast_math'
-            ] + self.compute_capability_args())
+            nvcc_flags.extend(
+                ['-allow-unsupported-compiler' if sys.platform == "win32" else '', '-lineinfo', '--use_fast_math'] +
+                self.compute_capability_args())
         return nvcc_flags
diff --git a/op_builder/fused_lamb.py b/op_builder/fused_lamb.py
index d5f88d0b1ad1..f0cb557706b3 100644
--- a/op_builder/fused_lamb.py
+++ b/op_builder/fused_lamb.py
@@ -1,6 +1,8 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 from .builder import CUDAOpBuilder
 
 import sys
@@ -30,14 +32,9 @@ def nvcc_args(self):
         nvcc_flags = ['-O3'] + self.version_dependent_macros()
         if self.is_rocm_pytorch():
             ROCM_MAJOR, ROCM_MINOR = self.installed_rocm_version()
-            nvcc_flags += [
-                '-DROCM_VERSION_MAJOR=%s' % ROCM_MAJOR,
-                '-DROCM_VERSION_MINOR=%s' % ROCM_MINOR
-            ]
+            nvcc_flags += ['-DROCM_VERSION_MAJOR=%s' % ROCM_MAJOR, '-DROCM_VERSION_MINOR=%s' % ROCM_MINOR]
         else:
-            nvcc_flags.extend([
-                '-allow-unsupported-compiler' if sys.platform == "win32" else '',
-                '-lineinfo',
-                '--use_fast_math'
-            ] + self.compute_capability_args())
+            nvcc_flags.extend(
+                ['-allow-unsupported-compiler' if sys.platform == "win32" else '', '-lineinfo', '--use_fast_math'] +
+                self.compute_capability_args())
         return nvcc_flags
diff --git a/op_builder/fused_lion.py b/op_builder/fused_lion.py
new file mode 100644
index 000000000000..b900a8f2369d
--- /dev/null
+++ b/op_builder/fused_lion.py
@@ -0,0 +1,37 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .builder import CUDAOpBuilder
+
+import sys
+
+
+class FusedLionBuilder(CUDAOpBuilder):
+    BUILD_VAR = "DS_BUILD_FUSED_LION"
+    NAME = "fused_lion"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'deepspeed.ops.lion.{self.NAME}_op'
+
+    def sources(self):
+        return ['csrc/lion/fused_lion_frontend.cpp', 'csrc/lion/multi_tensor_lion.cu']
+
+    def include_paths(self):
+        return ['csrc/includes', 'csrc/lion']
+
+    def cxx_args(self):
+        args = super().cxx_args()
+        return args + self.version_dependent_macros()
+
+    def nvcc_args(self):
+        nvcc_flags = ['-O3'] + self.version_dependent_macros()
+        if not self.is_rocm_pytorch():
+            nvcc_flags.extend(
+                ['-allow-unsupported-compiler' if sys.platform == "win32" else '', '-lineinfo', '--use_fast_math'] +
+                self.compute_capability_args())
+        return nvcc_flags
diff --git a/op_builder/inference_core_ops.py b/op_builder/inference_core_ops.py
new file mode 100755
index 000000000000..229b500bebda
--- /dev/null
+++ b/op_builder/inference_core_ops.py
@@ -0,0 +1,92 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import os
+
+from .builder import CUDAOpBuilder, installed_cuda_version
+
+
+class InferenceCoreBuilder(CUDAOpBuilder):
+    BUILD_VAR = "DS_BUILD_INFERENCE_CORE_OPS"
+    NAME = "inference_core_ops"
+
+    def __init__(self, name=None):
+        name = self.NAME if name is None else name
+        super().__init__(name=name)
+
+    def absolute_name(self):
+        return f'deepspeed.inference.v2.kernels{self.NAME}'
+
+    def is_compatible(self, verbose=True):
+        try:
+            import torch
+        except ImportError:
+            self.warning("Please install torch if trying to pre-compile inference kernels")
+            return False
+
+        cuda_okay = True
+        if not self.is_rocm_pytorch() and torch.cuda.is_available():  #ignore-cuda
+            sys_cuda_major, _ = installed_cuda_version()
+            torch_cuda_major = int(torch.version.cuda.split('.')[0])
+            cuda_capability = torch.cuda.get_device_properties(0).major  #ignore-cuda
+            if cuda_capability < 6:
+                self.warning("NVIDIA Inference is only supported on Pascal and newer architectures")
+                cuda_okay = False
+            if cuda_capability >= 8:
+                if torch_cuda_major < 11 or sys_cuda_major < 11:
+                    self.warning("On Ampere and higher architectures please use CUDA 11+")
+                    cuda_okay = False
+        return super().is_compatible(verbose) and cuda_okay
+
+    def filter_ccs(self, ccs):
+        ccs_retained = []
+        ccs_pruned = []
+        for cc in ccs:
+            if int(cc[0]) >= 6:
+                ccs_retained.append(cc)
+            else:
+                ccs_pruned.append(cc)
+        if len(ccs_pruned) > 0:
+            self.warning(f"Filtered compute capabilities {ccs_pruned}")
+        return ccs_retained
+
+    def get_prefix(self):
+        ds_path = self.deepspeed_src_path("deepspeed")
+        return "deepspeed" if os.path.isdir(ds_path) else ".."
+
+    def sources(self):
+        sources = [
+            "inference/v2/kernels/core_ops/core_ops.cpp",
+            "inference/v2/kernels/core_ops/bias_activations/bias_activation.cpp",
+            "inference/v2/kernels/core_ops/bias_activations/bias_activation.cu",
+            "inference/v2/kernels/core_ops/cuda_layer_norm/layer_norm.cpp",
+            "inference/v2/kernels/core_ops/cuda_layer_norm/layer_norm.cu",
+            "inference/v2/kernels/core_ops/cuda_rms_norm/rms_norm.cpp",
+            "inference/v2/kernels/core_ops/cuda_rms_norm/rms_norm.cu",
+            "inference/v2/kernels/core_ops/gated_activations/gated_activation_kernels.cpp",
+            "inference/v2/kernels/core_ops/gated_activations/gated_activation_kernels.cu",
+        ]
+
+        prefix = self.get_prefix()
+        sources = [os.path.join(prefix, src) for src in sources]
+        return sources
+
+    def extra_ldflags(self):
+        return []
+
+    def include_paths(self):
+        sources = [
+            'inference/v2/kernels/core_ops/bias_activations',
+            'inference/v2/kernels/core_ops/blas_kernels',
+            'inference/v2/kernels/core_ops/cuda_layer_norm',
+            'inference/v2/kernels/core_ops/cuda_rms_norm',
+            'inference/v2/kernels/core_ops/gated_activations',
+            'inference/v2/kernels/includes',
+        ]
+
+        prefix = self.get_prefix()
+        sources = [os.path.join(prefix, src) for src in sources]
+
+        return sources
diff --git a/op_builder/inference_cutlass_builder.py b/op_builder/inference_cutlass_builder.py
new file mode 100644
index 000000000000..51f7931d9435
--- /dev/null
+++ b/op_builder/inference_cutlass_builder.py
@@ -0,0 +1,92 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+import os
+
+from .builder import CUDAOpBuilder, installed_cuda_version
+
+
+class InferenceCutlassBuilder(CUDAOpBuilder):
+    BUILD_VAR = "DS_BUILD_CUTLASS_OPS"
+    NAME = "cutlass_ops"
+
+    def __init__(self, name=None):
+        name = self.NAME if name is None else name
+        super().__init__(name=name)
+
+    def absolute_name(self):
+        return f'deepspeed.inference.v2.kernels.cutlass_ops.{self.NAME}'
+
+    def is_compatible(self, verbose=True):
+        try:
+            import torch
+        except ImportError:
+            self.warning("Please install torch if trying to pre-compile inference kernels")
+            return False
+
+        cuda_okay = True
+        if not self.is_rocm_pytorch() and torch.cuda.is_available():  #ignore-cuda
+            sys_cuda_major, _ = installed_cuda_version()
+            torch_cuda_major = int(torch.version.cuda.split('.')[0])
+            cuda_capability = torch.cuda.get_device_properties(0).major  #ignore-cuda
+            if cuda_capability < 6:
+                self.warning("NVIDIA Inference is only supported on Pascal and newer architectures")
+                cuda_okay = False
+            if cuda_capability >= 8:
+                if torch_cuda_major < 11 or sys_cuda_major < 11:
+                    self.warning("On Ampere and higher architectures please use CUDA 11+")
+                    cuda_okay = False
+        return super().is_compatible(verbose) and cuda_okay
+
+    def filter_ccs(self, ccs):
+        ccs_retained = []
+        ccs_pruned = []
+        for cc in ccs:
+            if int(cc[0]) >= 8:
+                # Only support Ampere and newer
+                ccs_retained.append(cc)
+            else:
+                ccs_pruned.append(cc)
+        if len(ccs_pruned) > 0:
+            self.warning(f"Filtered compute capabilities {ccs_pruned}")
+        return ccs_retained
+
+    def get_prefix(self):
+        ds_path = self.deepspeed_src_path("deepspeed")
+        return "deepspeed" if os.path.isdir(ds_path) else ".."
+
+    def sources(self):
+        sources = [
+            "inference/v2/kernels/cutlass_ops/cutlass_ops.cpp",
+            "inference/v2/kernels/cutlass_ops/mixed_gemm/mixed_gemm.cu",
+            "inference/v2/kernels/cutlass_ops/moe_gemm/moe_gemm.cu",
+        ]
+
+        prefix = self.get_prefix()
+        sources = [os.path.join(prefix, src) for src in sources]
+        return sources
+
+    def extra_ldflags(self):
+        import dskernels
+        lib_path = dskernels.library_path()
+        prefix = self.get_prefix()
+        lib_path = os.path.join(prefix, lib_path)
+        lib_path = self.deepspeed_src_path(lib_path)
+
+        args = [f'-L{lib_path}', '-ldeepspeedft']
+        if self.jit_load:
+            args.append(f'-Wl,-rpath,{lib_path}')
+        return args
+
+    def include_paths(self):
+        sources = [
+            'inference/v2/kernels/includes',
+            'inference/v2/kernels/cutlass_ops/mixed_gemm',
+            'inference/v2/kernels/cutlass_ops/moe_gemm',
+            'inference/v2/kernels/cutlass_ops/shared_resources/',
+        ]
+
+        prefix = self.get_prefix()
+        sources = [os.path.join(prefix, src) for src in sources]
+        return sources
diff --git a/op_builder/npu/__init__.py b/op_builder/npu/__init__.py
new file mode 100644
index 000000000000..6ad9124d36f0
--- /dev/null
+++ b/op_builder/npu/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+'''Copyright The Microsoft DeepSpeed Team'''
+
+from .fused_adam import FusedAdamBuilder
+from .no_impl import NotImplementedBuilder
+from .cpu_adam import CPUAdamBuilder
+from .cpu_adagrad import CPUAdagradBuilder
+from .cpu_lion import CPULionBuilder
diff --git a/op_builder/npu/builder.py b/op_builder/npu/builder.py
new file mode 100644
index 000000000000..0dea2e78915e
--- /dev/null
+++ b/op_builder/npu/builder.py
@@ -0,0 +1,86 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import re
+import os
+try:
+    import torch_npu
+except ImportError as e:
+    pass
+
+try:
+    # is op_builder from deepspeed or a 3p version? this should only succeed if it's deepspeed
+    # if successful this also means we're doing a local install and not JIT compile path
+    from op_builder import __deepspeed__  # noqa: F401 # type: ignore
+    from op_builder.builder import OpBuilder
+except ImportError:
+    from deepspeed.ops.op_builder.builder import OpBuilder
+
+
+class NPUOpBuilder(OpBuilder):
+    _ascend_path = None
+    _torch_npu_path = None
+    _cann_version = None
+
+    def __init__(self, name):
+        super().__init__(name)
+        self._ascend_path = self.installed_cann_path()
+        self._torch_npu_path = os.path.join(os.path.dirname(os.path.abspath(torch_npu.__file__)))
+        try:
+            self._cann_version = self.installed_cann_version(self.name)
+        except BaseException:
+            print(f"{self.name} ascend_cann is missing, npu ops cannot be compiled!")
+
+    def cann_defs(self):
+        if self._cann_version:
+            return '-D__ENABLE_CANN__'
+        return '-D__DISABLE_CANN__'
+
+    def installed_cann_path(self):
+        if "ASCEND_HOME_PATH" in os.environ or os.path.exists(os.environ["ASCEND_HOME_PATH"]):
+            return os.environ["ASCEND_HOME_PATH"]
+        return None
+
+    def installed_cann_version(self, name=""):
+        ascend_path = self.installed_cann_path()
+        assert ascend_path is not None, "CANN_HOME does not exist, unable to compile NPU op(s)"
+        cann_version = ""
+        for dirpath, _, filenames in os.walk(os.path.realpath(ascend_path)):
+            if cann_version:
+                break
+            install_files = [file for file in filenames if re.match(r"ascend_.*_install\.info", file)]
+            if install_files:
+                filepath = os.path.join(dirpath, install_files[0])
+                with open(filepath, "r") as f:
+                    for line in f:
+                        if line.find("version") != -1:
+                            cann_version = line.strip().split("=")[-1]
+                            break
+        return cann_version
+
+    def include_paths(self):
+        paths = super().include_paths()
+        paths += [os.path.join(self._ascend_path, 'include'), os.path.join(self._torch_npu_path, 'include')]
+        return paths
+
+    def cxx_args(self):
+        args = super().cxx_args()
+        args += ['-O3', '-std=c++17', '-g', '-Wno-reorder', '-fopenmp']
+        args += ['-fstack-protector-all', '-Wl,-z,relro,-z,now,-z,noexecstack', '-Wl,--disable-new-dtags,--rpath']
+        args += [
+            self.cann_defs(),
+            self.cpu_arch(),
+            self.simd_width(), '-L' + os.path.join(self._ascend_path, 'lib64'),
+            '-L' + os.path.join(self._torch_npu_path, 'lib')
+        ]
+        return args
+
+    def extra_ldflags(self):
+        flags = super().extra_ldflags()
+        flags += [
+            '-L' + os.path.join(self._ascend_path, 'lib64'), '-lascendcl',
+            '-L' + os.path.join(self._torch_npu_path, 'lib'), '-ltorch_npu'
+        ]
+        return flags
diff --git a/op_builder/npu/cpu_adagrad.py b/op_builder/npu/cpu_adagrad.py
new file mode 100644
index 000000000000..161bc82efe1c
--- /dev/null
+++ b/op_builder/npu/cpu_adagrad.py
@@ -0,0 +1,25 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .builder import NPUOpBuilder
+
+
+class CPUAdagradBuilder(NPUOpBuilder):
+    BUILD_VAR = "DS_BUILD_CPU_ADAGRAD"
+    NAME = "cpu_adagrad"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'deepspeed.ops.adagrad.{self.NAME}_op'
+
+    def sources(self):
+        return ['csrc/adagrad/cpu_adagrad.cpp']
+
+    def include_paths(self):
+        args = super().include_paths()
+        args += ['csrc/includes']
+        return args
diff --git a/op_builder/npu/cpu_adam.py b/op_builder/npu/cpu_adam.py
new file mode 100644
index 000000000000..a4e9569c0f33
--- /dev/null
+++ b/op_builder/npu/cpu_adam.py
@@ -0,0 +1,25 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .builder import NPUOpBuilder
+
+
+class CPUAdamBuilder(NPUOpBuilder):
+    BUILD_VAR = "DS_BUILD_CPU_ADAM"
+    NAME = "cpu_adam"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'deepspeed.ops.adam.{self.NAME}_op'
+
+    def sources(self):
+        return ['csrc/adam/cpu_adam.cpp', 'csrc/adam/cpu_adam_impl.cpp']
+
+    def include_paths(self):
+        args = super().include_paths()
+        args += ['csrc/includes']
+        return args
diff --git a/op_builder/npu/cpu_lion.py b/op_builder/npu/cpu_lion.py
new file mode 100644
index 000000000000..6917e0fd03d0
--- /dev/null
+++ b/op_builder/npu/cpu_lion.py
@@ -0,0 +1,25 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .builder import NPUOpBuilder
+
+
+class CPULionBuilder(NPUOpBuilder):
+    BUILD_VAR = "DS_BUILD_CPU_LION"
+    NAME = "cpu_lion"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'deepspeed.ops.lion.{self.NAME}_op'
+
+    def sources(self):
+        return ['csrc/lion/cpu_lion.cpp', 'csrc/lion/cpu_lion_impl.cpp']
+
+    def include_paths(self):
+        args = super().include_paths()
+        args += ['csrc/includes']
+        return args
diff --git a/op_builder/npu/fused_adam.py b/op_builder/npu/fused_adam.py
new file mode 100644
index 000000000000..fc1bc83c7cc7
--- /dev/null
+++ b/op_builder/npu/fused_adam.py
@@ -0,0 +1,74 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .builder import NPUOpBuilder
+
+try:
+    import torch_npu
+except ImportError as e:
+    pass
+
+
+class NPUFusedAdam:
+
+    @staticmethod
+    def multi_tensor_adam(chunk_size, noop_flag_buffer, tensor_lists, lr, beta1, beta2, epsilon, step, adam_w_mode,
+                          bias_correction, weight_decay, *args):
+        bias_correction1 = beta1**step
+        bias_correction2 = beta2**step
+
+        # iteration group['params']
+        for i in range(len(tensor_lists[0])):
+            grad_flat = tensor_lists[0][i]
+            param_flat = tensor_lists[1][i]
+            m_flat = tensor_lists[2][i]
+            v_flat = tensor_lists[3][i]
+
+            if adam_w_mode:
+                param_flat.data, m_flat, v_flat = torch_npu.npu_apply_adam_w(
+                    bias_correction1,
+                    bias_correction2,
+                    lr,
+                    weight_decay,
+                    beta1,
+                    beta2,
+                    epsilon,
+                    grad_flat,
+                    None,  # max_grad_norm
+                    False,  # amsgrad
+                    False,  # maximize
+                    out=(param_flat.data, m_flat, v_flat))
+            else:
+                param_flat.data, m_flat, v_flat = torch_npu.npu_apply_adam(
+                    bias_correction1,
+                    bias_correction2,
+                    lr,
+                    beta1,
+                    beta2,
+                    epsilon,
+                    grad_flat,
+                    False,  # use_locking
+                    False,  # use_nesterov
+                    out=(param_flat.data, m_flat, v_flat))
+
+
+class FusedAdamBuilder(NPUOpBuilder):
+    BUILD_VAR = "DS_BUILD_FUSED_ADAM"
+    NAME = "fused_adam"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'deepspeed.ops.adam.{self.NAME}_op'
+
+    def sources(self):
+        return []
+
+    def include_paths(self):
+        return []
+
+    def load(self, verbose=True):
+        return NPUFusedAdam
diff --git a/op_builder/npu/no_impl.py b/op_builder/npu/no_impl.py
new file mode 100644
index 000000000000..5b1771fabc22
--- /dev/null
+++ b/op_builder/npu/no_impl.py
@@ -0,0 +1,33 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .builder import NPUOpBuilder
+
+
+class NotImplementedBuilder(NPUOpBuilder):
+    BUILD_VAR = "DS_BUILD_NOT_IMPLEMENTED"
+    NAME = "deepspeed_not_implemented"
+
+    def __init__(self, name=None):
+        name = self.NAME if name is None else name
+        super().__init__(name=name)
+
+    def absolute_name(self):
+        return f'deepspeed.ops.comm.{self.NAME}_op'
+
+    def load(self, verbose=True):
+        raise ValueError("This op had not been implemented on NPU backend.")
+
+    def sources(self):
+        return []
+
+    def cxx_args(self):
+        return []
+
+    def extra_ldflags(self):
+        return []
+
+    def include_paths(self):
+        return []
diff --git a/op_builder/quantizer.py b/op_builder/quantizer.py
index e2c2c9564a29..fd765b743de0 100644
--- a/op_builder/quantizer.py
+++ b/op_builder/quantizer.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .builder import CUDAOpBuilder
 
@@ -19,7 +22,10 @@ def sources(self):
             'csrc/quantization/pt_binding.cpp',
             'csrc/quantization/fake_quantizer.cu',
             'csrc/quantization/quantize.cu',
+            'csrc/quantization/quantize_intX.cu',
             'csrc/quantization/dequantize.cu',
+            'csrc/quantization/swizzled_quantize.cu',
+            'csrc/quantization/quant_reduce.cu',
         ]
 
     def include_paths(self):
diff --git a/op_builder/ragged_ops.py b/op_builder/ragged_ops.py
new file mode 100644
index 000000000000..13d71b476b5a
--- /dev/null
+++ b/op_builder/ragged_ops.py
@@ -0,0 +1,114 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import os
+
+from .builder import CUDAOpBuilder, installed_cuda_version
+
+
+class RaggedOpsBuilder(CUDAOpBuilder):
+    BUILD_VAR = "DS_BUILD_RAGGED_DEVICE_OPS"
+    NAME = "ragged_device_ops"
+
+    def __init__(self, name=None):
+        name = self.NAME if name is None else name
+        super().__init__(name=name)
+
+    def absolute_name(self):
+        return f'deepspeed.inference.v2.kernels.ragged_ops.{self.NAME}'
+
+    def is_compatible(self, verbose=True):
+        try:
+            import torch
+        except ImportError:
+            self.warning("Please install torch if trying to pre-compile inference kernels")
+            return False
+
+        cuda_okay = True
+        if not self.is_rocm_pytorch() and torch.cuda.is_available():  #ignore-cuda
+            sys_cuda_major, _ = installed_cuda_version()
+            torch_cuda_major = int(torch.version.cuda.split('.')[0])
+            cuda_capability = torch.cuda.get_device_properties(0).major  #ignore-cuda
+            if cuda_capability < 6:
+                self.warning("NVIDIA Inference is only supported on Pascal and newer architectures")
+                cuda_okay = False
+            if cuda_capability >= 8:
+                if torch_cuda_major < 11 or sys_cuda_major < 11:
+                    self.warning("On Ampere and higher architectures please use CUDA 11+")
+                    cuda_okay = False
+        return super().is_compatible(verbose) and cuda_okay
+
+    def filter_ccs(self, ccs):
+        ccs_retained = []
+        ccs_pruned = []
+        for cc in ccs:
+            if int(cc[0]) >= 8:
+                # Blocked flash has a dependency on Ampere + newer
+                ccs_retained.append(cc)
+            else:
+                ccs_pruned.append(cc)
+        if len(ccs_pruned) > 0:
+            self.warning(f"Filtered compute capabilities {ccs_pruned}")
+        return ccs_retained
+
+    def get_prefix(self):
+        ds_path = self.deepspeed_src_path("deepspeed")
+        return "deepspeed" if os.path.isdir(ds_path) else ".."
+
+    def sources(self):
+        sources = [
+            "inference/v2/kernels/ragged_ops/ragged_ops.cpp",
+            "inference/v2/kernels/ragged_ops/atom_builder/atom_builder.cpp",
+            "inference/v2/kernels/ragged_ops/blocked_flash/blocked_flash.cpp",
+            "inference/v2/kernels/ragged_ops/embed/embed.cpp",
+            "inference/v2/kernels/ragged_ops/embed/embed.cu",
+            "inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_kv_rotary.cpp",
+            "inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_kv_rotary.cu",
+            "inference/v2/kernels/ragged_ops/logits_gather/logits_gather.cpp",
+            "inference/v2/kernels/ragged_ops/logits_gather/logits_gather.cu",
+            "inference/v2/kernels/ragged_ops/moe_scatter/moe_scatter.cpp",
+            "inference/v2/kernels/ragged_ops/moe_scatter/moe_scatter.cu",
+            "inference/v2/kernels/ragged_ops/moe_gather/moe_gather.cpp",
+            "inference/v2/kernels/ragged_ops/moe_gather/moe_gather.cu",
+            "inference/v2/kernels/ragged_ops/ragged_helpers/ragged_kernel_helpers.cpp",
+            "inference/v2/kernels/ragged_ops/top_1_gating/top_1_gating.cpp",
+            "inference/v2/kernels/ragged_ops/top_1_gating/top_1_gating.cu",
+        ]
+
+        prefix = self.get_prefix()
+        sources = [os.path.join(prefix, src) for src in sources]
+        return sources
+
+    def extra_ldflags(self):
+        import dskernels
+        lib_path = dskernels.library_path()
+
+        prefix = self.get_prefix()
+        lib_path = os.path.join(prefix, lib_path)
+        lib_path = self.deepspeed_src_path(lib_path)
+
+        args = [f'-L{lib_path}', '-lblockedflash']
+        if self.jit_load:
+            args.append(f'-Wl,-rpath,{lib_path}')
+        return args
+
+    def include_paths(self):
+        sources = [
+            'inference/v2/kernels/includes',
+            'inference/v2/kernels/ragged_ops',
+            'inference/v2/kernels/ragged_ops/atom_builder',
+            'inference/v2/kernels/ragged_ops/blocked_flash',
+            'inference/v2/kernels/ragged_ops/embed',
+            'inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary',
+            'inference/v2/kernels/ragged_ops/logits_gather',
+            'inference/v2/kernels/ragged_ops/moe_gather',
+            'inference/v2/kernels/ragged_ops/moe_scatter',
+            'inference/v2/kernels/ragged_ops/ragged_helpers',
+            'inference/v2/kernels/ragged_ops/top_1_gating',
+        ]
+
+        prefix = self.get_prefix()
+        sources = [os.path.join(prefix, src) for src in sources]
+        return sources
diff --git a/op_builder/ragged_utils.py b/op_builder/ragged_utils.py
new file mode 100755
index 000000000000..89450e1fd30d
--- /dev/null
+++ b/op_builder/ragged_utils.py
@@ -0,0 +1,77 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import os
+
+from .builder import CUDAOpBuilder, installed_cuda_version
+
+
+class RaggedUtilsBuilder(CUDAOpBuilder):
+    BUILD_VAR = "DS_BUILD_RAGGED_OPS"
+    NAME = "ragged_ops"
+
+    def __init__(self, name=None):
+        name = self.NAME if name is None else name
+        super().__init__(name=name)
+
+    def absolute_name(self):
+        return f'deepspeed.inference.v2.{self.NAME}'
+
+    def is_compatible(self, verbose=True):
+        try:
+            import torch
+        except ImportError:
+            self.warning("Please install torch if trying to pre-compile inference kernels")
+            return False
+
+        cuda_okay = True
+        if not self.is_rocm_pytorch() and torch.cuda.is_available():  #ignore-cuda
+            sys_cuda_major, _ = installed_cuda_version()
+            torch_cuda_major = int(torch.version.cuda.split('.')[0])
+            cuda_capability = torch.cuda.get_device_properties(0).major  #ignore-cuda
+            if cuda_capability < 6:
+                self.warning("NVIDIA Inference is only supported on Pascal and newer architectures")
+                cuda_okay = False
+            if cuda_capability >= 8:
+                if torch_cuda_major < 11 or sys_cuda_major < 11:
+                    self.warning("On Ampere and higher architectures please use CUDA 11+")
+                    cuda_okay = False
+        return super().is_compatible(verbose) and cuda_okay
+
+    def filter_ccs(self, ccs):
+        ccs_retained = []
+        ccs_pruned = []
+        for cc in ccs:
+            if int(cc[0]) >= 6:
+                ccs_retained.append(cc)
+            else:
+                ccs_pruned.append(cc)
+        if len(ccs_pruned) > 0:
+            self.warning(f"Filtered compute capabilities {ccs_pruned}")
+        return ccs_retained
+
+    def get_prefix(self):
+        ds_path = self.deepspeed_src_path("deepspeed")
+        return "deepspeed" if os.path.isdir(ds_path) else ".."
+
+    def sources(self):
+        sources = [
+            "inference/v2/ragged/csrc/fast_host_buffer.cu",
+            "inference/v2/ragged/csrc/ragged_ops.cpp",
+        ]
+
+        prefix = self.get_prefix()
+        sources = [os.path.join(prefix, src) for src in sources]
+        return sources
+
+    def extra_ldflags(self):
+        return []
+
+    def include_paths(self):
+        include_dirs = ['inference/v2/ragged/includes', 'inference/v2/kernels/includes']
+        prefix = self.get_prefix()
+        includes = [os.path.join(prefix, include_dir) for include_dir in include_dirs]
+
+        return includes
diff --git a/op_builder/random_ltd.py b/op_builder/random_ltd.py
index 79c86c1346c5..54af7150fb36 100644
--- a/op_builder/random_ltd.py
+++ b/op_builder/random_ltd.py
@@ -1,6 +1,8 @@
-"""
-Copyright 2022 The Microsoft DeepSpeed Team
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 from .builder import CUDAOpBuilder
 
 
@@ -23,18 +25,10 @@ def extra_ldflags(self):
 
     def sources(self):
         return [
-            'csrc/random_ltd/pt_binding.cpp',
-            'csrc/random_ltd/gather_scatter.cu',
-            'csrc/random_ltd/slice_attn_masks.cu',
-            'csrc/random_ltd/token_sort.cu'
+            'csrc/random_ltd/pt_binding.cpp', 'csrc/random_ltd/gather_scatter.cu',
+            'csrc/random_ltd/slice_attn_masks.cu', 'csrc/random_ltd/token_sort.cu'
         ]
 
     def include_paths(self):
         includes = ['csrc/includes']
-        if self.is_rocm_pytorch():
-            from torch.utils.cpp_extension import ROCM_HOME
-            includes += [
-                '{}/hiprand/include'.format(ROCM_HOME),
-                '{}/rocrand/include'.format(ROCM_HOME)
-            ]
         return includes
diff --git a/op_builder/sparse_attn.py b/op_builder/sparse_attn.py
index 414bc212e8bc..188d257ff4ef 100644
--- a/op_builder/sparse_attn.py
+++ b/op_builder/sparse_attn.py
@@ -1,6 +1,8 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 from .builder import OpBuilder
 
 try:
@@ -47,26 +49,23 @@ def is_compatible(self, verbose=True):
             self.warning(f"{self.NAME} cuda is not available from torch")
         else:
             major, minor = torch.version.cuda.split('.')[:2]
-            cuda_compatible = (int(major) == 10
-                               and int(minor) >= 1) or (int(major) >= 11)
+            cuda_compatible = (int(major) == 10 and int(minor) >= 1) or (int(major) >= 11)
             if not cuda_compatible:
                 self.warning(f"{self.NAME} requires CUDA version 10.1+")
 
         TORCH_MAJOR = int(torch.__version__.split('.')[0])
         TORCH_MINOR = int(torch.__version__.split('.')[1])
-        torch_compatible = TORCH_MAJOR == 1 and TORCH_MINOR >= 5
+        torch_compatible = (TORCH_MAJOR == 1 and TORCH_MINOR >= 5)
         if not torch_compatible:
             self.warning(
-                f'{self.NAME} requires a torch version >= 1.5 but detected {TORCH_MAJOR}.{TORCH_MINOR}'
-            )
+                f'{self.NAME} requires a torch version >= 1.5 and < 2.0 but detected {TORCH_MAJOR}.{TORCH_MINOR}')
 
         try:
             import triton
         except ImportError:
             # auto-install of triton is broken on some systems, reverting to manual install for now
             # see this issue: https://github.com/microsoft/DeepSpeed/issues/1710
-            self.warning(
-                f"please install triton==1.0.0 if you want to use sparse attention")
+            self.warning(f"please install triton==1.0.0 if you want to use sparse attention")
             return False
 
         if pkg_version:
@@ -77,9 +76,7 @@ def is_compatible(self, verbose=True):
             triton_mismatch = installed_triton != "1.0.0"
 
         if triton_mismatch:
-            self.warning(
-                f"using untested triton version ({installed_triton}), only 1.0.0 is known to be compatible"
-            )
+            self.warning(f"using untested triton version ({installed_triton}), only 1.0.0 is known to be compatible")
             return False
 
         return super().is_compatible(verbose) and torch_compatible and cuda_compatible
diff --git a/op_builder/spatial_inference.py b/op_builder/spatial_inference.py
index 18d19d40e86f..59caf57f938d 100644
--- a/op_builder/spatial_inference.py
+++ b/op_builder/spatial_inference.py
@@ -1,6 +1,8 @@
-'''
-Copyright 2022 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 from .builder import CUDAOpBuilder, installed_cuda_version
 
 
@@ -19,8 +21,7 @@ def is_compatible(self, verbose=True):
         try:
             import torch
         except ImportError:
-            self.warning(
-                "Please install torch if trying to pre-compile inference kernels")
+            self.warning("Please install torch if trying to pre-compile inference kernels")
             return False
 
         cuda_okay = True
@@ -30,8 +31,7 @@ def is_compatible(self, verbose=True):
             cuda_capability = torch.cuda.get_device_properties(0).major
             if cuda_capability >= 8:
                 if torch_cuda_major < 11 or sys_cuda_major < 11:
-                    self.warning(
-                        "On Ampere and higher architectures please use CUDA 11+")
+                    self.warning("On Ampere and higher architectures please use CUDA 11+")
                     cuda_okay = False
         return super().is_compatible(verbose) and cuda_okay
 
diff --git a/op_builder/stochastic_transformer.py b/op_builder/stochastic_transformer.py
index aa47c13c49e4..52b02a3c629e 100644
--- a/op_builder/stochastic_transformer.py
+++ b/op_builder/stochastic_transformer.py
@@ -1,6 +1,8 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 from .transformer import TransformerBuilder
 
 
diff --git a/op_builder/transformer.py b/op_builder/transformer.py
index 239f29552d98..8db30fdc6791 100644
--- a/op_builder/transformer.py
+++ b/op_builder/transformer.py
@@ -1,6 +1,8 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 from .builder import CUDAOpBuilder
 
 
@@ -23,22 +25,12 @@ def extra_ldflags(self):
 
     def sources(self):
         return [
-            'csrc/transformer/ds_transformer_cuda.cpp',
-            'csrc/transformer/cublas_wrappers.cu',
-            'csrc/transformer/transform_kernels.cu',
-            'csrc/transformer/gelu_kernels.cu',
-            'csrc/transformer/dropout_kernels.cu',
-            'csrc/transformer/normalize_kernels.cu',
-            'csrc/transformer/softmax_kernels.cu',
-            'csrc/transformer/general_kernels.cu'
+            'csrc/transformer/ds_transformer_cuda.cpp', 'csrc/transformer/cublas_wrappers.cu',
+            'csrc/transformer/transform_kernels.cu', 'csrc/transformer/gelu_kernels.cu',
+            'csrc/transformer/dropout_kernels.cu', 'csrc/transformer/normalize_kernels.cu',
+            'csrc/transformer/softmax_kernels.cu', 'csrc/transformer/general_kernels.cu'
         ]
 
     def include_paths(self):
         includes = ['csrc/includes']
-        if self.is_rocm_pytorch():
-            from torch.utils.cpp_extension import ROCM_HOME
-            includes += [
-                '{}/hiprand/include'.format(ROCM_HOME),
-                '{}/rocrand/include'.format(ROCM_HOME)
-            ]
         return includes
diff --git a/op_builder/transformer_inference.py b/op_builder/transformer_inference.py
index 9bb9bbb956b3..5ee902289448 100755
--- a/op_builder/transformer_inference.py
+++ b/op_builder/transformer_inference.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .builder import CUDAOpBuilder, installed_cuda_version
 
@@ -18,8 +21,7 @@ def is_compatible(self, verbose=True):
         try:
             import torch
         except ImportError:
-            self.warning(
-                "Please install torch if trying to pre-compile inference kernels")
+            self.warning("Please install torch if trying to pre-compile inference kernels")
             return False
 
         cuda_okay = True
@@ -28,14 +30,11 @@ def is_compatible(self, verbose=True):
             torch_cuda_major = int(torch.version.cuda.split('.')[0])
             cuda_capability = torch.cuda.get_device_properties(0).major
             if cuda_capability < 6:
-                self.warning(
-                    "NVIDIA Inference is only supported on Pascal and newer architectures"
-                )
+                self.warning("NVIDIA Inference is only supported on Pascal and newer architectures")
                 cuda_okay = False
             if cuda_capability >= 8:
                 if torch_cuda_major < 11 or sys_cuda_major < 11:
-                    self.warning(
-                        "On Ampere and higher architectures please use CUDA 11+")
+                    self.warning("On Ampere and higher architectures please use CUDA 11+")
                     cuda_okay = False
         return super().is_compatible(verbose) and cuda_okay
 
@@ -57,10 +56,12 @@ def sources(self):
             'csrc/transformer/inference/csrc/gelu.cu',
             'csrc/transformer/inference/csrc/relu.cu',
             'csrc/transformer/inference/csrc/layer_norm.cu',
+            'csrc/transformer/inference/csrc/rms_norm.cu',
             'csrc/transformer/inference/csrc/softmax.cu',
             'csrc/transformer/inference/csrc/dequantize.cu',
             'csrc/transformer/inference/csrc/apply_rotary_pos_emb.cu',
             'csrc/transformer/inference/csrc/transform.cu',
+            'csrc/transformer/inference/csrc/pointwise_ops.cu',
         ]
 
     def extra_ldflags(self):
diff --git a/op_builder/utils.py b/op_builder/utils.py
deleted file mode 100644
index 02d4daa41680..000000000000
--- a/op_builder/utils.py
+++ /dev/null
@@ -1,18 +0,0 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team
-"""
-from .builder import OpBuilder
-
-
-class UtilsBuilder(OpBuilder):
-    BUILD_VAR = "DS_BUILD_UTILS"
-    NAME = "utils"
-
-    def __init__(self):
-        super().__init__(name=self.NAME)
-
-    def absolute_name(self):
-        return f'deepspeed.ops.{self.NAME}_op'
-
-    def sources(self):
-        return ['csrc/utils/flatten_unflatten.cpp']
diff --git a/release/bump_patch_version.py b/release/bump_patch_version.py
index 40d9badf09ec..20827011d368 100644
--- a/release/bump_patch_version.py
+++ b/release/bump_patch_version.py
@@ -1,11 +1,22 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
 
+# DeepSpeed Team
+
+import argparse
 from packaging import version as pkg_version
 
-with open('../version.txt') as fd:
-    version = pkg_version.parse(fd.read())
+parser = argparse.ArgumentParser()
+
+parser.add_argument("--current_version",
+                    type=str,
+                    help="The current version being published to help set the next version.")
+
+args = parser.parse_args()
+
+current_version = pkg_version.parse(args.current_version)
 
-with open('../version.txt', 'w') as fd:
-    fd.write(f'{version.major}.{version.minor}.{version.micro + 1}\n')
+with open('./version.txt', 'w') as fd:
+    fd.write(f'{current_version.major}.{current_version.minor}.{current_version.micro + 1}\n')
 
-print(f'{version} -> {version.major}.{version.minor}.{version.micro + 1}')
+print(f'{current_version} -> {current_version.major}.{current_version.minor}.{current_version.micro + 1}')
diff --git a/release/check_release_version.py b/release/check_release_version.py
new file mode 100644
index 000000000000..148fa8aa3c42
--- /dev/null
+++ b/release/check_release_version.py
@@ -0,0 +1,20 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import argparse
+from packaging import version as pkg_version
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument("--release_version", type=str, help="The new version being published.")
+
+args = parser.parse_args()
+
+release_version = pkg_version.parse(args.release_version)
+
+with open('./version.txt') as fd:
+    repo_version = pkg_version.parse(fd.read())
+
+assert repo_version == release_version, f"{repo_version=} does not match {release_version=}, unable to proceed"
diff --git a/release/release.sh b/release/release.sh
index 1366532e8b06..a83fafcb9b1f 100644
--- a/release/release.sh
+++ b/release/release.sh
@@ -25,6 +25,13 @@ if [ "${version}" != `cat version.txt` ]; then
     exit 1
 fi
 
+echo "checking that the version is valid"
+python release/check_release_version.py --release_version ${version}
+if [ $? != 0 ]; then
+    echo 'please check the version number selected'
+    exit 1
+fi
+
 python -c "import twine"
 if [ $? != 0 ]; then
     echo 'please install twine via pip'
@@ -45,5 +52,4 @@ git tag v${version}
 git push origin v${version}
 
 echo "bumping up patch version"
-cd -
-python bump_patch_version.py
+python release/bump_patch_version.py --current_version ${version}
diff --git a/requirements/requirements-cpu.txt b/requirements/requirements-cpu.txt
new file mode 100755
index 000000000000..08a7ec83faf7
--- /dev/null
+++ b/requirements/requirements-cpu.txt
@@ -0,0 +1 @@
+intel_extension_for_pytorch
diff --git a/requirements/requirements-dev.txt b/requirements/requirements-dev.txt
index 6192949b2148..105dd094f995 100644
--- a/requirements/requirements-dev.txt
+++ b/requirements/requirements-dev.txt
@@ -1,8 +1,11 @@
-clang-format>=14.0.6
+accelerate
+clang-format==16.0.2
+coverage
+deepspeed-kernels
 docutils<0.18
 future
 importlib-metadata>=4
-megatron-lm==1.1.5
+mup
 pre-commit>=2.20.0
 pytest
 pytest-forked
diff --git a/requirements/requirements-readthedocs.txt b/requirements/requirements-readthedocs.txt
index fcd0ec5a9a6a..1a2ad18611e7 100644
--- a/requirements/requirements-readthedocs.txt
+++ b/requirements/requirements-readthedocs.txt
@@ -4,6 +4,8 @@ hjson
 packaging
 psutil
 py-cpuinfo
-pydantic
+pydantic<2.0.0
+recommonmark
+sphinx_rtd_theme
 torch
 tqdm
diff --git a/requirements/requirements-sd.txt b/requirements/requirements-sd.txt
index c9026206a737..cb679ae3771d 100644
--- a/requirements/requirements-sd.txt
+++ b/requirements/requirements-sd.txt
@@ -1,2 +1,2 @@
 diffusers
-triton==2.0.0.dev20221005
+triton>=2.1.0
diff --git a/requirements/requirements-sparse_pruning.txt b/requirements/requirements-sparse_pruning.txt
new file mode 100755
index 000000000000..3b96b4134cdb
--- /dev/null
+++ b/requirements/requirements-sparse_pruning.txt
@@ -0,0 +1 @@
+neural-compressor==2.1.0
diff --git a/requirements/requirements-triton.txt b/requirements/requirements-triton.txt
new file mode 100644
index 000000000000..f43a7e19e242
--- /dev/null
+++ b/requirements/requirements-triton.txt
@@ -0,0 +1 @@
+triton>=2.1.0
diff --git a/requirements/requirements.txt b/requirements/requirements.txt
index 6840d6dbcc98..80c9f9b3287a 100755
--- a/requirements/requirements.txt
+++ b/requirements/requirements.txt
@@ -5,5 +5,6 @@ packaging>=20.0
 psutil
 py-cpuinfo
 pydantic
+pynvml
 torch
 tqdm
diff --git a/scripts/check-license.py b/scripts/check-license.py
index 519827d7df67..e5d5792d06b6 100755
--- a/scripts/check-license.py
+++ b/scripts/check-license.py
@@ -1,4 +1,9 @@
 #!/usr/bin/env python3
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 from __future__ import annotations
 '''Copyright The Microsoft DeepSpeed Team'''
 """
@@ -13,24 +18,29 @@ def err(s: str) -> None:
     print(s, file=sys.stderr)
 
 
+COPYRIGHT = [
+    (r"^# Copyright (c) Microsoft Corporation.$", r"^\/\/ Copyright (c) Microsoft Corporation.$"),
+    (r"^# SPDX-License-Identifier: Apache-2.0$", r"^\/\/ SPDX-License-Identifier: Apache-2.0$"),
+    (r"^# DeepSpeed Team$", r"^\/\/ DeepSpeed Team$"),
+]
+
 success = True
 failures = []
 for f in sys.argv[1:]:
-    res = subprocess.run(
-        ["git",
-         "grep",
-         "--quiet",
-         "-e",
-         r"Copyright .* DeepSpeed Team",
-         f],
-        capture_output=True)
-    if res.returncode == 1:
-        success = False
-        failures.append(f)
-    elif res.returncode == 2:
-        err(f"Error invoking grep on {', '.join(sys.argv[1:])}:")
-        err(res.stderr.decode("utf-8"))
-        sys.exit(2)
+    for copyright_line in COPYRIGHT:
+        cmd = ["git", "grep", "--quiet"]
+        for line in copyright_line:
+            cmd.extend(["-e", line])
+        cmd.append(f)
+        res = subprocess.run(cmd, capture_output=True)
+        if res.returncode == 1:
+            success = False
+            failures.append(f)
+            break
+        elif res.returncode == 2:
+            err(f"Error invoking grep on {', '.join(sys.argv[1:])}:")
+            err(res.stderr.decode("utf-8"))
+            sys.exit(2)
 
 if not success:
     err(f'{failures}: Missing license at top of file')
diff --git a/scripts/check-torchcuda.py b/scripts/check-torchcuda.py
new file mode 100755
index 000000000000..0723c9888369
--- /dev/null
+++ b/scripts/check-torchcuda.py
@@ -0,0 +1,79 @@
+#!/usr/bin/env python3
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from __future__ import annotations
+'''Copyright The Microsoft DeepSpeed Team'''
+"""
+Checks each file in sys.argv for the string "torch.cuda".
+Modified from https://github.com/jlebar/pre-commit-hooks/blob/master/check_do_not_submit.py
+"""
+
+import subprocess
+import sys
+
+
+def err(s: str) -> None:
+    print(s, file=sys.stderr)
+
+
+print(*sys.argv[1:])
+
+# There are many ways we could search for the string "torch.cuda", but `git
+# grep --no-index` is nice because
+#  - it's very fast (as compared to iterating over the file in Python)
+#  - we can reasonably assume it's available on all machines
+#  - unlike plain grep, which is slower and has different flags on MacOS versus
+#    Linux, git grep is always the same.
+res = subprocess.run(
+    ["git", "grep", "-Hn", "--no-index", "-e", r"torch\.cuda", "--and", "--not", "-e", "#ignore-cuda", *sys.argv[1:]],
+    capture_output=True,
+)
+if res.returncode == 0:
+    err('Error: The string "torch.cuda" was found.\nPlease replace all calls to torch.cuda with "get_accelerator()" and add the following import line:\n\n    from deepspeed.accelerator import get_accelerator\n\nIf your code is mean to be cuda specific, please add the following comment in the line with torch.cuda:\n\n    #ignore-cuda\n'
+        )
+    err(res.stdout.decode("utf-8"))
+    sys.exit(1)
+elif res.returncode == 2:
+    err(f"Error invoking grep on {', '.join(sys.argv[1:])}:")
+    err(res.stderr.decode("utf-8"))
+    sys.exit(2)
+
+res = subprocess.run(
+    ["git", "grep", "-Hn", "--no-index", r"\.cuda()", *sys.argv[1:]],
+    capture_output=True,
+)
+if res.returncode == 0:
+    err('Error: The string ".cuda()" was found. This implies convert a tensor to cuda tensor.  Please replace all calls to tensor.cuda() with "tensor.to(get_accelerator().device_name())" and add the following import line:\nfrom deepspeed.accelerator import get_accelerator'
+        )
+    err(res.stdout.decode("utf-8"))
+    sys.exit(1)
+elif res.returncode == 2:
+    err(f"Error invoking grep on {', '.join(sys.argv[1:])}:")
+    err(res.stderr.decode("utf-8"))
+    sys.exit(2)
+
+files = []
+for file in sys.argv[1:]:
+    if not file.endswith(".cpp"):
+        files.append(file)
+
+res = subprocess.run(
+    ["git", "grep", "-Hn", "--no-index", r"\.is_cuda", *files],
+    capture_output=True,
+)
+if res.returncode == 0:
+    err('''
+Error: The string ".is_cuda" was found. This implies checking if a tensor is a cuda tensor.
+       Please replace all calls to "tensor.is_cuda" with "get_accelerator().on_accelerator(tensor)",
+       and add the following import line:
+       'from deepspeed.accelerator import get_accelerator'
+''')
+    err(res.stdout.decode("utf-8"))
+    sys.exit(1)
+elif res.returncode == 2:
+    err(f"Error invoking grep on {', '.join(files)}:")
+    err(res.stderr.decode("utf-8"))
+    sys.exit(2)
diff --git a/scripts/check-torchdist.py b/scripts/check-torchdist.py
index d655b7b9008e..f0328aca6469 100755
--- a/scripts/check-torchdist.py
+++ b/scripts/check-torchdist.py
@@ -1,4 +1,9 @@
 #!/usr/bin/env python3
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 from __future__ import annotations
 '''Copyright The Microsoft DeepSpeed Team'''
 """
@@ -21,12 +26,7 @@ def err(s: str) -> None:
 #  - unlike plain grep, which is slower and has different flags on MacOS versus
 #    Linux, git grep is always the same.
 res = subprocess.run(
-    ["git",
-     "grep",
-     "-Hn",
-     "--no-index",
-     r"torch\.distributed",
-     *sys.argv[1:]],
+    ["git", "grep", "-Hn", "--no-index", r"torch\.distributed", *sys.argv[1:]],
     capture_output=True,
 )
 if res.returncode == 0:
diff --git a/scripts/replace_copyright.py b/scripts/replace_copyright.py
new file mode 100644
index 000000000000..03a8c63f9abc
--- /dev/null
+++ b/scripts/replace_copyright.py
@@ -0,0 +1,235 @@
+#!/usr/bin/env python3
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""
+USAGE:
+$ python3 script/replace_copyright.py --repo_dir ./
+"""
+
+import os
+import argparse
+
+NEW_COPYRIGHT = ("Copyright (c) Microsoft Corporation.", "SPDX-License-Identifier: Apache-2.0", "", "DeepSpeed Team")
+
+PY_SL_COMMENT = "#"
+PY_ML_SINGLE = "'''"
+PY_ML_DOUBLE = '"""'
+PY_COMMENTS = (PY_SL_COMMENT, PY_ML_SINGLE, PY_ML_DOUBLE)
+
+C_SL_COMMENT = "//"
+C_ML_OPEN = "/*"
+C_ML_CLOSE = "*/"
+C_COMMENTS = (C_SL_COMMENT, C_ML_OPEN, C_ML_CLOSE)
+
+BASH_SL_COMMENT = "#"
+BASH_COMMENTS = (BASH_SL_COMMENT, )
+
+DELIM = "|/-\|/-\|BARRIER|/-\|/-\|"  # noqa: W605
+
+
+def parser_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--repo_dir", type=str, help="Repository directory")
+    parser.add_argument("--python_style_ext",
+                        type=str,
+                        nargs="+",
+                        default=[".py"],
+                        help="File types to process with python-style comments")
+    parser.add_argument("--bash_style_ext",
+                        type=str,
+                        nargs="+",
+                        default=[".sh"],
+                        help="File types to process with bash-style comments")
+    parser.add_argument("--c_style_ext",
+                        type=str,
+                        nargs="+",
+                        default=[
+                            ".c",
+                            ".cpp",
+                            ".cu",
+                            ".h",
+                            ".hpp",
+                            ".cuh",
+                            ".cc",
+                            ".hip",
+                            ".tr",
+                        ],
+                        help="File types to process with C-style comments")
+    args = parser.parse_args()
+    return args
+
+
+# These get_header_* functions are ugly, but they work :)
+def get_header_py(fp):
+    with open(fp, "r") as f:
+        lines = iter(l for l in f.readlines())
+
+    header = []
+    rest = []
+    in_multiline = False
+    multiline_type = None
+
+    while (l := next(lines, None)) is not None:
+        l = l.strip()
+        if l.startswith(PY_ML_SINGLE) or l.startswith(PY_ML_DOUBLE):
+            # Detected multiline comment
+            if in_multiline and multiline_type == l[:3]:
+                # Ended a multiline comment
+                in_multiline = False
+            else:
+                # Started a multiline comment
+                in_multiline = True
+                multiline_type = l[:3]
+            if l.endswith(multiline_type) and len(l) >= 6:
+                # Opened and closed multiline comment on single line
+                in_multiline = False
+        elif in_multiline and l.endswith(multiline_type):
+            # Ended a multiline comment
+            in_multiline = False
+        elif not (in_multiline or l.startswith(PY_SL_COMMENT) or l == ""):
+            # Not in a comment
+            rest += [l + "\n"]
+            break
+        header.append(l)
+
+    rest += list(lines)
+
+    return header, rest
+
+
+def get_header_c(fp):
+    with open(fp, "r") as f:
+        lines = iter(l for l in f.readlines())
+
+    header = []
+    rest = []
+    in_multiline = False
+
+    while (l := next(lines, None)) is not None:
+        l = l.strip()
+        if l.startswith(C_ML_OPEN):
+            # Detected multiline comment
+            if not l.endswith(C_ML_CLOSE):
+                # multiline comment not closed on same line
+                in_multiline = True
+        elif l.endswith(C_ML_CLOSE):
+            # Ended a multiline comment
+            in_multiline = False
+        elif not in_multiline or l.startswith(C_SL_COMMENT) or l.isspace():
+            # Not in a comment
+            rest += [l + "\n"]
+            break
+        header.append(l)
+
+    rest += list(lines)
+
+    return header, rest
+
+
+def get_header_bash(fp):
+    with open(fp, "r") as f:
+        lines = iter(l for l in f.readlines())
+
+    header = []
+    rest = []
+
+    while (l := next(lines, None)) is not None:
+        l = l.strip()
+        if not l.startswith(BASH_SL_COMMENT) or l.isspace():
+            # Not in a comment
+            rest += [l + "\n"]
+            break
+        header.append(l)
+
+    rest += list(lines)
+
+    return header, rest
+
+
+def remove_comments(line, comment_strs):
+    for cstr in comment_strs:
+        line = line.replace(cstr, "")
+    return line
+
+
+def format_multiline_comment(text, comment_type):
+    if comment_type == PY_COMMENTS:
+        text = f"\n{comment_type[2]}\n" + "\n".join(text) + f"{comment_type[2]}"
+    if comment_type == C_COMMENTS:
+        text = f"\n{comment_type[1]}\n" + "\n".join(text) + f"{comment_type[2]}"
+    if comment_type == BASH_COMMENTS:
+        text = "\n".join([f"{comment_type[0]}{l}" for l in text])
+    return text
+
+
+def modify_file_header(fp, file_header, rest_of_file, preserve_text_store, comment_type):
+    header_text = "\n".join(file_header)
+    if not (header_text.strip() == "" or header_text in preserve_text_store):
+        # Unique header, need to get user input
+        print("\n", DELIM, "\n")
+        for idx, line in enumerate(file_header):
+            print(f"{idx}: {line}")
+        print("\n", DELIM, "\n")
+        print("\nIndicate the FIRST line of the Header to KEEP")
+        print("(shebang #! lines will be automatically processed and should not be included).")
+        keep_idx = input("Enter number (or leave blank if no lines should be preserved): ")
+        preserve_text_store[header_text] = file_header[int(keep_idx):] if keep_idx != "" else ""
+
+    # Identify any shebang lines in the file
+    shebang = "\n".join([l for l in file_header if l.startswith("#!")])
+    if shebang != "":
+        shebang += "\n"
+
+    # Get the text we should preserve in this file and process to remove comment characters
+    text_to_preserve = preserve_text_store.get(header_text, [""])
+    text_to_preserve = [remove_comments(l, comment_type) for l in text_to_preserve]
+
+    # Format the text we want to keep into a new multiline comment
+    if "".join(text_to_preserve) == "":
+        text_to_preserve = ""
+    else:
+        text_to_preserve = format_multiline_comment(text_to_preserve, comment_type)
+
+    # Generate the copyright text we will be adding
+    copyright_text = "\n".join([f"{comment_type[0]} {l}" if l != "" else l for l in NEW_COPYRIGHT])
+
+    # Assemble the new header
+    new_header = shebang + copyright_text + text_to_preserve
+
+    # Write out the new file
+    new_file_contents = new_header + "\n" + "".join(rest_of_file)
+    with open(fp, "w") as f:
+        f.write(new_file_contents)
+
+    return preserve_text_store  # Return so we can reuse for future files
+
+
+def main(args):
+    preserve_text_store = {}  # Used to track header comments we should preserve
+    for root, dirs, fnames in os.walk(args.repo_dir):
+        # Walk across directory looking for all files with extensions we want to modify
+        for ext in args.python_style_ext:
+            fpaths = [os.path.join(root, fn) for fn in fnames if fn.endswith(ext)]
+            for fp in fpaths:
+                file_header, rest_of_file = get_header_py(fp)
+                preserve_text_store = modify_file_header(fp, file_header, rest_of_file, preserve_text_store,
+                                                         PY_COMMENTS)
+        for ext in args.c_style_ext:
+            fpaths = [os.path.join(root, fn) for fn in fnames if fn.endswith(ext)]
+            for fp in fpaths:
+                file_header, rest_of_file = get_header_c(fp)
+                preserve_text_store = modify_file_header(fp, file_header, rest_of_file, preserve_text_store,
+                                                         C_COMMENTS)
+        for ext in args.bash_style_ext:
+            fpaths = [os.path.join(root, fn) for fn in fnames if fn.endswith(ext)]
+            for fp in fpaths:
+                file_header, rest_of_file = get_header_bash(fp)
+                preserve_text_store = modify_file_header(fp, file_header, rest_of_file, preserve_text_store,
+                                                         BASH_COMMENTS)
+
+
+if __name__ == "__main__":
+    args = parser_args()
+    main(args)
diff --git a/setup.py b/setup.py
index 351f52b1efc5..d9aed9b47bd8 100755
--- a/setup.py
+++ b/setup.py
@@ -1,16 +1,19 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
 
+# DeepSpeed Team
+"""
 DeepSpeed library
 
 To build wheel on Windows:
-    1. Install pytorch, such as pytorch 1.12 + cuda 11.6
-    2. Install visual cpp build tool
-    3. Include cuda toolkit
-    4. Launch cmd console with Administrator privilege for creating required symlink folders
+1. Install pytorch, such as pytorch 1.12 + cuda 11.6.
+2. Install visual cpp build tool.
+3. Include cuda toolkit.
+4. Launch cmd console with Administrator privilege for creating required symlink folders.
+
 
 Create a new wheel via the following command:
-    build_win.bat
+build_win.bat
 
 The wheel will be located at: dist/*.whl
 """
@@ -21,6 +24,7 @@
 from setuptools import setup, find_packages
 from setuptools.command import egg_info
 import time
+import typing
 
 torch_available = True
 try:
@@ -34,7 +38,7 @@
 from op_builder.all_ops import ALL_OPS
 from op_builder.builder import installed_cuda_version
 
-# fetch rocm state
+# Fetch rocm state.
 is_rocm_pytorch = OpBuilder.is_rocm_pytorch()
 rocm_version = OpBuilder.installed_rocm_version()
 
@@ -53,34 +57,57 @@ def fetch_requirements(path):
         return [r.strip() for r in fd.readlines()]
 
 
+def is_env_set(key):
+    """
+    Checks if an environment variable is set and not "".
+    """
+    return bool(os.environ.get(key, None))
+
+
+def get_env_if_set(key, default: typing.Any = ""):
+    """
+    Returns an environment variable if it is set and not "",
+    otherwise returns a default value. In contrast, the fallback
+    parameter of os.environ.get() is skipped if the variable is set to "".
+    """
+    return os.environ.get(key, None) or default
+
+
 install_requires = fetch_requirements('requirements/requirements.txt')
 extras_require = {
-    '1bit': [], # add cupy based on cuda/rocm version
+    '1bit': [],  # add cupy based on cuda/rocm version
     '1bit_mpi': fetch_requirements('requirements/requirements-1bit-mpi.txt'),
     'readthedocs': fetch_requirements('requirements/requirements-readthedocs.txt'),
     'dev': fetch_requirements('requirements/requirements-dev.txt'),
     'autotuning': fetch_requirements('requirements/requirements-autotuning.txt'),
     'autotuning_ml': fetch_requirements('requirements/requirements-autotuning-ml.txt'),
     'sparse_attn': fetch_requirements('requirements/requirements-sparse_attn.txt'),
+    'sparse': fetch_requirements('requirements/requirements-sparse_pruning.txt'),
     'inf': fetch_requirements('requirements/requirements-inf.txt'),
-    'sd': fetch_requirements('requirements/requirements-sd.txt')
+    'sd': fetch_requirements('requirements/requirements-sd.txt'),
+    'triton': fetch_requirements('requirements/requirements-triton.txt'),
 }
 
-# Add specific cupy version to both onebit extension variants
+# Add specific cupy version to both onebit extension variants.
 if torch_available and torch.cuda.is_available():
     cupy = None
     if is_rocm_pytorch:
         rocm_major, rocm_minor = rocm_version
-        # XXX cupy support for rocm 5 is not available yet
+        # XXX cupy support for rocm 5 is not available yet.
         if rocm_major <= 4:
             cupy = f"cupy-rocm-{rocm_major}-{rocm_minor}"
     else:
-        cupy = f"cupy-cuda{''.join(map(str,installed_cuda_version()))}"
+        cuda_major_ver, cuda_minor_ver = installed_cuda_version()
+        if (cuda_major_ver < 11) or ((cuda_major_ver == 11) and (cuda_minor_ver < 3)):
+            cupy = f"cupy-cuda{cuda_major_ver}{cuda_minor_ver}"
+        else:
+            cupy = f"cupy-cuda{cuda_major_ver}x"
+
     if cupy:
         extras_require['1bit'].append(cupy)
         extras_require['1bit_mpi'].append(cupy)
 
-# Make an [all] extra that installs all needed dependencies
+# Make an [all] extra that installs all needed dependencies.
 all_extras = set()
 for extra in extras_require.items():
     for req in extra[1]:
@@ -89,11 +116,10 @@ def fetch_requirements(path):
 
 cmdclass = {}
 
-# For any pre-installed ops force disable ninja
+# For any pre-installed ops force disable ninja.
 if torch_available:
     from accelerator import get_accelerator
-    cmdclass['build_ext'] = get_accelerator().build_extension().with_options(
-        use_ninja=False)
+    cmdclass['build_ext'] = get_accelerator().build_extension().with_options(use_ninja=False)
 
 if torch_available:
     TORCH_MAJOR = torch.__version__.split('.')[0]
@@ -103,19 +129,18 @@ def fetch_requirements(path):
     TORCH_MINOR = "0"
 
 if torch_available and not torch.cuda.is_available():
-    # Fix to allow docker builds, similar to https://github.com/NVIDIA/apex/issues/486
-    print(
-        "[WARNING] Torch did not find cuda available, if cross-compiling or running with cpu only "
-        "you can ignore this message. Adding compute capability for Pascal, Volta, and Turing "
-        "(compute capabilities 6.0, 6.1, 6.2)")
-    if os.environ.get("TORCH_CUDA_ARCH_LIST", None) is None:
+    # Fix to allow docker builds, similar to https://github.com/NVIDIA/apex/issues/486.
+    print("[WARNING] Torch did not find cuda available, if cross-compiling or running with cpu only "
+          "you can ignore this message. Adding compute capability for Pascal, Volta, and Turing "
+          "(compute capabilities 6.0, 6.1, 6.2)")
+    if not is_env_set("TORCH_CUDA_ARCH_LIST"):
         os.environ["TORCH_CUDA_ARCH_LIST"] = get_default_compute_capabilities()
 
 ext_modules = []
 
 # Default to pre-install kernels to false so we rely on JIT on Linux, opposite on Windows.
 BUILD_OP_PLATFORM = 1 if sys.platform == "win32" else 0
-BUILD_OP_DEFAULT = int(os.environ.get('DS_BUILD_OPS', BUILD_OP_PLATFORM))
+BUILD_OP_DEFAULT = int(get_env_if_set('DS_BUILD_OPS', BUILD_OP_PLATFORM))
 print(f"DS_BUILD_OPS={BUILD_OP_DEFAULT}")
 
 if BUILD_OP_DEFAULT:
@@ -139,7 +164,7 @@ def op_envvar(op_name):
 
 def op_enabled(op_name):
     env_var = op_envvar(op_name)
-    return int(os.environ.get(env_var, BUILD_OP_DEFAULT))
+    return int(get_env_if_set(env_var, BUILD_OP_DEFAULT))
 
 
 compatible_ops = dict.fromkeys(ALL_OPS.keys(), False)
@@ -147,19 +172,20 @@ def op_enabled(op_name):
 for op_name, builder in ALL_OPS.items():
     op_compatible = builder.is_compatible()
     compatible_ops[op_name] = op_compatible
+    compatible_ops["deepspeed_not_implemented"] = False
 
-    # If op is requested but not available, throw an error
+    # If op is requested but not available, throw an error.
     if op_enabled(op_name) and not op_compatible:
         env_var = op_envvar(op_name)
-        if env_var not in os.environ:
+        if not is_env_set(env_var):
             builder.warning(f"One can disable {op_name} with {env_var}=0")
         abort(f"Unable to pre-compile {op_name}")
 
-    # if op is compatible but install is not enabled (JIT mode)
+    # If op is compatible but install is not enabled (JIT mode).
     if is_rocm_pytorch and op_compatible and not op_enabled(op_name):
         builder.hipify_extension()
 
-    # If op install enabled, add builder to extensions
+    # If op install enabled, add builder to extensions.
     if op_enabled(op_name) and op_compatible:
         assert torch_available, f"Unable to pre-compile {op_name}, please first install torch"
         install_ops[op_name] = op_enabled(op_name)
@@ -167,10 +193,10 @@ def op_enabled(op_name):
 
 print(f'Install Ops={install_ops}')
 
-# Write out version/git info
+# Write out version/git info.
 git_hash_cmd = "git rev-parse --short HEAD"
 git_branch_cmd = "git rev-parse --abbrev-ref HEAD"
-if command_exists('git') and 'DS_BUILD_STRING' not in os.environ:
+if command_exists('git') and not is_env_set('DS_BUILD_STRING'):
     try:
         result = subprocess.check_output(git_hash_cmd, shell=True)
         git_hash = result.decode('utf-8').strip()
@@ -200,38 +226,38 @@ def create_dir_symlink(src, dest):
     create_dir_symlink('..\\accelerator', '.\\deepspeed\\accelerator')
     egg_info.manifest_maker.template = 'MANIFEST_win.in'
 
-# Parse the DeepSpeed version string from version.txt
+# Parse the DeepSpeed version string from version.txt.
 version_str = open('version.txt', 'r').read().strip()
 
 # Build specifiers like .devX can be added at install time. Otherwise, add the git hash.
-# example: DS_BUILD_STRING=".dev20201022" python setup.py sdist bdist_wheel
+# Example: DS_BUILD_STRING=".dev20201022" python setup.py sdist bdist_wheel.
 
-# Building wheel for distribution, update version file
-if 'DS_BUILD_STRING' in os.environ:
-    # Build string env specified, probably building for distribution
+# Building wheel for distribution, update version file.
+if is_env_set('DS_BUILD_STRING'):
+    # Build string env specified, probably building for distribution.
     with open('build.txt', 'w') as fd:
-        fd.write(os.environ.get('DS_BUILD_STRING'))
-    version_str += os.environ.get('DS_BUILD_STRING')
+        fd.write(os.environ['DS_BUILD_STRING'])
+    version_str += os.environ['DS_BUILD_STRING']
 elif os.path.isfile('build.txt'):
-    # build.txt exists, probably installing from distribution
+    # build.txt exists, probably installing from distribution.
     with open('build.txt', 'r') as fd:
         version_str += fd.read().strip()
 else:
-    # None of the above, probably installing from source
+    # None of the above, probably installing from source.
     version_str += f'+{git_hash}'
 
 torch_version = ".".join([TORCH_MAJOR, TORCH_MINOR])
 bf16_support = False
-# Set cuda_version to 0.0 if cpu-only
+# Set cuda_version to 0.0 if cpu-only.
 cuda_version = "0.0"
 nccl_version = "0.0"
-# Set hip_version to 0.0 if cpu-only
+# Set hip_version to 0.0 if cpu-only.
 hip_version = "0.0"
 if torch_available and torch.version.cuda is not None:
     cuda_version = ".".join(torch.version.cuda.split('.')[:2])
     if sys.platform != "win32":
         if isinstance(torch.cuda.nccl.version(), int):
-            # This will break if minor version > 9
+            # This will break if minor version > 9.
             nccl_version = ".".join(str(torch.cuda.nccl.version())[:2])
         else:
             nccl_version = ".".join(map(str, torch.cuda.nccl.version()[:2]))
@@ -273,7 +299,7 @@ def create_dir_symlink(src, dest):
       long_description=readme_text,
       long_description_content_type='text/markdown',
       author='DeepSpeed Team',
-      author_email='deepspeed@microsoft.com',
+      author_email='deepspeed-info@microsoft.com',
       url='http://deepspeed.ai',
       project_urls={
           'Documentation': 'https://deepspeed.readthedocs.io',
@@ -281,27 +307,18 @@ def create_dir_symlink(src, dest):
       },
       install_requires=install_requires,
       extras_require=extras_require,
-      packages=find_packages(include=['deepspeed',
-                                      'deepspeed.*']),
+      packages=find_packages(include=['deepspeed', 'deepspeed.*']),
       include_package_data=True,
       scripts=[
-          'bin/deepspeed',
-          'bin/deepspeed.pt',
-          'bin/ds',
-          'bin/ds_ssh',
-          'bin/ds_report',
-          'bin/ds_bench',
-          'bin/dsr',
+          'bin/deepspeed', 'bin/deepspeed.pt', 'bin/ds', 'bin/ds_ssh', 'bin/ds_report', 'bin/ds_bench', 'bin/dsr',
           'bin/ds_elastic'
       ],
       classifiers=[
-          'Programming Language :: Python :: 3.6',
-          'Programming Language :: Python :: 3.7',
-          'Programming Language :: Python :: 3.8',
-          'Programming Language :: Python :: 3.9',
+          'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7',
+          'Programming Language :: Python :: 3.8', 'Programming Language :: Python :: 3.9',
           'Programming Language :: Python :: 3.10'
       ],
-      license='MIT',
+      license='Apache Software License 2.0',
       ext_modules=ext_modules,
       cmdclass=cmdclass)
 
diff --git a/tests/.coveragerc b/tests/.coveragerc
new file mode 100644
index 000000000000..dccaba6b57a3
--- /dev/null
+++ b/tests/.coveragerc
@@ -0,0 +1,5 @@
+# .coveragerc to control coverage.py
+[run]
+parallel = True
+sigterm = True
+source = deepspeed
diff --git a/tests/accelerator/test_ds_init.py b/tests/accelerator/test_ds_init.py
index 6c4e90e2aa63..9594a6f5ea58 100644
--- a/tests/accelerator/test_ds_init.py
+++ b/tests/accelerator/test_ds_init.py
@@ -1,4 +1,8 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 import os
 import torch
 import deepspeed
@@ -6,6 +10,7 @@
 
 
 class OneLayerNet(torch.nn.Module):
+
     def __init__(self, D_in, D_out):
         """
         In the constructor we instantiate two nn.Linear modules and assign them as
diff --git a/tests/benchmarks/DS4Sci_EvoformerAttention_bench.py b/tests/benchmarks/DS4Sci_EvoformerAttention_bench.py
new file mode 100644
index 000000000000..e3d8825f5415
--- /dev/null
+++ b/tests/benchmarks/DS4Sci_EvoformerAttention_bench.py
@@ -0,0 +1,107 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""
+This script is to test the performance of the DS4Sci_EvoformerAttention op.
+To run the script,
+1. Clone the CUTLASS repo. E.g. git clone https://github.com/NVIDIA/cutlass.git
+2. Specify the CUTLASS_PATH environment variable. E.g. export CUTLASS_PATH=$(pwd)/cutlass
+3. Run the script. E.g. python DS4Sci_EvoformerAttention_bench.py
+"""
+
+import contextlib
+import torch
+from typing import List
+from torch.nn import functional as F
+from deepspeed.ops.deepspeed4science import DS4Sci_EvoformerAttention
+from deepspeed.accelerator import get_accelerator
+
+
+def attention_reference(
+        q_input: torch.Tensor,  # [*, Dim_Q, H, C_hid]
+        k_input: torch.Tensor,  # [*, Dim_Q, H, C_hid]
+        v_input: torch.Tensor,  # [*, Dim_Q, H, C_hid]
+        biases: List[torch.Tensor],
+        sm_scale: float) -> torch.Tensor:
+    # Original shape: [*, Dim_Q, H, C_hid] -> Transpose to: [*, H, Dim_Q, C_hid]
+    q = q_input.transpose(-2, -3)
+    k = k_input.transpose(-2, -3)
+    v = v_input.transpose(-2, -3)
+
+    # Now, q, k, v are in shape: [*, H, Dim_Q, C_hid]
+
+    # Transpose k to shape [*, H, C_hid, Dim_Q]
+    k_t = k.transpose(-1, -2)
+
+    # Now, q and k_t are in shapes: [*, H, Dim_Q, C_hid] and [*, H, C_hid, Dim_Q] respectively
+
+    # [*, H, Dim_Q, Dim_Q]
+    a = torch.matmul(q, k_t) * sm_scale
+
+    for b in biases:
+        a += b
+
+    a = F.softmax(a, dim=-1)
+
+    # Now, a is in shape [*, H, Dim_Q, Dim_Q], v is in shape [*, H, Dim_Q, C_hid]
+
+    # Matmul operation results in [*, H, Dim_Q, C_hid]
+    a_v = torch.matmul(a, v)
+
+    # [*, Dim_Q, H, C_hid]
+    o = a_v.transpose(-2, -3)
+
+    return o
+
+
+dtype = torch.float16
+
+N = 256
+heads = 4
+dim = 32
+seq_len = 256
+
+
+@contextlib.contextmanager
+def cuda_timer(res_list):
+    start = get_accelerator().Event(enable_timing=True)
+    end = get_accelerator().Event(enable_timing=True)
+    start.record()
+    yield
+    end.record()
+    get_accelerator().synchronize()
+    res_list.append(start.elapsed_time(end))
+
+
+def benchmark():
+    ours_fw = []
+    ours_bw = []
+    baseline_fw = []
+    baseline_bw = []
+    for batch in range(1, 17):
+        Q = torch.randn(batch, N, seq_len, heads, dim, dtype=dtype, device="cuda", requires_grad=True)
+        K = torch.randn(batch, N, seq_len, heads, dim, dtype=dtype, device="cuda", requires_grad=True)
+        V = torch.randn(batch, N, seq_len, heads, dim, dtype=dtype, device="cuda", requires_grad=True)
+        bias1 = torch.randn(batch, N, 1, 1, seq_len, dtype=dtype, device="cuda", requires_grad=False)
+        bias2 = torch.randn(batch, 1, heads, seq_len, seq_len, dtype=dtype, device="cuda", requires_grad=True)
+        # warm up
+        DS4Sci_EvoformerAttention(Q, K, V, [bias1, bias2])
+        with cuda_timer(ours_fw):
+            out = DS4Sci_EvoformerAttention(Q, K, V, [bias1, bias2])
+        d_out = torch.rand_like(out)
+        with cuda_timer(ours_bw):
+            out.backward(d_out)
+        # warm up
+        attention_reference(Q, K, V, [bias1, bias2], 1 / (dim**0.5))
+        with cuda_timer(baseline_fw):
+            ref_out = attention_reference(Q, K, V, [bias1, bias2], 1 / (dim**0.5))
+        with cuda_timer(baseline_bw):
+            ref_out.backward(d_out)
+
+    print(f"batch size\tours (FW)\tbaseline (FW)\tours (BW)\tbaseline (BW)")
+    for i in range(len(ours_fw)):
+        print(f"{i+1}\t{ours_fw[i]}\t{baseline_fw[i]}\t{ours_bw[i]}\t{baseline_bw[i]}")
+
+
+benchmark()
diff --git a/tests/benchmarks/flatten_bench.py b/tests/benchmarks/flatten_bench.py
index 1082554f81d1..a09600db5fbe 100755
--- a/tests/benchmarks/flatten_bench.py
+++ b/tests/benchmarks/flatten_bench.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 #!/usr/bin/env python
 # run the benchmark under timeit (-t), cProfile (-c), line_profiler (-l)
@@ -26,12 +29,9 @@
 torch.manual_seed(0)
 # emulate a small typical model weights
 x = [
-    torch.rand((512,
-                512)).to(get_accelerator().device_name()),
-    torch.rand((512,
-                1024)).to(get_accelerator().device_name()),
-    torch.rand((512,
-                30000)).to(get_accelerator().device_name())
+    torch.rand((512, 512)).to(get_accelerator().device_name()),
+    torch.rand((512, 1024)).to(get_accelerator().device_name()),
+    torch.rand((512, 30000)).to(get_accelerator().device_name())
 ]
 t = x * 30
 
@@ -110,15 +110,15 @@ def timeme():
 def line_profileme():
     print("--------------- line_profiler -----------------")
     print("py")
-    profile(py)()  # noqa: F821
+    profile(py)()  # noqa: F821 # type: ignore
     gc.collect()
     get_accelerator().empty_cache()
     print("cpp")
-    profile(cpp)()  # noqa: F821
+    profile(cpp)()  # noqa: F821 # type: ignore
     gc.collect()
     get_accelerator().empty_cache()
     print("apex")
-    profile(apex)()  # noqa: F821
+    profile(apex)()  # noqa: F821 # type: ignore
     gc.collect()
     get_accelerator().empty_cache()
 
diff --git a/tests/benchmarks/unflatten_bench.py b/tests/benchmarks/unflatten_bench.py
index a4a1b63b3dd0..9f2f0e1e87f5 100755
--- a/tests/benchmarks/unflatten_bench.py
+++ b/tests/benchmarks/unflatten_bench.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 #!/usr/bin/env python
 
@@ -26,12 +29,9 @@
 torch.manual_seed(0)
 # emulate a small typical model weights
 x = [
-    torch.rand((512,
-                512)).to(get_accelerator().device_name()),
-    torch.rand((512,
-                1024)).to(get_accelerator().device_name()),
-    torch.rand((512,
-                30000)).to(get_accelerator().device_name())
+    torch.rand((512, 512)).to(get_accelerator().device_name()),
+    torch.rand((512, 1024)).to(get_accelerator().device_name()),
+    torch.rand((512, 30000)).to(get_accelerator().device_name())
 ]
 unflat_t = x * 30
 
@@ -119,15 +119,15 @@ def timeme():
 def line_profileme():
     print("--------------- line_profier -----------------")
     print("py")
-    profile(py)()  # noqa: F821
+    profile(py)()  # noqa: F821 # type: ignore
     gc.collect()
     get_accelerator().empty_cache()
     print("cpp")
-    profile(cpp)()  # noqa: F821
+    profile(cpp)()  # noqa: F821 # type: ignore
     gc.collect()
     get_accelerator().empty_cache()
     print("apex")
-    profile(apex)()  # noqa: F821
+    profile(apex)()  # noqa: F821 # type: ignore
     gc.collect()
     get_accelerator().empty_cache()
 
diff --git a/tests/conftest.py b/tests/conftest.py
index 86662993a4fb..45e8434a021b 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 # tests directory-specific settings - this file is run automatically by pytest before any tests are run
 
@@ -18,6 +21,13 @@
 sys.path.insert(1, git_repo_path)
 
 
+def pytest_configure(config):
+    config.option.color = "yes"
+    config.option.durations = 0
+    config.option.durations_min = 1
+    config.option.verbose = True
+
+
 def pytest_addoption(parser):
     parser.addoption("--torch_ver", default=None, type=str)
     parser.addoption("--cuda_ver", default=None, type=str)
@@ -35,16 +45,14 @@ def check_environment(pytestconfig):
     expected_cuda_version = pytestconfig.getoption("cuda_ver")
     if expected_torch_version is None:
         warnings.warn(
-            "Running test without verifying torch version, please provide an expected torch version with --torch_ver"
-        )
+            "Running test without verifying torch version, please provide an expected torch version with --torch_ver")
     elif not validate_version(expected_torch_version, torch.__version__):
         pytest.exit(
             f"expected torch version {expected_torch_version} did not match found torch version {torch.__version__}",
             returncode=2)
     if expected_cuda_version is None:
         warnings.warn(
-            "Running test without verifying cuda version, please provide an expected cuda version with --cuda_ver"
-        )
+            "Running test without verifying cuda version, please provide an expected cuda version with --cuda_ver")
     elif not validate_version(expected_cuda_version, torch.version.cuda):
         pytest.exit(
             f"expected cuda version {expected_cuda_version} did not match found cuda version {torch.version.cuda}",
@@ -62,10 +70,18 @@ def pytest_runtest_call(item):
         item.runtest = lambda: True  # Dummy function so test is not run twice
 
 
+# We allow DistributedTest to reuse distributed environments. When the last
+# test for a class is run, we want to make sure those distributed environments
+# are destroyed.
+def pytest_runtest_teardown(item, nextitem):
+    if getattr(item.cls, "reuse_dist_env", False) and not nextitem:
+        dist_test_class = item.cls()
+        for num_procs, pool in dist_test_class._pool_cache.items():
+            dist_test_class._close_pool(pool, num_procs, force=True)
+
+
 @pytest.hookimpl(tryfirst=True)
 def pytest_fixture_setup(fixturedef, request):
     if getattr(fixturedef.func, "is_dist_fixture", False):
-        #for val in dir(request):
-        #    print(val.upper(), getattr(request, val), "\n")
         dist_fixture_class = fixturedef.func()
         dist_fixture_class(request)
diff --git a/tests/hybrid_engine/hybrid_engine_config.json b/tests/hybrid_engine/hybrid_engine_config.json
new file mode 100644
index 000000000000..1d418ae8e019
--- /dev/null
+++ b/tests/hybrid_engine/hybrid_engine_config.json
@@ -0,0 +1,19 @@
+{
+  "train_batch_size" : 32,
+  "train_micro_batch_size_per_gpu": 2,
+  "steps_per_print": 10,
+  "zero_optimization": {
+    "stage": 0,
+    "offload_param": {
+        "device": "cpu"
+    },
+    "stage3_param_persistence_threshold": 0
+  },
+  "fp16":{
+    "enabled": true,
+    "loss_scale_window": 100
+  },
+  "gradient_clipping": 1.0,
+  "prescale_gradients": false,
+  "wall_clock_breakdown" : false
+}
diff --git a/tests/hybrid_engine/hybrid_engine_test.py b/tests/hybrid_engine/hybrid_engine_test.py
new file mode 100644
index 000000000000..1b8958a387e7
--- /dev/null
+++ b/tests/hybrid_engine/hybrid_engine_test.py
@@ -0,0 +1,30 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+
+from transformers import AutoModelForCausalLM
+import deepspeed
+import argparse
+from deepspeed.accelerator import get_accelerator
+
+deepspeed.runtime.utils.see_memory_usage('pre test', force=True)
+
+model = AutoModelForCausalLM.from_pretrained('facebook/opt-350M').half().to(get_accelerator().device_name())
+parser = argparse.ArgumentParser()
+parser = deepspeed.add_config_arguments(parser)
+args = parser.parse_args()
+
+deepspeed.runtime.utils.see_memory_usage('post test', force=True)
+
+m, _, _, _ = deepspeed.initialize(model=model, args=args, enable_hybrid_engine=True)
+
+m.eval()
+input = torch.ones(1, 16, device='cuda', dtype=torch.long)
+out = m(input)
+
+m.train()
+out = m(input)
+print(out['logits'], out['logits'].norm())
diff --git a/tests/lightning/test_simple.py b/tests/lightning/test_simple.py
index c78768bc707a..01f3f4e96571 100644
--- a/tests/lightning/test_simple.py
+++ b/tests/lightning/test_simple.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 from pytorch_lightning import LightningModule, Trainer
@@ -7,6 +10,7 @@
 
 
 class RandomDataset(Dataset):
+
     def __init__(self, size, length):
         self.len = length
         self.data = torch.randn(length, size)
@@ -19,6 +23,7 @@ def __len__(self):
 
 
 class BoringModel(LightningModule):
+
     def __init__(self):
         super().__init__()
         self.layer = torch.nn.Linear(32, 2)
@@ -53,5 +58,5 @@ def test_lightning_model():
     """Test that DeepSpeed works with a simple LightningModule and LightningDataModule."""
 
     model = BoringModel()
-    trainer = Trainer(strategy=DeepSpeedStrategy(), max_epochs=1, precision=16, gpus=1)
+    trainer = Trainer(strategy=DeepSpeedStrategy(), max_epochs=1, precision=16, accelerator="gpu", devices=1)
     trainer.fit(model)
diff --git a/tests/model/BingBertSquad/BingBertSquad_run_func_test.py b/tests/model/BingBertSquad/BingBertSquad_run_func_test.py
index 828771cd324b..d98091a8bdf5 100755
--- a/tests/model/BingBertSquad/BingBertSquad_run_func_test.py
+++ b/tests/model/BingBertSquad/BingBertSquad_run_func_test.py
@@ -1,6 +1,10 @@
-# Copyright (c) 2019, The Microsoft DeepSpeed Team. All rights reserved.
-#
-# Note: please copy webtext data to "Megatron-LM" folder, before running this script.
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""
+Note: please copy webtext data to "Megatron-LM" folder, before running this script.
+"""
 
 import unittest
 import os
@@ -28,6 +32,7 @@ def grep_loss_from_file(file_name):
 
 
 class BingBertSquadFuncTestCase(BaseTestCase):
+
     def __init__(self, methodName="DeepSpeed function test on BingBertSquad model"):
         super(BingBertSquadFuncTestCase, self).__init__(methodName)
 
@@ -112,8 +117,7 @@ def run_test(self, test_config, r_tol):
         prefix = "BingBertSquad_func"
 
         test_config['other_args'] += f" --max_steps {test_config['max_steps']}"
-        test_config[
-            'other_args'] += f" --max_steps_per_epoch {test_config['max_epoch_steps']}"
+        test_config['other_args'] += f" --max_steps_per_epoch {test_config['max_epoch_steps']}"
 
         # baseline run...
         test_config["deepspeed"] = False
diff --git a/tests/model/BingBertSquad/BingBertSquad_test_common.py b/tests/model/BingBertSquad/BingBertSquad_test_common.py
index b6069d76e69a..ef42f85cc945 100755
--- a/tests/model/BingBertSquad/BingBertSquad_test_common.py
+++ b/tests/model/BingBertSquad/BingBertSquad_test_common.py
@@ -1,5 +1,7 @@
-# Copyright (c) 2019, The Microsoft DeepSpeed Team. All rights reserved.
-#
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import unittest
 import subprocess
@@ -8,6 +10,7 @@
 
 
 class BaseTestCase(unittest.TestCase):
+
     def __init__(self, methodName="DeepSpeed performance test"):
         super(BaseTestCase, self).__init__(methodName)
         self.test_dir = "./test"
@@ -23,10 +26,7 @@ def gen_output_name(self, test_config, prefix):
             other_args = "_" + other_args
 
         if test_config["deepspeed"]:
-            file_name = "_gpu{0}_{1}_ds{2}-{3}.log".format(test_config["gpus"],
-                                                           other_args,
-                                                           zero_args,
-                                                           self.timestr)
+            file_name = "_gpu{0}_{1}_ds{2}-{3}.log".format(test_config["gpus"], other_args, zero_args, self.timestr)
             save_dir = self.test_dir
         else:
             file_name = "_gpu{0}_{1}.log".format(test_config["gpus"], other_args)
@@ -46,22 +46,12 @@ def clean_test_env(self):
         time.sleep(20)
 
     def run_BingBertSquad_test(self, test_config, output):
-        ds_flag = " -d --deepspeed_config " + test_config["json"] if test_config[
-            "deepspeed"] else " "
-        other_args = " " + test_config[
-            "other_args"] if "other_args" in test_config else " "
+        ds_flag = " -d --deepspeed_config " + test_config["json"] if test_config["deepspeed"] else " "
+        other_args = " " + test_config["other_args"] if "other_args" in test_config else " "
 
-        cmd = "./run_BingBertSquad_sanity.sh -e 1 -g {0} {1} {2}".format(
-            test_config["gpus"],
-            other_args,
-            ds_flag)
+        cmd = "./run_BingBertSquad_sanity.sh -e 1 -g {0} {1} {2}".format(test_config["gpus"], other_args, ds_flag)
 
         self.ensure_directory_exists(output)
         with open(output, "w") as f:
             print(cmd)
-            subprocess.run(cmd,
-                           shell=True,
-                           check=False,
-                           executable='/bin/bash',
-                           stdout=f,
-                           stderr=f)
+            subprocess.run(cmd, shell=True, check=False, executable='/bin/bash', stdout=f, stderr=f)
diff --git a/tests/model/BingBertSquad/__init__.py b/tests/model/BingBertSquad/__init__.py
index e122adbdfdde..3b0e64cad41f 100755
--- a/tests/model/BingBertSquad/__init__.py
+++ b/tests/model/BingBertSquad/__init__.py
@@ -1,4 +1,7 @@
-# Copyright (c) 2019, The Microsoft DeepSpeed Team. All rights reserved.
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .BingBertSquad_run_func_test import BingBertSquadFuncTestCase
 from .BingBertSquad_run_func_test import suite
diff --git a/tests/model/BingBertSquad/test_e2e_squad.py b/tests/model/BingBertSquad/test_e2e_squad.py
index 7dfd718bc6bd..9312dc67a193 100644
--- a/tests/model/BingBertSquad/test_e2e_squad.py
+++ b/tests/model/BingBertSquad/test_e2e_squad.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import subprocess as sp
 import os
@@ -63,16 +66,7 @@ def test_e2e_squad_deepspeed_base(tmpdir):
     output_dir = os.path.join(tmpdir, "output")
     pred_file = os.path.join(output_dir, pred_file_name)
 
-    proc = sp.Popen([
-        "bash",
-        script_file_name,
-        num_gpus,
-        model_file,
-        squad_dir,
-        output_dir,
-        config_file
-    ],
-                    cwd=base_dir)
+    proc = sp.Popen(["bash", script_file_name, num_gpus, model_file, squad_dir, output_dir, config_file], cwd=base_dir)
 
     try:
         proc.communicate(timeout=timeout_sec)
@@ -82,9 +76,7 @@ def test_e2e_squad_deepspeed_base(tmpdir):
 
             print("evaluation result: ", json.dumps(eval_result))
 
-            assert isclose(eval_result["exact_match"],
-                           expected_exact_match,
-                           abs_tol=1e-2)
+            assert isclose(eval_result["exact_match"], expected_exact_match, abs_tol=1e-2)
             assert isclose(eval_result["f1"], expected_f1, abs_tol=1e-2)
 
         else:
@@ -110,16 +102,7 @@ def test_e2e_squad_deepspeed_zero(tmpdir):
     output_dir = os.path.join(tmpdir, "output")
     pred_file = os.path.join(output_dir, pred_file_name)
 
-    proc = sp.Popen([
-        "bash",
-        script_file_name,
-        num_gpus,
-        model_file,
-        squad_dir,
-        output_dir,
-        config_file
-    ],
-                    cwd=base_dir)
+    proc = sp.Popen(["bash", script_file_name, num_gpus, model_file, squad_dir, output_dir, config_file], cwd=base_dir)
 
     try:
         proc.communicate(timeout=timeout_sec)
@@ -129,9 +112,7 @@ def test_e2e_squad_deepspeed_zero(tmpdir):
 
             print("evaluation result: ", json.dumps(eval_result))
 
-            assert isclose(eval_result["exact_match"],
-                           expected_exact_match,
-                           abs_tol=1e-2)
+            assert isclose(eval_result["exact_match"], expected_exact_match, abs_tol=1e-2)
             assert isclose(eval_result["f1"], expected_f1, abs_tol=1e-2)
 
         else:
diff --git a/tests/model/Megatron_GPT2/__init__.py b/tests/model/Megatron_GPT2/__init__.py
index 2451ec7ae5bf..4180edf94aa5 100644
--- a/tests/model/Megatron_GPT2/__init__.py
+++ b/tests/model/Megatron_GPT2/__init__.py
@@ -1,6 +1,10 @@
-# Copyright (c) 2019, The Microsoft DeepSpeed Team. All rights reserved.
-#
-# Note: please copy webtext data to "Megatron-LM" folder, before running this script.
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""
+Note: please copy webtext data to "Megatron-LM" folder, before running this script.
+"""
 
 from .run_func_test import GPT2FuncTestCase
 from .run_checkpoint_test import GPT2CheckpointTestCase, checkpoint_suite
diff --git a/tests/model/Megatron_GPT2/run_checkpoint_test.py b/tests/model/Megatron_GPT2/run_checkpoint_test.py
index 628547ef2f14..d97a28ff1ad5 100755
--- a/tests/model/Megatron_GPT2/run_checkpoint_test.py
+++ b/tests/model/Megatron_GPT2/run_checkpoint_test.py
@@ -1,6 +1,10 @@
-# Copyright (c) 2019, The Microsoft DeepSpeed Team. All rights reserved.
-#
-# Note: please copy webtext data to "Megatron-LM" folder, before running this script.
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""
+Note: please copy webtext data to "Megatron-LM" folder, before running this script.
+"""
 
 import unittest
 import subprocess
@@ -39,6 +43,7 @@ def grep_loss_from_file(file_name):
 
 
 class GPT2CheckpointTestCase(BaseTestCase):
+
     def __init__(self, methodName="DeepSpeed function test on GPT2 model"):
         super(GPT2CheckpointTestCase, self).__init__(methodName)
 
@@ -480,8 +485,7 @@ def run_test(self, test_config, r_tol):
         #-----------------Loading Checkpoint-----------------#
 
         # building checkpoint arguments
-        test_config[
-            "other_args"] = f"\"--load {checkpoint_folder} {cpu_optimizer_flag} \""
+        test_config["other_args"] = f"\"--load {checkpoint_folder} {cpu_optimizer_flag} \""
 
         # set checkpoint load iteration
         try:
@@ -543,24 +547,20 @@ def checkpoint_suite():
     # Shrink DP
     suite.addTest(GPT2CheckpointTestCase('test_mp1_gpu2_load_gpu1_node1_with_zero1'))
     suite.addTest(GPT2CheckpointTestCase('test_mp1_gpu2_load_gpu1_node1_with_zero2'))
-    suite.addTest(
-        GPT2CheckpointTestCase('test_mp1_gpu2_load_gpu1_node1_with_zero2_offload'))
+    suite.addTest(GPT2CheckpointTestCase('test_mp1_gpu2_load_gpu1_node1_with_zero2_offload'))
 
     suite.addTest(GPT2CheckpointTestCase('test_mp2_gpu4_load_gpu2_node1_with_zero1'))
     suite.addTest(GPT2CheckpointTestCase('test_mp2_gpu4_load_gpu2_node1_with_zero2'))
-    suite.addTest(
-        GPT2CheckpointTestCase('test_mp2_gpu4_load_gpu2_node1_with_zero2_offload'))
+    suite.addTest(GPT2CheckpointTestCase('test_mp2_gpu4_load_gpu2_node1_with_zero2_offload'))
 
     # Expand DP
     suite.addTest(GPT2CheckpointTestCase('test_mp1_gpu2_load_gpu4_node1_with_zero1'))
     suite.addTest(GPT2CheckpointTestCase('test_mp1_gpu2_load_gpu4_node1_with_zero2'))
-    suite.addTest(
-        GPT2CheckpointTestCase('test_mp1_gpu2_load_gpu4_node1_with_zero2_offload'))
+    suite.addTest(GPT2CheckpointTestCase('test_mp1_gpu2_load_gpu4_node1_with_zero2_offload'))
 
     suite.addTest(GPT2CheckpointTestCase('test_mp2_gpu2_load_gpu4_node1_with_zero1'))
     suite.addTest(GPT2CheckpointTestCase('test_mp2_gpu2_load_gpu4_node1_with_zero2'))
-    suite.addTest(
-        GPT2CheckpointTestCase('test_mp2_gpu2_load_gpu4_node1_with_zero2_offload'))
+    suite.addTest(GPT2CheckpointTestCase('test_mp2_gpu2_load_gpu4_node1_with_zero2_offload'))
 
     suite.addTest(GPT2CheckpointTestCase('test_mp2_gpu4_node1_without_zero'))
 
diff --git a/tests/model/Megatron_GPT2/run_func_test.py b/tests/model/Megatron_GPT2/run_func_test.py
index 78a685e0f0e2..0f5ad12c7c73 100755
--- a/tests/model/Megatron_GPT2/run_func_test.py
+++ b/tests/model/Megatron_GPT2/run_func_test.py
@@ -1,6 +1,10 @@
-# Copyright (c) 2019, The Microsoft DeepSpeed Team. All rights reserved.
-#
-# Note: please copy webtext data to "Megatron-LM" folder, before running this script.
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""
+Note: please copy webtext data to "Megatron-LM" folder, before running this script.
+"""
 
 import unittest
 import os
@@ -34,6 +38,7 @@ def grep_loss_from_file(file_name):
 
 
 class GPT2FuncTestCase(BaseTestCase):
+
     def __init__(self, methodName="DeepSpeed function test on GPT2 model"):
         super(GPT2FuncTestCase, self).__init__(methodName)
 
@@ -454,9 +459,7 @@ def run_partition_activations_test(self, test_config, r_tol):
             baseline_deepspeed_config = True
 
         test_config["other_args"] = f"\"{cpu_optimizer_flag}\""
-        base_file = self.gen_output_name(test_config,
-                                         baseline_prefix,
-                                         baseline_config=baseline_deepspeed_config)
+        base_file = self.gen_output_name(test_config, baseline_prefix, baseline_config=baseline_deepspeed_config)
 
         # skip baseline run if it exists.
         if not self.has_loss_data(base_file):
@@ -468,8 +471,7 @@ def run_partition_activations_test(self, test_config, r_tol):
         # DeepSpeed run...
         test_config["deepspeed"] = True
         cpu_optimizer_flag = self.gen_cpu_optimizer_flag(test_config, False)
-        test_config[
-            "other_args"] = f"\"--deepspeed-activation-checkpointing {cpu_optimizer_flag}\""
+        test_config["other_args"] = f"\"--deepspeed-activation-checkpointing {cpu_optimizer_flag}\""
         test_config["json"] = deepspeed_config
 
         print("{0}: DeepSpeed run.".format(self.id()))
@@ -502,9 +504,7 @@ def run_test(self, test_config, r_tol):
         test_config["other_args"] = f"\"{cpu_optimizer_flag}\""
 
         # baseline run...
-        base_file = self.gen_output_name(test_config,
-                                         baseline_prefix,
-                                         baseline_config=baseline_deepspeed_config)
+        base_file = self.gen_output_name(test_config, baseline_prefix, baseline_config=baseline_deepspeed_config)
 
         # skip baseline run if it exists.
         if not self.has_loss_data(base_file):
diff --git a/tests/model/Megatron_GPT2/run_perf_baseline.py b/tests/model/Megatron_GPT2/run_perf_baseline.py
index 0c7233d5dc8f..0958f021045f 100755
--- a/tests/model/Megatron_GPT2/run_perf_baseline.py
+++ b/tests/model/Megatron_GPT2/run_perf_baseline.py
@@ -1,6 +1,10 @@
-# Copyright (c) 2019, The Microsoft DeepSpeed Team. All rights reserved.
-#
-# Note: please copy webtext data to "Megatron-LM" folder, before running this script.
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""
+Note: please copy webtext data to "Megatron-LM" folder, before running this script.
+"""
 
 import unittest
 import re
@@ -8,6 +12,7 @@
 
 
 class GPT2PerfBaselineTestCase(BaseTestCase):
+
     def __init__(self, methodName="DeepSpeed performance test on GPT2 model"):
         super(GPT2PerfBaselineTestCase, self).__init__(methodName)
 
@@ -88,9 +93,7 @@ def run_test(self, test_config):
         if exec_time == 0.0:
             print("{0}: no latency found in file {1}".format(self.id(), test_file))
         else:
-            print("{0}: execution time per iteration is {1}ms.".format(
-                self.id(),
-                exec_time))
+            print("{0}: execution time per iteration is {1}ms.".format(self.id(), exec_time))
 
     def grep_latency_from_file(self, file_name):
         latency = 0.0
@@ -99,9 +102,7 @@ def grep_latency_from_file(self, file_name):
         with open(file_name, 'r') as f:
             lines = f.readlines()
             line_filter = "elapsed time per iteration"
-            match_number = re.compile(
-                r'elapsed time per iteration \(ms\): ([-+]?[0-9]+\.?[0-9]*(?:[Ee][-+]?[0-9]+)?)'
-            )
+            match_number = re.compile(r'elapsed time per iteration \(ms\): ([-+]?[0-9]+\.?[0-9]*(?:[Ee][-+]?[0-9]+)?)')
 
             for line in lines:
                 if line_filter in line:
diff --git a/tests/model/Megatron_GPT2/run_perf_test.py b/tests/model/Megatron_GPT2/run_perf_test.py
index 623f945a4425..3a144ab067ca 100755
--- a/tests/model/Megatron_GPT2/run_perf_test.py
+++ b/tests/model/Megatron_GPT2/run_perf_test.py
@@ -1,6 +1,10 @@
-# Copyright (c) 2019, The Microsoft DeepSpeed Team. All rights reserved.
-#
-# Note: please copy webtext data to "Megatron-LM" folder, before running this script.
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""
+Note: please copy webtext data to "Megatron-LM" folder, before running this script.
+"""
 
 import unittest
 import re
@@ -8,6 +12,7 @@
 
 
 class GPT2PerfTestCase(BaseTestCase):
+
     def __init__(self, methodName="DeepSpeed performance test on GPT2 model"):
         super(GPT2PerfTestCase, self).__init__(methodName)
 
@@ -92,9 +97,7 @@ def run_test(self, test_config):
         if exec_time == 0.0:
             print("{0}: no latency found in file {1}".format(self.id(), test_file))
         else:
-            print("{0}: execution time per iteration is {1}ms.".format(
-                self.id(),
-                exec_time))
+            print("{0}: execution time per iteration is {1}ms.".format(self.id(), exec_time))
 
     def grep_latency_from_file(self, file_name):
         latency = 0.0
@@ -103,9 +106,7 @@ def grep_latency_from_file(self, file_name):
         with open(file_name, 'r') as f:
             lines = f.readlines()
             line_filter = "elapsed time per iteration"
-            match_number = re.compile(
-                r'elapsed time per iteration \(ms\): ([-+]?[0-9]+\.?[0-9]*(?:[Ee][-+]?[0-9]+)?)'
-            )
+            match_number = re.compile(r'elapsed time per iteration \(ms\): ([-+]?[0-9]+\.?[0-9]*(?:[Ee][-+]?[0-9]+)?)')
 
             for line in lines:
                 if line_filter in line:
diff --git a/tests/model/Megatron_GPT2/test_common.py b/tests/model/Megatron_GPT2/test_common.py
index 6f9bec89eeb5..1bcd891e31d5 100755
--- a/tests/model/Megatron_GPT2/test_common.py
+++ b/tests/model/Megatron_GPT2/test_common.py
@@ -1,5 +1,7 @@
-# Copyright (c) 2019, The Microsoft DeepSpeed Team. All rights reserved.
-#
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import unittest
 import subprocess
@@ -8,6 +10,7 @@
 
 
 class BaseTestCase(unittest.TestCase):
+
     def __init__(self, methodName="DeepSpeed performance test"):
         super(BaseTestCase, self).__init__(methodName)
         self.test_dir = "./test"
@@ -24,30 +27,14 @@ def gen_output_name(self, test_config, prefix, baseline_config=False):
 
         if test_config["deepspeed"] and not baseline_config:
             file_name = "_mp{0}_gpu{1}_node{2}_bs{3}_step{4}_layer{5}_hidden{6}_seq{7}_head{8}{9}_ds{10}-{11}.log".format(
-                test_config["mp"],
-                test_config["gpus"],
-                test_config["nodes"],
-                test_config["bs"],
-                test_config["steps"],
-                test_config["layers"],
-                test_config["hidden_size"],
-                test_config["seq_length"],
-                test_config["heads"],
-                other_args,
-                zero_args,
-                self.timestr)
+                test_config["mp"], test_config["gpus"], test_config["nodes"], test_config["bs"], test_config["steps"],
+                test_config["layers"], test_config["hidden_size"], test_config["seq_length"], test_config["heads"],
+                other_args, zero_args, self.timestr)
             save_dir = self.test_dir
         else:
             file_name = "_mp{0}_gpu{1}_node{2}_bs{3}_step{4}_layer{5}_hidden{6}_seq{7}_head{8}{9}.log".format(
-                test_config["mp"],
-                test_config["gpus"],
-                test_config["nodes"],
-                test_config["bs"],
-                test_config["steps"],
-                test_config["layers"],
-                test_config["hidden_size"],
-                test_config["seq_length"],
-                test_config["heads"],
+                test_config["mp"], test_config["gpus"], test_config["nodes"], test_config["bs"], test_config["steps"],
+                test_config["layers"], test_config["hidden_size"], test_config["seq_length"], test_config["heads"],
                 other_args)
             save_dir = self.baseline_dir
 
@@ -66,31 +53,15 @@ def clean_test_env(self):
 
     def run_gpt2_test(self, test_config, output):
         ds_flag = "-d " + test_config["json"] if test_config["deepspeed"] else ""
-        ckpt_num = test_config[
-            "ckpt_num_layers"] if "ckpt_num_layers" in test_config else 1
-        other_args = "-o " + test_config[
-            "other_args"] if "other_args" in test_config else ""
+        ckpt_num = test_config["ckpt_num_layers"] if "ckpt_num_layers" in test_config else 1
+        other_args = "-o " + test_config["other_args"] if "other_args" in test_config else ""
 
         cmd = "./ds_gpt2_test.sh -m {0} -g {1} -n {2} -b {3} -s {4} -l {5} -h {6} -q {7} -e {8} -c {9} {10} {11}".format(
-            test_config["mp"],
-            test_config["gpus"],
-            test_config["nodes"],
-            test_config["bs"],
-            test_config["steps"],
-            test_config["layers"],
-            test_config["hidden_size"],
-            test_config["seq_length"],
-            test_config["heads"],
-            ckpt_num,
-            other_args,
-            ds_flag)
+            test_config["mp"], test_config["gpus"], test_config["nodes"], test_config["bs"], test_config["steps"],
+            test_config["layers"], test_config["hidden_size"], test_config["seq_length"], test_config["heads"],
+            ckpt_num, other_args, ds_flag)
 
         self.ensure_directory_exists(output)
         with open(output, "w") as f:
             print(cmd)
-            subprocess.run(cmd,
-                           shell=True,
-                           check=False,
-                           executable='/bin/bash',
-                           stdout=f,
-                           stderr=f)
+            subprocess.run(cmd, shell=True, check=False, executable='/bin/bash', stdout=f, stderr=f)
diff --git a/tests/model/run_sanity_check.py b/tests/model/run_sanity_check.py
index a226ccb8ca06..443f12046b27 100755
--- a/tests/model/run_sanity_check.py
+++ b/tests/model/run_sanity_check.py
@@ -1,6 +1,10 @@
-# Copyright (c) 2019, The Microsoft DeepSpeed Team. All rights reserved.
-#
-# Note: please copy webtext data to "Megatron-LM" folder, before running this script.
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""
+Note: please copy webtext data to "Megatron-LM" folder, before running this script.
+"""
 
 import sys
 import unittest
diff --git a/tests/onebit/test_mpi_backend.py b/tests/onebit/test_mpi_backend.py
index bb8915f2c001..5cc36317a8dd 100644
--- a/tests/onebit/test_mpi_backend.py
+++ b/tests/onebit/test_mpi_backend.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from mpi4py import MPI
 import torch
@@ -35,8 +38,7 @@ def torch_sim(a):
     a_list = torch.chunk(a_compressed, chunks=dist.get_world_size())
     server_scale = [chunk_a.norm() / np.sqrt(chunk_a.numel()) for chunk_a in a_list]
     a_sign_list = torch.chunk(a_server_sign, dist.get_world_size())
-    a_server_compressed = torch.cat(
-        [server_scale[i] * a_sign_list[i] for i in range(dist.get_world_size())])
+    a_server_compressed = torch.cat([server_scale[i] * a_sign_list[i] for i in range(dist.get_world_size())])
     rank = dist.get_rank()
     server_error = a_list[rank] - server_scale[rank] * a_sign_list[rank]
     get_accelerator().synchronize()
diff --git a/tests/onebit/test_mpi_perf.py b/tests/onebit/test_mpi_perf.py
index dd67fdb615e8..4a4f6dbe8b4a 100644
--- a/tests/onebit/test_mpi_perf.py
+++ b/tests/onebit/test_mpi_perf.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from mpi4py import MPI
 import torch
diff --git a/tests/onebit/test_nccl_backend.py b/tests/onebit/test_nccl_backend.py
index e544865b7685..57edd4894c18 100644
--- a/tests/onebit/test_nccl_backend.py
+++ b/tests/onebit/test_nccl_backend.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 import deepspeed.comm as dist
@@ -40,8 +43,7 @@ def torch_sim(a):
     a_list = torch.chunk(a_compressed, chunks=dist.get_world_size())
     server_scale = [chunk_a.norm() / np.sqrt(chunk_a.numel()) for chunk_a in a_list]
     a_sign_list = torch.chunk(a_server_sign, dist.get_world_size())
-    a_server_compressed = torch.cat(
-        [server_scale[i] * a_sign_list[i] for i in range(dist.get_world_size())])
+    a_server_compressed = torch.cat([server_scale[i] * a_sign_list[i] for i in range(dist.get_world_size())])
     rank = dist.get_rank()
     server_error = a_list[rank] - server_scale[rank] * a_sign_list[rank]
     get_accelerator().synchronize()
@@ -86,7 +88,6 @@ def torch_sim(a):
     else:
         check_mag_mask = mpi_server[diff_server_mask] > magnitude_threshold
         if torch.sum(check_mag_mask) == 0:
-            print(
-                'Successfully passed the test for NCCL Backend at Rank {}'.format(rank))
+            print('Successfully passed the test for NCCL Backend at Rank {}'.format(rank))
         else:
             print('Fails at {} of positions'.format(torch.sum(check_mag_mask)))
diff --git a/tests/onebit/test_nccl_perf.py b/tests/onebit/test_nccl_perf.py
index aab93efac851..d98e06e797a7 100644
--- a/tests/onebit/test_nccl_perf.py
+++ b/tests/onebit/test_nccl_perf.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 import deepspeed.comm as dist
@@ -83,9 +86,7 @@
 minlat = round(min(time_list) * convert)
 maxlat = round(max(time_list) * convert)
 meanlat = round(mean(time_list) * convert, places)
-print("min, max, and mean = {} ms, {} ms, {} ms".format(minlat,
-                                                        maxlat,
-                                                        meanlat)) if rank == 0 else None
+print("min, max, and mean = {} ms, {} ms, {} ms".format(minlat, maxlat, meanlat)) if rank == 0 else None
 #print("tensor shape", a.shape)
 duration = meanlat / 1e3
 tput = ((tensor_size * 4) / duration)
diff --git a/tests/perf/adagrad_test.py b/tests/perf/adagrad_test.py
index 2359c28bbe52..0dd7b9d20475 100755
--- a/tests/perf/adagrad_test.py
+++ b/tests/perf/adagrad_test.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 from deepspeed.ops.adagrad import DeepSpeedCPUAdagrad
diff --git a/tests/perf/adam_test.py b/tests/perf/adam_test.py
index 24240347451d..1a4dfcdab0d3 100755
--- a/tests/perf/adam_test.py
+++ b/tests/perf/adam_test.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 from deepspeed.ops.adam import DeepSpeedCPUAdam
diff --git a/tests/perf/adam_test1.py b/tests/perf/adam_test1.py
index 13d486d4d855..b35477afb4fe 100755
--- a/tests/perf/adam_test1.py
+++ b/tests/perf/adam_test1.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 from deepspeed.ops.adam import DeepSpeedCPUAdam
@@ -8,10 +11,7 @@
 device = 'cpu'
 model_size = 1 * 1024**3
 param = torch.nn.Parameter(torch.ones(model_size, device=device))
-param_fp16 = torch.nn.Parameter(
-    torch.ones(model_size,
-               dtype=torch.half,
-               device=get_accelerator().device_name(0)))
+param_fp16 = torch.nn.Parameter(torch.ones(model_size, dtype=torch.half, device=get_accelerator().device_name(0)))
 
 optimizer = DeepSpeedCPUAdam([param])
 #torch.set_num_threads(128)
diff --git a/tests/pytest.ini b/tests/pytest.ini
index 08b666867b79..8d043c8b3f9d 100644
--- a/tests/pytest.ini
+++ b/tests/pytest.ini
@@ -1,8 +1,12 @@
 [pytest]
-addopts = -m "not sequential and not nightly and not inference and not seq_inference and not inference_ops"
+addopts = -m "not sequential and not nightly and not inference and not seq_inference and not inference_ops and not inference_v2 and not inference_v2_ops and not stable_diffusion"
 markers =
     sequential:Tests that need to be run sequentially
     inference:Inference model tests
     inference_ops:Individual inference operator tests
+    inference_v2: Inference tests for the v2 stack
+    inference_v2_ops: Op tests for the v2 stack
     seq_inference:Inference model tests to run sequentially
     nightly:Tests that should be run nightly
+    world_size:Change world size of individual tests in a class
+    stable_diffusion:Tests that run Stable Diffusion
diff --git a/tests/small_model_debugging/partial_offload_test.py b/tests/small_model_debugging/partial_offload_test.py
new file mode 100644
index 000000000000..2094448d534d
--- /dev/null
+++ b/tests/small_model_debugging/partial_offload_test.py
@@ -0,0 +1,128 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import os
+import json
+import argparse
+import torch
+import deepspeed
+from torch.utils.data.distributed import DistributedSampler
+import deepspeed.comm as dist
+
+
+class SimpleModel(torch.nn.Module):
+
+    def __init__(self, hidden_dim, empty_grad=False):
+        super(SimpleModel, self).__init__()
+        self.linear = torch.nn.Linear(hidden_dim, hidden_dim)
+        self.linear2 = torch.nn.Linear(hidden_dim, hidden_dim)
+        self.linear3 = torch.nn.Linear(hidden_dim, hidden_dim)
+        self.linear4 = torch.nn.Linear(hidden_dim, hidden_dim)
+        if empty_grad:
+            self.layers2 = torch.nn.ModuleList([torch.nn.Linear(hidden_dim, hidden_dim)])
+        self.cross_entropy_loss = torch.nn.CrossEntropyLoss()
+
+    def forward(self, x, y):
+        hidden = x
+        hidden = self.linear(hidden)
+        hidden = self.linear2(hidden)
+        hidden = self.linear3(hidden)
+        hidden = self.linear4(hidden)
+        return self.cross_entropy_loss(hidden, y)
+
+
+def create_config_from_dict(tmpdir, config_dict):
+    config_path = os.path.join(tmpdir, 'temp_config.json')
+    with open(config_path, 'w') as fd:
+        json.dump(config_dict, fd)
+    return config_path
+
+
+def get_data_loader(model, total_samples, hidden_dim, device):
+    batch_size = model.train_micro_batch_size_per_gpu()
+    train_data = torch.randn(total_samples, hidden_dim, device=device, dtype=torch.half)
+    train_label = torch.empty(total_samples, dtype=torch.long, device=device).random_(hidden_dim)
+    train_dataset = torch.utils.data.TensorDataset(train_data, train_label)
+    sampler = DistributedSampler(train_dataset)
+    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, sampler=sampler)
+    return train_loader
+
+
+def get_args(tmpdir, config_dict):
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--local_rank", type=int, default=0)
+    parser.add_argument('--zero', type=int, default=0)
+    args = parser.parse_args()  #args=''
+
+    config_dict["zero_optimization"]["stage"] = args.zero
+    print('config_dict["zero_optimization"]', config_dict["zero_optimization"])
+    config_path = create_config_from_dict(tmpdir, config_dict)
+
+    args.deepspeed_config = config_path
+    return args
+
+
+def print0(msg):
+    if dist.get_rank() == 0:
+        print(msg, flush=True)
+
+
+rank = int(os.environ['RANK'])
+print('seed:', 2222 + rank)
+torch.random.manual_seed(2222 + rank)
+
+config_dict = {
+    "train_batch_size": 256,
+    "steps_per_print": 1,
+    "optimizer": {
+        "type": "Adam",
+        "params": {
+            "lr": 0.00015,
+        }
+    },
+    "fp16": {
+        "enabled": True,
+        "initial_scale_power": 15
+    },
+    "zero_optimization": {
+        "stage": 0,
+        "sub_group_size": 8,
+        "reduce_bucket_size": 20,
+        "offload_optimizer": {
+            "device": "cpu",
+            "pin_memory": True,
+            "ratio": 0.3
+        }
+    }
+}
+#        "initial_scale_power": 15
+args = get_args('/tmp/', config_dict)
+hidden_dim = 4 * 1024
+
+model = SimpleModel(hidden_dim, empty_grad=False)
+
+model, _, _, _ = deepspeed.initialize(args=args,
+                                      model=model,
+                                      model_parameters=model.parameters(),
+                                      dist_init_required=True)
+
+
+def print_params(tag, model):
+    if dist.get_rank() == 0:
+        for n, p in model.named_parameters():
+            print0("{} {}:{}".format(tag, n, p))
+
+
+data_loader = get_data_loader(model=model, total_samples=1000, hidden_dim=hidden_dim, device=model.device)
+#print_params('pre-train', model)
+#while True:
+for n, batch in enumerate(data_loader):
+    loss = model(batch[0], batch[1])
+    if dist.get_rank() == 0:
+        print("LOSS:", loss.item())
+    model.backward(loss)
+    model.step()
+    #print_params('step={}'.format(n), model)
+    if n == 2: break
diff --git a/tests/small_model_debugging/stage3_test.py b/tests/small_model_debugging/stage3_test.py
index ca85c00be486..3a92d31f1b7a 100644
--- a/tests/small_model_debugging/stage3_test.py
+++ b/tests/small_model_debugging/stage3_test.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 
@@ -10,6 +13,7 @@
 
 
 class VerboseLinear(torch.nn.Linear):
+
     def __init__(self, **kwargs):
         print(f'Begin VerboseLinear.__init__')
         super().__init__(**kwargs)
@@ -17,21 +21,19 @@ def __init__(self, **kwargs):
 
 
 class LinearStack(torch.nn.Module):
+
     def __init__(self, input_dim=2, hidden_dim=4, output_dim=4, num_layers=2):
         super().__init__()
         self.input_dim = input_dim
         self.output_dim = output_dim
         self.hidden_dim = hidden_dim
 
-        self.input_layer = VerboseLinear(in_features=self.input_dim,
-                                         out_features=self.hidden_dim)
+        self.input_layer = VerboseLinear(in_features=self.input_dim, out_features=self.hidden_dim)
         self.layers = torch.nn.ModuleList([
-            torch.nn.Linear(in_features=self.hidden_dim,
-                            out_features=self.hidden_dim,
-                            bias=False) for x in range(num_layers)
+            torch.nn.Linear(in_features=self.hidden_dim, out_features=self.hidden_dim, bias=False)
+            for x in range(num_layers)
         ])
-        self.output_layer = torch.nn.Linear(in_features=self.hidden_dim,
-                                            out_features=self.output_dim)
+        self.output_layer = torch.nn.Linear(in_features=self.hidden_dim, out_features=self.output_dim)
         self.identity = torch.nn.Identity()
 
     def forward(self, x):
diff --git a/tests/small_model_debugging/test.py b/tests/small_model_debugging/test.py
index a97792df56ac..993fb6284a03 100644
--- a/tests/small_model_debugging/test.py
+++ b/tests/small_model_debugging/test.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 from deepspeed.pt.deepspeed_linear import LinearModuleForZeroStage3
@@ -28,10 +31,7 @@ def see_memory_usage(message):
     )
 
 
-tens = torch.rand(1024,
-                  16384,
-                  dtype=torch.half,
-                  device=torch.device(get_accelerator().device_name()))
+tens = torch.rand(1024, 16384, dtype=torch.half, device=torch.device(get_accelerator().device_name()))
 tens_back = tens.detach().clone()
 
 #linear_bk = torch.nn.functional.linear
@@ -45,9 +45,7 @@ def see_memory_usage(message):
 
 see_memory_usage("After forward")
 
-model.weight.data = torch.zeros(1,
-                                dtype=torch.half,
-                                device=torch.device(get_accelerator().device_name()))
+model.weight.data = torch.zeros(1, dtype=torch.half, device=torch.device(get_accelerator().device_name()))
 
 see_memory_usage("After weight zero")
 
diff --git a/tests/small_model_debugging/test_mics.sh b/tests/small_model_debugging/test_mics.sh
new file mode 100755
index 000000000000..9f306a7055d3
--- /dev/null
+++ b/tests/small_model_debugging/test_mics.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+deepspeed test_mics_config.py --mics_shard_size=1
+
+deepspeed test_mics_config.py --mics_shard_size=2
+
+# for debugging the hierarchical params gathering
+export NDEV_PER_NODE=2
+deepspeed test_mics_config.py --mics_shard_size=4 --mics_hierarchical_params_gather
diff --git a/tests/small_model_debugging/test_mics_config.py b/tests/small_model_debugging/test_mics_config.py
new file mode 100644
index 000000000000..ccb34fadaefe
--- /dev/null
+++ b/tests/small_model_debugging/test_mics_config.py
@@ -0,0 +1,133 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""
+Testing on a 8 GPUs node
+NDEV_PER_NODE=2 torchrun --nnodes 1 --nproc-per-node 8 test_mics_config.py
+"""
+
+import os
+import json
+import argparse
+import torch
+import deepspeed
+from torch.utils.data.distributed import DistributedSampler
+import deepspeed.comm as dist
+
+
+class SimpleModel(torch.nn.Module):
+
+    def __init__(self, hidden_dim, empty_grad=False):
+        super(SimpleModel, self).__init__()
+        self.linear = torch.nn.Linear(hidden_dim, hidden_dim)
+        if empty_grad:
+            self.layers2 = torch.nn.ModuleList([torch.nn.Linear(hidden_dim, hidden_dim)])
+        self.cross_entropy_loss = torch.nn.CrossEntropyLoss()
+
+    def forward(self, x, y):
+        hidden = x
+        hidden = self.linear(hidden)
+        return self.cross_entropy_loss(hidden, y)
+
+
+def create_config_from_dict(tmpdir, config_dict):
+    config_path = os.path.join(tmpdir, 'temp_config.json')
+    with open(config_path, 'w') as fd:
+        json.dump(config_dict, fd)
+    return config_path
+
+
+def get_data_loader(model, total_samples, hidden_dim, device):
+    batch_size = model.train_micro_batch_size_per_gpu()
+    train_data = torch.randn(total_samples, hidden_dim, device=device, dtype=torch.float)
+    train_label = torch.empty(total_samples, dtype=torch.long, device=device).random_(hidden_dim)
+    train_dataset = torch.utils.data.TensorDataset(train_data, train_label)
+    sampler = DistributedSampler(train_dataset)
+    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, sampler=sampler)
+    return train_loader
+
+
+def get_args(tmpdir, config_dict):
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--zero', type=int, default=3)
+    parser.add_argument('--local_rank', type=int)
+
+    parser.add_argument('--mics_shard_size', default=2, type=int)
+    parser.add_argument('--mics_hierarchical_params_gather', default=False, action='store_true')
+    args = parser.parse_args()  #args=''
+
+    config_dict["zero_optimization"]["stage"] = args.zero
+    config_dict["zero_optimization"]["mics_shard_size"] = args.mics_shard_size
+    config_dict["zero_optimization"]["mics_hierarchical_params_gather"] = args.mics_hierarchical_params_gather
+
+    # print('config_dict["zero_optimization"]', config_dict["zero_optimization"])
+    config_path = create_config_from_dict(tmpdir, config_dict)
+
+    args.deepspeed_config = config_path
+    return args
+
+
+def print0(msg):
+    if dist.get_rank() == 0:
+        print(msg, flush=True)
+
+
+rank = int(os.environ['RANK'])
+print('seed:', 2222 + rank)
+torch.random.manual_seed(2222 + rank)
+
+config_dict = {
+    "train_batch_size": 8,
+    "steps_per_print": 1,
+    "optimizer": {
+        "type": "Adam",
+        "params": {
+            "lr": 0.00015,
+        }
+    },
+    "fp16": {
+        "enabled": False,
+        "initial_scale_power": 15
+    },
+    "zero_optimization": {
+        "stage": 3,
+        "reduce_bucket_size": 20,
+        "mics_shard_size": 4,
+        "mics_hierarchical_params_gather": True,
+        "stage3_model_persistence_threshold": 10
+    }
+}
+#        "initial_scale_power": 15
+args = get_args('/tmp/', config_dict)
+hidden_dim = 32
+
+with deepspeed.zero.MiCS_Init(config_dict_or_path=config_dict):
+    model = SimpleModel(hidden_dim, empty_grad=False)
+# print('------> init model with deepspeed.zero.Init()')
+
+model, _, _, _ = deepspeed.initialize(args=args,
+                                      model=model,
+                                      model_parameters=model.parameters(),
+                                      dist_init_required=True)
+
+
+def print_params(tag, model):
+    if dist.get_rank() == 0:
+        for n, p in model.named_parameters():
+            print0("{} {}:{}".format(tag, n, p))
+
+
+data_loader = get_data_loader(model=model, total_samples=1000, hidden_dim=hidden_dim, device=model.device)
+#print_params('pre-train', model)
+for n, batch in enumerate(data_loader):
+    loss = model(batch[0], batch[1])
+    if dist.get_rank() == 0:
+        print("LOSS:", loss.item())
+    model.backward(loss)
+    model.step()
+    #print_params('step={}'.format(n), model)
+    if n == 5: break
diff --git a/tests/small_model_debugging/test_model.py b/tests/small_model_debugging/test_model.py
index 792d683ce47b..2706cde980d4 100755
--- a/tests/small_model_debugging/test_model.py
+++ b/tests/small_model_debugging/test_model.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import os
 import json
@@ -10,17 +13,21 @@
 
 
 class SimpleModel(torch.nn.Module):
+
     def __init__(self, hidden_dim, empty_grad=False):
         super(SimpleModel, self).__init__()
-        self.linear = torch.nn.Linear(hidden_dim, hidden_dim)
+        self.linear = torch.nn.Linear(hidden_dim, hidden_dim, bias=True)
+        self.linear = torch.nn.Linear(hidden_dim, hidden_dim, bias=False)
         if empty_grad:
-            self.layers2 = torch.nn.ModuleList([torch.nn.Linear(hidden_dim, hidden_dim)])
+            self.layers2 = torch.nn.ModuleList([torch.nn.Linear(hidden_dim,
+                                                                hidden_dim)])  #QuantizeLinear(hidden_dim, hidden_dim)
         self.cross_entropy_loss = torch.nn.CrossEntropyLoss()
 
     def forward(self, x, y):
         hidden = x
-        hidden = self.linear(hidden)
-        return self.cross_entropy_loss(hidden, y)
+        hidden1 = self.linear(hidden)
+        hidden2 = self.linear(hidden1)
+        return self.cross_entropy_loss(hidden2, y)
 
 
 def create_config_from_dict(tmpdir, config_dict):
@@ -33,14 +40,10 @@ def create_config_from_dict(tmpdir, config_dict):
 def get_data_loader(model, total_samples, hidden_dim, device):
     batch_size = model.train_micro_batch_size_per_gpu()
     train_data = torch.randn(total_samples, hidden_dim, device=device, dtype=torch.half)
-    train_label = torch.empty(total_samples,
-                              dtype=torch.long,
-                              device=device).random_(hidden_dim)
+    train_label = torch.empty(total_samples, dtype=torch.long, device=device).random_(hidden_dim)
     train_dataset = torch.utils.data.TensorDataset(train_data, train_label)
     sampler = DistributedSampler(train_dataset)
-    train_loader = torch.utils.data.DataLoader(train_dataset,
-                                               batch_size=batch_size,
-                                               sampler=sampler)
+    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, sampler=sampler)
     return train_loader
 
 
@@ -48,9 +51,11 @@ def get_args(tmpdir, config_dict):
     parser = argparse.ArgumentParser()
     parser.add_argument("--local_rank", type=int, default=0)
     parser.add_argument('--zero', type=int, default=0)
+    parser.add_argument('--zero_hpz_partition_size', type=int, default=1)
     args = parser.parse_args()  #args=''
 
     config_dict["zero_optimization"]["stage"] = args.zero
+    config_dict["zero_optimization"]["zero_hpz_partition_size"] = args.zero_hpz_partition_size
     print('config_dict["zero_optimization"]', config_dict["zero_optimization"])
     config_path = create_config_from_dict(tmpdir, config_dict)
 
@@ -68,7 +73,7 @@ def print0(msg):
 torch.random.manual_seed(2222 + rank)
 
 config_dict = {
-    "train_batch_size": 8,
+    "train_batch_size": 256,
     "steps_per_print": 1,
     "optimizer": {
         "type": "Adam",
@@ -78,23 +83,27 @@ def print0(msg):
     },
     "fp16": {
         "enabled": True,
-        "initial_scale_power": 15
+        "initial_scale_power": 8
     },
     "zero_optimization": {
         "stage": 0,
-        "reduce_bucket_size": 20
+        "reduce_bucket_size": 20,
+        "zero_hpz_partition_size": 1,
+        "reduce_scatter": True,
+        "zero_quantized_weights": False,
+        "zero_quantized_gradients": False
     }
 }
 #        "initial_scale_power": 15
 args = get_args('/tmp/', config_dict)
-hidden_dim = 4
+hidden_dim = 4 * 1024
 
 model = SimpleModel(hidden_dim, empty_grad=False)
 
-model, _, _,_ = deepspeed.initialize(args=args,
-                                     model=model,
-                                     model_parameters=model.parameters(),
-                                     dist_init_required=True)
+model, _, _, _ = deepspeed.initialize(args=args,
+                                      model=model,
+                                      model_parameters=model.parameters(),
+                                      dist_init_required=True)
 
 
 def print_params(tag, model):
@@ -103,11 +112,9 @@ def print_params(tag, model):
             print0("{} {}:{}".format(tag, n, p))
 
 
-data_loader = get_data_loader(model=model,
-                              total_samples=1000,
-                              hidden_dim=hidden_dim,
-                              device=model.device)
+data_loader = get_data_loader(model=model, total_samples=256, hidden_dim=hidden_dim, device=model.device)
 #print_params('pre-train', model)
+
 for n, batch in enumerate(data_loader):
     loss = model(batch[0], batch[1])
     if dist.get_rank() == 0:
@@ -115,4 +122,4 @@ def print_params(tag, model):
     model.backward(loss)
     model.step()
     #print_params('step={}'.format(n), model)
-    if n == 5: break
+    #if n == 5: break
diff --git a/tests/unit/__init__.py b/tests/unit/__init__.py
index fcb45ab2b685..6c5067f71c8f 100644
--- a/tests/unit/__init__.py
+++ b/tests/unit/__init__.py
@@ -1 +1,5 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 '''Copyright The Microsoft DeepSpeed Team'''
diff --git a/tests/unit/alexnet_model.py b/tests/unit/alexnet_model.py
index bdbaf02922e2..e3be2be4894d 100644
--- a/tests/unit/alexnet_model.py
+++ b/tests/unit/alexnet_model.py
@@ -1,6 +1,10 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import pytest
+import os
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -12,41 +16,23 @@
 
 
 class AlexNet(nn.Module):
+
     def __init__(self, num_classes=10):
         super(AlexNet, self).__init__()
         self.features = nn.Sequential(
-            nn.Conv2d(3,
-                      64,
-                      kernel_size=11,
-                      stride=4,
-                      padding=5),
+            nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=5),
             nn.ReLU(inplace=True),
-            nn.MaxPool2d(kernel_size=2,
-                         stride=2),
-            nn.Conv2d(64,
-                      192,
-                      kernel_size=5,
-                      padding=2),
+            nn.MaxPool2d(kernel_size=2, stride=2),
+            nn.Conv2d(64, 192, kernel_size=5, padding=2),
             nn.ReLU(inplace=True),
-            nn.MaxPool2d(kernel_size=2,
-                         stride=2),
-            nn.Conv2d(192,
-                      384,
-                      kernel_size=3,
-                      padding=1),
+            nn.MaxPool2d(kernel_size=2, stride=2),
+            nn.Conv2d(192, 384, kernel_size=3, padding=1),
             nn.ReLU(inplace=True),
-            nn.Conv2d(384,
-                      256,
-                      kernel_size=3,
-                      padding=1),
+            nn.Conv2d(384, 256, kernel_size=3, padding=1),
             nn.ReLU(inplace=True),
-            nn.Conv2d(256,
-                      256,
-                      kernel_size=3,
-                      padding=1),
+            nn.Conv2d(256, 256, kernel_size=3, padding=1),
             nn.ReLU(inplace=True),
-            nn.MaxPool2d(kernel_size=2,
-                         stride=2),
+            nn.MaxPool2d(kernel_size=2, stride=2),
         )
         self.classifier = nn.Linear(256, num_classes)
         self.loss_fn = nn.CrossEntropyLoss()
@@ -59,12 +45,14 @@ def forward(self, x, y):
 
 
 class AlexNetPipe(AlexNet):
+
     def to_layers(self):
         layers = [*self.features, lambda x: x.view(x.size(0), -1), self.classifier]
         return layers
 
 
 class AlexNetPipeSpec(PipelineModule):
+
     def __init__(self, num_classes=10, **kwargs):
         self.num_classes = num_classes
         specs = [
@@ -81,9 +69,8 @@ def __init__(self, num_classes=10, **kwargs):
             LayerSpec(nn.Conv2d, 256, 256, kernel_size=3, padding=1),
             F.relu,
             LayerSpec(nn.MaxPool2d, kernel_size=2, stride=2),
-
             lambda x: x.view(x.size(0), -1),
-            LayerSpec(nn.Linear, 256, self.num_classes), # classifier
+            LayerSpec(nn.Linear, 256, self.num_classes),  # classifier
         ]
         super().__init__(layers=specs, loss_fn=nn.CrossEntropyLoss(), **kwargs)
 
@@ -99,12 +86,7 @@ def cifar_trainset(fp16=False):
 
     transform_list = [
         transforms.ToTensor(),
-        transforms.Normalize((0.5,
-                              0.5,
-                              0.5),
-                             (0.5,
-                              0.5,
-                              0.5)),
+        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
     ]
     if fp16:
         transform_list.append(torchvision.transforms.Lambda(cast_to_half))
@@ -117,7 +99,9 @@ def cifar_trainset(fp16=False):
     dist.barrier()
     if local_rank != 0:
         dist.barrier()
-    trainset = torchvision.datasets.CIFAR10(root='/blob/cifar10-data',
+
+    data_root = os.getenv("TEST_DATA_DIR", "/tmp/")
+    trainset = torchvision.datasets.CIFAR10(root=os.path.join(data_root, "cifar10-data"),
                                             train=True,
                                             download=True,
                                             transform=transform)
@@ -126,14 +110,9 @@ def cifar_trainset(fp16=False):
     return trainset
 
 
-def train_cifar(model,
-                config,
-                num_steps=400,
-                average_dp_losses=True,
-                fp16=True,
-                seed=123):
-    with get_accelerator().random().fork_rng(
-            devices=[get_accelerator().current_device_name()]):
+def train_cifar(model, config, num_steps=400, average_dp_losses=True, fp16=True, seed=123):
+    with get_accelerator().random().fork_rng(devices=[get_accelerator().current_device_name()],
+                                             device_type=get_accelerator().device_name()):
         ds_utils.set_random_seed(seed)
 
         # disable dropout
@@ -142,11 +121,22 @@ def train_cifar(model,
         trainset = cifar_trainset(fp16=fp16)
         config['local_rank'] = dist.get_rank()
 
-        engine, _, _, _ = deepspeed.initialize(
-            config=config,
-            model=model,
-            model_parameters=[p for p in model.parameters()],
-            training_data=trainset)
+        # deepspeed_io defaults to creating a dataloader that uses a
+        # multiprocessing pool. Our tests use pools and we cannot nest pools in
+        # python. Therefore we're injecting this kwarg to ensure that no pools
+        # are used in the dataloader.
+        old_method = deepspeed.runtime.engine.DeepSpeedEngine.deepspeed_io
+
+        def new_method(*args, **kwargs):
+            kwargs["num_local_io_workers"] = 0
+            return old_method(*args, **kwargs)
+
+        deepspeed.runtime.engine.DeepSpeedEngine.deepspeed_io = new_method
+
+        engine, _, _, _ = deepspeed.initialize(config=config,
+                                               model=model,
+                                               model_parameters=[p for p in model.parameters()],
+                                               training_data=trainset)
 
         losses = []
         for step in range(num_steps):
diff --git a/tests/unit/autotuning/test_autotuning.py b/tests/unit/autotuning/test_autotuning.py
index 90b9c5b3a2c8..ea32c66624a8 100644
--- a/tests/unit/autotuning/test_autotuning.py
+++ b/tests/unit/autotuning/test_autotuning.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import os
 import pytest
@@ -14,13 +17,11 @@
 def test_command_line():
     '''Validate handling of command line arguments'''
     for opt in [RUN_OPTION, TUNE_OPTION]:
-        dsrun.parse_args(
-            args=f"--num_nodes 1 --num_gpus 1 --autotuning {opt} foo.py".split())
+        dsrun.parse_args(args=f"--num_nodes 1 --num_gpus 1 --autotuning {opt} foo.py".split())
 
     for error_opts in [
             "--autotuning --num_nodes 1 --num_gpus 1 foo.py".split(),
-            "--autotuning test --num_nodes 1 -- num_gpus 1 foo.py".split(),
-            "--autotuning".split()
+            "--autotuning test --num_nodes 1 -- num_gpus 1 foo.py".split(), "--autotuning".split()
     ]:
         with pytest.raises(SystemExit):
             dsrun.parse_args(args=error_opts)
@@ -65,18 +66,9 @@ def test_resource_manager_arg_mappings(arg_mappings):
                         ]
                         ) # yapf: disable
 def test_autotuner_resources(tmpdir, active_resources):
-    config_dict = {
-        "autotuning": {
-            "enabled": True,
-            "exps_dir": os.path.join(tmpdir,
-                                     'exps_dir'),
-            "arg_mappings": {}
-        }
-    }
+    config_dict = {"autotuning": {"enabled": True, "exps_dir": os.path.join(tmpdir, 'exps_dir'), "arg_mappings": {}}}
     config_path = create_config_from_dict(tmpdir, config_dict)
-    args = dsrun.parse_args(
-        args=f'--autotuning {TUNE_OPTION} foo.py --deepspeed_config {config_path}'.split(
-        ))
+    args = dsrun.parse_args(args=f'--autotuning {TUNE_OPTION} foo.py --deepspeed_config {config_path}'.split())
     tuner = Autotuner(args=args, active_resources=active_resources)
 
     expected_num_nodes = len(list(active_resources.keys()))
diff --git a/tests/unit/checkpoint/common.py b/tests/unit/checkpoint/common.py
index 5b89d6811b01..d6dda2f14cbe 100644
--- a/tests/unit/checkpoint/common.py
+++ b/tests/unit/checkpoint/common.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import os
 import torch
@@ -9,8 +12,10 @@
 from deepspeed.runtime.fp16.fused_optimizer import FP16_Optimizer
 from deepspeed.runtime.fp16.unfused_optimizer import FP16_UnfusedOptimizer
 from deepspeed.runtime.zero.stage3 import DeepSpeedZeroOptimizer_Stage3
+from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
 
 from unit.simple_model import *
+from unittest.mock import MagicMock, patch
 
 
 def compare_deepspeed_states(saved_model, loaded_model):
@@ -22,38 +27,44 @@ def compare_deepspeed_states(saved_model, loaded_model):
     assert saved_model.global_steps == loaded_model.global_steps
 
 
-def compare_model_states(saved_model,
-                         loaded_model,
-                         compare_optimizer=True,
-                         load_module_only=False):
+def zero3_params_to_fetch(param_list):
+    return [p for p in param_list if hasattr(p, 'ds_id') and p.ds_status == ZeroParamStatus.NOT_AVAILABLE]
+
+
+def compare_model_states(saved_model, loaded_model, compare_optimizer=True, load_module_only=False):
     if not load_module_only:
         compare_deepspeed_states(saved_model, loaded_model)
 
-    for p0, p1 in zip(saved_model.module.named_parameters(), loaded_model.module.named_parameters()):
-        np0, p0 = p0
-        np1, p1 = p1
-        if 'deepspeed_moe.gate.wg' in np0:
-            # these params are converted to float at runtime, cast to half for comparison
-            p1 = p1.half()
-            p0 = p0.half()
-        assert id(p0) != id(p1), f'Comparing fp16 model state tensor against itself : {id(p0)} <====> {id(p1)}'
-        try:
-            assert torch.allclose(p0, p1, atol=1e-07), f"FP16 model state {p0} is not equal to {p1}, names:{np0}, {np1}"
-        except RuntimeError as err:
-            print(f"FP16 model state {p0} is not equal to {p1}, names:{np0}, {np1}")
-            raise err
+    params_to_fetch = zero3_params_to_fetch(
+        list(saved_model.module.named_parameters()) + list(loaded_model.module.named_parameters()))
+    enable_gather = len(params_to_fetch) > 0
+    with deepspeed.zero.GatheredParameters(params_to_fetch, enabled=enable_gather):
+        for p0, p1 in zip(saved_model.module.named_parameters(), loaded_model.module.named_parameters()):
+            np0, p0 = p0
+            np1, p1 = p1
+            if 'deepspeed_moe.gate.wg' in np0:
+                # these params are converted to float at runtime, cast to half for comparison
+                p1 = p1.half()
+                p0 = p0.half()
+            assert id(p0) != id(p1), f'Comparing fp16 model state tensor against itself : {id(p0)} <====> {id(p1)}'
+            try:
+                assert torch.allclose(p0, p1,
+                                      atol=1e-07), f"FP16 model state {p0} is not equal to {p1}, names:{np0}, {np1}"
+            except RuntimeError as err:
+                print(f"FP16 model state {p0} is not equal to {p1}, names:{np0}, {np1}")
+                raise err
 
     if not compare_optimizer:
         return
 
-    if DeepSpeedZeroOptimizer_Stage3 is not None and isinstance(
-            saved_model.optimizer,
-            DeepSpeedZeroOptimizer_Stage3):
-        for p0, p1 in zip(saved_model.optimizer.fp32_partitioned_groups_flat, loaded_model.optimizer.fp32_partitioned_groups_flat):
+    if DeepSpeedZeroOptimizer_Stage3 is not None and isinstance(saved_model.optimizer, DeepSpeedZeroOptimizer_Stage3):
+        for p0, p1 in zip(saved_model.optimizer.fp32_partitioned_groups_flat,
+                          loaded_model.optimizer.fp32_partitioned_groups_flat):
             assert torch.allclose(p0, p1, atol=1e-07), f"Fp32 model states {p0} is not equal to {p1}"
 
     elif isinstance(saved_model.optimizer, DeepSpeedZeroOptimizer):
-        for p0, p1 in zip(saved_model.optimizer.single_partition_of_fp32_groups, loaded_model.optimizer.single_partition_of_fp32_groups):
+        for p0, p1 in zip(saved_model.optimizer.single_partition_of_fp32_groups,
+                          loaded_model.optimizer.single_partition_of_fp32_groups):
             assert id(p0) != id(p1), f'Comparing fp32 model state tensor against itself: {id(p0)} <====> {id(p1)}'
             assert torch.allclose(p0, p1, atol=1e-07), f"Fp32 model states {p0} is not equal to {p1}"
 
@@ -89,8 +100,7 @@ def compare_optimizer_states(saved_model, loaded_model, hidden_dim, fp16=True):
     saved_optimizer = saved_model.optimizer.optimizer if fp16 else saved_model.optimizer
     loaded_optimizer = loaded_model.optimizer.optimizer if fp16 else loaded_model.optimizer
 
-    for state0, state1 in zip(saved_optimizer.state.values(),
-                              loaded_optimizer.state.values()):
+    for state0, state1 in zip(saved_optimizer.state.values(), loaded_optimizer.state.values()):
         compare_state_dicts(state0, state1)
 
 
@@ -130,6 +140,7 @@ def create_deepspeed_model(config_dict, model, base_optimizer):
                                              model=model,
                                              model_parameters=create_moe_param_groups(model),
                                              optimizer=base_optimizer)
+    ds_model.empty_partition_cache()
     return ds_model
 
 
@@ -141,15 +152,12 @@ def checkpoint_correctness_verification(config_dict,
                                         load_lr_scheduler_states=False,
                                         fp16=True,
                                         train_batch=False,
-                                        base_optimizers=[None,
-                                                         None],
+                                        base_optimizers=[None, None],
                                         empty_tag=False,
                                         seq_dataloader=False,
                                         load_module_only=False):
     dtype = torch.half if fp16 else torch.float32
-    ds_model = create_deepspeed_model(config_dict=config_dict,
-                                      model=models[0],
-                                      base_optimizer=base_optimizers[0])
+    ds_model = create_deepspeed_model(config_dict=config_dict, model=models[0], base_optimizer=base_optimizers[0])
 
     if seq_dataloader:
         data_loader = sequence_dataloader(model=ds_model,
@@ -174,6 +182,9 @@ def checkpoint_correctness_verification(config_dict,
             ds_model.backward(loss)
             ds_model.step()
 
+    # Flush zero stage 3 cache
+    ds_model.empty_partition_cache()
+
     trained_model = ds_model
 
     save_folder = os.path.join(tmpdir, 'saved_checkpoint')
@@ -196,17 +207,20 @@ def checkpoint_correctness_verification(config_dict,
                 stored = sum(v for _, v in storages.items())
                 assert needed == stored, f"MoE expert checkpoint uses more storage than required: {f}"
 
-    loaded_model = create_deepspeed_model(config_dict=config_dict,
-                                          model=models[1],
-                                          base_optimizer=base_optimizers[1])
-    assert list(trained_model.parameters())[0].dtype == list(
-        loaded_model.parameters())[0].dtype
-
-    loaded_model.load_checkpoint(save_folder,
-                                 tag=save_tag,
-                                 load_optimizer_states=load_optimizer_states,
-                                 load_lr_scheduler_states=load_lr_scheduler_states,
-                                 load_module_only=load_module_only)
+    loaded_model = create_deepspeed_model(config_dict=config_dict, model=models[1], base_optimizer=base_optimizers[1])
+    assert list(trained_model.parameters())[0].dtype == list(loaded_model.parameters())[0].dtype
+
+    context = patch.object(loaded_model, "_get_optimizer_ckpt_name",
+                           wraps=loaded_model._get_optimizer_ckpt_name) if not load_optimizer_states else MagicMock()
+    with context as optim_load_state_dict_mock:
+        loaded_model.load_checkpoint(save_folder,
+                                     tag=save_tag,
+                                     load_optimizer_states=load_optimizer_states,
+                                     load_lr_scheduler_states=load_lr_scheduler_states,
+                                     load_module_only=load_module_only)
+        if not load_optimizer_states:
+            # should not attempt to get the file name to load it
+            optim_load_state_dict_mock.assert_not_called()
 
     compare_model_states(trained_model,
                          loaded_model,
diff --git a/tests/unit/checkpoint/test_latest_checkpoint.py b/tests/unit/checkpoint/test_latest_checkpoint.py
index 955edfdec3ac..41ce2278680f 100644
--- a/tests/unit/checkpoint/test_latest_checkpoint.py
+++ b/tests/unit/checkpoint/test_latest_checkpoint.py
@@ -1,11 +1,19 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import deepspeed
 
+import pytest
 from unit.common import DistributedTest
 from unit.simple_model import *
 
 from unit.checkpoint.common import checkpoint_correctness_verification
+from deepspeed.ops.op_builder import FusedAdamBuilder
+
+if not deepspeed.ops.__compatible_ops__[FusedAdamBuilder.NAME]:
+    pytest.skip("This op had not been implemented on this system.", allow_module_level=True)
 
 
 class TestLatestCheckpoint(DistributedTest):
@@ -46,8 +54,6 @@ def test_missing_latest(self, tmpdir):
         }
         hidden_dim = 10
         model = SimpleModel(hidden_dim)
-        model, _, _,_ = deepspeed.initialize(config=config_dict,
-                                            model=model,
-                                            model_parameters=model.parameters())
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
         # should be no-op, since latest doesn't exist
         model.load_checkpoint(tmpdir)
diff --git a/tests/unit/checkpoint/test_lr_scheduler.py b/tests/unit/checkpoint/test_lr_scheduler.py
index f6a8f5ebdd4a..c4c6773cd474 100644
--- a/tests/unit/checkpoint/test_lr_scheduler.py
+++ b/tests/unit/checkpoint/test_lr_scheduler.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import deepspeed
 from deepspeed.ops.op_builder import CPUAdamBuilder
@@ -11,19 +14,8 @@
 import pytest
 
 
-@pytest.mark.parametrize('zero_stage, use_cpu_offload',
-                         [(0,
-                           False),
-                          (1,
-                           False),
-                          (2,
-                           False),
-                          (2,
-                           True),
-                          (3,
-                           False),
-                          (3,
-                           True)])
+@pytest.mark.parametrize('zero_stage, use_cpu_offload', [(0, False), (1, False), (2, False), (2, True), (3, False),
+                                                         (3, True)])
 class TestLRSchedulerCheckpoint(DistributedTest):
     world_size = 2
 
@@ -38,8 +30,7 @@ def test_checkpoint_lr_scheduler(self, tmpdir, zero_stage, use_cpu_offload):
                 "type": 'Adam',
                 "params": {
                     "lr": 0.00015,
-                    "betas": [0.8,
-                              0.999],
+                    "betas": [0.8, 0.999],
                     "eps": 1e-8,
                     "weight_decay": 3e-7
                 }
diff --git a/tests/unit/checkpoint/test_mics_optimizer.py b/tests/unit/checkpoint/test_mics_optimizer.py
new file mode 100644
index 000000000000..3f853cd5c13a
--- /dev/null
+++ b/tests/unit/checkpoint/test_mics_optimizer.py
@@ -0,0 +1,83 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+import deepspeed
+
+from deepspeed.runtime.utils import required_torch_version
+from unit.common import DistributedTest
+from unit.simple_model import *
+from unit.checkpoint.common import *
+
+import pytest
+
+if not required_torch_version(max_version=2.0):
+    pytest.skip("Skipping until we resolve problems with torch 2.1", allow_module_level=True)
+
+
+class TestMiCSCheckpoint(DistributedTest):
+    world_size = 4
+
+    def _toy_model_config(self, shard_size):
+
+        config_dict = {
+            "train_micro_batch_size_per_gpu": 2,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": 'Adam',
+                "params": {
+                    "lr": 0.00015,
+                    "betas": [0.8, 0.999],
+                    "eps": 1e-8,
+                    "weight_decay": 3e-7
+                }
+            },
+            "fp16": {
+                "enabled": True,
+                "initial_scale_power": 8
+            },
+            "wall_clock_breakdown": True,
+            "zero_optimization": {
+                "stage": 3,
+                "mics_shard_size": shard_size
+            }
+        }
+
+        hidden_dim = 10
+        with deepspeed.zero.MiCS_Init(config_dict_or_path=config_dict):
+            models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)]
+
+        return config_dict, hidden_dim, models
+
+    @pytest.mark.parametrize('shard_size', [1, 2, 4])
+    def test_load_optimizer_state(self, tmpdir, shard_size):
+        config_dict, hidden_dim, models = self._toy_model_config(shard_size)
+        checkpoint_correctness_verification(config_dict, models, hidden_dim, tmpdir, load_optimizer_states=True)
+
+    @pytest.mark.parametrize('shard_size', [1, 2, 4])
+    def test_not_load_optimizer_state(self, tmpdir, shard_size):
+        config_dict, hidden_dim, models = self._toy_model_config(shard_size)
+        checkpoint_correctness_verification(config_dict, models, hidden_dim, tmpdir, load_optimizer_states=False)
+
+    @pytest.mark.parametrize('shard_size', [1, 2, 4])
+    def test_load_module_only(self, tmpdir, shard_size):
+        config_dict, hidden_dim, models = self._toy_model_config(shard_size)
+        checkpoint_correctness_verification(config_dict, models, hidden_dim, tmpdir, load_module_only=True)
+
+    @pytest.mark.parametrize('shard_size', [1, 2, 4])
+    def test_save_checkpoint_on_first_partition_group(self, tmpdir, shard_size):
+        config_dict, _, models = self._toy_model_config(shard_size)
+        ds_engine, _, _, _ = deepspeed.initialize(config=config_dict,
+                                                  model=models[0],
+                                                  model_parameters=models[0].parameters(),
+                                                  optimizer=None)
+
+        ds_engine.save_checkpoint(tmpdir)
+        if ds_engine.global_rank < shard_size:
+            assert ds_engine.save_non_zero_checkpoint == True
+        else:
+            assert ds_engine.save_non_zero_checkpoint == False
diff --git a/tests/unit/checkpoint/test_moe_checkpoint.py b/tests/unit/checkpoint/test_moe_checkpoint.py
index edce2959aa20..0706b7327ce8 100644
--- a/tests/unit/checkpoint/test_moe_checkpoint.py
+++ b/tests/unit/checkpoint/test_moe_checkpoint.py
@@ -1,10 +1,13 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from deepspeed.moe.utils import split_params_into_different_moe_groups_for_optimizer
+from deepspeed.runtime.utils import required_torch_version
 
 from unit.common import DistributedTest
 from unit.simple_model import *
-from unit.util import required_torch_version
 
 from unit.checkpoint.common import checkpoint_correctness_verification
 
@@ -16,23 +19,13 @@ class TestMoECheckpoint(DistributedTest):
 
     @pytest.mark.parametrize("ep_size", [4])
     def test_checkpoint_moe(self, tmpdir, ep_size):
-        if not required_torch_version():
+        if not required_torch_version(min_version=1.8):
             pytest.skip("DeepSpeed MoE tests need torch 1.8 or higher to run correctly")
 
-        config_dict = {
-            "train_batch_size": 8,
-            "steps_per_print": 1,
-            "fp16": {
-                "enabled": True
-            }
-        }
+        config_dict = {"train_batch_size": 8, "steps_per_print": 1, "fp16": {"enabled": True}}
         hidden_dim = 16
 
-        models = [
-            SimpleMoEModel(hidden_dim=hidden_dim,
-                           num_experts=ep_size,
-                           ep_size=ep_size) for _ in range(2)
-        ]
+        models = [SimpleMoEModel(hidden_dim=hidden_dim, num_experts=ep_size, ep_size=ep_size) for _ in range(2)]
         optimizers = [torch.optim.AdamW(params=model.parameters()) for model in models]
         checkpoint_correctness_verification(config_dict,
                                             models=models,
@@ -45,17 +38,9 @@ def test_checkpoint_moe(self, tmpdir, ep_size):
                                             base_optimizers=optimizers,
                                             seq_dataloader=True)
 
-    @pytest.mark.parametrize("ep_size, load_optim_states",
-                             [(4,
-                               True),
-                              (4,
-                               False),
-                              (2,
-                               True),
-                              (2,
-                               False)])
+    @pytest.mark.parametrize("ep_size, load_optim_states", [(4, True), (4, False), (2, True), (2, False)])
     def test_checkpoint_moe_and_zero(self, tmpdir, ep_size, load_optim_states):
-        if not required_torch_version():
+        if not required_torch_version(min_version=1.8):
             pytest.skip("DeepSpeed MoE tests need torch 1.8 or higher to run correctly")
 
         config_dict = {
@@ -65,8 +50,7 @@ def test_checkpoint_moe_and_zero(self, tmpdir, ep_size, load_optim_states):
                 "type": 'Adam',
                 "params": {
                     "lr": 0.00015,
-                    "betas": [0.8,
-                              0.999],
+                    "betas": [0.8, 0.999],
                     "eps": 1e-8,
                     "weight_decay": 3e-7
                 }
@@ -81,21 +65,11 @@ def test_checkpoint_moe_and_zero(self, tmpdir, ep_size, load_optim_states):
         }
         hidden_dim = 16
 
-        models = [
-            SimpleMoEModel(hidden_dim=hidden_dim,
-                           num_experts=ep_size,
-                           ep_size=ep_size) for _ in range(2)
-        ]
+        models = [SimpleMoEModel(hidden_dim=hidden_dim, num_experts=ep_size, ep_size=ep_size) for _ in range(2)]
         # param group must have a random unique name (for now)
         # TODO: clean-up this requirement, the unique name should not be required here
-        param_groups = [{
-            'params': [p for p in model.parameters()],
-            'name': 'random-unique-name'
-        } for model in models]
-        params = [
-            split_params_into_different_moe_groups_for_optimizer(group)
-            for group in param_groups
-        ]
+        param_groups = [{'params': [p for p in model.parameters()], 'name': 'random-unique-name'} for model in models]
+        params = [split_params_into_different_moe_groups_for_optimizer(group) for group in param_groups]
         optimizers = [torch.optim.AdamW(params=param) for param in params]
         checkpoint_correctness_verification(config_dict,
                                             models=models,
diff --git a/tests/unit/checkpoint/test_other_optimizer.py b/tests/unit/checkpoint/test_other_optimizer.py
index d09157a2c80d..9cb8c4286880 100644
--- a/tests/unit/checkpoint/test_other_optimizer.py
+++ b/tests/unit/checkpoint/test_other_optimizer.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import deepspeed
 from deepspeed.ops.op_builder import FusedLambBuilder
@@ -14,8 +17,7 @@
 class TestOtherOptimizerCheckpoint(DistributedTest):
     world_size = 2
 
-    @pytest.mark.skipif(not deepspeed.ops.__compatible_ops__[FusedLambBuilder.NAME],
-                        reason="lamb is not compatible")
+    @pytest.mark.skipif(not deepspeed.ops.__compatible_ops__[FusedLambBuilder.NAME], reason="lamb is not compatible")
     def test_checkpoint_unfused_optimizer(self, tmpdir):
         config_dict = {
             "train_batch_size": 2,
@@ -74,8 +76,7 @@ def test_checkpoint_fused_optimizer(self, tmpdir):
                 "type": "Adam",
                 "params": {
                     "lr": 0.00015,
-                    "betas": [0.8,
-                              0.999],
+                    "betas": [0.8, 0.999],
                     "eps": 1e-8,
                     "weight_decay": 3e-7
                 }
@@ -111,8 +112,7 @@ def test_checkpoint_fp32_optimizer(self, tmpdir):
                 "type": "Adam",
                 "params": {
                     "lr": 0.00015,
-                    "betas": [0.8,
-                              0.999],
+                    "betas": [0.8, 0.999],
                     "eps": 1e-8,
                     "weight_decay": 3e-7
                 }
diff --git a/tests/unit/checkpoint/test_pipeline.py b/tests/unit/checkpoint/test_pipeline.py
index c698798fa965..99f1ba2ec433 100644
--- a/tests/unit/checkpoint/test_pipeline.py
+++ b/tests/unit/checkpoint/test_pipeline.py
@@ -1,10 +1,13 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from deepspeed.runtime.checkpoint_engine.torch_checkpoint_engine import TorchCheckpointEngine
 from unit.common import DistributedTest
 from unit.simple_model import *
-
 from unit.checkpoint.common import checkpoint_correctness_verification
+from unit.util import skip_on_arch
 
 import pytest
 
@@ -14,6 +17,8 @@ class TestPipelineCheckpoint(DistributedTest):
 
     @pytest.mark.parametrize("zero_stage", [0, 1])
     def test_checkpoint_pipe_engine(self, zero_stage, tmpdir):
+        skip_on_arch(min_arch=7)
+
         config_dict = {
             "train_batch_size": 2,
             "train_micro_batch_size_per_gpu": 1,
diff --git a/tests/unit/checkpoint/test_reshape_checkpoint.py b/tests/unit/checkpoint/test_reshape_checkpoint.py
index c9ae854521ba..41ccd37b3602 100644
--- a/tests/unit/checkpoint/test_reshape_checkpoint.py
+++ b/tests/unit/checkpoint/test_reshape_checkpoint.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from deepspeed.checkpoint import model_3d_desc
 
diff --git a/tests/unit/checkpoint/test_shared_weights.py b/tests/unit/checkpoint/test_shared_weights.py
new file mode 100644
index 000000000000..ed69073fb81c
--- /dev/null
+++ b/tests/unit/checkpoint/test_shared_weights.py
@@ -0,0 +1,49 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+import torch.nn as nn
+
+import deepspeed
+from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+from unit.common import DistributedTest
+
+
+class ModelWithSharedWeights(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.layer0 = nn.Linear(100, 100)
+        self.layer1 = nn.Linear(200, 200)
+        self.layer2 = nn.Linear(300, 300)
+        # tie layer 1 and layer 2
+        self.layer1.weight = self.layer2.weight
+
+
+class TestCheckpointSharedWeights(DistributedTest):
+    world_size = 2
+
+    def test_checkpoint_shared_weights(self, tmp_path):
+        config = {
+            "train_micro_batch_size_per_gpu": 2,
+            "zero_allow_untested_optimizer": True,
+            "zero_optimization": {
+                "stage": 2
+            },
+        }
+        model = ModelWithSharedWeights()
+        optimizer = torch.optim.Adam(model.parameters())
+
+        deepspeed_engine, _, _, _ = deepspeed.initialize(
+            config=config,
+            model=model,
+            optimizer=optimizer,
+        )
+        filename = tmp_path / "checkpoint.pt"
+        deepspeed_engine.save_checkpoint(filename, tag="checkpoint")
+
+        model = ModelWithSharedWeights()
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(filename, tag="checkpoint")
+        model.load_state_dict(state_dict, strict=True)
diff --git a/tests/unit/checkpoint/test_sparse.py b/tests/unit/checkpoint/test_sparse.py
index 4f07acebc058..19fbcd81e473 100644
--- a/tests/unit/checkpoint/test_sparse.py
+++ b/tests/unit/checkpoint/test_sparse.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import deepspeed
 
@@ -11,33 +14,21 @@
 class TestSparseCheckpoint(DistributedTest):
     world_size = 2
 
-    @pytest.mark.parametrize(["to_save_model_has_embedding",
-                              "to_save_model_sparse"],
-                             [
-                                 [False,
-                                  False],
-                                 [True,
-                                  False],
-                                 [True,
-                                  True],
-                             ])
-    @pytest.mark.parametrize(["destination_has_embedding",
-                              "destination_sparse"],
-                             [
-                                 [False,
-                                  False],
-                                 [True,
-                                  False],
-                                 [True,
-                                  True],
-                             ])
-    def test_non_strict_load_sparse(self,
-                                    tmpdir,
-                                    to_save_model_has_embedding,
-                                    to_save_model_sparse,
-                                    destination_has_embedding,
-                                    destination_sparse):
+    @pytest.mark.parametrize(["to_save_model_has_embedding", "to_save_model_sparse"], [
+        [False, False],
+        [True, False],
+        [True, True],
+    ])
+    @pytest.mark.parametrize(["destination_has_embedding", "destination_sparse"], [
+        [False, False],
+        [True, False],
+        [True, True],
+    ])
+    def test_non_strict_load_sparse(self, tmpdir, to_save_model_has_embedding, to_save_model_sparse,
+                                    destination_has_embedding, destination_sparse):
+
         class ModelNoEmbedding(torch.nn.Module):
+
             def __init__(self):
                 super().__init__()
                 self.linear = torch.nn.Linear(3, 1)
@@ -46,6 +37,7 @@ def forward(self, x):
                 return self.linear(x)
 
         class ModelEmbedding(torch.nn.Module):
+
             def __init__(self):
                 super().__init__()
                 self.emb = torch.nn.Embedding(10, 3)
@@ -63,22 +55,24 @@ def forward(self, x, offsets):
         else:
             model_destination = ModelNoEmbedding()
 
-        engine_to_save, _, _, _ = deepspeed.initialize(
-            model=model_to_save, config={"train_batch_size": 2, "sparse_gradients": to_save_model_sparse}
-        )
-        engine_destination, _, _, _ = deepspeed.initialize(
-            model=model_destination, config={"train_batch_size": 2, "sparse_gradients": destination_sparse}
-        )
+        engine_to_save, _, _, _ = deepspeed.initialize(model=model_to_save,
+                                                       config={
+                                                           "train_batch_size": 2,
+                                                           "sparse_gradients": to_save_model_sparse
+                                                       })
+        engine_destination, _, _, _ = deepspeed.initialize(model=model_destination,
+                                                           config={
+                                                               "train_batch_size": 2,
+                                                               "sparse_gradients": destination_sparse
+                                                           })
 
         save_folder = os.path.join(tmpdir, 'saved_checkpoint')
         save_tag = '1'
 
         engine_to_save.save_checkpoint(save_folder, tag=save_tag)
 
-        is_sparse_destination = isinstance(model_destination,
-                                           ModelEmbedding) and destination_sparse
-        if isinstance(model_destination,
-                      ModelEmbedding) and model_destination.emb.sparse:
+        is_sparse_destination = isinstance(model_destination, ModelEmbedding) and destination_sparse
+        if isinstance(model_destination, ModelEmbedding) and model_destination.emb.sparse:
             assert "emb.weight" in engine_destination.sparse_tensor_module_names
         engine_destination.load_checkpoint(save_folder,
                                            tag=save_tag,
@@ -86,9 +80,7 @@ def forward(self, x, offsets):
                                            load_optimizer_states=False,
                                            load_lr_scheduler_states=False,
                                            load_module_only=False)
-        if isinstance(model_destination,
-                      ModelEmbedding) and isinstance(model_to_save,
-                                                     ModelEmbedding):
+        if isinstance(model_destination, ModelEmbedding) and isinstance(model_to_save, ModelEmbedding):
             assert engine_destination.sparse_tensor_module_names == engine_to_save.sparse_tensor_module_names
         elif isinstance(model_destination, ModelEmbedding):
             assert not is_sparse_destination or "emb.weight" in engine_destination.sparse_tensor_module_names
diff --git a/tests/unit/checkpoint/test_tag_validation.py b/tests/unit/checkpoint/test_tag_validation.py
index d9489622305d..b164c31e52b0 100644
--- a/tests/unit/checkpoint/test_tag_validation.py
+++ b/tests/unit/checkpoint/test_tag_validation.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import deepspeed
 
@@ -29,9 +32,7 @@ def test_checkpoint_unique_tag(self, tmpdir, valid_mode):
         hidden_dim = 10
         model = SimpleModel(hidden_dim)
 
-        model, _, _,_ = deepspeed.initialize(config=config_dict,
-                                            model=model,
-                                            model_parameters=model.parameters())
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
         if valid_mode == "FAIL":
             with pytest.raises(AssertionError):
                 model.save_checkpoint(save_dir=tmpdir, tag=f"tag-{dist.get_rank()}")
@@ -58,6 +59,4 @@ def test_checkpoint_unknown_tag_validation(self, tmpdir):
         model = SimpleModel(hidden_dim)
 
         with pytest.raises(deepspeed.DeepSpeedConfigError):
-            model, _, _,_ = deepspeed.initialize(config=config_dict,
-                                                model=model,
-                                                model_parameters=model.parameters())
+            model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
diff --git a/tests/unit/checkpoint/test_zero_optimizer.py b/tests/unit/checkpoint/test_zero_optimizer.py
index 7de8e9bff908..f2237341ef68 100644
--- a/tests/unit/checkpoint/test_zero_optimizer.py
+++ b/tests/unit/checkpoint/test_zero_optimizer.py
@@ -1,11 +1,17 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import deepspeed
+from types import SimpleNamespace
 from deepspeed.ops.op_builder import CPUAdamBuilder
+from deepspeed.checkpoint.utils import clone_tensors_for_torch_save, get_model_ckpt_name_for_rank
+from deepspeed.accelerator import get_accelerator
+from deepspeed.runtime.utils import required_torch_version
 
 from unit.common import DistributedTest, DistributedFixture
 from unit.simple_model import *
-from unit.util import required_minimum_torch_version
 
 from unit.checkpoint.common import *
 
@@ -15,27 +21,34 @@
 class TestZeROCheckpoint(DistributedTest):
     world_size = 2
 
-    @pytest.mark.parametrize('zero_stage, use_cpu_offload, adam_optimizer',
-                             [(1,
-                               False,
-                               'Adam'),
-                              (2,
-                               False,
-                               'Adam'),
-                              (2,
-                               True,
-                               'deepspeed_adam'),
-                              (3,
-                               False,
-                               'Adam'),
-                              (3,
-                               True,
-                               'deepspeed_adam')])
-    def test_load_optimizer_state(self,
-                                  tmpdir,
-                                  zero_stage,
-                                  use_cpu_offload,
-                                  adam_optimizer):
+    @pytest.mark.parametrize('zero_stage', [3])
+    def test_pipeline_checkpoint_loading(self, tmpdir, zero_stage):
+        config_dict = {
+            "train_batch_size": 2,
+            "optimizer": {
+                "type": 'Adam'
+            },
+            "fp16": {
+                "enabled": True,
+                "initial_scale_power": 8
+            },
+            "zero_optimization": {
+                "stage": zero_stage,
+                "pipeline_loading_checkpoint": True,
+            }
+        }
+        hidden_dim = 10
+
+        with deepspeed.zero.Init():
+            models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)]
+
+        checkpoint_correctness_verification(config_dict, models, hidden_dim, tmpdir, load_module_only=True)
+
+    @pytest.mark.parametrize('zero_stage, use_cpu_offload, adam_optimizer', [(1, False, 'Adam'), (2, False, 'Adam'),
+                                                                             (2, True, 'deepspeed_adam'),
+                                                                             (3, False, 'Adam'),
+                                                                             (3, True, 'deepspeed_adam')])
+    def test_load_optimizer_state(self, tmpdir, zero_stage, use_cpu_offload, adam_optimizer):
         if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
             pytest.skip("cpu-adam is not compatible")
 
@@ -46,8 +59,7 @@ def test_load_optimizer_state(self,
                 "type": 'Adam',
                 "params": {
                     "lr": 0.00015,
-                    "betas": [0.8,
-                              0.999],
+                    "betas": [0.8, 0.999],
                     "eps": 1e-8,
                     "weight_decay": 3e-7
                 }
@@ -70,33 +82,13 @@ def test_load_optimizer_state(self,
         else:
             models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)]
 
-        checkpoint_correctness_verification(config_dict,
-                                            models,
-                                            hidden_dim,
-                                            tmpdir,
-                                            load_optimizer_states=True)
+        checkpoint_correctness_verification(config_dict, models, hidden_dim, tmpdir, load_optimizer_states=True)
 
-    @pytest.mark.parametrize('zero_stage, use_cpu_offload, adam_optimizer',
-                             [(1,
-                               False,
-                               "Adam"),
-                              (2,
-                               False,
-                               "Adam"),
-                              (2,
-                               True,
-                               'deepspeed_adam'),
-                              (3,
-                               False,
-                               'Adam'),
-                              (3,
-                               True,
-                               'deepspeed_adam')])
-    def test_not_load_optimizer_state(self,
-                                      tmpdir,
-                                      zero_stage,
-                                      use_cpu_offload,
-                                      adam_optimizer):
+    @pytest.mark.parametrize('zero_stage, use_cpu_offload, adam_optimizer', [(1, False, "Adam"), (2, False, "Adam"),
+                                                                             (2, True, 'deepspeed_adam'),
+                                                                             (3, False, 'Adam'),
+                                                                             (3, True, 'deepspeed_adam')])
+    def test_not_load_optimizer_state(self, tmpdir, zero_stage, use_cpu_offload, adam_optimizer):
         if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
             pytest.skip("cpu-adam is not compatible")
 
@@ -107,8 +99,7 @@ def test_not_load_optimizer_state(self,
                 "type": 'Adam',
                 "params": {
                     "lr": 0.00015,
-                    "betas": [0.8,
-                              0.999],
+                    "betas": [0.8, 0.999],
                     "eps": 1e-8,
                     "weight_decay": 3e-7
                 }
@@ -131,11 +122,7 @@ def test_not_load_optimizer_state(self,
         else:
             models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)]
 
-        checkpoint_correctness_verification(config_dict,
-                                            models,
-                                            hidden_dim,
-                                            tmpdir,
-                                            load_optimizer_states=False)
+        checkpoint_correctness_verification(config_dict, models, hidden_dim, tmpdir, load_optimizer_states=False)
 
     @pytest.mark.parametrize('zero_stage', [1, 2])
     def test_hybrid_optimizer_state(self, tmpdir, zero_stage):
@@ -186,11 +173,7 @@ def test_load_module_only(self, tmpdir, zero_stage):
         else:
             models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)]
 
-        checkpoint_correctness_verification(config_dict,
-                                            models,
-                                            hidden_dim,
-                                            tmpdir,
-                                            load_module_only=True)
+        checkpoint_correctness_verification(config_dict, models, hidden_dim, tmpdir, load_module_only=True)
 
 
 class ws4_model_checkpoint(DistributedFixture):
@@ -214,22 +197,15 @@ def run(self, class_tmpdir, elastic_save, load_optim):
         hidden_dim = 10
         model = SimpleModel(hidden_dim)
 
-        model, _, _, _ = deepspeed.initialize(config=ds_config,
-                                            model=model,
-                                            model_parameters=model.parameters())
-        data_loader = random_dataloader(model=model,
-                                        total_samples=8,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
+        model, _, _, _ = deepspeed.initialize(config=ds_config, model=model, model_parameters=model.parameters())
+        data_loader = random_dataloader(model=model, total_samples=8, hidden_dim=hidden_dim, device=model.device)
         for n, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
             model.backward(loss)
             model.step()
 
         if load_optim:
-            torch.save(model.optimizer.optimizer.state_dict(),
-                       os.path.join(class_tmpdir,
-                                    'opt-state-dict'))
+            torch.save(model.optimizer.optimizer.state_dict(), os.path.join(class_tmpdir, 'opt-state-dict'))
         model.save_checkpoint(class_tmpdir)
 
 
@@ -239,11 +215,7 @@ def run(self, class_tmpdir, elastic_save, load_optim):
 class TestZeROElasticCheckpoint(DistributedTest):
     world_size = 2
 
-    def test_elastic_checkpoint_fixed_dp(self,
-                                         tmpdir,
-                                         elastic_save,
-                                         elastic_load,
-                                         load_optim):
+    def test_elastic_checkpoint_fixed_dp(self, tmpdir, elastic_save, elastic_load, load_optim):
         ds_config = {
             "train_batch_size": 2,
             "optimizer": {
@@ -263,54 +235,39 @@ def test_elastic_checkpoint_fixed_dp(self,
         # torch 1.2.* stores raw tensor id numbers in checkpoint state which leads to
         # false positive mismatches in checkpoint state comparisons.
         # Newer torch versions store tensor ids as 0, 1, 2, ...
-        expected_mismatch_keys = [] if required_minimum_torch_version(1,
-                                                                      4) else ['params']
+        expected_mismatch_keys = [] if required_torch_version(min_version=1.4) else ['params']
         models = [SimpleModel(hidden_dim) for _ in range(2)]
         model, _, _, _ = deepspeed.initialize(config=ds_config,
-                                            model=models[0],
-                                            model_parameters=models[0].parameters())
-        data_loader = random_dataloader(model=model,
-                                        total_samples=8,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
+                                              model=models[0],
+                                              model_parameters=models[0].parameters())
+        data_loader = random_dataloader(model=model, total_samples=8, hidden_dim=hidden_dim, device=model.device)
         for n, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
             model.backward(loss)
             model.step()
         if load_optim:
-            torch.save(model.optimizer.optimizer.state_dict(),
-                       os.path.join(tmpdir,
-                                    'opt-state-dict'))
+            torch.save(model.optimizer.optimizer.state_dict(), os.path.join(tmpdir, 'opt-state-dict'))
         model.save_checkpoint(tmpdir)
 
         ds_config["zero_optimization"]["elastic_checkpoint"] = elastic_load
         model, _, _, _ = deepspeed.initialize(config=ds_config,
-                                            model=models[1],
-                                            model_parameters=models[1].parameters())
+                                              model=models[1],
+                                              model_parameters=models[1].parameters())
         model.load_checkpoint(tmpdir, load_optimizer_states=load_optim)
 
         if load_optim:
             saved_sd = torch.load(os.path.join(tmpdir, 'opt-state-dict'))
             curr_sd = model.optimizer.optimizer.state_dict()
             for curr_param_group, saved_param_group in zip(curr_sd['param_groups'], saved_sd['param_groups']):
-                compare_state_dicts(curr_param_group,
-                                    saved_param_group,
-                                    expected_mismatch_keys)
+                compare_state_dicts(curr_param_group, saved_param_group, expected_mismatch_keys)
 
-        data_loader = random_dataloader(model=model,
-                                        total_samples=8,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
+        data_loader = random_dataloader(model=model, total_samples=8, hidden_dim=hidden_dim, device=model.device)
         for n, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
             model.backward(loss)
             model.step()
 
-    def test_elastic_checkpoint_change_dp(self,
-                                          ws4_model_checkpoint,
-                                          class_tmpdir,
-                                          elastic_save,
-                                          elastic_load,
+    def test_elastic_checkpoint_change_dp(self, ws4_model_checkpoint, class_tmpdir, elastic_save, elastic_load,
                                           load_optim):
         ds_config = {
             "train_batch_size": 4,
@@ -330,9 +287,7 @@ def test_elastic_checkpoint_change_dp(self,
         model = SimpleModel(hidden_dim)
 
         # Load checkpoint with dp world size = 2
-        model, _, _, _ = deepspeed.initialize(config=ds_config,
-                                                model=model,
-                                                model_parameters=model.parameters())
+        model, _, _, _ = deepspeed.initialize(config=ds_config, model=model, model_parameters=model.parameters())
         if load_optim:
             with pytest.raises(deepspeed.runtime.zero.utils.ZeRORuntimeException):
                 model.load_checkpoint(class_tmpdir, load_optimizer_states=load_optim)
@@ -361,9 +316,7 @@ def test_immediate_save_load(self, tmpdir, zero_stage):
         hidden_dim = 10
         model = SimpleModel(hidden_dim)
 
-        ds_model = create_deepspeed_model(config_dict=config_dict,
-                                          model=model,
-                                          base_optimizer=None)
+        ds_model = create_deepspeed_model(config_dict=config_dict, model=model, base_optimizer=None)
         ds_model.save_checkpoint(tmpdir)
         ds_model.load_checkpoint(tmpdir,
                                  load_optimizer_states=False,
@@ -390,9 +343,7 @@ def test_load_immediate_save(self, tmpdir, zero_stage):
 
         # 1. pretrain a model and save it
         dtype = torch.half
-        ds_model = create_deepspeed_model(config_dict=config_dict,
-                                          model=model,
-                                          base_optimizer=None)
+        ds_model = create_deepspeed_model(config_dict=config_dict, model=model, base_optimizer=None)
         data_loader = random_dataloader(model=ds_model,
                                         total_samples=1,
                                         hidden_dim=hidden_dim,
@@ -402,12 +353,12 @@ def test_load_immediate_save(self, tmpdir, zero_stage):
             loss = ds_model(batch[0], batch[1])
             ds_model.backward(loss)
             ds_model.step()
+
+        ds_model.empty_partition_cache()
         ds_model.save_checkpoint(tmpdir)
 
         # 2. load and immediately save a model with a fresh ds engine
-        ds_model = create_deepspeed_model(config_dict=config_dict,
-                                          model=model,
-                                          base_optimizer=None)
+        ds_model = create_deepspeed_model(config_dict=config_dict, model=model, base_optimizer=None)
         ds_model.load_checkpoint(tmpdir,
                                  load_optimizer_states=False,
                                  load_lr_scheduler_states=False,
@@ -438,9 +389,7 @@ def test_save_before_accum_grad_is_done(self, tmpdir, zero_stage):
         # This test reproduces a bug where one tries to retrieve a 16bit model before grad_accum
         # cycle was completed.
         # So we config grad_accum=2 and step only once and save_16bit_model
-        ds_model = create_deepspeed_model(config_dict=config_dict,
-                                          model=model,
-                                          base_optimizer=None)
+        ds_model = create_deepspeed_model(config_dict=config_dict, model=model, base_optimizer=None)
 
         data_loader = random_dataloader(model=ds_model,
                                         total_samples=2,
@@ -453,8 +402,259 @@ def test_save_before_accum_grad_is_done(self, tmpdir, zero_stage):
         ds_model.backward(loss)
         ds_model.step()
 
+        ds_model.empty_partition_cache()
+
         # we stepped only once, and now save 16bit model before gradient_accumulation_steps=2 is complete
         ds_model.save_16bit_model(tmpdir, "model.pt")
 
         # let's test just as well that we can save the checkpoint too
         ds_model.save_checkpoint(tmpdir)
+
+
+class TestZeROCheckpointFrozenWeights(DistributedTest):
+    world_size = 2
+
+    @pytest.mark.parametrize('zero_stage', [1, 2, 3])
+    def test_load_optimizer_state(self, tmpdir, zero_stage):
+
+        config_dict = {
+            "train_batch_size": 2,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": 'Adam',
+                "params": {
+                    "lr": 0.00015,
+                    "betas": [0.8, 0.999],
+                    "eps": 1e-8,
+                    "weight_decay": 3e-7
+                }
+            },
+            "fp16": {
+                "enabled": True,
+                "initial_scale_power": 8
+            },
+            "wall_clock_breakdown": True,
+            "zero_optimization": {
+                "stage": zero_stage
+            }
+        }
+        hidden_dim = 10
+
+        with deepspeed.zero.Init(enabled=zero_stage == 3):
+            models = [SimpleFrozenModel(hidden_dim, empty_grad=False) for _ in range(2)]
+
+        checkpoint_correctness_verification(config_dict, models, hidden_dim, tmpdir, load_optimizer_states=True)
+
+    @pytest.mark.parametrize('zero_stage', [1, 2, 3])
+    def test_not_load_optimizer_state(self, tmpdir, zero_stage):
+
+        config_dict = {
+            "train_batch_size": 2,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": 'Adam',
+                "params": {
+                    "lr": 0.00015,
+                    "betas": [0.8, 0.999],
+                    "eps": 1e-8,
+                    "weight_decay": 3e-7
+                }
+            },
+            "fp16": {
+                "enabled": True
+            },
+            "zero_optimization": {
+                "stage": zero_stage
+            }
+        }
+        hidden_dim = 10
+
+        with deepspeed.zero.Init(enabled=zero_stage == 3):
+            models = [SimpleFrozenModel(hidden_dim, empty_grad=False) for _ in range(2)]
+
+        checkpoint_correctness_verification(config_dict, models, hidden_dim, tmpdir, load_optimizer_states=False)
+
+    @pytest.mark.parametrize('zero_stage', [1, 2, 3])
+    def test_load_module_only(self, tmpdir, zero_stage):
+        config_dict = {
+            "train_batch_size": 2,
+            "optimizer": {
+                "type": 'Adam'
+            },
+            "fp16": {
+                "enabled": True,
+                "initial_scale_power": 8
+            },
+            "zero_optimization": {
+                "stage": zero_stage,
+            }
+        }
+        hidden_dim = 10
+
+        with deepspeed.zero.Init(enabled=zero_stage == 3):
+            models = [SimpleFrozenModel(hidden_dim, empty_grad=False) for _ in range(2)]
+
+        checkpoint_correctness_verification(config_dict, models, hidden_dim, tmpdir, load_module_only=True)
+
+    @pytest.mark.parametrize('zero_stage', [1, 2])
+    def test_save_exclude_frozen_weights(self, tmpdir, zero_stage):
+        world_size = 1
+        config_dict = {
+            "train_micro_batch_size_per_gpu": 1,
+            "optimizer": {
+                "type": 'Adam'
+            },
+            "fp16": {
+                "enabled": True,
+                "initial_scale_power": 8
+            },
+            "zero_optimization": {
+                "stage": zero_stage,
+            }
+        }
+        hidden_dim = 10
+
+        model = SimpleFrozenModel(hidden_dim, empty_grad=False)
+
+        ds_engine, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config_dict)
+
+        # Validate backwards-compatibility of including frozen parameters in checkpoint
+        all_ckpt_folder = os.path.join(tmpdir, 'all_params')
+        ds_engine.save_checkpoint(all_ckpt_folder)
+        all_params_ckpt_file = get_model_ckpt_name_for_rank(os.path.join(all_ckpt_folder, 'global_step0'), '00')
+        loaded_all_param_model = torch.load(all_params_ckpt_file)['module']
+        all_param_names = set([n for n, p in model.named_parameters()])
+        assert set(loaded_all_param_model.keys()) == all_param_names
+
+        # Validate exclusion of frozen parameters
+        trainable_ckpt_folder = os.path.join(tmpdir, 'no_frozen_params')
+        ds_engine.save_checkpoint(trainable_ckpt_folder, exclude_frozen_parameters=True)
+
+        trainable_ckpt_file = get_model_ckpt_name_for_rank(os.path.join(trainable_ckpt_folder, 'global_step0'), '00')
+
+        # Excluding frozen parameters should reduce checkpoint size
+        assert os.path.getsize(all_params_ckpt_file) > os.path.getsize(trainable_ckpt_file)
+
+        loaded_trainable_param_model = torch.load(trainable_ckpt_file)['module']
+        frozen_param_names = set([n for n, p in model.named_parameters() if not p.requires_grad])
+        loaded_trainable_param_names = set(loaded_trainable_param_model.keys())
+        overlap_names = set.intersection(loaded_trainable_param_names, frozen_param_names)
+        assert len(overlap_names) == 0
+
+        trainable_param_names = set([n for n, p in model.named_parameters() if p.requires_grad])
+        assert loaded_trainable_param_names == trainable_param_names
+
+    @pytest.mark.parametrize('zero_stage', [1, 2])
+    def test_save_exclude_custom_frozen_weights(self, tmpdir, zero_stage):
+        world_size = 1
+        config_dict = {
+            "train_micro_batch_size_per_gpu": 1,
+            "optimizer": {
+                "type": 'Adam'
+            },
+            "fp16": {
+                "enabled": True,
+                "initial_scale_power": 8
+            },
+            "zero_optimization": {
+                "stage": zero_stage,
+            }
+        }
+        hidden_dim = 10
+
+        model = SimpleFrozenModel(hidden_dim, empty_grad=False)
+
+        ds_engine, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config_dict)
+
+        # Validate custom state_dict model
+        state_dict_bk = model.state_dict
+        model.state_dict = model.custom_state_dict
+        custom_state_dict_ckpt_folder = os.path.join(tmpdir, 'custom_state_dict')
+        ds_engine.save_checkpoint(custom_state_dict_ckpt_folder, exclude_frozen_parameters=True)
+
+        custom_state_dict_ckpt_file = get_model_ckpt_name_for_rank(
+            os.path.join(custom_state_dict_ckpt_folder, 'global_step0'), '00')
+        loaded_custom_state_dict_param_model = torch.load(custom_state_dict_ckpt_file)['module']
+        loaded_custom_state_dict_param_names = set(loaded_custom_state_dict_param_model.keys())
+
+        custom_state_dict_param_names = set([k for k, v in model.state_dict().items()])
+        trainable_param_names = set([n for n, p in model.named_parameters() if p.requires_grad])
+        overlap_names = set.intersection(custom_state_dict_param_names, trainable_param_names)
+
+        assert loaded_custom_state_dict_param_names == overlap_names
+
+        model.state_dict = state_dict_bk
+
+
+class TestSaveTensorClone(DistributedTest):
+    world_size = 1
+
+    @pytest.mark.parametrize('zero_stage', [1, 2])
+    @pytest.mark.parametrize('use_cpu_device', [True, False])
+    def test_save_tensor_clone(self, tmpdir, zero_stage, use_cpu_device):
+
+        ds_config = {
+            "optimizer": {
+                "type": "AdamW",
+            },
+            "zero_optimization": {
+                "stage": zero_stage
+            },
+            "train_batch_size": 1,
+            "train_micro_batch_size_per_gpu": 1
+        }
+        hidden_dim = 1024
+        model = SimpleModel(hidden_dim, nlayers=4).half()
+        ref_model_state_dict = model.state_dict()
+
+        ds_engine, _, _, _ = deepspeed.initialize(model=model, config_params=ds_config)
+        clone_device = torch.device('cpu') if use_cpu_device else get_accelerator().current_device()
+        clone_state_dict = clone_tensors_for_torch_save(ds_engine.module.state_dict())
+        compare_state_dicts(ref_model_state_dict, clone_state_dict)
+
+        ref_ckpt_file = os.path.join(tmpdir, 'ref_ckpt.pt')
+        torch.save(ref_model_state_dict, ref_ckpt_file)
+        clone_ckpt_file = os.path.join(tmpdir, 'clone_ckpt.pt')
+        torch.save(clone_state_dict, clone_ckpt_file)
+
+        compare_state_dicts(torch.load(ref_ckpt_file), torch.load(clone_ckpt_file))
+
+
+class TestZeRONonDistributed(DistributedTest):
+    world_size = 1
+    init_distributed = False
+
+    @pytest.mark.parametrize('zero_stage', [1, 2, 3])
+    def test_chmod_exception_handling(self, monkeypatch, zero_stage):
+
+        config_dict = {
+            "optimizer": {
+                "type": "AdamW"
+            },
+            "train_batch_size": 1,
+            "zero_optimization": {
+                "stage": zero_stage
+            }
+        }
+        args = SimpleNamespace(local_rank=0)
+        net = SimpleModel(hidden_dim=4)
+        engine, _, _, _ = deepspeed.initialize(args=args,
+                                               config=config_dict,
+                                               model=net,
+                                               model_parameters=net.parameters())
+
+        log_called = False
+
+        def mock_logger_info(message, *args, **kwargs):
+            nonlocal log_called
+            log_called = True
+
+        monkeypatch.setattr("deepspeed.utils.logger.info", mock_logger_info)
+        """
+            This is presented for use-cases like Azure Storage File Share (where permissions are not allowed)
+            We use a fake file for this test (file not existing would present a similar issue as not being able to chmod)
+        """
+        fake_recovery_script_dst = os.path.join("tmp", "zero_to_fp32.py")
+        engine._change_recovery_script_permissions(fake_recovery_script_dst)
+
+        assert log_called, "Expected deepspeed.utils.logger.info to be called."
diff --git a/tests/unit/comm/test_dist.py b/tests/unit/comm/test_dist.py
index 6005c926f793..a40805bf0a75 100644
--- a/tests/unit/comm/test_dist.py
+++ b/tests/unit/comm/test_dist.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import os
 import torch
@@ -10,6 +13,10 @@
 from deepspeed.accelerator import get_accelerator
 
 import pytest
+from deepspeed.ops.op_builder import FusedAdamBuilder
+
+if not deepspeed.ops.__compatible_ops__[FusedAdamBuilder.NAME]:
+    pytest.skip("This op had not been implemented on this system.", allow_module_level=True)
 
 
 class TestInit(DistributedTest):
@@ -31,6 +38,7 @@ def greeting(request):
 class TestDistArgs(DistributedTest):
     world_size = 2
     """ Classes that use DistributedTest class must define a test* method """
+
     @pytest.mark.parametrize("shape", ["icosahedron"])
     def test(self, number, color, shape, greeting):
         """Ensure that we can parse args to DistributedTest methods. """
@@ -103,7 +111,13 @@ def test(self, distributed_fixture, class_tmpdir, val1, val2):
 
 
 class TestDistAllReduce(DistributedTest):
-    world_size = [1, 2, 4]
+    device_count = get_accelerator().device_count()
+    if device_count >= 4:
+        world_size = [1, 2, 4]
+    elif device_count >= 2:
+        world_size = [1, 2]
+    else:
+        world_size = [1]
 
     def test(self):
         x = torch.ones(1, 3).to(get_accelerator().device_name()) * (dist.get_rank() + 1)
@@ -113,13 +127,23 @@ def test(self):
         assert torch.all(x == result)
 
 
+class TestDistInferenceAllReduce(DistributedTest):
+    world_size = 4
+
+    def test(self):
+        x = torch.ones(1, 3).to(get_accelerator().device_name()) * (dist.get_rank() + 1)
+        sum_of_ranks = (dist.get_world_size() * (dist.get_world_size() + 1)) // 2
+        result = torch.ones(1, 3).to(get_accelerator().device_name()) * sum_of_ranks
+        dist.inference_all_reduce(x)
+        assert torch.all(x == result)
+
+
 @pytest.mark.parametrize("dist_init_required", [True, False, None])
 class TestDistInit(DistributedTest):
     init_distributed = False
 
     def test_already_init(self, dist_init_required):
-        torch.distributed.init_process_group(
-            get_accelerator().communication_backend_name())
+        torch.distributed.init_process_group(get_accelerator().communication_backend_name())
         deepspeed.init_distributed(get_accelerator().communication_backend_name(),
                                    dist_init_required=dist_init_required)
 
@@ -130,9 +154,8 @@ def test_no_init(self, dist_init_required):
         else:
             # torch.dist is not done and for some reason the user says they don't want it done
             with pytest.raises(Exception):
-                deepspeed.init_distributed(
-                    get_accelerator().communication_backend_name(),
-                    dist_init_required=dist_init_required)
+                deepspeed.init_distributed(get_accelerator().communication_backend_name(),
+                                           dist_init_required=dist_init_required)
 
 
 class TestDistInitNoEnv(DistributedTest):
@@ -141,14 +164,12 @@ class TestDistInitNoEnv(DistributedTest):
     set_dist_env = False
 
     def test(self):
-        torch.distributed.init_process_group(
-            backend=get_accelerator().communication_backend_name(),
-            init_method=f"tcp://127.0.0.1:{get_master_port()}",
-            world_size=1,
-            rank=0)
+        torch.distributed.init_process_group(backend=get_accelerator().communication_backend_name(),
+                                             init_method=f"tcp://127.0.0.1:{get_master_port()}",
+                                             world_size=1,
+                                             rank=0)
         assert torch.distributed.is_initialized()
-        deepspeed.init_distributed(get_accelerator().communication_backend_name(),
-                                   auto_mpi_discovery=True)
+        deepspeed.init_distributed(get_accelerator().communication_backend_name(), auto_mpi_discovery=True)
 
 
 @pytest.mark.parametrize("dist_init_required", [True, False])
@@ -156,45 +177,26 @@ class TestDistInitWithModel(DistributedTest):
     init_distributed = False
 
     def test_already_init(self, dist_init_required):
-        torch.distributed.init_process_group(
-            get_accelerator().communication_backend_name())
+        torch.distributed.init_process_group(get_accelerator().communication_backend_name())
         model = SimpleModel(4)
-        config_dict = {
-            "train_micro_batch_size_per_gpu": 1,
-            "optimizer": {
-                "type": "Adam",
-                "params": {}
-            }
-        }
-        engine, *_ = deepspeed.initialize(
-            model=model,
-            config=config_dict,
-            model_parameters=model.parameters(),
-            dist_init_required=dist_init_required
-        )
+        config_dict = {"train_micro_batch_size_per_gpu": 1, "optimizer": {"type": "Adam", "params": {}}}
+        engine, *_ = deepspeed.initialize(model=model,
+                                          config=config_dict,
+                                          model_parameters=model.parameters(),
+                                          dist_init_required=dist_init_required)
 
     def test_no_init(self, dist_init_required):
         model = SimpleModel(4)
-        config_dict = {
-            "train_micro_batch_size_per_gpu": 1,
-            "optimizer": {
-                "type": "Adam",
-                "params": {}
-            }
-        }
+        config_dict = {"train_micro_batch_size_per_gpu": 1, "optimizer": {"type": "Adam", "params": {}}}
         if dist_init_required:
-            engine, *_ = deepspeed.initialize(
-                model=model,
-                config=config_dict,
-                model_parameters=model.parameters(),
-                dist_init_required=dist_init_required
-            )
+            engine, *_ = deepspeed.initialize(model=model,
+                                              config=config_dict,
+                                              model_parameters=model.parameters(),
+                                              dist_init_required=dist_init_required)
         else:
             # torch.dist is not done and for some reason the user says they don't want it done
             with pytest.raises(Exception):
-                engine, *_ = deepspeed.initialize(
-                    model=model,
-                    config=config_dict,
-                    model_parameters=model.parameters(),
-                    dist_init_required=dist_init_required
-                )
+                engine, *_ = deepspeed.initialize(model=model,
+                                                  config=config_dict,
+                                                  model_parameters=model.parameters(),
+                                                  dist_init_required=dist_init_required)
diff --git a/tests/unit/common.py b/tests/unit/common.py
index 35e8f3983072..cdeca54b01ee 100644
--- a/tests/unit/common.py
+++ b/tests/unit/common.py
@@ -1,8 +1,14 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import os
+import re
 import time
 import inspect
+import socket
+import subprocess
 from abc import ABC, abstractmethod
 from pathlib import Path
 
@@ -11,19 +17,19 @@
 import deepspeed
 from deepspeed.accelerator import get_accelerator
 import deepspeed.comm as dist
-from torch.multiprocessing import Process
 
 import pytest
 from _pytest.outcomes import Skipped
 from _pytest.fixtures import FixtureLookupError, FixtureFunctionMarker
 
-# Worker timeout *after* the first worker has completed.
-DEEPSPEED_UNIT_WORKER_TIMEOUT = 120
-
 # Worker timeout for tests that hang
 DEEPSPEED_TEST_TIMEOUT = 600
 
 
+def is_rocm_pytorch():
+    return hasattr(torch.version, 'hip') and torch.version.hip is not None
+
+
 def get_xdist_worker_id():
     xdist_worker = os.environ.get('PYTEST_XDIST_WORKER', None)
     if xdist_worker is not None:
@@ -32,12 +38,24 @@ def get_xdist_worker_id():
     return None
 
 
-def get_master_port():
-    master_port = os.environ.get('DS_TEST_PORT', '29503')
+def get_master_port(base_port=29500, port_range_size=1000):
     xdist_worker_id = get_xdist_worker_id()
     if xdist_worker_id is not None:
-        master_port = str(int(master_port) + xdist_worker_id)
-    return master_port
+        # Make xdist workers use different port ranges to avoid race conditions
+        base_port += port_range_size * xdist_worker_id
+
+    # Select first open port in range
+    port = base_port
+    max_port = base_port + port_range_size
+    sock = socket.socket()
+    while port < max_port:
+        try:
+            sock.bind(('', port))
+            sock.close()
+            return str(port)
+        except OSError:
+            port += 1
+    raise IOError('no free ports')
 
 
 def set_accelerator_visible():
@@ -47,30 +65,32 @@ def set_accelerator_visible():
         xdist_worker_id = 0
     if cuda_visible is None:
         # CUDA_VISIBLE_DEVICES is not set, discover it using accelerator specific command instead
-        import subprocess
         if get_accelerator().device_name() == 'cuda':
-            is_rocm_pytorch = hasattr(torch.version,
-                                      'hip') and torch.version.hip is not None
-            if is_rocm_pytorch:
+            if is_rocm_pytorch():
                 rocm_smi = subprocess.check_output(['rocm-smi', '--showid'])
-                gpu_ids = filter(lambda s: 'GPU' in s,
-                                 rocm_smi.decode('utf-8').strip().split('\n'))
-                num_gpus = len(list(gpu_ids))
+                gpu_ids = filter(lambda s: 'GPU' in s, rocm_smi.decode('utf-8').strip().split('\n'))
+                num_accelerators = len(list(gpu_ids))
             else:
                 nvidia_smi = subprocess.check_output(['nvidia-smi', '--list-gpus'])
-                num_gpus = len(nvidia_smi.decode('utf-8').strip().split('\n'))
-        else:
-            assert get_accelerator().device_name() == 'xpu'
-            import re
+                num_accelerators = len(nvidia_smi.decode('utf-8').strip().split('\n'))
+        elif get_accelerator().device_name() == 'xpu':
             clinfo = subprocess.check_output(['clinfo'])
             lines = clinfo.decode('utf-8').strip().split('\n')
-            num_gpus = 0
+            num_accelerators = 0
             for line in lines:
                 match = re.search('Device Type.*GPU', line)
                 if match:
-                    num_gpus += 1
+                    num_accelerators += 1
+        elif get_accelerator().device_name() == 'npu':
+            npu_smi = subprocess.check_output(['npu-smi', 'info', '-l'])
+            num_accelerators = int(npu_smi.decode('utf-8').strip().split('\n')[0].split(':')[1].strip())
+        else:
+            assert get_accelerator().device_name() == 'cpu'
+            cpu_sockets = int(
+                subprocess.check_output('cat /proc/cpuinfo | grep "physical id" | sort -u | wc -l', shell=True))
+            num_accelerators = cpu_sockets
 
-        cuda_visible = ",".join(map(str, range(num_gpus)))
+        cuda_visible = ",".join(map(str, range(num_accelerators)))
 
     # rotate list based on xdist worker id, example below
     # wid=0 -> ['0', '1', '2', '3']
@@ -92,6 +112,9 @@ class DistributedExec(ABC):
     init_distributed = True
     set_dist_env = True
     requires_cuda_env = True
+    reuse_dist_env = False
+    _pool_cache = {}
+    exec_timeout = DEEPSPEED_TEST_TIMEOUT
 
     @abstractmethod
     def run(self):
@@ -107,7 +130,6 @@ def __call__(self, request=None):
             world_size = [world_size]
         for procs in world_size:
             self._launch_procs(procs)
-            time.sleep(0.5)
 
     def _get_fixture_kwargs(self, request, func):
         if not request:
@@ -124,94 +146,95 @@ def _get_fixture_kwargs(self, request, func):
         return fixture_kwargs
 
     def _launch_procs(self, num_procs):
-        if torch.cuda.is_available() and torch.cuda.device_count() < num_procs:
+        # Verify we have enough accelerator devices to run this test
+        if get_accelerator().is_available() and get_accelerator().device_count() < num_procs:
             pytest.skip(
-                f"Skipping test because not enough GPUs are available: {num_procs} required, {torch.cuda.device_count()} available"
+                f"Skipping test because not enough GPUs are available: {num_procs} required, {get_accelerator().device_count()} available"
             )
+
+        # Set start method to `forkserver` (or `fork`)
         mp.set_start_method('forkserver', force=True)
-        skip_msg = mp.Queue()  # Allows forked processes to share pytest.skip reason
-        processes = []
-        for local_rank in range(num_procs):
-            p = Process(target=self._dist_init, args=(local_rank, num_procs, skip_msg))
-            p.start()
-            processes.append(p)
-
-        # Now loop and wait for a test to complete. The spin-wait here isn't a big
-        # deal because the number of processes will be O(#GPUs) << O(#CPUs).
-        any_done = False
-        start = time.time()
-        while (not any_done) and ((time.time() - start) < DEEPSPEED_TEST_TIMEOUT):
-            for p in processes:
-                if not p.is_alive():
-                    any_done = True
-                    break
-            time.sleep(.1)  # So we don't hog CPU
-
-        # If we hit the timeout, then presume a test is hanged
-        if not any_done:
-            for p in processes:
-                p.terminate()
-            pytest.exit("Test hanged, exiting", returncode=0)
 
-        # Wait for all other processes to complete
-        for p in processes:
-            p.join(DEEPSPEED_UNIT_WORKER_TIMEOUT)
-
-        failed = [(rank, p) for rank, p in enumerate(processes) if p.exitcode != 0]
-        for rank, p in failed:
-            # If it still hasn't terminated, kill it because it hung.
-            if p.exitcode is None:
-                p.terminate()
-                pytest.fail(f'Worker {rank} hung.', pytrace=False)
-            if p.exitcode < 0:
-                pytest.fail(f'Worker {rank} killed by signal {-p.exitcode}',
-                            pytrace=False)
-            if p.exitcode > 0:
-                pytest.fail(f'Worker {rank} exited with code {p.exitcode}',
-                            pytrace=False)
-
-        if not skip_msg.empty():
-            # This assumed all skip messages are the same, it may be useful to
-            # add a check here to assert all exit messages are equal
-            pytest.skip(skip_msg.get())
-
-    def _dist_init(self, local_rank, num_procs, skip_msg):
-        """Initialize deepspeed.comm and execute the user function. """
-        if self.set_dist_env:
-            os.environ['MASTER_ADDR'] = '127.0.0.1'
-            os.environ['MASTER_PORT'] = get_master_port()
-            os.environ['LOCAL_RANK'] = str(local_rank)
-            # NOTE: unit tests don't support multi-node so local_rank == global rank
-            os.environ['RANK'] = str(local_rank)
-            os.environ['WORLD_SIZE'] = str(num_procs)
-
-        # turn off NCCL logging if set
-        os.environ.pop('NCCL_DEBUG', None)
-
-        if get_accelerator().is_available():
-            set_accelerator_visible()
-
-        if self.init_distributed:
-            deepspeed.init_distributed(dist_backend=self.backend)
-            dist.barrier()
+        # Create process pool or use cached one
+        master_port = None
+        if self.reuse_dist_env:
+            if num_procs not in self._pool_cache:
+                self._pool_cache[num_procs] = mp.Pool(processes=num_procs)
+                master_port = get_master_port()
+            pool = self._pool_cache[num_procs]
+        else:
+            pool = mp.Pool(processes=num_procs)
+            master_port = get_master_port()
 
-        if get_accelerator().is_available():
-            get_accelerator().set_device(local_rank)
+        # Run the test
+        args = [(local_rank, num_procs, master_port) for local_rank in range(num_procs)]
+        skip_msgs_async = pool.starmap_async(self._dist_run, args)
+
+        try:
+            skip_msgs = skip_msgs_async.get(self.exec_timeout)
+        except mp.TimeoutError:
+            # Shortcut to exit pytest in the case of a hanged test. This
+            # usually means an environment error and the rest of tests will
+            # hang (causing super long unit test runtimes)
+            pytest.exit("Test hanged, exiting", returncode=0)
+
+        # Tear down distributed environment and close process pools
+        self._close_pool(pool, num_procs)
+
+        # If we skipped a test, propagate that to this process
+        if any(skip_msgs):
+            assert len(set(skip_msgs)) == 1, "Multiple different skip messages received"
+            pytest.skip(skip_msgs[0])
+
+    def _dist_run(self, local_rank, num_procs, master_port):
+        skip_msg = ''
+        if not dist.is_initialized():
+            """ Initialize deepspeed.comm and execute the user function. """
+            if self.set_dist_env:
+                os.environ['MASTER_ADDR'] = '127.0.0.1'
+                os.environ['MASTER_PORT'] = str(master_port)
+                os.environ['LOCAL_RANK'] = str(local_rank)
+                # NOTE: unit tests don't support multi-node so local_rank == global rank
+                os.environ['RANK'] = str(local_rank)
+                # In case of multiprocess launching LOCAL_SIZE should be same as WORLD_SIZE
+                # DeepSpeed single node launcher would also set LOCAL_SIZE accordingly
+                os.environ['LOCAL_SIZE'] = str(num_procs)
+                os.environ['WORLD_SIZE'] = str(num_procs)
+
+            # turn off NCCL logging if set
+            os.environ.pop('NCCL_DEBUG', None)
+
+            if get_accelerator().is_available():
+                set_accelerator_visible()
+
+            if get_accelerator().is_available():
+                get_accelerator().set_device(local_rank)
+
+            if self.init_distributed:
+                deepspeed.init_distributed(dist_backend=self.backend)
+                dist.barrier()
 
         try:
             self.run(**self._fixture_kwargs)
         except BaseException as e:
             if isinstance(e, Skipped):
-                skip_msg.put(e.msg)
+                skip_msg = e.msg
             else:
                 raise e
 
-        if self.init_distributed or dist.is_initialized():
-            # make sure all ranks finish at the same time
+        return skip_msg
+
+    def _dist_destroy(self):
+        if (dist is not None) and dist.is_initialized():
             dist.barrier()
-            # tear down after test completes
             dist.destroy_process_group()
 
+    def _close_pool(self, pool, num_procs, force=False):
+        if force or not self.reuse_dist_env:
+            msg = pool.starmap(self._dist_destroy, [() for _ in range(num_procs)])
+            pool.close()
+            pool.join()
+
 
 class DistributedFixture(DistributedExec):
     """
@@ -272,9 +295,7 @@ def test(self, distributed_fixture_example, regular_pytest_fixture, class_tmpdir
     def __init__(self):
         assert isinstance(self.world_size, int), "Only one world size is allowed for distributed fixtures"
         self.__name__ = type(self).__name__
-        _pytestfixturefunction = FixtureFunctionMarker(scope="function",
-                                                       params=None,
-                                                       name=self.__name__)
+        _pytestfixturefunction = FixtureFunctionMarker(scope="function", params=None, name=self.__name__)
 
 
 class DistributedTest(DistributedExec):
diff --git a/tests/unit/compression/test_compression.py b/tests/unit/compression/test_compression.py
index 829161ea072c..c6e5031349cb 100644
--- a/tests/unit/compression/test_compression.py
+++ b/tests/unit/compression/test_compression.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 import pytest
@@ -11,13 +14,11 @@
 from deepspeed.compression.basic_layer import LinearLayer_Compress, ColumnParallelLinear_Compress, RowParallelLinear_Compress
 from deepspeed.compression.helper import convert_conv1d_to_linear
 from deepspeed.accelerator import get_accelerator
+from deepspeed.runtime.utils import required_torch_version
 from unit.common import DistributedTest
 
-TORCH_MAJOR = int(torch.__version__.split('.')[0])
-TORCH_MINOR = int(torch.__version__.split('.')[1])
-pytestmark = pytest.mark.skipif(
-    TORCH_MAJOR < 1 or (TORCH_MAJOR == 1 and TORCH_MINOR < 5),
-    reason='Megatron-LM package requires Pytorch version 1.5 or above')
+pytestmark = pytest.mark.skipif(not required_torch_version(min_version=1.5),
+                                reason='Megatron-LM package requires Pytorch version 1.5 or above')
 
 
 def reset_random(seed=1234):
@@ -73,6 +74,7 @@ class Conv1D(torch.nn.Module):
         nf (`int`): The number of output features.
         nx (`int`): The number of input features.
     """
+
     def __init__(self, nf, nx):
         super().__init__()
         self.nf = nf
@@ -95,6 +97,7 @@ def create_conv1d_model():
 
 
 class TestCompression(DistributedTest):
+
     def setup_method(self, method):
         reset_random()
 
@@ -132,8 +135,7 @@ def get_ds_config(self):
                                 "target_bits": 8,
                                 "quantization_period": 50
                             },
-                            "modules": ["attention.self",
-                                        "intermediate"]
+                            "modules": ["attention.self", "intermediate"]
                         },
                         "wq2": {
                             "params": {
@@ -205,9 +207,7 @@ def get_ds_config(self):
                                 "dense_ratio": 0.5
                             },
                             "modules": ["attention.output.dense"],
-                            "related_modules": [["self.query",
-                                                 "self.key",
-                                                 "self.value"]]
+                            "related_modules": [["self.query", "self.key", "self.value"]]
                         }
                     }
                 }
@@ -220,14 +220,14 @@ def test_linear_layer_compress(self, tmpdir):
         model = create_bert_model()
         compressed_model = init_compression(model, self.get_ds_config())
 
-        assert isinstance(compressed_model.layer[0].attention.self.query,
-                          LinearLayer_Compress)
-        assert isinstance(compressed_model.layer[0].attention.self.key,
-                          LinearLayer_Compress)
-        assert isinstance(compressed_model.layer[0].attention.self.value,
-                          LinearLayer_Compress)
+        assert isinstance(compressed_model.layer[0].attention.self.query, LinearLayer_Compress)
+        assert isinstance(compressed_model.layer[0].attention.self.key, LinearLayer_Compress)
+        assert isinstance(compressed_model.layer[0].attention.self.value, LinearLayer_Compress)
 
+    @pytest.mark.skip(reason="megatron-lm is currently broken so this test cannot be run.")
     def test_mpu_compress(self, tmpdir):
+        if not required_torch_version(max_version=1.13):
+            pytest.skip("megatron not compatible with torch >1.13")
         from megatron import mpu
         args_defaults = {
             'num_layers': 2,
@@ -239,21 +239,14 @@ def test_mpu_compress(self, tmpdir):
         model = get_gpt2_model(args_defaults)
         compressed_model = init_compression(model, self.get_ds_config(), mpu=mpu)
 
-        assert isinstance(
-            compressed_model.module.language_model.transformer.layers[0].attention.
-            query_key_value,
-            ColumnParallelLinear_Compress)
-        assert isinstance(
-            compressed_model.module.language_model.transformer.layers[0].attention.dense,
-            RowParallelLinear_Compress)
-        assert isinstance(
-            compressed_model.module.language_model.transformer.layers[0].mlp.
-            dense_h_to_4h,
-            ColumnParallelLinear_Compress)
-        assert isinstance(
-            compressed_model.module.language_model.transformer.layers[0].mlp.
-            dense_4h_to_h,
-            RowParallelLinear_Compress)
+        assert isinstance(compressed_model.module.language_model.transformer.layers[0].attention.query_key_value,
+                          ColumnParallelLinear_Compress)
+        assert isinstance(compressed_model.module.language_model.transformer.layers[0].attention.dense,
+                          RowParallelLinear_Compress)
+        assert isinstance(compressed_model.module.language_model.transformer.layers[0].mlp.dense_h_to_4h,
+                          ColumnParallelLinear_Compress)
+        assert isinstance(compressed_model.module.language_model.transformer.layers[0].mlp.dense_4h_to_h,
+                          RowParallelLinear_Compress)
 
     def test_conv1d_convertion(self, tmpdir):
         model = create_conv1d_model()
diff --git a/tests/unit/compression/test_dequantization.py b/tests/unit/compression/test_dequantization.py
new file mode 100644
index 000000000000..692f4cef97d7
--- /dev/null
+++ b/tests/unit/compression/test_dequantization.py
@@ -0,0 +1,38 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# Copyright (c) 2023, 2023, Oracle and/or its affiliates.
+
+import os
+import torch
+from unit.common import DistributedTest
+from deepspeed.ops.op_builder import InferenceBuilder
+from deepspeed.accelerator import get_accelerator
+
+
+class TestDequantization(DistributedTest):
+
+    def init(self):
+        local_rank = int(os.getenv("LOCAL_RANK", "0"))
+        self.device = torch.device(get_accelerator().device_name(local_rank))
+
+        self.dequantize_func = InferenceBuilder().load().dequantize_fp16
+
+    def run_dequantize_test(self, M, N, num_groups):
+        weight = torch.randint(-255, 255, (M, N)).to(dtype=torch.int8, device=self.device)
+        scale = torch.rand(num_groups, 1).to(device=self.device)
+
+        weight_deq = (weight.reshape(num_groups, -1) * scale).reshape(M, N).to(torch.float16).contiguous()
+        weight_deq_backend = self.dequantize_func(weight, scale, num_groups)
+
+        assert torch.allclose(weight_deq, weight_deq_backend)
+
+    def test_dequantize(self):
+        self.init()
+
+        self.run_dequantize_test(14336, 7168, 32)
+        self.run_dequantize_test(14336, 1792, 32)
+        self.run_dequantize_test(768, 768, 32)
+        self.run_dequantize_test(768, 768, 48)
diff --git a/tests/unit/elasticity/test_elastic.py b/tests/unit/elasticity/test_elastic.py
index e29b2a22e825..a49ec595a420 100644
--- a/tests/unit/elasticity/test_elastic.py
+++ b/tests/unit/elasticity/test_elastic.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import pytest
 import deepspeed
@@ -6,6 +9,10 @@
 from deepspeed.git_version_info import version as ds_version
 import os
 from unit.simple_model import SimpleModel
+from deepspeed.ops.op_builder import FusedAdamBuilder
+
+if not deepspeed.ops.__compatible_ops__[FusedAdamBuilder.NAME]:
+    pytest.skip("This op had not been implemented on this system.", allow_module_level=True)
 
 
 @pytest.fixture
@@ -14,10 +21,7 @@ def ds_config():
         "elasticity": {
             "enabled": True,
             "max_train_batch_size": 10000,
-            "micro_batch_sizes": [8,
-                                  12,
-                                  16,
-                                  17],
+            "micro_batch_sizes": [8, 12, 16, 17],
             "min_gpus": 32,
             "max_gpus": 1500,
             "min_time": 20,
@@ -28,9 +32,8 @@ def ds_config():
 
 
 def test_basic_10k(ds_config):
-    final_batch_size, valid_gpus = deepspeed.elasticity.compute_elastic_config(
-        ds_config=ds_config,
-        target_deepspeed_version=ds_version)
+    final_batch_size, valid_gpus = deepspeed.elasticity.compute_elastic_config(ds_config=ds_config,
+                                                                               target_deepspeed_version=ds_version)
 
     for gpu_num in valid_gpus:
         assert final_batch_size % gpu_num == 0, f"Batch {final_batch_size} is not divisible by GPU count {gpu_num}"
@@ -49,61 +52,51 @@ def test_basic_10k(ds_config):
 
 def test_old_version(ds_config):
     with pytest.raises(deepspeed.elasticity.config.ElasticityError):
-        final_batch_size, valid_gpus = deepspeed.elasticity.compute_elastic_config(
-            ds_config=ds_config,
-            target_deepspeed_version="0.2")
+        final_batch_size, valid_gpus = deepspeed.elasticity.compute_elastic_config(ds_config=ds_config,
+                                                                                   target_deepspeed_version="0.2")
 
 
 def test_disabled(ds_config):
     ds_config['elasticity']['enabled'] = False
     with pytest.raises(deepspeed.elasticity.config.ElasticityError):
-        final_batch_size, valid_gpus = deepspeed.elasticity.compute_elastic_config(
-            ds_config=ds_config,
-            target_deepspeed_version=ds_version)
+        final_batch_size, valid_gpus = deepspeed.elasticity.compute_elastic_config(ds_config=ds_config,
+                                                                                   target_deepspeed_version=ds_version)
 
 
 def test_valid_world_size(ds_config):
     final_batch_size, valid_gpus, mbsize = deepspeed.elasticity.compute_elastic_config(
-            ds_config=ds_config,
-            target_deepspeed_version=ds_version,
-            world_size=64)
+        ds_config=ds_config, target_deepspeed_version=ds_version, world_size=64)
     assert mbsize == 17
 
 
 def test_invalid_world_size(ds_config):
     with pytest.raises(deepspeed.elasticity.config.ElasticityIncompatibleWorldSize):
         final_batch_size, valid_gpus, mbsize = deepspeed.elasticity.compute_elastic_config(
-            ds_config=ds_config,
-            target_deepspeed_version=ds_version,
-            world_size=128)
+            ds_config=ds_config, target_deepspeed_version=ds_version, world_size=128)
 
 
 def test_future_elastic_version(ds_config):
     ds_config['elasticity']['version'] = '0.3'
     with pytest.raises(deepspeed.elasticity.config.ElasticityError):
-        deepspeed.elasticity.compute_elastic_config(ds_config=ds_config,
-                                                    target_deepspeed_version=ds_version)
+        deepspeed.elasticity.compute_elastic_config(ds_config=ds_config, target_deepspeed_version=ds_version)
 
 
 def test_missing_max_batch(ds_config):
     del ds_config['elasticity']['max_train_batch_size']
     with pytest.raises(deepspeed.elasticity.config.ElasticityError):
-        deepspeed.elasticity.compute_elastic_config(ds_config=ds_config,
-                                                    target_deepspeed_version=ds_version)
+        deepspeed.elasticity.compute_elastic_config(ds_config=ds_config, target_deepspeed_version=ds_version)
 
 
 def test_missing_micro_batch(ds_config):
     del ds_config['elasticity']['micro_batch_sizes']
     with pytest.raises(deepspeed.elasticity.config.ElasticityError):
-        deepspeed.elasticity.compute_elastic_config(ds_config=ds_config,
-                                                    target_deepspeed_version=ds_version)
+        deepspeed.elasticity.compute_elastic_config(ds_config=ds_config, target_deepspeed_version=ds_version)
 
 
 def test_empty_config():
     ds_config = {"elasticity": {"enabled": True}}
     with pytest.raises(deepspeed.elasticity.config.ElasticityError):
-        deepspeed.elasticity.compute_elastic_config(ds_config=ds_config,
-                                                    target_deepspeed_version=ds_version)
+        deepspeed.elasticity.compute_elastic_config(ds_config=ds_config, target_deepspeed_version=ds_version)
 
 
 def test_model_parallel_v1_invalid(ds_config):
@@ -112,8 +105,7 @@ def test_model_parallel_v1_invalid(ds_config):
     ds_config["elasticity"]["version"] = 0.1
 
     with pytest.raises(deepspeed.elasticity.config.ElasticityError):
-        deepspeed.elasticity.compute_elastic_config(ds_config=ds_config,
-                                                    target_deepspeed_version=ds_version)
+        deepspeed.elasticity.compute_elastic_config(ds_config=ds_config, target_deepspeed_version=ds_version)
 
 
 def test_model_parallel_v2_invalid(ds_config):
@@ -133,37 +125,17 @@ def test_model_parallel_v2_valid(ds_config):
     ds_config["elasticity"]["version"] = 0.2
 
     os.environ["WORLD_SIZE"] = str(16)
-    deepspeed.elasticity.compute_elastic_config(ds_config=ds_config,
-                                                target_deepspeed_version=ds_version)
+    deepspeed.elasticity.compute_elastic_config(ds_config=ds_config, target_deepspeed_version=ds_version)
     os.environ.pop("WORLD_SIZE")
 
 
-@pytest.mark.parametrize('key, value',
-                         [('micro_batch_sizes',
-                           [1,
-                            4,
-                            -1,
-                            2,
-                            -10]),
-                          ('min_gpus',
-                           -1),
-                          ('max_gpus',
-                           -1),
-                          ('micro_batch_sizes',
-                           5),
-                          ('micro_batch_sizes',
-                           ['a',
-                            None,
-                            0.5]),
-                          ('micro_batch_sizes',
-                           [2,
-                            0.5,
-                            4])])
+@pytest.mark.parametrize('key, value', [('micro_batch_sizes', [1, 4, -1, 2, -10]), ('min_gpus', -1), ('max_gpus', -1),
+                                        ('micro_batch_sizes', 5), ('micro_batch_sizes', ['a', None, 0.5]),
+                                        ('micro_batch_sizes', [2, 0.5, 4])])
 def test_invalid_config_values(key, value, ds_config):
     ds_config['elasticity'][key] = value
     with pytest.raises(deepspeed.elasticity.config.ElasticityError):
-        deepspeed.elasticity.compute_elastic_config(ds_config=ds_config,
-                                                    target_deepspeed_version=ds_version)
+        deepspeed.elasticity.compute_elastic_config(ds_config=ds_config, target_deepspeed_version=ds_version)
 
 
 def test_proper_mbsz(ds_config):
@@ -171,9 +143,7 @@ def test_proper_mbsz(ds_config):
     ds_config["elasticity"]["micro_batch_sizes"] = [1, 2, 3, 7]
     ds_config["elasticity"]["min_gpus"] = 1
     final_batch_size, valid_gpus, mbsize = deepspeed.elasticity.compute_elastic_config(
-        ds_config=ds_config,
-        target_deepspeed_version=ds_version,
-        world_size=7)
+        ds_config=ds_config, target_deepspeed_version=ds_version, world_size=7)
     assert mbsize == 3
 
 
@@ -194,10 +164,7 @@ def test(self):
             "elasticity": {
                 "enabled": True,
                 "max_train_batch_size": 4,
-                "micro_batch_sizes": [1,
-                                      2,
-                                      3,
-                                      4],
+                "micro_batch_sizes": [1, 2, 3, 4],
                 "min_gpus": 1,
                 "max_gpus": 4,
                 "min_time": 20,
@@ -209,9 +176,7 @@ def test(self):
         model = SimpleModel(hidden_dim, empty_grad=False)
 
         with pytest.raises(deepspeed.elasticity.config.ElasticityError):
-            model, _, _,_ = deepspeed.initialize(config=config_dict,
-                                                 model=model,
-                                                 model_parameters=model.parameters())
+            model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
 
 
 class TestNonElasticBatchParamsWithOverride(DistributedTest):
@@ -231,10 +196,7 @@ def test(self):
             "elasticity": {
                 "enabled": True,
                 "max_train_batch_size": 4,
-                "micro_batch_sizes": [1,
-                                      2,
-                                      3,
-                                      4],
+                "micro_batch_sizes": [1, 2, 3, 4],
                 "min_gpus": 1,
                 "max_gpus": 4,
                 "min_time": 20,
@@ -245,9 +207,7 @@ def test(self):
         hidden_dim = 10
 
         model = SimpleModel(hidden_dim, empty_grad=False)
-        model, _, _,_ = deepspeed.initialize(config=config_dict,
-                                             model=model,
-                                             model_parameters=model.parameters())
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
 
 
 class TestElasticConfigChanged(DistributedTest):
@@ -267,10 +227,7 @@ def test(self):
             "elasticity": {
                 "enabled": True,
                 "max_train_batch_size": 4,
-                "micro_batch_sizes": [1,
-                                      2,
-                                      3,
-                                      4],
+                "micro_batch_sizes": [1, 2, 3, 4],
                 "min_gpus": 1,
                 "max_gpus": 4,
                 "min_time": 20,
@@ -287,6 +244,4 @@ def test(self):
         model = SimpleModel(hidden_dim, empty_grad=False)
 
         with pytest.raises(deepspeed.elasticity.config.ElasticityError):
-            model, _, _,_ = deepspeed.initialize(config=config_dict,
-                                                 model=model,
-                                                 model_parameters=model.parameters())
+            model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
diff --git a/tests/unit/hybrid_engine/test_he_all.py b/tests/unit/hybrid_engine/test_he_all.py
new file mode 100644
index 000000000000..aa1f120645b1
--- /dev/null
+++ b/tests/unit/hybrid_engine/test_he_all.py
@@ -0,0 +1,99 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import os
+import torch
+import pytest
+import deepspeed
+from deepspeed.ops.op_builder import OpBuilder
+from unit.common import DistributedTest
+from deepspeed.accelerator import get_accelerator
+
+from transformers import (AutoConfig, AutoTokenizer, AutoModelForCausalLM)
+from deepspeed.ops.op_builder import InferenceBuilder
+
+if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
+    pytest.skip("This op had not been implemented on this system.", allow_module_level=True)
+
+rocm_version = OpBuilder.installed_rocm_version()
+if rocm_version != (0, 0):
+    pytest.skip("skip inference tests on rocm for now", allow_module_level=True)
+
+
+@pytest.mark.seq_inference
+@pytest.mark.parametrize("batch_size", [1, 2], ids=["bsz=1", "bsz=2"])
+@pytest.mark.parametrize("model_name", ["EleutherAI/gpt-neo-1.3B", "facebook/opt-1.3b"])
+class TestHybridEngineTextGen(DistributedTest):
+    world_size = 1
+
+    def _generate(self, model, tokenizer, prompt):
+        local_rank = int(os.getenv("LOCAL_RANK", "0"))
+        tokens = tokenizer.batch_encode_plus(prompt, return_tensors="pt", padding=True)
+        for t in tokens:
+            if torch.is_tensor(tokens[t]):
+                tokens[t] = tokens[t].to(f'{get_accelerator().device_name()}:{local_rank}')
+        output = model.generate(**tokens, do_sample=False, max_length=100)
+        outputs = tokenizer.batch_decode(output, skip_special_tokens=True)
+        return outputs
+
+    def get_model(self, model_name):
+        local_rank = int(os.getenv("LOCAL_RANK", "0"))
+        model_config = AutoConfig.from_pretrained(model_name)
+        model_config.dropout = 0.0
+        model = AutoModelForCausalLM.from_pretrained(model_name, config=model_config)
+        model = model.half()
+        model = model.to(f'{get_accelerator().device_name()}:{local_rank}')
+        return model
+
+    def get_tokenizer(self, model_name):
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        tokenizer.pad_token = tokenizer.eos_token
+        return tokenizer
+
+    def get_prompt(self, batch_size):
+        if batch_size == 1:
+            prompt = ["Microsoft is in Washington"]
+        elif batch_size == 2:
+            prompt = ["DeepSpeed is", "Microsoft is in Washington"]
+        else:
+            raise NotImplementedError(f"batch_size {batch_size} not implemented")
+        return prompt
+
+    def test_correctness(self, batch_size, model_name):
+        pytest.skip("skip test for now, will fix in follow-up PR")
+        model = self.get_model(model_name)
+        tokenizer = self.get_tokenizer(model_name)
+        prompt = self.get_prompt(batch_size)
+
+        base_out = self._generate(model, tokenizer, prompt)
+
+        ds_config = {"train_batch_size": 1, "fp16": {"enabled": True}, "hybrid_engine": {"enabled": True}}
+        model, *_ = deepspeed.initialize(model=model, config=ds_config)
+
+        model.eval()
+        ds1_out = self._generate(model, tokenizer, prompt)
+        assert base_out == ds1_out, f"base_out: {base_out}, ds1_out: {ds1_out}"
+
+        model.train()
+        model.eval()
+        ds2_out = self._generate(model, tokenizer, prompt)
+        assert base_out == ds2_out
+
+    def test_functionality(self, batch_size, model_name):
+        model = self.get_model(model_name)
+        tokenizer = self.get_tokenizer(model_name)
+        prompt = self.get_prompt(batch_size)
+
+        ds_config = {"train_batch_size": 1, "fp16": {"enabled": True}, "hybrid_engine": {"enabled": True}}
+        model, *_ = deepspeed.initialize(model=model, config=ds_config)
+
+        model.eval()
+        ds1_out = self._generate(model, tokenizer, prompt)
+
+        model.train()
+        model.eval()
+        ds2_out = self._generate(model, tokenizer, prompt)
+
+        assert ds1_out == ds2_out, f"ds1_out: {ds1_out}, ds2_out: {ds2_out}"
diff --git a/tests/unit/hybrid_engine/test_he_llama.py b/tests/unit/hybrid_engine/test_he_llama.py
new file mode 100644
index 000000000000..fcf5b8ffb89b
--- /dev/null
+++ b/tests/unit/hybrid_engine/test_he_llama.py
@@ -0,0 +1,102 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import os
+import torch
+import pytest
+import deepspeed
+from deepspeed.ops.op_builder import OpBuilder
+from unit.common import DistributedTest
+from deepspeed.accelerator import get_accelerator
+
+from transformers import (AutoConfig, AutoTokenizer, AutoModelForCausalLM)
+from deepspeed.ops.op_builder import InferenceBuilder
+
+if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
+    pytest.skip("This op had not been implemented on this system.", allow_module_level=True)
+
+rocm_version = OpBuilder.installed_rocm_version()
+if rocm_version != (0, 0):
+    pytest.skip("skip inference tests on rocm for now", allow_module_level=True)
+
+
+@pytest.mark.seq_inference
+@pytest.mark.parametrize("batch_size", [1, 2], ids=["bsz=1", "bsz=2"])
+@pytest.mark.parametrize("model_name", ["huggyllama/llama-7b"])
+class TestHybridEngineLlama(DistributedTest):
+    world_size = 1
+
+    def _generate(self, model, tokenizer, prompt):
+        local_rank = int(os.getenv("LOCAL_RANK", "0"))
+        tokens = tokenizer.batch_encode_plus(prompt, return_tensors="pt", padding=True)
+        for t in tokens:
+            if torch.is_tensor(tokens[t]):
+                tokens[t] = tokens[t].to(f'{get_accelerator().device_name()}:{local_rank}')
+        #output = model.generate(**tokens, do_sample=False, max_length=100)
+        output = model.generate(tokens.input_ids, do_sample=False, max_length=100)
+        outputs = tokenizer.batch_decode(output, skip_special_tokens=True)
+        return outputs
+
+    def get_model(self, model_name):
+        local_rank = int(os.getenv("LOCAL_RANK", "0"))
+        model_config = AutoConfig.from_pretrained(model_name)
+        model_config.dropout = 0.0
+        model = AutoModelForCausalLM.from_pretrained(model_name, config=model_config)
+        # Make the model smaller so we can run it on a single GPU in CI
+        _ = [model.model.layers.pop(-1) for _ in range(8)]
+        model = model.half()
+        model = model.to(f'{get_accelerator().device_name()}:{local_rank}')
+        return model
+
+    def get_tokenizer(self, model_name):
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        tokenizer.pad_token = tokenizer.eos_token
+        return tokenizer
+
+    def get_prompt(self, batch_size):
+        if batch_size == 1:
+            prompt = ["Microsoft is in Washington"]
+        elif batch_size == 2:
+            prompt = ["DeepSpeed is", "Microsoft is in Washington"]
+        else:
+            raise NotImplementedError(f"batch_size {batch_size} not implemented")
+        return prompt
+
+    def test_correctness(self, batch_size, model_name):
+        pytest.skip("skip test for now, will fix in follow-up PR")
+        model = self.get_model(model_name)
+        tokenizer = self.get_tokenizer(model_name)
+        prompt = self.get_prompt(batch_size)
+
+        base_out = self._generate(model, tokenizer, prompt)
+
+        ds_config = {"train_batch_size": 1, "fp16": {"enabled": True}, "hybrid_engine": {"enabled": True}}
+        model, *_ = deepspeed.initialize(model=model, config=ds_config)
+
+        model.eval()
+        ds1_out = self._generate(model, tokenizer, prompt)
+        assert base_out == ds1_out, f"base_out: {base_out}, ds1_out: {ds1_out}"
+
+        model.train()
+        model.eval()
+        ds2_out = self._generate(model, tokenizer, prompt)
+        assert base_out == ds2_out
+
+    def test_functionality(self, batch_size, model_name):
+        model = self.get_model(model_name)
+        tokenizer = self.get_tokenizer(model_name)
+        prompt = self.get_prompt(batch_size)
+
+        ds_config = {"train_batch_size": 1, "fp16": {"enabled": True}, "hybrid_engine": {"enabled": True}}
+        model, *_ = deepspeed.initialize(model=model, config=ds_config)
+
+        model.eval()
+        ds1_out = self._generate(model, tokenizer, prompt)
+
+        model.train()
+        model.eval()
+        ds2_out = self._generate(model, tokenizer, prompt)
+
+        assert ds1_out == ds2_out, f"ds1_out: {ds1_out}, ds2_out: {ds2_out}"
diff --git a/tests/unit/hybrid_engine/test_he_lora.py b/tests/unit/hybrid_engine/test_he_lora.py
new file mode 100644
index 000000000000..ea27239ed55e
--- /dev/null
+++ b/tests/unit/hybrid_engine/test_he_lora.py
@@ -0,0 +1,231 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import os
+import math
+import torch
+import torch.nn.functional as F
+import pytest
+import deepspeed
+from deepspeed.runtime.zero import GatheredParameters
+from deepspeed.ops.op_builder import OpBuilder
+from deepspeed.utils import safe_get_full_grad
+import numpy.testing as npt
+from unit.common import DistributedTest
+from deepspeed.ops.op_builder import InferenceBuilder
+
+if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
+    pytest.skip("This op had not been implemented on this system.", allow_module_level=True)
+
+from transformers import (AutoConfig, AutoTokenizer, AutoModelForCausalLM)
+
+rocm_version = OpBuilder.installed_rocm_version()
+if rocm_version != (0, 0):
+    pytest.skip("skip inference tests on rocm for now", allow_module_level=True)
+
+
+def to_device(batch, device):
+    output = {}
+    for k, v in batch.items():
+        try:
+            output[k] = v.to(device)
+        except:
+            output[k] = v
+    return output
+
+
+def convert_linear_layer_to_lora(model, part_module_name, lora_dim=0, lora_scaling=1, lora_droppout=0):
+    from deepspeed.compression.helper import recursive_getattr, recursive_setattr
+
+    repalce_name = []
+    for name, module in model.named_modules():
+        if isinstance(module, torch.nn.Linear) and part_module_name in name:
+            repalce_name.append(name)
+    for name in repalce_name:
+        module = recursive_getattr(model, name)
+        tmp = LinearLayer_LoRA(module.weight, lora_dim, lora_scaling, lora_droppout,
+                               module.bias).to(module.weight.device).to(module.weight.dtype)
+        recursive_setattr(model, name, tmp)
+    return model
+
+
+class LinearLayer_LoRA(torch.nn.Module):
+    # an simple implementation of LoRA
+    # for now only support Linear Layer
+    def __init__(self, weight, lora_dim=0, lora_scaling=1, lora_droppout=0, bias=None):
+        super(LinearLayer_LoRA, self).__init__()
+        self.weight = weight
+        self.bias = bias
+
+        if lora_dim <= 0:
+            raise ValueError("You are training to use LoRA, whose reduced dim should be larger than 1")
+
+        try:
+            # for zero stage 3
+            rows, columns = weight.ds_shape
+        except:
+            rows, columns = weight.shape
+        self.lora_right_weight = torch.nn.Parameter(torch.zeros(
+            columns, lora_dim))  # apply transpose so in forward we do not need to transpose again
+        self.lora_left_weight = torch.nn.Parameter(torch.zeros(lora_dim, rows))
+        self.lora_scaling = lora_scaling / lora_dim
+
+        if lora_droppout > 0:
+            self.lora_dropout = torch.nn.Dropout(lora_droppout)
+        else:
+            self.lora_dropout = torch.nn.Identity()
+
+        self.reset_parameters()
+        # disable the original weight gradient
+        self.weight.requires_grad = False
+        # fuse LoRA to the original weight
+        self.fuse_lora = False
+
+    def eval(self):
+        self.lora_dropout.eval()
+
+    def train(self, mode=True):
+        self.lora_dropout.train(mode)
+
+    def reset_parameters(self):
+        torch.nn.init.kaiming_uniform_(self.lora_right_weight, a=math.sqrt(5))
+        torch.nn.init.zeros_(self.lora_left_weight)
+
+    def forward(self, input):
+        if self.fuse_lora:
+            return F.linear(input, self.weight, self.bias)
+        else:
+            return F.linear(input, self.weight, self.bias) + (
+                self.lora_dropout(input) @ self.lora_right_weight @ self.lora_left_weight) * self.lora_scaling
+
+
+def only_optimize_lora_parameters(model):
+    # turn off the gradient of all the parameters except the LoRA parameters
+    for name, param in model.named_parameters():
+        if "lora_right_weight" in name or "lora_left_weight" in name:
+            param.requires_grad = True
+        else:
+            param.requires_grad = False
+    return model
+
+
+@pytest.mark.seq_inference
+@pytest.mark.parametrize("batch_size", [1], ids=["bsz=1"])
+@pytest.mark.parametrize("zero_stage", [2, 3], ids=["zero_stage=2", "zero_stage=3"])
+@pytest.mark.parametrize("model_name", ["EleutherAI/gpt-neo-125m", "facebook/opt-350m", "bigscience/bloom-560m"])
+@pytest.mark.parametrize("offload_device", ["none", "cpu"])
+class TestHybridEngineLoRA(DistributedTest):
+    world_size = 1
+
+    def get_model(self, model_name):
+        local_rank = int(os.getenv("LOCAL_RANK", "0"))
+        model_config = AutoConfig.from_pretrained(model_name)
+        model_config.dropout = 0.0
+        model = AutoModelForCausalLM.from_pretrained(model_name, config=model_config)
+        model = model.half()
+        model = model.to(f'cuda:{local_rank}')
+        return model
+
+    def get_tokenizer(self, model_name):
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        tokenizer.pad_token = tokenizer.eos_token
+        return tokenizer
+
+    def get_train_sentences(self, batch_size):
+        sentences = [
+            r"\n\nHuman: I am trying to write a fairy tale. What is the most popular plot?\n\n"
+            r"Assistant: The most popular plot might be a princess goes to a faraway land, falls in love",
+            r"\n\nHuman: What flowers should I grow to attract bees?\n\nAssistant: The reason you want bees "
+            r"in your garden is to attract pollinators and get more fruit or vegetable production."
+        ]
+        if batch_size <= 2:
+            return sentences[:batch_size]
+        else:
+            raise NotImplementedError(f"batch_size {batch_size} not implemented")
+
+    def test_lora(self, batch_size, model_name, zero_stage, offload_device):
+        local_rank = int(os.getenv("LOCAL_RANK", "0"))
+        model = self.get_model(model_name)
+        tokenizer = self.get_tokenizer(model_name)
+        train_sentences = self.get_train_sentences(batch_size)
+
+        # Inject LoRA
+        model = convert_linear_layer_to_lora(model, "", 8)
+        model = only_optimize_lora_parameters(model)
+
+        ds_config = {
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 1.0,
+                    "betas": [0.9, 0.95]
+                }
+            },
+            "train_batch_size": batch_size,
+            "fp16": {
+                "enabled": True,
+                "initial_scale_power": 12
+            },
+            "hybrid_engine": {
+                "enabled": True,
+                "pin_parameters": True
+            },
+            "zero_optimization": {
+                "stage": zero_stage,
+                "offload_optimizer": {
+                    "device": offload_device
+                }
+            }
+        }
+
+        model, *_ = deepspeed.initialize(model=model, config=ds_config)
+
+        # Verify gradient norm is larger than 0
+        before_grad_update_layer0_params = [
+            ele.detach().cpu().float().numpy() for ele in model.layer_params[0]
+            if ele is not None and len(ele.shape) > 1
+        ]
+
+        model.train()
+        batch = tokenizer(train_sentences, max_length=16, padding="max_length", truncation=True, return_tensors="pt")
+        batch = to_device(batch, f'cuda:{local_rank}')
+        batch["labels"] = batch["input_ids"]
+        outputs = model(**batch, use_cache=False)
+        loss = outputs.loss
+        model.backward(loss)
+
+        grad_norm_dict = dict()
+        for name, param in model.named_parameters():
+            if param.requires_grad is True:
+                grad_norm_dict[name] = torch.linalg.norm(safe_get_full_grad(param))
+
+        model.step()
+        grad_norm = sum([ele.detach().cpu().numpy() for ele in grad_norm_dict.values()])
+        assert grad_norm > 1E-5
+
+        # Verify parameter remains the same
+        after_grad_update_layer0_params = [
+            ele.detach().cpu().float().numpy() for ele in model.layer_params[0]
+            if ele is not None and len(ele.shape) > 1
+        ]
+        for lhs, rhs in zip(before_grad_update_layer0_params, after_grad_update_layer0_params):
+            npt.assert_allclose(lhs, rhs, 1E-5, 1E-5)
+
+        # Verify fuse will mutate layer_params
+        model.eval()
+        with GatheredParameters(model.parameters()):
+            model.fuse_lora_weight()
+
+        after_grad_update_layer0_params_lora_fused = [
+            ele.detach().cpu().float().numpy() for ele in model.layer_params[0]
+            if ele is not None and len(ele.shape) > 1
+        ]
+
+        for lhs, rhs in zip(before_grad_update_layer0_params, after_grad_update_layer0_params_lora_fused):
+            with pytest.raises(AssertionError):
+                npt.assert_allclose(lhs, rhs, 1E-5, 1E-5)
+
+        with GatheredParameters(model.parameters()):
+            model.unfuse_lora_weight()
diff --git a/tests/unit/inference/__init__.py b/tests/unit/inference/__init__.py
new file mode 100644
index 000000000000..208299fb8c50
--- /dev/null
+++ b/tests/unit/inference/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
diff --git a/tests/unit/inference/quantization/test_intX_quantization.py b/tests/unit/inference/quantization/test_intX_quantization.py
new file mode 100644
index 000000000000..56df2b232d15
--- /dev/null
+++ b/tests/unit/inference/quantization/test_intX_quantization.py
@@ -0,0 +1,406 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import numpy as np
+import torch
+import torch.nn as nn
+from unit.common import DistributedTest
+from deepspeed.accelerator import get_accelerator
+from deepspeed.inference.quantization.quantization import _init_group_wise_weight_quantization
+from deepspeed.inference.quantization.utils import Quantizer, DeQuantizer
+from deepspeed.inference.quantization.layers import QuantizedLinear
+from deepspeed.runtime.utils import required_torch_version
+from transformers.models.opt.modeling_opt import OPTDecoderLayer
+from transformers import AutoConfig, OPTConfig, AutoModel
+import pytest
+from collections import OrderedDict
+from typing import Dict
+
+device = get_accelerator().device_name() if get_accelerator().is_available() else 'cpu'
+
+if not required_torch_version(min_version=1.11):
+    pytest.skip("torch.Tensor.bitwise_left_shift in INT4 quantizer needs torch 1.11 or above.",
+                allow_module_level=True)
+
+
+def reset_random(seed=1234):
+    np.random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    get_accelerator().manual_seed_all(seed)
+
+
+def quantization_test_helper(pre_quant_type: torch.dtype, num_bits: int):
+    reset_random()
+    num_group = 1024 * 32
+    group_size = 64
+    quantization_config = {'num_bits': num_bits, 'group_size': group_size, 'group_dim': 1, 'symmetric': False}
+
+    quantizer = Quantizer(config=quantization_config)
+    dequantizer = DeQuantizer(config=quantization_config, dtype=pre_quant_type)
+
+    data = torch.randn(num_group, group_size, dtype=pre_quant_type, device=device)
+
+    quantized_data, scale_buf, min_vals = quantizer.quantize(data)
+    dequantized_data = dequantizer.dequantize(quantized_data, scale_buf, min_vals)
+
+    max_diff = torch.max(torch.abs(data - dequantized_data))
+    mean_diff = torch.mean(torch.abs(data - dequantized_data))
+
+    # This threshold value is emperically selected.
+    assert mean_diff < 0.15 and max_diff < 0.5, f'Numeric error exceed threshold, mean diff {mean_diff} (threshold 0.15), max diff {max_diff} (threshold 0.5)'
+
+
+def zero3_post_init_quantization_test_helper(cpu_offload: bool, nvme_offload: bool, bits: int):
+    import deepspeed
+    from transformers.deepspeed import HfDeepSpeedConfig
+
+    def get_zero3_ds_config(hf_config: OPTConfig, cpu_offload: bool, nvme_offload: bool, bits: int) -> Dict:
+        GB = 1 << 30
+
+        ds_config = {
+            "fp16": {
+                "enabled": True,
+            },
+            "zero_optimization": {
+                "stage": 3,
+                "stage3_prefetch_bucket_size": 2 * hf_config.hidden_size * hf_config.hidden_size,
+                "stage3_param_persistence_threshold": hf_config.hidden_size,
+                "stage3_max_live_parameters": 2 * hf_config.hidden_size * hf_config.hidden_size
+            },
+            "steps_per_print": 2000,
+            "train_micro_batch_size_per_gpu": 1,
+            "wall_clock_breakdown": False,
+            'weight_quantization': {
+                'post_init_quant': {
+                    'fc': {
+                        'num_bits': bits,
+                        'group_size': 32,
+                        'group_dim': 1,
+                        'symmetric': False
+                    },
+                    'self_attn.q_proj': {
+                        'num_bits': bits,
+                        'group_size': 32,
+                        'group_dim': 1,
+                        'symmetric': False
+                    },
+                    'self_attn.k_proj': {
+                        'num_bits': bits,
+                        'group_size': 32,
+                        'group_dim': 1,
+                        'symmetric': False
+                    },
+                    'self_attn.v_proj': {
+                        'num_bits': bits,
+                        'group_size': 32,
+                        'group_dim': 1,
+                        'symmetric': False
+                    },
+                    'self_attn.out_proj': {
+                        'num_bits': bits,
+                        'group_size': 32,
+                        'group_dim': 1,
+                        'symmetric': False
+                    },
+                    'lm_head': {
+                        'num_bits': bits,
+                        'group_size': 32,
+                        'group_dim': 1,
+                        'symmetric': False
+                    },
+                    'embed_tokens': {
+                        'num_bits': bits,
+                        'group_size': 32,
+                        'group_dim': 1,
+                        'symmetric': False
+                    },
+                }
+            }
+        }
+
+        if cpu_offload:
+            ds_config["zero_optimization"]["offload_param"] = dict(device="cpu", pin_memory=1)
+        if nvme_offload:
+            ds_config["zero_optimization"]["offload_param"] = dict(
+                device="nvme",
+                pin_memory=True,
+                nvme_path='~/tmp_offload_dir',
+                buffer_count=5,
+                buffer_size=1 * GB,
+            )
+            ds_config["aio"] = {
+                "block_size": 1048576,
+                "queue_depth": 8,
+                "thread_count": 1,
+                "single_submit": False,
+                "overlap_events": True,
+            }
+
+        return ds_config
+
+    hf_config = AutoConfig.from_pretrained('facebook/opt-125m')
+    ds_config = get_zero3_ds_config(hf_config=hf_config, cpu_offload=cpu_offload, nvme_offload=nvme_offload, bits=bits)
+
+    input_ids = torch.ones(1, 16, dtype=torch.int32, device=device)
+    attention_mask = torch.ones(1, 16, dtype=torch.float32, device=device)
+
+    with torch.no_grad():
+        ref_model = AutoModel.from_pretrained('facebook/opt-125m', torch_dtype=torch.float16).to(device)
+        ref_model.eval()
+        ref_output = ref_model(input_ids=input_ids, attention_mask=attention_mask)
+
+    with torch.no_grad():
+        dschf = HfDeepSpeedConfig(ds_config)
+        model = AutoModel.from_pretrained('facebook/opt-125m', torch_dtype=torch.float16)
+        model = model.eval()
+
+        model = _init_group_wise_weight_quantization(model, ds_config)
+        ds_engine = deepspeed.initialize(model=model, config_params=ds_config)[0]
+        ds_engine.module.eval()
+        model = ds_engine.module
+
+    output = model(input_ids=input_ids, attention_mask=attention_mask)
+
+    mean_diff = torch.mean(torch.abs(output.last_hidden_state - ref_output.last_hidden_state))
+
+    # This threshold value is emperically selected.
+    assert mean_diff < 0.4, f'Numeric error exceed threshold, relative error {mean_diff} (threshold 0.4)'
+
+
+def zero3_quantized_initialization_test_helper(cpu_offload: bool, nvme_offload: bool, bits: int):
+    import deepspeed
+    from transformers.deepspeed import HfDeepSpeedConfig
+
+    def get_zero3_ds_config(hf_config: OPTConfig, cpu_offload: bool, nvme_offload: bool, bits: int) -> Dict:
+        GB = 1 << 30
+
+        ds_config = {
+            "fp16": {
+                "enabled": True,
+            },
+            "zero_optimization": {
+                "stage": 3,
+                "stage3_prefetch_bucket_size": 2 * hf_config.hidden_size * hf_config.hidden_size,
+                "stage3_param_persistence_threshold": hf_config.hidden_size,
+                "stage3_max_live_parameters": 2 * hf_config.hidden_size * hf_config.hidden_size
+            },
+            "steps_per_print": 2000,
+            "train_micro_batch_size_per_gpu": 1,
+            "wall_clock_breakdown": False,
+            'weight_quantization': {
+                'quantized_initialization': {
+                    'num_bits': bits,
+                    'group_size': 32,
+                    'group_dim': 1,
+                    'symmetric': False
+                },
+            }
+        }
+
+        if cpu_offload:
+            ds_config["zero_optimization"]["offload_param"] = dict(device="cpu", pin_memory=1)
+        if nvme_offload:
+            ds_config["zero_optimization"]["offload_param"] = dict(
+                device="nvme",
+                pin_memory=True,
+                nvme_path='~/tmp_offload_dir',
+                buffer_count=5,
+                buffer_size=1 * GB,
+            )
+            ds_config["aio"] = {
+                "block_size": 1048576,
+                "queue_depth": 8,
+                "thread_count": 1,
+                "single_submit": False,
+                "overlap_events": True,
+            }
+
+        return ds_config
+
+    hf_config = AutoConfig.from_pretrained('facebook/opt-125m')
+    ds_config = get_zero3_ds_config(hf_config=hf_config, cpu_offload=cpu_offload, nvme_offload=nvme_offload, bits=bits)
+
+    input_ids = torch.ones(1, 16, dtype=torch.int32, device=device)
+    attention_mask = torch.ones(1, 16, dtype=torch.float32, device=device)
+
+    with torch.no_grad():
+        ref_model = AutoModel.from_pretrained('facebook/opt-125m', torch_dtype=torch.float16).to(device)
+        ref_model.eval()
+        ref_output = ref_model(input_ids=input_ids, attention_mask=attention_mask)
+
+    with torch.no_grad():
+        dschf = HfDeepSpeedConfig(ds_config)
+        model = AutoModel.from_pretrained('facebook/opt-125m', torch_dtype=torch.float16)
+        model = model.eval()
+        ds_engine = deepspeed.initialize(model=model, config_params=ds_config)[0]
+        ds_engine.module.eval()
+        model = ds_engine.module
+
+    output = model(input_ids=input_ids, attention_mask=attention_mask)
+
+    mean_diff = torch.mean(torch.abs(output.last_hidden_state - ref_output.last_hidden_state))
+
+    # This threshold value is emperically selected.
+    assert mean_diff < 0.4, f'Numeric error exceed threshold, relative error {mean_diff} (threshold 0.4)'
+
+
+@pytest.fixture(params=[4, 8], ids=["4bits", "8bits"])
+def quantization_bits(request):
+    return request.param
+
+
+@pytest.fixture(params=[0, 1], ids=["0", "1"])
+def group_dim(request):
+    return request.param
+
+
+class TestQuantizedInt(DistributedTest):
+
+    def test_model_quantization(self, quantization_bits):
+        reset_random()
+
+        config = AutoConfig.from_pretrained('facebook/opt-125m')
+
+        with torch.no_grad():
+            model = OPTDecoderLayer(config).half().to(device)
+            bits = quantization_bits
+
+            ds_config = {
+                'weight_quantization': {
+                    'post_init_quant': {
+                        'fc': {
+                            'num_bits': bits,
+                            'group_size': 64,
+                            'group_dim': 0,
+                            'symmetric': False
+                        },
+                        'self_attn.q_proj': {
+                            'num_bits': bits,
+                            'group_size': 64,
+                            'group_dim': 0,
+                            'symmetric': False
+                        },
+                        'self_attn.k_proj': {
+                            'num_bits': bits,
+                            'group_size': 64,
+                            'group_dim': 0,
+                            'symmetric': False
+                        },
+                        'self_attn.v_proj': {
+                            'num_bits': bits,
+                            'group_size': 64,
+                            'group_dim': 0,
+                            'symmetric': False
+                        },
+                        'self_attn.out_proj': {
+                            'num_bits': bits,
+                            'group_size': 64,
+                            'group_dim': 0,
+                            'symmetric': False
+                        }
+                    }
+                }
+            }
+
+            model = _init_group_wise_weight_quantization(model, ds_config)
+
+            assert type(model.fc1) is QuantizedLinear
+            assert type(model.fc2) is QuantizedLinear
+            assert type(model.self_attn.q_proj) is QuantizedLinear
+            assert type(model.self_attn.k_proj) is QuantizedLinear
+            assert type(model.self_attn.v_proj) is QuantizedLinear
+            assert type(model.self_attn.out_proj) is QuantizedLinear
+
+    @pytest.mark.skipif(device == 'cpu', reason='CPU does support FP16 GEMM')
+    def test_quantized_linear(self, quantization_bits, group_dim):
+        reset_random()
+
+        layers = []
+
+        for idx in range(5):
+            layers.append(
+                (f'layer_{idx}', nn.Linear(in_features=128, out_features=128, dtype=torch.float16, device=device)))
+
+        input_tensor = torch.randn(32, 128, dtype=torch.float16, device=device)
+        with torch.no_grad():
+            model = nn.Sequential(OrderedDict(layers))
+
+            ref_output = model(input_tensor)
+
+            ds_config = {
+                'weight_quantization': {
+                    'post_init_quant': {
+                        'layer': {
+                            'num_bits': quantization_bits,
+                            'group_size': 64,
+                            'group_dim': group_dim,
+                            'symmetric': False
+                        }
+                    }
+                }
+            }
+
+            model = _init_group_wise_weight_quantization(model, ds_config)
+
+            assert type(model.layer_0) is QuantizedLinear
+            assert type(model.layer_1) is QuantizedLinear
+            assert type(model.layer_2) is QuantizedLinear
+            assert type(model.layer_3) is QuantizedLinear
+            assert type(model.layer_4) is QuantizedLinear
+
+            output = model(input_tensor)
+
+            mean_diff = torch.mean(torch.abs(ref_output - output))
+
+            # This threshold value is emperically selected.
+            assert mean_diff < 0.15, f'Numeric error exceed threshold, mean diff {mean_diff}'
+
+    def test_float_int4_quantization(self):
+        reset_random()
+        quantization_test_helper(torch.float32, 4)
+
+    @pytest.mark.skipif(device == 'cpu', reason='CPU does support FP16 GEMM')
+    def test_half_int4_quantization(self):
+        reset_random()
+        quantization_test_helper(torch.float16, 4)
+
+    def test_float_int8_quantization(self):
+        reset_random()
+        quantization_test_helper(torch.float32, 8)
+
+    def test_half_int8_quantization(self):
+        reset_random()
+        quantization_test_helper(torch.float16, 8)
+
+    @pytest.mark.skipif(device == 'cpu', reason='CPU does support FP16 GEMM')
+    def test_zero3_int4_post_init_quant(self, quantization_bits):
+        reset_random()
+        zero3_post_init_quantization_test_helper(cpu_offload=False, nvme_offload=False, bits=quantization_bits)
+
+    @pytest.mark.skipif(device == 'cpu', reason='CPU does support FP16 GEMM')
+    def test_zero3_int4_post_init_quant_cpu_offload(self, quantization_bits):
+        reset_random()
+        zero3_post_init_quantization_test_helper(cpu_offload=True, nvme_offload=False, bits=quantization_bits)
+
+    @pytest.mark.skipif(device == 'cpu', reason='CPU does support FP16 GEMM')
+    def test_zero3_int4_post_init_quant_nvme_offload(self):
+        reset_random()
+        zero3_post_init_quantization_test_helper(cpu_offload=False, nvme_offload=True, bits=4)
+
+    @pytest.mark.skipif(device == 'cpu', reason='CPU does support FP16 GEMM')
+    def test_zero3_int4_quantized_initialization(self, quantization_bits):
+        reset_random()
+        zero3_quantized_initialization_test_helper(cpu_offload=False, nvme_offload=False, bits=quantization_bits)
+
+    @pytest.mark.skipif(device == 'cpu', reason='CPU does support FP16 GEMM')
+    def test_zero3_int4_quantized_initialization_cpu_offload(self, quantization_bits):
+        reset_random()
+        zero3_quantized_initialization_test_helper(cpu_offload=True, nvme_offload=False, bits=quantization_bits)
+
+    @pytest.mark.skipif(device == 'cpu', reason='CPU does support FP16 GEMM')
+    def test_zero3_int4_quantized_initialization_nvme_offload(self):
+        reset_random()
+        zero3_quantized_initialization_test_helper(cpu_offload=False, nvme_offload=True, bits=4)
diff --git a/tests/unit/inference/test_checkpoint_sharding.py b/tests/unit/inference/test_checkpoint_sharding.py
index a8ff1312279c..564b3fab6bf4 100644
--- a/tests/unit/inference/test_checkpoint_sharding.py
+++ b/tests/unit/inference/test_checkpoint_sharding.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import os
 import pytest
@@ -7,9 +10,17 @@
 from deepspeed.model_implementations import DeepSpeedTransformerInference
 from unit.common import DistributedTest, DistributedFixture
 from transformers import AutoConfig, AutoModelForCausalLM
+import deepspeed.comm as dist
+from huggingface_hub import snapshot_download
+from transformers.utils import is_offline_mode
+from deepspeed.ops.op_builder import InferenceBuilder
+
+if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
+    pytest.skip("This op had not been implemented on this system.", allow_module_level=True)
 
 
 def check_dtype(model, expected_dtype):
+
     def find_dtype(module):
         for child in module.children():
             if isinstance(child, DeepSpeedTransformerInference):
@@ -21,16 +32,11 @@ def find_dtype(module):
 
     found_dtype = find_dtype(model)
     assert found_dtype, "Did not find DeepSpeedTransformerInference in model"
-    assert (
-        found_dtype == expected_dtype
-    ), f"Expected transformer dtype {expected_dtype}, but found {found_dtype}"
+    assert (found_dtype == expected_dtype), f"Expected transformer dtype {expected_dtype}, but found {found_dtype}"
 
 
 @pytest.fixture(params=[
-    "bigscience/bloom-560m",
-    "EleutherAI/gpt-j-6B",
-    "EleutherAI/gpt-neo-125M",
-    "facebook/opt-125m"
+    "bigscience/bloom-560m", "EleutherAI/gpt-j-6B", "EleutherAI/gpt-neo-125M", "facebook/opt-350m", "facebook/opt-125m"
 ])
 def model_name(request):
     return request.param
@@ -55,13 +61,11 @@ def run(self, model_name, class_tmpdir):
                 "tensor_parallel": {
                     "tp_size": world_size
                 },
-                "save_mp_checkpoint_path": os.path.join(str(class_tmpdir),
-                                                        model_name),
+                "save_mp_checkpoint_path": os.path.join(str(class_tmpdir), model_name),
             }
 
             # Load model and save sharded checkpoint
-            model = AutoModelForCausalLM.from_pretrained(model_name,
-                                                         torch_dtype=torch.float16)
+            model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
             model = deepspeed.init_inference(model, config=inf_config)
 
 
@@ -78,17 +82,58 @@ def test(self, model_name, dtype, class_tmpdir, save_shard):
             "tensor_parallel": {
                 "tp_size": world_size
             },
-            "checkpoint": os.path.join(class_tmpdir,
-                                       model_name,
-                                       "ds_inference_config.json"),
+            "checkpoint": os.path.join(class_tmpdir, model_name, "ds_inference_config.json"),
         }
 
         # Load model on meta tensors
         model_config = AutoConfig.from_pretrained(model_name)
         # Note that we use half precision to load initially, even for int8
         with deepspeed.OnDevice(dtype=torch.float16, device="meta"):
-            model = AutoModelForCausalLM.from_config(model_config,
-                                                     torch_dtype=torch.bfloat16)
+            model = AutoModelForCausalLM.from_config(model_config, torch_dtype=torch.bfloat16)
         model = model.eval()
         model = deepspeed.init_inference(model, config=inf_config)
         check_dtype(model, dtype)
+
+
+@pytest.mark.seq_inference
+class TestCheckpointShardinAutoTP(DistributedTest):
+    world_size = 2
+
+    def test(self, model_name, class_tmpdir):
+
+        def write_checkpoints_json(model_name, class_tmpdir):
+            import json
+            from pathlib import Path
+            local_rank = int(os.getenv("LOCAL_RANK", "0"))
+            if local_rank == 0:
+                # download only on first process
+                cached_repo_dir = snapshot_download(
+                    model_name,
+                    local_files_only=is_offline_mode(),
+                    cache_dir=os.getenv("TRANSFORMERS_CACHE", None),
+                    ignore_patterns=["*.safetensors", "*.msgpack", "*.h5"],
+                )
+                file_list = [str(entry) for entry in Path(cached_repo_dir).rglob("*.[bp][it][n]") if entry.is_file()]
+                data = {"type": "ds_model", "checkpoints": file_list, "version": 1.0}
+                os.makedirs(os.path.join(class_tmpdir, model_name), exist_ok=True)
+                json.dump(data, open(os.path.join(class_tmpdir, model_name, "ds_inference_config.json"), "w"))
+            dist.barrier()
+
+        world_size = int(os.getenv("WORLD_SIZE", "1"))
+        inf_config = {
+            "replace_with_kernel_inject": False,
+            "tensor_parallel": {
+                "tp_size": world_size
+            },
+            "checkpoint": os.path.join(class_tmpdir, model_name, "ds_inference_config.json"),
+        }
+
+        write_checkpoints_json(model_name, class_tmpdir)
+
+        # Load model on meta tensors
+        model_config = AutoConfig.from_pretrained(model_name)
+        # Note that we use half precision to load initially, even for int8
+        with deepspeed.OnDevice(dtype=torch.bfloat16, device="meta"):
+            model = AutoModelForCausalLM.from_config(model_config, torch_dtype=torch.bfloat16)
+        model = model.eval()
+        model = deepspeed.init_inference(model, config=inf_config)
diff --git a/tests/unit/inference/test_inference.py b/tests/unit/inference/test_inference.py
index 371ecda710b2..6b5588d8a1f7 100644
--- a/tests/unit/inference/test_inference.py
+++ b/tests/unit/inference/test_inference.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import os
 import time
@@ -10,13 +13,14 @@
 from unit.common import DistributedTest
 from packaging import version as pkg_version
 from deepspeed.ops.op_builder import OpBuilder
-from transformers import pipeline
+from transformers import pipeline, AutoTokenizer
 from transformers.models.t5.modeling_t5 import T5Block
 from transformers.models.roberta.modeling_roberta import RobertaLayer
 from huggingface_hub import HfApi
 from deepspeed.model_implementations import DeepSpeedTransformerInference
 from torch import nn
 from deepspeed.accelerator import get_accelerator
+from deepspeed.ops.op_builder import InferenceBuilder
 
 rocm_version = OpBuilder.installed_rocm_version()
 if rocm_version != (0, 0):
@@ -46,47 +50,51 @@
     "gpt2",
     "distilgpt2",
     "Norod78/hebrew-bad_wiki-gpt_neo-tiny",
-    #"EleutherAI/gpt-j-6B", # Removed as this is causing OOM errors randomly
+    "EleutherAI/gpt-j-6b",
+    "EleutherAI/pythia-70m-deduped",
     "bigscience/bloom-560m",
 ]
 _opt_models = [
-    "facebook/opt-125m",        # 125m, 1.7B, ..., 175B variants have the same model architecture.
-    "facebook/opt-350m",        # 350m applies layer norm after attnention layer which is different than other variants.
+    "facebook/opt-125m",  # 125m, 1.7B, ..., 175B variants have the same model architecture.
+    "facebook/opt-350m",  # 350m applies layer norm after attention layer which is different than other variants.
 ]
-_all_models = HfApi().list_models()
-
-test_models = set(_bert_models + _roberta_models + _gpt_models + _opt_models)
-test_tasks = [
-    "fill-mask",
-    "question-answering",
-    "text-classification",
-    "token-classification",
-    "text-generation",
-    "text2text-generation",
-    "summarization",
-    "translation"
+_test_models = set(_bert_models + _roberta_models + _gpt_models + _opt_models)
+_test_tasks = [
+    "fill-mask", "question-answering", "text-classification", "token-classification", "text-generation",
+    "text2text-generation", "summarization", "translation"
 ]
-pytest.all_models = {
-    task: [m.modelId for m in _all_models if m.pipeline_tag == task]
-    for task in test_tasks
-}
 
-_model_w_tasks = itertools.product(*[test_models, test_tasks])
+# Get a list of all models and mapping from task to supported models
+_hf_models = list(HfApi().list_models())
+_hf_model_names = [m.modelId for m in _hf_models]
+_hf_task_to_models = {task: [m.modelId for m in _hf_models if m.pipeline_tag == task] for task in _test_tasks}
+
+# Get all combinations of task:model to test
+_model_w_tasks = [(m, t) for m, t in itertools.product(*[_test_models, _test_tasks]) if m in _hf_task_to_models[t]]
+
+# Assign to pytest variables for testing
+pytest.model_w_tasks = _model_w_tasks
+pytest.mt_names = [f"{m}-{t}" for m, t in pytest.model_w_tasks]
 
 
-def _valid_model_task(model_task):
-    m, t = model_task
-    return m in pytest.all_models[t]
+@pytest.fixture(scope="module", autouse=True)
+def verify_models():
+    # Verify all test models are registered in HF
+    _test_models_not_found = [m for m in _test_models if m not in _hf_model_names]
+    if _test_models_not_found:
+        pytest.fail(f"Model(s) not found in HuggingFace: {_test_models_not_found}")
 
+    # Verify all models are assigned to at least one task
+    _models_to_be_tested = set(m for m, t in _model_w_tasks)
+    _missing_task_models = _models_to_be_tested.difference(_test_models)
+    if _missing_task_models:
+        pytest.fail(f"Model(s) do not have an assigned task: {_missing_task_models}")
 
-pytest.models_w_tasks = list(filter(_valid_model_task, _model_w_tasks))
-pytest.mt_names = [f"{m}-{t}" for m, t in pytest.models_w_tasks]
-"""
-These fixtures iterate all combinations of tasks and models, dtype, & cuda_graph
-"""
 
+""" Fixtures for inference config """
 
-@pytest.fixture(params=pytest.models_w_tasks, ids=pytest.mt_names)
+
+@pytest.fixture(params=pytest.model_w_tasks, ids=pytest.mt_names)
 def model_w_task(request):
     return request.param
 
@@ -101,59 +109,18 @@ def enable_cuda_graph(request):
     return request.param
 
 
-"""
-This fixture will validate the configuration
-"""
-
-
-@pytest.fixture()
-def invalid_model_task_config(model_w_task, dtype, enable_cuda_graph):
-    model, task = model_w_task
-    msg = ""
-    if pkg_version.parse(torch.__version__) <= pkg_version.parse("1.2"):
-        msg = "DS inference injection doesn't work well on older torch versions"
-    elif model not in pytest.all_models[task]:
-        msg = f"Not a valid model / task combination: {model} / {task}"
-    elif enable_cuda_graph and (torch_info["cuda_version"] == "0.0"):
-        msg = "CUDA not detected, cannot use CUDA Graph"
-    elif enable_cuda_graph and pkg_version.parse(
-            torch.__version__) < pkg_version.parse("1.10"):
-        msg = "CUDA Graph is only available in torch versions >= 1.10"
-    elif "gpt-j-6B" in model:
-        if dtype != torch.half:
-            msg = f"Not enough GPU memory to run {model} with dtype {dtype}"
-        elif enable_cuda_graph:
-            msg = f"Not enough GPU memory to run {model} with CUDA Graph enabled"
-    elif "gpt-neox-20b" in model:  # TODO: remove this when neox issues resolved
-        msg = "Skipping gpt-neox-20b for now"
-    elif ("gpt-neox-20b" in model) and (dtype != torch.half):
-        msg = f"Not enough GPU memory to run {model} with dtype {dtype}"
-    elif ("bloom" in model) and (dtype != torch.half):
-        msg = f"Bloom models only support half precision, cannot use dtype {dtype}"
-    elif ("bert" not in model.lower()) and enable_cuda_graph:
-        msg = "Non bert/roberta models do no support CUDA Graph"
-    return msg
+@pytest.fixture(params=[True, False], ids=["Triton", "noTriton"])
+def enable_triton(request):
+    return request.param
 
 
-"""
-These fixtures can be used to customize the query, inference args, and assert
-statement for each combination of model /task
-"""
+""" Fixtures for running query """
 
 
 @pytest.fixture
 def query(model_w_task):
     model, task = model_w_task
-    angle_bracket_mask_models = [
-        "roberta",
-        "camembert",
-        "esm",
-        "ibert",
-        "luke",
-        "mpnet",
-        "yoso",
-        "mpnet"
-    ]
+    angle_bracket_mask_models = ["roberta", "camembert", "esm", "ibert", "luke", "mpnet", "yoso", "mpnet"]
 
     if task == "fill-mask":
         if any(map(lambda x: x in model, angle_bracket_mask_models)):
@@ -183,14 +150,17 @@ def query(model_w_task):
 def inf_kwargs(model_w_task):
     model, task = model_w_task
     if task == "text-generation":
-        if model == "EleutherAI/gpt-j-6B":
+        if model == "EleutherAI/gpt-j-6b":
             # This model on V100 is hitting memory problems that limit the number of output tokens
-            return {"do_sample": False, "max_length": 12}
-        return {"do_sample": False, "max_length": 20}
+            return {"do_sample": False, "temperature": 1.0, "max_length": 12}
+        return {"do_sample": False, "temperature": 1.0, "max_length": 20}
     else:
         return {}
 
 
+""" Assertion fixture for verifying model outputs """
+
+
 def fill_mask_assert(x, y):
     return set(res["token_str"] for res in x) == set(res["token_str"] for res in y)
 
@@ -208,18 +178,15 @@ def token_classification_assert(x, y):
 
 
 def text_generation_assert(x, y):
-    return set(res["generated_text"] for res in x) == set(res["generated_text"]
-                                                          for res in y)
+    return set(res["generated_text"] for res in x) == set(res["generated_text"] for res in y)
 
 
 def text2text_generation_assert(x, y):
-    return set(res["generated_text"] for res in x) == set(res["generated_text"]
-                                                          for res in y)
+    return set(res["generated_text"] for res in x) == set(res["generated_text"] for res in y)
 
 
 def translation_assert(x, y):
-    return set(res["translation_text"] for res in x) == set(res["translation_text"]
-                                                            for res in y)
+    return set(res["translation_text"] for res in x) == set(res["translation_text"] for res in y)
 
 
 def summarization_assert(x, y):
@@ -245,7 +212,9 @@ def assert_fn(model_w_task):
     return assert_fn
 
 
+# Used to verify DeepSpeed kernel injection worked with a model
 def check_injection(model):
+
     def verify_injection(module):
         for child in module.children():
             if isinstance(child, nn.ModuleList):
@@ -258,9 +227,38 @@ def verify_injection(module):
     verify_injection(model)
 
 
-"""
-Tests
-"""
+# Verify that test is valid
+def validate_test(model_w_task, dtype, enable_cuda_graph, enable_triton):
+    model, task = model_w_task
+    msg = ""
+    if enable_cuda_graph and (torch_info["cuda_version"] == "0.0"):
+        msg = "CUDA not detected, cannot use CUDA Graph"
+    elif enable_cuda_graph and pkg_version.parse(torch.__version__) < pkg_version.parse("1.10"):
+        msg = "CUDA Graph is only available in torch versions >= 1.10"
+    elif "gpt-j-6b" in model:
+        if dtype != torch.half:
+            msg = f"Not enough GPU memory to run {model} with dtype {dtype}"
+        elif enable_cuda_graph:
+            msg = f"Not enough GPU memory to run {model} with CUDA Graph enabled"
+    elif "gpt-neox-20b" in model:  # TODO: remove this when neox issues resolved
+        msg = "Skipping gpt-neox-20b for now"
+    elif ("gpt-neox-20b" in model) and (dtype != torch.half):
+        msg = f"Not enough GPU memory to run {model} with dtype {dtype}"
+    elif ("bloom" in model) and (dtype != torch.half):
+        msg = f"Bloom models only support half precision, cannot use dtype {dtype}"
+    elif ("bert" not in model.lower()) and enable_cuda_graph:
+        msg = "Non bert/roberta models do no support CUDA Graph"
+    elif enable_triton and not (dtype in [torch.half]):
+        msg = "Triton is for fp16"
+    elif enable_triton and not deepspeed.HAS_TRITON:
+        msg = "triton needs to be installed for the test"
+    elif ("bert" not in model.lower()) and enable_triton:
+        msg = "Triton kernels do not support Non bert/roberta models yet"
+
+    # These should be removed once we fix several inference tests failing
+    if model in ["EleutherAI/pythia-70m-deduped", "distilbert-base-cased-distilled-squad", "EleutherAI/gpt-j-6b"]:
+        msg = "Test is currently broken"
+    return msg
 
 
 @pytest.mark.inference
@@ -272,13 +270,15 @@ def test(
         model_w_task,
         dtype,
         enable_cuda_graph,
+        enable_triton,
         query,
         inf_kwargs,
         assert_fn,
-        invalid_model_task_config,
+        perf_meas=True,
     ):
-        if invalid_model_task_config:
-            pytest.skip(invalid_model_task_config)
+        invalid_test_msg = validate_test(model_w_task, dtype, enable_cuda_graph, enable_triton)
+        if invalid_test_msg:
+            pytest.skip(invalid_test_msg)
 
         model, task = model_w_task
         local_rank = int(os.getenv("LOCAL_RANK", "0"))
@@ -302,13 +302,18 @@ def test(
         get_accelerator().synchronize()
         bs_time = time.time() - start
 
-        pipe.model = deepspeed.init_inference(
-            pipe.model,
-            mp_size=1,
-            dtype=dtype,
-            replace_with_kernel_inject=True,
-            enable_cuda_graph=enable_cuda_graph,
-        )
+        args = {
+            'mp_size': 1,
+            'dtype': dtype,
+            'replace_with_kernel_inject': True,
+            'enable_cuda_graph': enable_cuda_graph,
+            'use_triton': enable_triton,
+            'triton_autotune': False,
+        }
+        if pipe.tokenizer.model_max_length < deepspeed.ops.transformer.inference.config.DeepSpeedInferenceConfig(
+        ).max_out_tokens:
+            args.update({'max_out_tokens': pipe.tokenizer.model_max_length})
+        pipe.model = deepspeed.init_inference(pipe.model, **args)
         check_injection(pipe.model)
         # Warm-up queries for perf measurement
         #for i in range(10):
@@ -319,6 +324,11 @@ def test(
         get_accelerator().synchronize()
         ds_time = time.time() - start
 
+        if perf_meas:
+            print(
+                f"model={model}, task={task}, dtype={dtype}, cuda_graph={enable_cuda_graph}, triton={enable_triton}, bs_time={bs_time}, ds_time={ds_time}"
+            )
+
         # facebook/opt* and some bigscient/bloom* models are not matching
         # baseline exactly, adding an exception to them for now
         if ("opt" in model) or ("bloom" in model):
@@ -327,25 +337,18 @@ def test(
         # These performance tests are only measuring the time for a single
         # inference request, we just want to check that performance isn't terrible
         #assert ds_time <= (bs_time * 1.1)
+
         assert assert_fn(bs_output, ds_output)
 
 
 @pytest.mark.seq_inference
-@pytest.mark.parametrize("model_w_task",
-                         [("EleutherAI/gpt-neo-1.3B",
-                           "text-generation"),
-                          ("EleutherAI/gpt-neox-20b",
-                           "text-generation"),
-                          ("bigscience/bloom-3b",
-                           "text-generation"),
-                          ("EleutherAI/gpt-j-6B",
-                           "text-generation")],
-                         ids=["gpt-neo",
-                              "gpt-neox",
-                              "bloom",
-                              "gpt-j"])
+@pytest.mark.parametrize("model_w_task", [("EleutherAI/gpt-neo-1.3B", "text-generation"),
+                                          ("EleutherAI/gpt-neox-20b", "text-generation"),
+                                          ("bigscience/bloom-3b", "text-generation"),
+                                          ("EleutherAI/gpt-j-6b", "text-generation")],
+                         ids=["gpt-neo", "gpt-neox", "bloom", "gpt-j"])
 class TestMPSize(DistributedTest):
-    world_size = 4
+    world_size = 2
 
     def test(
         self,
@@ -354,10 +357,13 @@ def test(
         query,
         inf_kwargs,
         assert_fn,
-        invalid_model_task_config,
     ):
-        if invalid_model_task_config:
-            pytest.skip(invalid_model_task_config)
+        invalid_test_msg = validate_test(model_w_task, dtype, enable_cuda_graph=False, enable_triton=False)
+        if invalid_test_msg:
+            pytest.skip(invalid_test_msg)
+
+        if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
+            pytest.skip("This op had not been implemented on this system.", allow_module_level=True)
 
         model, task = model_w_task
         local_rank = int(os.getenv("LOCAL_RANK", "0"))
@@ -381,28 +387,92 @@ def test(
         assert assert_fn(bs_output, ds_output)
 
 
+@pytest.mark.inference
+@pytest.mark.parametrize("model_w_task", [("gpt2", "text-generation")], ids=["gpt2"])
+class TestLowCpuMemUsage(DistributedTest):
+    world_size = 1
+
+    def test(
+        self,
+        model_w_task,
+        query,
+        inf_kwargs,
+        assert_fn,
+    ):
+        model, task = model_w_task
+        dtype = torch.float16
+        if dtype not in get_accelerator().supported_dtypes():
+            pytest.skip(f"Acceleraor {get_accelerator().device_name()} does not support {dtype}.")
+
+        local_rank = int(os.getenv("LOCAL_RANK", "0"))
+
+        pipe = pipeline(task, model=model, model_kwargs={"low_cpu_mem_usage": True}, device=local_rank, framework="pt")
+        bs_output = pipe(query, **inf_kwargs)
+        pipe.model = deepspeed.init_inference(pipe.model,
+                                              mp_size=self.world_size,
+                                              dtype=dtype,
+                                              replace_method="auto",
+                                              replace_with_kernel_inject=True)
+
+        ds_output = pipe(query, **inf_kwargs)
+
+        assert assert_fn(bs_output, ds_output)
+
+
+@pytest.mark.seq_inference
+@pytest.mark.parametrize("model_w_task", [("tiiuae/falcon-7b", "text-generation")], ids=["falcon"])
+class TestAutoTP(DistributedTest):
+    world_size = 1
+
+    def test(
+        self,
+        model_w_task,
+        query,
+        inf_kwargs,
+        assert_fn,
+    ):
+        # TODO: enable this test for H100 tests
+        pytest.skip("Not enough GPU memory for this on V100 runners")
+        model, task = model_w_task
+        dtype = torch.bfloat16
+        local_rank = int(os.getenv("LOCAL_RANK", "0"))
+
+        # We have to load these large models on CPU with pipeline because not
+        # enough GPU memory
+        tokenizer = AutoTokenizer.from_pretrained(model, use_fast=True)
+        pipe = pipeline(task,
+                        model=model,
+                        tokenizer=tokenizer,
+                        torch_dtype=dtype,
+                        trust_remote_code=True,
+                        device=torch.device("cpu"),
+                        framework="pt")
+        #bs_output = pipe(query, **inf_kwargs)
+
+        pipe.model = deepspeed.init_inference(pipe.model, mp_size=self.world_size, replace_with_kernel_inject=False)
+        # Switch device to GPU so that input tensors are not on CPU
+        pipe.device = torch.device(get_accelerator().device_name(local_rank))
+        ds_output = pipe(query, **inf_kwargs)
+
+        #print(local_rank, "baseline", bs_output)
+        print(local_rank, "deepspeed", ds_output)
+        #assert assert_fn(bs_output, ds_output)
+
+
 @pytest.mark.seq_inference
 @pytest.mark.parametrize(
     "model_w_task, injection_policy",
     [
-        (("google/t5-v1_1-small",
-          "text2text-generation"),
-         {
-             T5Block: ('SelfAttention.o',
-                       'EncDecAttention.o',
-                       'DenseReluDense.wo')
-         }),
-        (("roberta-large",
-          "fill-mask"),
-         {
-             RobertaLayer: ('output.dense')
-         }),
+        (("google/t5-v1_1-small", "text2text-generation"), {
+            T5Block: ('SelfAttention.o', 'EncDecAttention.o', 'DenseReluDense.wo')
+        }),
+        (("roberta-large", "fill-mask"), {
+            RobertaLayer: ('output.dense')
+        }),
     ],
-    ids=["t5",
-         "roberta"],
+    ids=["t5", "roberta"],
 )
 @pytest.mark.parametrize("dtype", [torch.float], ids=["fp32"])
-@pytest.mark.parametrize("enable_cuda_graph", [False], ids=["noCG"])
 class TestInjectionPolicy(DistributedTest):
     world_size = [1, 2]
 
@@ -413,12 +483,11 @@ def test(
         query,
         inf_kwargs,
         assert_fn,
-        invalid_model_task_config,
         dtype,
-        enable_cuda_graph,
     ):
-        if invalid_model_task_config:
-            pytest.skip(invalid_model_task_config)
+        invalid_test_msg = validate_test(model_w_task, dtype, enable_cuda_graph=False, enable_triton=False)
+        if invalid_test_msg:
+            pytest.skip(invalid_test_msg)
 
         model, task = model_w_task
         local_rank = int(os.getenv("LOCAL_RANK", "0"))
@@ -445,16 +514,10 @@ def test(
 @pytest.mark.seq_inference
 @pytest.mark.parametrize(
     "model_w_task",
-    [
-        ("Helsinki-NLP/opus-mt-en-de",
-         "translation"),
-    ],
-    ids=[
-        "marian",
-    ],
+    [("Helsinki-NLP/opus-mt-en-de", "translation"), ("Salesforce/codegen-350M-mono", "text-generation")],
+    ids=["marian", "codegen"],  #codegen has fusedqkv weight.
 )
-@pytest.mark.parametrize("dtype", [torch.float16], ids=["fp16"])
-@pytest.mark.parametrize("enable_cuda_graph", [False], ids=["noCG"])
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16], ids=["fp16", "bf16"])
 class TestAutoTensorParallelism(DistributedTest):
     world_size = [2]
 
@@ -464,12 +527,18 @@ def test(
         query,
         inf_kwargs,
         assert_fn,
-        invalid_model_task_config,
         dtype,
-        enable_cuda_graph,
     ):
-        if invalid_model_task_config:
-            pytest.skip(invalid_model_task_config)
+        invalid_test_msg = validate_test(model_w_task, dtype, enable_cuda_graph=False, enable_triton=False)
+        if invalid_test_msg:
+            pytest.skip(invalid_test_msg)
+
+        if dtype not in get_accelerator().supported_dtypes():
+            pytest.skip(f"Acceleraor {get_accelerator().device_name()} does not support {dtype}.")
+
+        # TODO: enable this test after torch 2.1 stable release
+        if dtype == torch.bfloat16 and model_w_task[0] == "Salesforce/codegen-350M-mono":
+            pytest.skip("Codegen model(bf16) need to use torch version > 2.0.")
 
         model, task = model_w_task
         local_rank = int(os.getenv("LOCAL_RANK", "0"))
@@ -480,9 +549,7 @@ def test(
         pipe = pipeline(task, model=model, device=torch.device("cpu"), framework="pt")
         bs_output = pipe(query, **inf_kwargs)
 
-        pipe.model = deepspeed.init_inference(pipe.model,
-                                              mp_size=world_size,
-                                              dtype=dtype)
+        pipe.model = deepspeed.init_inference(pipe.model, mp_size=world_size, dtype=dtype)
         # Switch device to GPU so that input tensors are not on CPU
         pipe.device = torch.device(get_accelerator().device_name(local_rank))
         ds_output = pipe(query, **inf_kwargs)
@@ -491,22 +558,52 @@ def test(
         print(local_rank, "deepspeed", ds_output)
         assert assert_fn(bs_output, ds_output)
 
+    @pytest.mark.world_size(3)
+    def test_odd_world_size(
+        self,
+        model_w_task,
+        query,
+        inf_kwargs,
+        assert_fn,
+        dtype,
+    ):
+        invalid_test_msg = validate_test(model_w_task, dtype, enable_cuda_graph=False, enable_triton=False)
+        if invalid_test_msg:
+            pytest.skip(invalid_test_msg)
+
+        model, task = model_w_task
+        if model == "Salesforce/codegen-350M-mono":
+            pytest.skip("codegen does not supported by odd world_size")
+        local_rank = int(os.getenv("LOCAL_RANK", "0"))
+        world_size = int(os.getenv("WORLD_SIZE", "3"))
+
+        pipe = pipeline(task,
+                        model=model,
+                        device=torch.device(get_accelerator().device_name(local_rank)),
+                        framework="pt")
+        bs_output = pipe(query, **inf_kwargs)
+
+        pipe.model = deepspeed.init_inference(pipe.model, mp_size=world_size, dtype=dtype)
+        ds_output = pipe(query, **inf_kwargs)
+
+        print(local_rank, "baseline", bs_output)
+        print(local_rank, "deepspeed", ds_output)
+        assert assert_fn(bs_output, ds_output)
+
 
 @pytest.mark.nightly
 @pytest.mark.parametrize(
     "model_family, model_name",
     (
-        ["gpt2",
-         "EleutherAI/gpt-neo-2.7B"],
-        ["gpt2",
-         "EleutherAI/gpt-j-6B"],
-        ["gpt2",
-         "gpt2-xl"],
+        ["gpt2", "EleutherAI/gpt-neo-2.7B"],
+        #["gpt2", "EleutherAI/gpt-j-6b"], # Causing OOM for this test
+        ["gpt2", "gpt2-xl"],
     ),
 )
 @pytest.mark.parametrize("task", ["lambada_standard"])
 class TestLMCorrectness(DistributedTest):
     world_size = 1
+    exec_timeout = 1200  # Give these tests longer to complete
 
     def test(self, model_family, model_name, task):
         # imports here to avoid import errors when pytest collects tests
@@ -515,22 +612,35 @@ def test(self, model_family, model_name, task):
         import lm_eval.tasks
         import lm_eval.evaluator
 
+        # The bootstrap_stderr function in lm_eval.metrics uses a
+        # multiprocessing Pool to increase performance. Since we use a Pool for
+        # our distributed tests and cannot nest Pools, we must redefine and
+        # patch this function with a version that does not use Pool.
+        def no_pool_bootstrap_stderr(f, xs, iters):
+            from lm_eval.metrics import _bootstrap_internal
+            from lm_eval.metrics import sample_stddev
+            res = []
+            chunk_size = min(1000, iters)
+            for i in range(iters // chunk_size):
+                res.extend(_bootstrap_internal(f, chunk_size)((i, xs)))
+            return sample_stddev(res)
+
+        lm_eval.metrics.bootstrap_stderr = no_pool_bootstrap_stderr
+
         local_rank = os.getenv("LOCAL_RANK", "0")
         device = torch.device(get_accelerator().device_name(local_rank))
         dtype = torch.float
         task_dict = lm_eval.tasks.get_task_dict([task])
 
-        if 'gpt-j-6B' in model_name:
+        if 'gpt-j-6b' in model_name:
             dtype = torch.half
-            lm = lm_eval.models.get_model(model_family).create_from_arg_string(
-                f"pretrained={model_name}",
-                {"device": "cpu"})
+            lm = lm_eval.models.get_model(model_family).create_from_arg_string(f"pretrained={model_name}",
+                                                                               {"device": "cpu"})
             setattr(lm, model_family, getattr(lm, model_family).half().to(device))
             lm._device = device
         else:
             lm = lm_eval.models.get_model(model_family).create_from_arg_string(
-                f"pretrained={model_name}",
-                {"device": get_accelerator().device_name()})
+                f"pretrained={model_name}", {"device": get_accelerator().device_name()})
 
         get_accelerator().synchronize()
         start = time.time()
@@ -538,9 +648,9 @@ def test(self, model_family, model_name, task):
         get_accelerator().synchronize()
         bs_time = time.time() - start
 
+        getattr(lm, model_family).to("cpu")
         ds_model = deepspeed.init_inference(
-            getattr(lm,
-                    model_family),
+            getattr(lm, model_family),
             mp_size=1,
             dtype=dtype,
             replace_with_kernel_inject=True,
@@ -554,7 +664,6 @@ def test(self, model_family, model_name, task):
         get_accelerator().synchronize()
         ds_time = time.time() - start
 
-        ppl_diff = abs(bs_output["results"][task]["ppl"] -
-                       ds_output["results"][task]["ppl"])
+        ppl_diff = abs(bs_output["results"][task]["ppl"] - ds_output["results"][task]["ppl"])
         #assert ds_time <= bs_time
         assert ppl_diff < 0.01
diff --git a/tests/unit/inference/test_inference_config.py b/tests/unit/inference/test_inference_config.py
index e19f73ea35d6..375563abf65b 100644
--- a/tests/unit/inference/test_inference_config.py
+++ b/tests/unit/inference/test_inference_config.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import pytest
 import torch
diff --git a/tests/unit/inference/test_model_profiling.py b/tests/unit/inference/test_model_profiling.py
index 07ce839306a6..23e49f89025b 100644
--- a/tests/unit/inference/test_model_profiling.py
+++ b/tests/unit/inference/test_model_profiling.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import os
 import time
@@ -8,68 +11,33 @@
 from transformers import pipeline
 from unit.common import DistributedTest
 from deepspeed.accelerator import get_accelerator
+from deepspeed.ops.op_builder import InferenceBuilder
 
-
-@pytest.fixture
-def query(model, task):
-    if task == "text-generation":
-        return "DeepSpeed is"
-    elif task == "fill-mask":
-        if "roberta" in model:
-            return "I am a <mask> model"
-        else:
-            return "I am a [MASK] model"
-    else:
-        raise NotImplementedError
-
-
-@pytest.fixture
-def inf_kwargs(task):
-    if task == "text-generation":
-        return {"do_sample": False, "min_length": 50, "max_length": 50}
-    else:
-        return {}
+if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
+    pytest.skip("This op had not been implemented on this system.", allow_module_level=True)
 
 
 @pytest.mark.inference
-@pytest.mark.parametrize("model,task",
-                         [
-                             ("bert-base-cased",
-                              "fill-mask"),
-                             ("roberta-base",
-                              "fill-mask"),
-                             ("gpt2",
-                              "text-generation"),
-                             ("facebook/opt-125m",
-                              "text-generation"),
-                             ("bigscience/bloom-560m",
-                              "text-generation"),
-                         ])
-@pytest.mark.parametrize("cuda_graphs", [True, False])
 @pytest.mark.parametrize("use_cuda_events", [True, False])
+@pytest.mark.parametrize("enable_cuda_graph", [True, False])
 class TestModelProfiling(DistributedTest):
     world_size = 1
 
-    def test(self,
-             model,
-             task,
-             query,
-             inf_kwargs,
-             cuda_graphs,
-             use_cuda_events,
-             dtype=torch.float16):
-        if cuda_graphs and "bert" not in model:
-            pytest.skip(f"CUDA Graph not supported for {model}")
+    def test(self, enable_cuda_graph, use_cuda_events):
+        task = "fill-mask"
+        model = "bert-base-cased"
+        dtype = torch.float16
+        query = "I am a [MASK] model"
 
         local_rank = int(os.getenv("LOCAL_RANK", "0"))
         world_size = int(os.getenv("WORLD_SIZE", "1"))
 
-        pipe = pipeline(task, model, framework="pt", device=local_rank)
+        pipe = pipeline(task, model, framework="pt", device=get_accelerator().device_name(local_rank))
         pipe.model = deepspeed.init_inference(pipe.model,
                                               dtype=dtype,
                                               mp_size=world_size,
                                               replace_with_kernel_inject=True,
-                                              enable_cuda_graph=cuda_graphs)
+                                              enable_cuda_graph=enable_cuda_graph)
         pipe.model.profile_model_time(use_cuda_events=use_cuda_events)
 
         e2e_times = []
@@ -78,7 +46,7 @@ def test(self,
             get_accelerator().synchronize()
             start = time.perf_counter_ns()
 
-            r = pipe(query, **inf_kwargs)
+            r = pipe(query)
 
             get_accelerator().synchronize()
             end = time.perf_counter_ns()
diff --git a/tests/unit/inference/test_stable_diffusion.py b/tests/unit/inference/test_stable_diffusion.py
new file mode 100644
index 000000000000..ac39b7ab12fa
--- /dev/null
+++ b/tests/unit/inference/test_stable_diffusion.py
@@ -0,0 +1,48 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import os
+import torch
+import pytest
+import deepspeed
+import numpy
+from unit.common import DistributedTest
+from deepspeed.accelerator import get_accelerator
+
+
+# Setup for these models is different from other pipelines, so we add a separate test
+@pytest.mark.stable_diffusion
+class TestStableDiffusion(DistributedTest):
+    world_size = 1
+
+    def test(self):
+        from diffusers import DiffusionPipeline
+        from image_similarity_measures.quality_metrics import rmse
+        generator = torch.Generator(device=get_accelerator().current_device())
+        seed = 0xABEDABE7
+        generator.manual_seed(seed)
+        prompt = "a dog on a rocket"
+        model = "prompthero/midjourney-v4-diffusion"
+        local_rank = int(os.getenv("LOCAL_RANK", "0"))
+        device = torch.device(f"cuda:{local_rank}")
+
+        pipe = DiffusionPipeline.from_pretrained(model, torch_dtype=torch.half)
+        pipe = pipe.to(device)
+        baseline_image = pipe(prompt, guidance_scale=7.5, generator=generator).images[0]
+
+        pipe = deepspeed.init_inference(
+            pipe,
+            mp_size=1,
+            dtype=torch.half,
+            replace_with_kernel_inject=True,
+            enable_cuda_graph=True,
+        )
+        generator.manual_seed(seed)
+        deepspeed_image = pipe(prompt, guidance_scale=7.5, generator=generator).images[0]
+
+        rmse_value = rmse(org_img=numpy.asarray(baseline_image), pred_img=numpy.asarray(deepspeed_image))
+
+        # RMSE threshold value is arbitrary, may need to adjust as needed
+        assert rmse_value <= 0.01
diff --git a/tests/unit/inference/v2/__init__.py b/tests/unit/inference/v2/__init__.py
new file mode 100644
index 000000000000..208299fb8c50
--- /dev/null
+++ b/tests/unit/inference/v2/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
diff --git a/tests/unit/inference/v2/inference_test_utils.py b/tests/unit/inference/v2/inference_test_utils.py
new file mode 100644
index 000000000000..d63c51267e51
--- /dev/null
+++ b/tests/unit/inference/v2/inference_test_utils.py
@@ -0,0 +1,46 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import Tuple
+
+import torch
+from deepspeed.accelerator import get_accelerator
+
+TOLERANCES = None
+
+
+def get_tolerances():
+    global TOLERANCES
+    if TOLERANCES is None:
+        TOLERANCES = {torch.float32: (5e-4, 5e-5), torch.float16: (3e-2, 2e-3)}
+        if get_accelerator().is_bf16_supported():
+            # Note: BF16 tolerance is higher than FP16 because of the lower precision (7 (+1) bits vs
+            # 10 (+1) bits)
+            TOLERANCES[torch.bfloat16] = (4.8e-1, 3.2e-2)
+    return TOLERANCES
+
+
+DTYPES = None
+
+
+def get_dtypes(include_float=True):
+    global DTYPES
+    if DTYPES is None:
+        DTYPES = [torch.float16, torch.float32] if include_float else [torch.float16]
+        try:
+            if get_accelerator().is_bf16_supported():
+                DTYPES.append(torch.bfloat16)
+        except (AssertionError, AttributeError):
+            pass
+    return DTYPES
+
+
+def allclose(x, y, tolerances: Tuple[int, int] = None):
+    assert x.dtype == y.dtype
+    if tolerances is None:
+        rtol, atol = get_tolerances()[x.dtype]
+    else:
+        rtol, atol = tolerances
+    return torch.allclose(x, y, rtol=rtol, atol=atol)
diff --git a/tests/unit/inference/v2/kernels/__init__.py b/tests/unit/inference/v2/kernels/__init__.py
new file mode 100644
index 000000000000..208299fb8c50
--- /dev/null
+++ b/tests/unit/inference/v2/kernels/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
diff --git a/tests/unit/inference/v2/kernels/core_ops/__init__.py b/tests/unit/inference/v2/kernels/core_ops/__init__.py
new file mode 100644
index 000000000000..208299fb8c50
--- /dev/null
+++ b/tests/unit/inference/v2/kernels/core_ops/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
diff --git a/tests/unit/inference/v2/kernels/core_ops/test_bias_activation.py b/tests/unit/inference/v2/kernels/core_ops/test_bias_activation.py
new file mode 100644
index 000000000000..376188b92565
--- /dev/null
+++ b/tests/unit/inference/v2/kernels/core_ops/test_bias_activation.py
@@ -0,0 +1,101 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import Optional
+
+import pytest
+import torch
+
+from deepspeed.accelerator import get_accelerator
+from deepspeed.inference.v2.inference_utils import ActivationType, DtypeEnum
+from deepspeed.inference.v2.kernels.core_ops import CUDABiasActivation
+from ....v2.inference_test_utils import get_dtypes, allclose
+
+
+def reference_bias_act_implementation(input: torch.Tensor, bias: Optional[torch.Tensor],
+                                      act_type: ActivationType) -> torch.Tensor:
+    bias_func_map = {
+        ActivationType.RELU: torch.nn.functional.relu,
+        ActivationType.GELU: torch.nn.functional.gelu,
+        ActivationType.SILU: torch.nn.functional.silu,
+        ActivationType.IDENTITY: lambda x: x,
+    }
+
+    dtype = input.dtype
+    input_f = input.to(torch.float32)
+    if bias is not None:
+        bias_f = bias.to(torch.float32)
+        output_f = input_f + bias_f
+    else:
+        output_f = input_f
+    output_f = bias_func_map[act_type](output_f)
+
+    return output_f.to(dtype)
+
+
+def _bias_activation_test_helper(tokens: int,
+                                 channels: int,
+                                 act_fn: ActivationType,
+                                 dtype: DtypeEnum,
+                                 use_bias: bool = True) -> None:
+    """
+    Fully parameterized testing entry point.
+    """
+    # Input vals
+    input_tensor = torch.randn((tokens, channels), dtype=dtype.value, device=get_accelerator().current_device_name())
+    if use_bias:
+        bias = torch.randn((channels), dtype=dtype.value, device=get_accelerator().current_device_name())
+    else:
+        bias = None
+
+    # Reference output
+    ref_output = reference_bias_act_implementation(input_tensor, bias, act_fn)
+
+    bias_act = CUDABiasActivation(channels, dtype, act_fn)
+
+    # New output
+    ds_tensor = input_tensor.clone()
+    bias_act(ds_tensor, bias)
+
+    # Check
+    assert allclose(ds_tensor, ref_output)
+
+
+@pytest.mark.inference_v2_ops
+@pytest.mark.parametrize("tokens, channels", [(1, 4096), (37, 2048), (112, 14432), (1024, 6144)])
+@pytest.mark.parametrize("dtype", get_dtypes(include_float=False))
+def test_token_channels_permutations(tokens: int, channels: int, dtype: torch.dtype) -> None:
+    """
+    Validate bias activation kernel with different token and channel permutations when using the RELU
+    activation function.
+    """
+    act_fn = ActivationType.RELU
+    dtype = DtypeEnum(dtype)
+    _bias_activation_test_helper(tokens, channels, act_fn, dtype)
+
+
+@pytest.mark.inference_v2_ops
+@pytest.mark.parametrize("act_fn",
+                         [ActivationType.RELU, ActivationType.GELU, ActivationType.SILU, ActivationType.IDENTITY])
+def test_act_fns(act_fn: ActivationType) -> None:
+    """
+    Validate bias activation kernel with different activation functions.
+    """
+    tokens = 223
+    channels = 4096
+    dtype = DtypeEnum.fp16
+    _bias_activation_test_helper(tokens, channels, act_fn, dtype)
+
+
+@pytest.mark.inference_v2_ops
+def test_no_bias() -> None:
+    """
+    Validate bias activation kernel with no bias.
+    """
+    tokens = 223
+    channels = 4096
+    dtype = DtypeEnum.fp16
+    act_fn = ActivationType.IDENTITY
+    _bias_activation_test_helper(tokens, channels, act_fn, dtype, use_bias=False)
diff --git a/tests/unit/inference/v2/kernels/core_ops/test_blas_linear.py b/tests/unit/inference/v2/kernels/core_ops/test_blas_linear.py
new file mode 100644
index 000000000000..864db6204a16
--- /dev/null
+++ b/tests/unit/inference/v2/kernels/core_ops/test_blas_linear.py
@@ -0,0 +1,73 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import Tuple
+
+import pytest
+import torch
+
+from deepspeed.accelerator import get_accelerator
+from deepspeed.inference.v2.kernels.core_ops import BlasLibLinear
+from ....v2.inference_test_utils import allclose
+
+# Note: only testing with FP16 and BF16 because we use TF32 on Ampere and we don't have a good
+# set of tolerances. Since this is just on top of BLAS though, the test is more about
+# making sure the stride/contiguity is correct and that's data type agnostic.
+
+
+def reference_implementation(hidden_states, weights):
+    return hidden_states @ weights.t()
+
+
+problem_shapes = [
+    (1, 1, 1024, 1024),
+    (1, 1024, 1024, 1024),
+    (2, 1024, 1024, 1024),
+    (1, 128, 768, 3072),
+    (1, 128, 3072, 768),
+    (1, 1024, 8192, 8192),
+    (1, 733, 8192, 32768),
+    (1, 13, 32768, 8192),
+]
+
+
+@pytest.mark.inference_v2_ops
+@pytest.mark.parametrize("fp_dtype", [torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("problem_shape", problem_shapes)
+def test_blas_linear(fp_dtype: torch.dtype, problem_shape: Tuple[int, int, int, int]):
+    batch, seq_len, in_features, out_features = problem_shape
+    hidden_states = torch.randn(batch, seq_len, in_features, dtype=fp_dtype,
+                                device=get_accelerator().current_device()) * 0.1
+    weights = torch.randn(out_features, in_features, dtype=fp_dtype, device=get_accelerator().current_device()) * 0.01
+    ds_output = torch.empty(batch, seq_len, out_features, dtype=fp_dtype, device=get_accelerator().current_device())
+
+    ds_kernel = BlasLibLinear(fp_dtype)
+
+    ds_output = ds_kernel(ds_output, hidden_states, weights)
+    ref_output = reference_implementation(hidden_states, weights)
+
+    assert allclose(ds_output, ref_output)
+
+
+@pytest.mark.inference_v2_ops
+@pytest.mark.parametrize("fp_dtype", [torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("problem_shape", problem_shapes)
+def test_blas_linear_t(fp_dtype: torch.dtype, problem_shape: Tuple[int, int, int, int]):
+    batch, seq_len, in_features, out_features = problem_shape
+    hidden_states = torch.randn(batch, seq_len, in_features, dtype=fp_dtype,
+                                device=get_accelerator().current_device()) * 0.1
+    weights = torch.randn(out_features, in_features, dtype=fp_dtype, device=get_accelerator().current_device()) * 0.01
+    ds_output = torch.empty(batch, seq_len, out_features, dtype=fp_dtype, device=get_accelerator().current_device())
+
+    ds_kernel = BlasLibLinear(fp_dtype)
+
+    # Transpose the weights then revert to the format we expect.
+    weights = weights.t().contiguous()
+    weights = weights.t()
+    ds_output = ds_kernel(ds_output, hidden_states, weights)
+
+    ref_output = reference_implementation(hidden_states, weights)
+
+    assert allclose(ds_output, ref_output)
diff --git a/tests/unit/inference/v2/kernels/core_ops/test_gated_activation.py b/tests/unit/inference/v2/kernels/core_ops/test_gated_activation.py
new file mode 100644
index 000000000000..8cb95a6cdcba
--- /dev/null
+++ b/tests/unit/inference/v2/kernels/core_ops/test_gated_activation.py
@@ -0,0 +1,133 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import Iterable, Optional
+
+import pytest
+import torch
+
+from deepspeed.accelerator import get_accelerator
+from deepspeed.inference.v2.kernels.core_ops import CUDAGatedActivation
+from deepspeed.inference.v2.inference_utils import ActivationType
+from ....v2.inference_test_utils import get_dtypes, allclose
+
+
+def reference_geglu_implementation(input: torch.Tensor,
+                                   bias: Optional[torch.Tensor] = None,
+                                   act_fn: Optional[ActivationType] = ActivationType.GEGLU) -> torch.Tensor:
+    act_func_map = {
+        ActivationType.ReGLU: torch.nn.functional.relu,
+        ActivationType.GEGLU: lambda x: torch.nn.functional.gelu(x, approximate="tanh"),
+        ActivationType.SiGLU: torch.nn.functional.silu,
+    }
+
+    dtype = input.dtype
+    input = input.to(torch.float32)
+
+    if bias is not None:
+        bias = bias.to(torch.float32)
+        input = input + bias
+
+    act_act = input[..., ::2]
+    act_linear = input[..., 1::2]
+
+    act_act = act_func_map[act_fn](act_act)
+
+    return (act_act * act_linear).to(dtype)
+
+
+@pytest.mark.inference_v2_ops
+@pytest.mark.parametrize("shape", [(1372, 16384), (2, 743, 22016)])
+@pytest.mark.parametrize("dtype", get_dtypes())
+def test_dtypes(shape: Iterable[int], dtype: torch.dtype) -> None:
+    input_tensor = torch.randn(shape, dtype=dtype, device=get_accelerator().current_device_name())
+
+    # Reference output
+    ref_output = reference_geglu_implementation(input_tensor, act_fn=ActivationType.GEGLU)
+
+    # Build kernel
+    geglu = CUDAGatedActivation(input_tensor.size(-1), input_tensor.dtype, ActivationType.GEGLU)
+
+    # New output
+    output_shape = list(input_tensor.shape)
+    output_shape[-1] //= 2
+    output_tensor = torch.empty(output_shape, dtype=input_tensor.dtype, device=get_accelerator().current_device_name())
+    geglu(output_tensor, input_tensor)
+
+    # Check
+    assert allclose(output_tensor, ref_output)
+
+
+@pytest.mark.inference_v2_ops
+@pytest.mark.parametrize("act_fn", [ActivationType.GEGLU, ActivationType.ReGLU, ActivationType.SiGLU])
+def test_act_fn(act_fn: ActivationType) -> None:
+    input_tensor = torch.randn(832, 4096, dtype=torch.float16, device=get_accelerator().current_device())
+
+    # Reference output
+    ref_output = reference_geglu_implementation(input_tensor, act_fn=act_fn)
+
+    cuda_act = CUDAGatedActivation(4096, torch.float16, act_fn)
+
+    # New output
+    output_tensor = torch.empty(832, 2048, dtype=torch.float16, device=get_accelerator().current_device())
+    cuda_act(output_tensor, input_tensor)
+
+    assert allclose(output_tensor, ref_output)
+
+
+@pytest.mark.inference_v2_ops
+def test_act_with_bias():
+    input_tensor = torch.randn(832, 4096, dtype=torch.float16, device=get_accelerator().current_device())
+    bias = torch.randn(4096, dtype=torch.float16, device=get_accelerator().current_device())
+
+    # Reference output
+    ref_output = reference_geglu_implementation(input_tensor, bias=bias, act_fn=ActivationType.GEGLU)
+
+    cuda_act = CUDAGatedActivation(4096, torch.float16, ActivationType.GEGLU)
+
+    # New output
+    output_tensor = torch.empty(832, 2048, dtype=torch.float16, device=get_accelerator().current_device())
+
+    cuda_act(output_tensor, input_tensor, bias)
+
+    assert allclose(output_tensor, ref_output)
+
+
+@pytest.mark.inference_v2_ops
+def test_max_channels():
+    input_tensor = torch.randn(832, 48152, dtype=torch.float16, device=get_accelerator().current_device())
+
+    ref_output = reference_geglu_implementation(input_tensor, act_fn=ActivationType.GEGLU)
+
+    cuda_act = CUDAGatedActivation(48152, torch.float16, ActivationType.GEGLU)
+
+    output_tensor = torch.empty(832, 24076, dtype=torch.float16, device=get_accelerator().current_device())
+    cuda_act(output_tensor, input_tensor)
+
+    assert allclose(output_tensor, ref_output)
+
+
+@pytest.mark.inference_v2_ops
+def test_bad_dtype() -> None:
+    with pytest.raises(ValueError):
+        CUDAGatedActivation(128, torch.int8, ActivationType.GEGLU)
+
+
+@pytest.mark.inference_v2_ops
+def test_bad_act_fn() -> None:
+    with pytest.raises(ValueError):
+        CUDAGatedActivation(128, torch.float16, ActivationType.RELU)
+
+
+@pytest.mark.inference_v2_ops
+def test_bad_alignment() -> None:
+    with pytest.raises(ValueError):
+        CUDAGatedActivation(127, torch.float16, ActivationType.GEGLU)
+
+
+@pytest.mark.inference_v2_ops
+def test_too_many_channels() -> None:
+    with pytest.raises(ValueError):
+        CUDAGatedActivation(49160, torch.float16, ActivationType.GEGLU)
diff --git a/tests/unit/inference/v2/kernels/core_ops/test_post_ln.py b/tests/unit/inference/v2/kernels/core_ops/test_post_ln.py
new file mode 100644
index 000000000000..0b489894bb9b
--- /dev/null
+++ b/tests/unit/inference/v2/kernels/core_ops/test_post_ln.py
@@ -0,0 +1,47 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import pytest
+import torch
+
+from deepspeed.accelerator import get_accelerator
+from deepspeed.inference.v2.kernels.core_ops import CUDAFPPostLN
+from ....v2.inference_test_utils import get_dtypes, allclose
+
+
+def reference_implementation(residual: torch.Tensor, hidden_states: torch.Tensor, gamma: torch.Tensor,
+                             beta: torch.Tensor, epsilon: float) -> torch.Tensor:
+    residual_f = residual.to(torch.float32)
+    hidden_states_f = hidden_states.to(torch.float32)
+    gamma_f = gamma.to(torch.float32)
+    beta_f = beta.to(torch.float32)
+    return torch.nn.functional.layer_norm(residual_f + hidden_states_f, (hidden_states_f.size(-1), ),
+                                          weight=gamma_f,
+                                          bias=beta_f,
+                                          eps=epsilon).to(hidden_states.dtype)
+
+
+@pytest.mark.inference_v2_ops
+@pytest.mark.parametrize("tokens, channels", [(1, 4096), (37, 2048), (112, 14432), (1024, 6144)])
+@pytest.mark.parametrize("dtype", get_dtypes())
+def test_cuda_post_ln(tokens: int, channels: int, dtype: torch.dtype) -> None:
+
+    # Input vals
+    hidden_states = torch.randn((tokens, channels), dtype=dtype, device=get_accelerator().current_device_name())
+    residual = torch.randn((tokens, channels), dtype=dtype, device=get_accelerator().current_device_name())
+    gamma = torch.randn((channels), dtype=dtype, device=get_accelerator().current_device_name())
+    beta = torch.rand((channels), dtype=dtype, device=get_accelerator().current_device_name())
+    epsilon = 1e-5
+
+    # Reference output
+    ref_output = reference_implementation(residual, hidden_states, gamma, beta, epsilon)
+
+    # New output
+    post_ln_kernel = CUDAFPPostLN(hidden_states.size(-1), residual.dtype)
+    ds_output = torch.empty_like(residual)
+    post_ln_kernel(ds_output, residual, hidden_states, gamma, beta)
+
+    # Check
+    assert allclose(ds_output, ref_output)
diff --git a/tests/unit/inference/v2/kernels/core_ops/test_pre_ln.py b/tests/unit/inference/v2/kernels/core_ops/test_pre_ln.py
new file mode 100644
index 000000000000..ffb748e57af2
--- /dev/null
+++ b/tests/unit/inference/v2/kernels/core_ops/test_pre_ln.py
@@ -0,0 +1,51 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import pytest
+import torch
+
+from deepspeed.accelerator import get_accelerator
+from deepspeed.inference.v2.kernels.core_ops import CUDAFPPreLN
+from ....v2.inference_test_utils import get_dtypes, allclose
+
+
+def reference_implementation(residual: torch.Tensor, hidden_states: torch.Tensor, gamma: torch.Tensor,
+                             beta: torch.Tensor, epsilon: float) -> torch.Tensor:
+    residual_f = residual.to(torch.float32)
+    hidden_states_f = hidden_states.to(torch.float32)
+    gamma_f = gamma.to(torch.float32)
+    beta_f = beta.to(torch.float32)
+    residual_out = residual_f + hidden_states_f
+    hidden_out = torch.nn.functional.layer_norm(residual_out, (hidden_states_f.size(-1), ),
+                                                weight=gamma_f,
+                                                bias=beta_f,
+                                                eps=epsilon)
+    return residual_out.to(hidden_states.dtype), hidden_out.to(hidden_states.dtype)
+
+
+@pytest.mark.inference_v2_ops
+@pytest.mark.parametrize("tokens, channels", [(1, 4096), (37, 2048), (112, 14432), (1024, 6144)])
+@pytest.mark.parametrize("dtype", get_dtypes())
+def test_cuda_pre_ln(tokens: int, channels: int, dtype: torch.dtype) -> None:
+
+    # Input vals
+    hidden_states = torch.randn((tokens, channels), dtype=dtype, device=get_accelerator().current_device_name())
+    residual = torch.randn((tokens, channels), dtype=dtype, device=get_accelerator().current_device_name())
+    gamma = torch.randn((channels), dtype=dtype, device=get_accelerator().current_device_name())
+    beta = torch.rand((channels), dtype=dtype, device=get_accelerator().current_device_name())
+    epsilon = 1e-5
+
+    # Reference output
+    ref_output_res, ref_output_hid = reference_implementation(residual, hidden_states, gamma, beta, epsilon)
+
+    # New output
+    pre_ln_kernel = CUDAFPPreLN(hidden_states.size(-1), residual.dtype)
+    ds_output_res = torch.empty_like(residual)
+    ds_output_hid = torch.empty_like(hidden_states)
+    pre_ln_kernel(ds_output_res, ds_output_hid, residual, hidden_states, gamma, beta)
+
+    # Check
+    assert allclose(ds_output_res, ref_output_res)
+    assert allclose(ds_output_hid, ref_output_hid)
diff --git a/tests/unit/inference/v2/kernels/core_ops/test_rms_norm.py b/tests/unit/inference/v2/kernels/core_ops/test_rms_norm.py
new file mode 100644
index 000000000000..63b16da171c9
--- /dev/null
+++ b/tests/unit/inference/v2/kernels/core_ops/test_rms_norm.py
@@ -0,0 +1,77 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import pytest
+import torch
+
+from deepspeed.accelerator import get_accelerator
+from deepspeed.inference.v2.inference_utils import DtypeEnum
+from deepspeed.inference.v2.kernels.core_ops import CUDARMSNorm, CUDARMSPreNorm
+from ....v2.inference_test_utils import get_dtypes, allclose
+
+
+def reference_rms_norm(vals: torch.Tensor, gamma: torch.Tensor, epsilon: float = 1e-5) -> torch.Tensor:
+    variance = vals.to(torch.float32).pow(2).mean(-1, keepdim=True)
+    vals = vals * torch.rsqrt(variance + epsilon)
+
+    if gamma.dtype in [torch.float16, torch.bfloat16]:
+        vals = vals.to(gamma.dtype)
+
+    return gamma * vals
+
+
+def reference_rms_pre_norm(vals: torch.Tensor,
+                           residual: torch.Tensor,
+                           gamma: torch.Tensor,
+                           epsilon: float = 1e-5) -> torch.Tensor:
+    residual = residual + vals
+    return residual, reference_rms_norm(residual, gamma, epsilon)
+
+
+def _rms_norm_testing_helper(rows: int, channels: int, do_residual: bool, dtype: DtypeEnum) -> None:
+    device = get_accelerator().current_device_name()
+    t_dtype = dtype.value
+
+    vals = torch.randn((rows, channels), dtype=t_dtype, device=device)
+    gamma = torch.randn((channels), dtype=t_dtype, device=device)
+    epsilon = 1e-5
+
+    if do_residual:
+        residual_in = torch.randn((rows, channels), dtype=t_dtype, device=device)
+        ds_residual = residual_in.clone()
+
+        ref_residual, ref_output = reference_rms_pre_norm(vals, residual_in, gamma, epsilon)
+
+        kernel = CUDARMSPreNorm(channels, t_dtype, epsilon=epsilon)
+        ds_out = torch.empty_like(ds_residual)
+
+        kernel(ds_residual, ds_out, residual_in, vals, gamma)
+
+        assert allclose(ds_out, ref_output)
+        assert allclose(ds_residual, ref_residual)
+    else:
+
+        ref_output = reference_rms_norm(vals, gamma, epsilon)
+
+        kernel = CUDARMSNorm(channels, t_dtype, epsilon=epsilon)
+        ds_out = torch.empty_like(vals)
+
+        kernel(ds_out, vals, gamma)
+
+        assert allclose(ds_out, ref_output)
+
+
+@pytest.mark.inference_v2_ops
+@pytest.mark.parametrize("dtype", get_dtypes())
+@pytest.mark.parametrize("do_residual", [True, False])
+def test_rms_dtypes(dtype: DtypeEnum, do_residual: bool) -> None:
+    _rms_norm_testing_helper(883, 1024, do_residual, DtypeEnum(dtype))
+
+
+@pytest.mark.inference_v2_ops
+@pytest.mark.parametrize("rows, cols", [(1, 4096), (37, 2048), (112, 14432), (1024, 6144)])
+@pytest.mark.parametrize("do_residual", [True, False])
+def test_rms_shapes(rows: int, cols: int, do_residual: bool) -> None:
+    _rms_norm_testing_helper(rows, cols, do_residual, DtypeEnum.fp16)
diff --git a/tests/unit/inference/v2/kernels/cutlass_ops/__init__.py b/tests/unit/inference/v2/kernels/cutlass_ops/__init__.py
new file mode 100644
index 000000000000..208299fb8c50
--- /dev/null
+++ b/tests/unit/inference/v2/kernels/cutlass_ops/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
diff --git a/tests/unit/inference/v2/kernels/cutlass_ops/test_moe_gemm.py b/tests/unit/inference/v2/kernels/cutlass_ops/test_moe_gemm.py
new file mode 100644
index 000000000000..ed76dabe1f4c
--- /dev/null
+++ b/tests/unit/inference/v2/kernels/cutlass_ops/test_moe_gemm.py
@@ -0,0 +1,113 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import pytest
+import torch
+
+from deepspeed.accelerator import get_accelerator
+from deepspeed.inference.v2.inference_utils import ActivationType, DtypeEnum
+from deepspeed.inference.v2.kernels.cutlass_ops import MoEGEMM
+from ....v2.inference_test_utils import allclose
+
+SINGLE_EXPERT_CASES = [(13, 2048, 2048), (256, 1024, 4096), (278, 5120, 2048), (893, 5120, 2560)]
+
+PYTORCH_ACT_FN_MAP = {
+    ActivationType.GELU: torch.nn.functional.gelu,
+    ActivationType.SILU: torch.nn.functional.silu,
+    ActivationType.RELU: torch.nn.functional.relu
+}
+
+
+@pytest.mark.inference_v2_ops
+@pytest.mark.parametrize("n_tokens, in_neurons, out_neurons", SINGLE_EXPERT_CASES)
+def test_single_expert(n_tokens: int, in_neurons: int, out_neurons: int) -> None:
+    """
+    Validate that the GEMM kernel produces identical results for a single GEMM instance.
+    """
+    device = get_accelerator().current_device()
+
+    activations = torch.rand((n_tokens, in_neurons), device=device, dtype=torch.float16) - 0.5
+    weights = torch.rand((1, in_neurons, out_neurons), device=device, dtype=torch.float16) - 0.5
+    biases = torch.randn((1, out_neurons), device=device, dtype=torch.float16)
+
+    weights_ref = weights.reshape(in_neurons, out_neurons)
+    biases_ref = biases.reshape(out_neurons)
+    ref_output = torch.matmul(activations, weights_ref) + biases_ref
+
+    moe_gemm = MoEGEMM(DtypeEnum.fp16, ActivationType.IDENTITY)
+    output = torch.empty((n_tokens, out_neurons), device=device, dtype=torch.float16)
+    cumsum_rows = torch.tensor([n_tokens], dtype=torch.int64, device=device)
+
+    moe_gemm(output, activations, weights, cumsum_rows, biases)
+    assert allclose(output, ref_output, tolerances=(1e-2, 1e-2))
+    get_accelerator().synchronize()
+
+
+def moe_test_helper(in_neurons: int, out_neurons: int, n_experts: int, max_tokens_per_expert: int,
+                    act_fn: ActivationType, dtype: DtypeEnum) -> None:
+    """
+    Helper function for validating the GEMM kernel for a single expert.
+    """
+    device = get_accelerator().current_device()
+
+    expert_allocations = torch.randint(0, max_tokens_per_expert, (n_experts, ), device=device, dtype=torch.int32)
+    cumsum_rows = expert_allocations.cumsum(dim=0)
+    print(cumsum_rows.dtype)
+
+    activations = torch.rand((cumsum_rows[-1], in_neurons), device=device, dtype=dtype.value) - 0.5
+    weights = torch.rand((n_experts, in_neurons, out_neurons), device=device, dtype=dtype.value) - 0.5
+    biases = torch.randn((n_experts, out_neurons), device=device, dtype=dtype.value)
+
+    out_ref = torch.empty((cumsum_rows[-1], out_neurons), device=device, dtype=dtype.value)
+
+    for expert_idx in range(n_experts):
+        start = cumsum_rows[expert_idx - 1] if expert_idx > 0 else 0
+        end = cumsum_rows[expert_idx]
+        activations_slice = activations[start:end]
+        weights_slice = weights[expert_idx]
+        biases_slice = biases[expert_idx]
+        out_ref[start:end] = torch.matmul(activations_slice, weights_slice) + biases_slice
+
+    if act_fn != ActivationType.IDENTITY:
+        act_fn_fn = PYTORCH_ACT_FN_MAP[act_fn]
+        out_ref = act_fn_fn(out_ref)
+
+    moe_gemm = MoEGEMM(DtypeEnum.fp16, act_fn)
+    output = torch.empty((cumsum_rows[-1], out_neurons), device=device, dtype=dtype.value)
+
+    moe_gemm(output, activations, weights, cumsum_rows, biases)
+
+    if dtype == DtypeEnum.bf16:
+        assert allclose(output, out_ref, tolerances=(1e-1, 1e-1))
+    else:
+        assert allclose(output, out_ref, tolerances=(1e-2, 1e-2))
+    get_accelerator().synchronize()
+
+
+@pytest.mark.inference_v2_ops
+@pytest.mark.parametrize("max_tokens_per_expert", [1, 4, 16, 64, 128])
+def test_multi_expert(max_tokens_per_expert: int) -> None:
+    """
+    Validate for multi-expert GEMM instances that the output is identical to the reference.
+    """
+    moe_test_helper(5120, 2048, 64, max_tokens_per_expert, ActivationType.IDENTITY, DtypeEnum.fp16)
+
+
+@pytest.mark.inference_v2_ops
+@pytest.mark.parametrize("act_fn", [ActivationType.GELU, ActivationType.SILU, ActivationType.RELU])
+def test_act_fns(act_fn: ActivationType) -> None:
+    """
+    Validate activation function behavior.
+    """
+    moe_test_helper(5120, 2048, 64, 32, act_fn, DtypeEnum.fp16)
+
+
+@pytest.mark.inference_v2_ops
+@pytest.mark.parametrize("dtype", [DtypeEnum.fp16, DtypeEnum.bf16])
+def test_dtypes(dtype: DtypeEnum) -> None:
+    """
+    Validate data type behavior.
+    """
+    moe_test_helper(5120, 2048, 64, 32, ActivationType.IDENTITY, dtype)
diff --git a/tests/unit/inference/v2/kernels/ragged_ops/__init__.py b/tests/unit/inference/v2/kernels/ragged_ops/__init__.py
new file mode 100644
index 000000000000..208299fb8c50
--- /dev/null
+++ b/tests/unit/inference/v2/kernels/ragged_ops/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
diff --git a/tests/unit/inference/v2/kernels/ragged_ops/ragged_testing_utils.py b/tests/unit/inference/v2/kernels/ragged_ops/ragged_testing_utils.py
new file mode 100644
index 000000000000..be7454fee4aa
--- /dev/null
+++ b/tests/unit/inference/v2/kernels/ragged_ops/ragged_testing_utils.py
@@ -0,0 +1,300 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import random
+from typing import List, Optional, Tuple
+
+import torch
+
+from deepspeed.accelerator import get_accelerator
+from deepspeed.inference.v2.ragged import (
+    AllocationMode,
+    DSSequenceDescriptor,
+    DSStateManager,
+    DSStateManagerConfig,
+    KVCacheConfig,
+    MemoryConfig,
+    PlaceholderSequenceDescriptor,
+    RaggedBatchWrapper,
+)
+from ....v2.inference_test_utils import allclose
+
+
+def build_simple_batch(seq_lens: List[int],
+                       vocab_range: Optional[int] = 100,
+                       padding: Optional[bool] = False) -> RaggedBatchWrapper:
+    """
+    Construct a simple batch with the given sequence lengths. This method should not
+    be used for for testing scenarios that require information about KV or sequence
+    history.
+    """
+    total_tokens = max(sum(seq_lens), 1024)
+    n_seqs = max(len(seq_lens), 128)
+
+    config = DSStateManagerConfig(max_tracked_sequences=n_seqs,
+                                  max_ragged_sequence_count=n_seqs,
+                                  max_ragged_batch_size=total_tokens)
+    batch = RaggedBatchWrapper(config)
+
+    batch.clear()
+
+    for seq_len in seq_lens:
+        seq_desc = PlaceholderSequenceDescriptor()
+        tokens = torch.randint(0, vocab_range, (seq_len, ))
+        batch.insert_sequence(seq_desc, tokens)
+
+    batch.finalize(padding=padding)
+
+    return batch
+
+
+def build_complex_batch(seq_params: List[Tuple[int, int, int]],
+                        kv_block_size: int,
+                        vocab_range: Optional[int] = 100,
+                        padding: Optional[bool] = False) -> Tuple[RaggedBatchWrapper, int]:
+    """
+    Construct a fully paramtrized batch with the given sequence lengths. This method
+    can be used to construct more realistic inputs for testing scenarios that will interact
+    with all the members of the RaggedBatchWrapper.
+    """
+    seq_lens = [seq_param[0] for seq_param in seq_params]
+    total_tokens = max(sum(seq_lens), 1024)
+    n_seqs = max(len(seq_lens), 128)
+
+    config = DSStateManagerConfig(max_tracked_sequences=n_seqs,
+                                  max_ragged_sequence_count=n_seqs,
+                                  max_ragged_batch_size=total_tokens)
+    batch = RaggedBatchWrapper(config)
+
+    batch.clear()
+
+    total_kv_blocks = 0
+
+    for seq_len, n_seen_tokens, kv_ptr in seq_params:
+        n_kv_blocks = (seq_len + n_seen_tokens + kv_block_size - 1) // kv_block_size
+        seq_desc = PlaceholderSequenceDescriptor(seen_tokens=n_seen_tokens,
+                                                 cur_allocated_blocks=n_kv_blocks,
+                                                 kv_blocks_ptr=kv_ptr)
+        tokens = torch.randint(0, vocab_range, (seq_len, ))
+        batch.insert_sequence(seq_desc, tokens)
+        total_kv_blocks += n_kv_blocks
+
+    batch.finalize(padding=padding)
+
+    return batch, total_kv_blocks
+
+
+def build_batch_and_manager(
+    seq_params: List[Tuple[int, int]],
+    head_size: int,
+    n_heads_kv: int,
+    kv_block_size: int,
+    vocab_range: Optional[int] = 100,
+    padding: Optional[bool] = False,
+    kv_fill: Optional[List[torch.Tensor]] = None
+) -> Tuple[RaggedBatchWrapper, DSStateManager, List[DSSequenceDescriptor]]:
+    """
+    Will construct and populate a batch and KVCache with the given sequence parameters.
+
+    Arguments:
+        seq_params (List[Tuple[int, int]]): A list of tuples containing the sequence length and
+            the number of tokens that have already been seen for that sequence.
+        head_size (int): The size of each attention head.
+        n_heads_kv (int): The number of attention heads for the KV-cache.
+        kv_block_size (int): The size of each block in the KV-cache.
+        vocab_range (Optional[int]): The range of the vocabulary. Defaults to 100.
+        padding (Optional[bool]): Whether to pad the batch. Defaults to False.
+        kv_fill (Optional[List[torch.Tensor]]): A list of tensors to use to populate the KV-cache.
+            If this is not provided, the KV-cache will be treated as empty and the contents should
+            not be relied upon. NOTE(cmikeh2): This functionality relies on the functionality
+            of LinearBlockedKVCopy. If tests relying on this feature are failing, make sure that
+            LinearBlockedKVCopy is working correctly.
+    """
+    seq_lens = [seq_param[0] for seq_param in seq_params]
+    fill_lens = [seq_param[1] for seq_param in seq_params]
+    max_created_batch_len = max(sum(seq_lens), sum(fill_lens))
+    total_tokens = max(max_created_batch_len, 1024)
+    n_seqs = max(len(seq_lens), 128)
+
+    req_kv_blocks = [None] * n_seqs
+    total_kv_blocks = 0
+    for i, (seq_len, n_seen_tokens) in enumerate(seq_params):
+        req_kv_blocks[i] = (seq_len + n_seen_tokens + kv_block_size - 1) // kv_block_size
+        total_kv_blocks += req_kv_blocks[i]
+
+    kv_config = KVCacheConfig(block_size=kv_block_size,
+                              num_allocation_groups=1,
+                              cache_shape=(1, n_heads_kv, head_size))
+    memory_config = MemoryConfig(mode=AllocationMode.ALLOCATE, size=total_kv_blocks)
+
+    config = DSStateManagerConfig(max_tracked_sequences=n_seqs,
+                                  max_ragged_sequence_count=n_seqs,
+                                  max_ragged_batch_size=total_tokens,
+                                  memory_config=memory_config)
+
+    batch = RaggedBatchWrapper(config)
+    state_manager = DSStateManager(config, (kv_config, ))
+
+    # At the beginning of operation, the design of the allocator is such that it will return
+    # linear blocks of memory. The following will "warm up" the allocator so that we can be
+    # more certain that code is not dependent on this behavior.
+    all_allocs = []
+    for _ in range(20):
+        decision = random.randint(0, 1)
+
+        if decision == 0:
+            blocks_to_allocate = random.randint(0, total_kv_blocks)
+            if blocks_to_allocate <= state_manager.free_blocks[0] and blocks_to_allocate > 0:
+                all_allocs.append(state_manager.allocate_blocks(blocks_to_allocate))
+        else:
+            if len(all_allocs) > 0:
+                idx = random.randint(0, len(all_allocs) - 1)
+                state_manager._kv_cache.free(all_allocs[idx])
+
+                del all_allocs[idx]
+
+    for alloc in all_allocs:
+        state_manager._kv_cache.free(alloc)
+
+    assert state_manager.free_blocks[0] == total_kv_blocks
+
+    batch.clear()
+    seq_descs = []
+
+    if kv_fill is None or sum(fill_lens) == 0:
+        for i, (seq_len, n_seen_tokens) in enumerate(seq_params):
+            # Create empty descriptor
+            seq_desc = state_manager.get_or_create_sequence(i)
+
+            # Update `seen_tokens` in the descriptor
+            seq_desc.pre_forward(n_seen_tokens)
+            seq_desc.post_forward()
+
+            # Ensure there's enough KV-cache for the sequence
+            kv_block_ids = state_manager.allocate_blocks(req_kv_blocks[i])
+            print(f"Allocated {req_kv_blocks[i]} blocks for sequence {i}: {kv_block_ids}")
+            seq_desc.extend_kv_cache(kv_block_ids)
+
+            # Insert sequence into batch
+            tokens = torch.randint(0, vocab_range, (seq_len, ))
+            batch.insert_sequence(seq_desc, tokens)
+            seq_desc.pre_forward(seq_len)
+            seq_descs.append(seq_desc)
+    else:
+        qkv = torch.empty((total_tokens, (n_heads_kv * 3) * head_size),
+                          dtype=torch.float16,
+                          device=get_accelerator().current_device())
+        fills_as_tensor = torch.tensor(fill_lens, dtype=torch.int32)
+        fill_cumsum = torch.cat((torch.tensor([0], dtype=torch.int32), torch.cumsum(fills_as_tensor, dim=0)))
+
+        for i, (_, n_seen_tokens) in enumerate(seq_params):
+            # Create empty descriptor
+            seq_desc = state_manager.get_or_create_sequence(i)
+
+            # Update `seen_tokens` in the descriptor
+            if n_seen_tokens > 0:
+                dummy_fill_toks = torch.randint(0, vocab_range, (n_seen_tokens, ))
+                batch.insert_sequence(seq_desc, dummy_fill_toks)
+                seq_desc.pre_forward(n_seen_tokens)
+
+            # Ensure there's enough KV-cache for the sequence
+            kv_block_ids = state_manager.allocate_blocks(req_kv_blocks[i])
+            print(f"Allocated {req_kv_blocks[i]} blocks for sequence {i}: {kv_block_ids}")
+            seq_desc.extend_kv_cache(kv_block_ids)
+            seq_descs.append(seq_desc)
+
+            if n_seen_tokens == 0:
+                continue
+
+            assert kv_fill[i].shape[0] == n_seen_tokens
+            assert kv_fill[i].shape[1] == n_heads_kv * head_size * 2
+
+            local_q = torch.randn((n_seen_tokens, n_heads_kv * head_size), dtype=torch.float16, device=qkv.device)
+            local_qkv = torch.cat((local_q, kv_fill[i]), dim=1)
+            qkv[fill_cumsum[i]:fill_cumsum[i + 1]] = local_qkv
+
+        batch.finalize(padding=padding)
+
+        from deepspeed.inference.v2.kernels.ragged_ops import LinearBlockedKVCopy
+        kv_copy = LinearBlockedKVCopy(head_size, n_heads_kv, n_heads_kv, torch.float16)
+        kv_cache = state_manager.get_cache(0)
+        kv_copy(kv_cache, qkv, batch)
+
+        for seq_desc in seq_descs:
+            if seq_desc.in_flight_tokens > 0:
+                seq_desc.post_forward()
+
+        batch.clear()
+
+        for i, (seq_len, _) in enumerate(seq_params):
+            seq_desc = state_manager.get_or_create_sequence(i)
+            tokens = torch.randint(0, vocab_range, (seq_len, ))
+            batch.insert_sequence(seq_desc, tokens)
+            seq_desc.pre_forward(seq_len)
+
+            # We will skip KV cache allocation here because we did a lump allocation above
+            # for both the fill and the sequence itself.
+
+    batch.finalize(padding=padding)
+
+    return batch, state_manager, seq_descs
+
+
+def validate_kv_cache(kv_cache: torch.Tensor,
+                      k: torch.Tensor,
+                      v: torch.Tensor,
+                      seq_descs: List[DSSequenceDescriptor],
+                      batch: RaggedBatchWrapper,
+                      exact: bool = True) -> None:
+    """
+    Given a QKV tensor and a KV cache, validate that the cache contains the correct values.
+    """
+    block_size = kv_cache.shape[1]
+    n_kv_heads = kv_cache.shape[3]
+    head_size = kv_cache.shape[4]
+
+    inflight_descs = batch.inflight_seq_descriptors(on_device=False)[:batch.current_sequences]
+
+    if inflight_descs.shape[0] != len(seq_descs):
+        raise ValueError("The number of sequence descriptors does not match the number of sequences in the batch.")
+
+    for seq_desc, inflight_seq in zip(seq_descs, inflight_descs):
+        start_idx = inflight_seq[0]
+        assigned_kv_blocks = seq_desc.kv_cache_ids(on_device=False)
+
+        real_k_values = k[start_idx:start_idx + seq_desc.in_flight_tokens]
+        real_v_values = v[start_idx:start_idx + seq_desc.in_flight_tokens]
+
+        start_block_idx = seq_desc.seen_tokens // block_size
+        local_start_idx = 0
+        cur_start_idx = seq_desc.seen_tokens
+
+        for block_idx in range(start_block_idx, seq_desc.cur_allocated_blocks):
+            block = kv_cache[assigned_kv_blocks[0, block_idx].item()]
+            block_start_idx = cur_start_idx % block_size
+            n_tokens_to_check = min(block_size - block_start_idx, seq_desc.in_flight_tokens - local_start_idx)
+            block_end_idx = block_start_idx + n_tokens_to_check
+
+            if exact:
+                assert torch.equal(
+                    block[block_start_idx:block_end_idx, 0, :, :],
+                    real_k_values[local_start_idx:local_start_idx + n_tokens_to_check].reshape(
+                        n_tokens_to_check, n_kv_heads, head_size))
+                assert torch.equal(
+                    block[block_start_idx:block_end_idx, 1, :, :],
+                    real_v_values[local_start_idx:local_start_idx + n_tokens_to_check].reshape(
+                        n_tokens_to_check, n_kv_heads, head_size))
+            else:
+                assert allclose(
+                    block[block_start_idx:block_end_idx, 0, :, :],
+                    real_k_values[local_start_idx:local_start_idx + n_tokens_to_check].reshape(
+                        n_tokens_to_check, n_kv_heads, head_size))
+                assert allclose(
+                    block[block_start_idx:block_end_idx, 1, :, :],
+                    real_v_values[local_start_idx:local_start_idx + n_tokens_to_check].reshape(
+                        n_tokens_to_check, n_kv_heads, head_size))
+
+            local_start_idx += n_tokens_to_check
+            cur_start_idx += n_tokens_to_check
diff --git a/tests/unit/inference/v2/kernels/ragged_ops/test_atom_builder.py b/tests/unit/inference/v2/kernels/ragged_ops/test_atom_builder.py
new file mode 100644
index 000000000000..a33c938a0608
--- /dev/null
+++ b/tests/unit/inference/v2/kernels/ragged_ops/test_atom_builder.py
@@ -0,0 +1,45 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import pytest
+import torch
+
+from deepspeed.inference.v2.kernels.ragged_ops import AtomBuilder
+from .ragged_testing_utils import build_complex_batch
+
+Q_BLOCK_SIZE = 128
+KV_BLOCK_SIZE = 128
+
+
+@pytest.mark.inference_v2_ops
+@pytest.mark.parametrize('seq_params', [(1, 0, 0), (1, 228, 0), (383, 0, 0), (1, 494, 0)])
+def test_single_sequence(seq_params) -> None:
+    seq_len, n_seen_tokens, _ = seq_params
+
+    batch, _ = build_complex_batch([seq_params], kv_block_size=KV_BLOCK_SIZE, padding=False)
+    atom_builder = AtomBuilder()
+
+    atoms = torch.empty((8, 8), dtype=torch.int32, device=torch.device("cpu"))
+    atoms, n_atoms = atom_builder(atoms, batch, Q_BLOCK_SIZE, KV_BLOCK_SIZE)
+
+    calc_n_atoms = (seq_len + 127) // 128
+
+    assert n_atoms == calc_n_atoms
+
+    for i, atom in enumerate(atoms[:n_atoms]):
+        # Since the ptr was 0, first 2 elements should be 0
+        assert atom[0] == 0
+        assert atom[1] == 0
+
+        # Since we have a single sequence, the q_start_idx should always be
+        # whichever atom we're on multiplied by the block size
+        assert atom[2] == i * Q_BLOCK_SIZE
+        assert atom[3] == min(Q_BLOCK_SIZE, seq_len - i * Q_BLOCK_SIZE)
+        total_toks = i * Q_BLOCK_SIZE + min(Q_BLOCK_SIZE, seq_len - i * Q_BLOCK_SIZE)
+
+        assert atom[4] == (total_toks + n_seen_tokens + KV_BLOCK_SIZE - 1) // KV_BLOCK_SIZE
+        assert atom[5] == (total_toks + n_seen_tokens)
+
+        assert atom[6] == n_seen_tokens + i * Q_BLOCK_SIZE
diff --git a/tests/unit/inference/v2/kernels/ragged_ops/test_blocked_flash.py b/tests/unit/inference/v2/kernels/ragged_ops/test_blocked_flash.py
new file mode 100644
index 000000000000..ce5a178c9548
--- /dev/null
+++ b/tests/unit/inference/v2/kernels/ragged_ops/test_blocked_flash.py
@@ -0,0 +1,197 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import itertools
+
+from typing import List, Tuple
+
+import pytest
+import torch
+
+from deepspeed.accelerator import get_accelerator
+from deepspeed.inference.v2.inference_utils import DtypeEnum
+from deepspeed.inference.v2.kernels.ragged_ops import (
+    AtomBuilder,
+    BlockedFlashAttn,
+    get_q_block_size,
+    get_kv_block_size,
+    LinearBlockedKVCopy,
+)
+from deepspeed.inference.v2.ragged import split_kv
+from deepspeed.ops.op_builder import RaggedUtilsBuilder
+
+from .ragged_testing_utils import build_batch_and_manager
+from ....v2.inference_test_utils import allclose
+
+try:
+    from flash_attn.flash_attn_interface import flash_attn_varlen_func
+    validate_accuracy = True
+except ImportError:
+    validate_accuracy = False
+"""
+NOTE(cmikeh2): These tests depend on atom construction and KV-cache copying to behave correctly.
+If one or the other of those is not working, then these tests will fail. Before debugging here,
+make sure that the atom construction and KV-cache copying tests are passing.
+"""
+
+
+def _blocked_flash_testing_helper(head_size: int, n_heads_q: int, n_heads_kv: int,
+                                  seq_params: List[Tuple[int, int]]) -> None:
+    """
+    Helper function for testing blocked flash attention. Used to enable parametrize to only set up
+    a subset of parameters before being passed to the unified test function.
+    """
+    q_block_size = get_q_block_size(head_size)
+    kv_block_size = get_kv_block_size(head_size)
+
+    kvs = []
+    for _, history_len in seq_params:
+        if history_len > 0:
+            kvs.append(
+                torch.randn((history_len, 2 * n_heads_kv * head_size),
+                            device=get_accelerator().current_device(),
+                            dtype=torch.float16))
+        else:
+            kvs.append(None)
+
+    batch, state_manager, _ = build_batch_and_manager(seq_params, head_size, n_heads_kv, kv_block_size, kv_fill=kvs)
+
+    atom_builder = AtomBuilder()
+    kv_copy = LinearBlockedKVCopy(head_size, n_heads_q, n_heads_kv, DtypeEnum.fp16)
+    atom_flash = BlockedFlashAttn(head_size, DtypeEnum.fp16)
+
+    total_atoms = sum((seq[0] + q_block_size - 1) // q_block_size for seq in seq_params)
+    atoms = torch.empty((total_atoms, 8), dtype=torch.int32, device=get_accelerator().current_device())
+    alloc_func = RaggedUtilsBuilder().load().allocate_fast_host_buffer
+    atoms_host = alloc_func(atoms)
+
+    qkv = torch.randn((batch.current_tokens, (n_heads_q + 2 * n_heads_kv) * head_size),
+                      device=get_accelerator().current_device(),
+                      dtype=torch.float16)
+
+    atoms_host, n_atoms = atom_builder(atoms_host, batch, q_block_size, kv_block_size)
+    atoms.copy_(atoms_host[:n_atoms])
+
+    kv_cache = state_manager.get_cache(0)
+    kv_copy(kv_cache, qkv, batch)
+
+    out = torch.empty((batch.current_tokens, head_size * n_heads_q),
+                      device=get_accelerator().current_device(),
+                      dtype=torch.float16)
+    k_cache, v_cache = split_kv(kv_cache)
+    q = qkv[:, :head_size * n_heads_q]
+
+    atom_flash(out, q, k_cache, v_cache, atoms, 1.0)
+
+    if validate_accuracy:
+        cu_seqlens_q = torch.tensor([0] + list(itertools.accumulate([seq[0] for seq in seq_params])),
+                                    dtype=torch.int32,
+                                    device=get_accelerator().current_device())
+        cu_seqlens_kv = torch.tensor([0] + list(itertools.accumulate([seq[1] + seq[0] for seq in seq_params])),
+                                     dtype=torch.int32,
+                                     device=get_accelerator().current_device())
+
+        inflight_kv = qkv[:, head_size * n_heads_q:]
+        full_kvs = []
+        for i, kv in enumerate(kvs):
+            if kv is not None:
+                full_kvs.append(torch.cat([kv, inflight_kv[cu_seqlens_q[i]:cu_seqlens_q[i + 1]]], dim=0))
+            else:
+                full_kvs.append(inflight_kv[cu_seqlens_q[i]:cu_seqlens_q[i + 1]])
+        run_kvs = torch.cat(full_kvs, dim=0)
+        k = run_kvs[:, :head_size * n_heads_kv]
+        v = run_kvs[:, head_size * n_heads_kv:]
+
+        q_ref = q.reshape((batch.current_tokens, n_heads_q, head_size))
+        k_ref = k.reshape((k.shape[0], n_heads_kv, head_size))
+        v_ref = v.reshape((v.shape[0], n_heads_kv, head_size))
+
+        max_seqlen_q = max([seq[0] for seq in seq_params])
+        max_seqlen_kv = max([seq[1] + seq[0] for seq in seq_params])
+
+        ref_o = flash_attn_varlen_func(q_ref,
+                                       k_ref,
+                                       v_ref,
+                                       cu_seqlens_q,
+                                       cu_seqlens_kv,
+                                       max_seqlen_q,
+                                       max_seqlen_kv,
+                                       softmax_scale=1.0,
+                                       causal=True)
+
+        ref_o = ref_o.reshape(batch.current_tokens, head_size * n_heads_q)
+
+        assert allclose(out, ref_o)
+
+
+@pytest.mark.inference_v2_ops
+@pytest.mark.parametrize("n_tokens", [2, 33, 65, 128, 256, 2037])
+def test_single_prompt(n_tokens: int) -> None:
+    head_size = 64
+    n_heads_q = 16
+    n_heads_kv = 16
+
+    seq_params = [(n_tokens, 0)]
+    _blocked_flash_testing_helper(head_size, n_heads_q, n_heads_kv, seq_params)
+
+
+@pytest.mark.inference_v2_ops
+@pytest.mark.parametrize("prompt_lengths", [(128, 128), (192, 38), (514, 713), (83, 312, 610)])
+def test_multiple_prompts(prompt_lengths: Tuple[int, int]) -> None:
+    """
+    Test multiple prompts in a single batch.
+    """
+    head_size = 64
+    n_heads_q = 16
+    n_heads_kv = 16
+
+    seq_params = [(prompt_lengths[i], 0) for i in range(len(prompt_lengths))]
+    _blocked_flash_testing_helper(head_size, n_heads_q, n_heads_kv, seq_params)
+
+
+@pytest.mark.inference_v2_ops
+@pytest.mark.parametrize("seq_params", [(1, 34), (43, 40), (1, 144), (64, 128), (332, 628)])
+def test_continuation(seq_params: Tuple[int, int]) -> None:
+    """
+    Test continued generation/prompt processing.
+    """
+    head_size = 64
+    n_heads_q = 32
+    n_heads_kv = 32
+
+    _blocked_flash_testing_helper(head_size, n_heads_q, n_heads_kv, [seq_params])
+
+
+@pytest.mark.inference_v2_ops
+@pytest.mark.parametrize("head_size", [64, 128])
+def test_head_size(head_size: int) -> None:
+    n_heads_q = 16
+    n_heads_kv = 16
+    seq_params = [(128, 128), (192, 38), (1, 814)]
+
+    _blocked_flash_testing_helper(head_size, n_heads_q, n_heads_kv, seq_params)
+
+
+@pytest.mark.inference_v2_ops
+@pytest.mark.parametrize("head_config", [(32, 8), (64, 16), (40, 8)])
+def test_gqa(head_config: Tuple[int, int]) -> None:
+    head_size = 128
+    n_heads_q = head_config[0]
+    n_heads_kv = head_config[1]
+
+    seq_params = [(128, 128), (192, 38), (1, 814)]
+
+    _blocked_flash_testing_helper(head_size, n_heads_q, n_heads_kv, seq_params)
+
+
+@pytest.mark.inference_v2_ops
+def test_fully_composed() -> None:
+    head_size = 64
+    n_heads_q = 16
+    n_heads_kv = 16
+
+    seq_params = [(332, 628), (1, 718), (1, 323), (180, 5), (224, 0)]
+
+    _blocked_flash_testing_helper(head_size, n_heads_q, n_heads_kv, seq_params)
diff --git a/tests/unit/inference/v2/kernels/ragged_ops/test_blocked_kv_copy.py b/tests/unit/inference/v2/kernels/ragged_ops/test_blocked_kv_copy.py
new file mode 100644
index 000000000000..90fe26eb4490
--- /dev/null
+++ b/tests/unit/inference/v2/kernels/ragged_ops/test_blocked_kv_copy.py
@@ -0,0 +1,112 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import pytest
+import torch
+
+from deepspeed.accelerator import get_accelerator
+from deepspeed.inference.v2.kernels.ragged_ops import LinearBlockedKVCopy
+from .ragged_testing_utils import build_batch_and_manager, validate_kv_cache
+
+
+@pytest.mark.inference_v2_ops
+@pytest.mark.parametrize("n_tokens, history_size", [(1, 0), (17, 0), (33, 8), (63, 1)])
+def test_single_sequence_single_block(n_tokens: int, history_size: int):
+    """
+    Validate that the copy works correctly
+    """
+    head_size = 64
+    n_heads_q = 16
+    n_heads_kv = 16
+    kv_block_size = 64
+    device = get_accelerator().current_device()
+
+    batch, state_manager, seq_descs = build_batch_and_manager([(n_tokens, history_size)], head_size, n_heads_kv,
+                                                              kv_block_size)
+
+    assert batch.current_sequences == 1
+    assert batch.current_tokens == n_tokens
+
+    qkv = torch.randn((batch.current_tokens, (n_heads_q + 2 * n_heads_kv) * head_size),
+                      device=device,
+                      dtype=torch.float16)
+
+    kv_cache = state_manager.get_cache(0)
+
+    copy_impl = LinearBlockedKVCopy(head_size, n_heads_q, n_heads_kv, torch.float16)
+    copy_impl(kv_cache, qkv, batch)
+
+    k = qkv[:, head_size * n_heads_q:head_size * (n_heads_q + n_heads_kv)]
+    v = qkv[:, head_size * (n_heads_q + n_heads_kv):]
+
+    validate_kv_cache(kv_cache, k, v, seq_descs, batch)
+
+
+@pytest.mark.inference_v2_ops
+@pytest.mark.parametrize("n_tokens, history_size", [(128, 0), (177, 0), (169, 8), (117, 88)])
+def test_single_sequence_multiple_blocks(n_tokens: int, history_size: int):
+    """
+    Validate that the copy works correctly
+    """
+    head_size = 64
+    n_heads_q = 16
+    n_heads_kv = 16
+    kv_block_size = 64
+    device = get_accelerator().current_device()
+
+    batch, state_manager, seq_descs = build_batch_and_manager([(n_tokens, history_size)], head_size, n_heads_kv,
+                                                              kv_block_size)
+
+    assert batch.current_sequences == 1
+    assert batch.current_tokens == n_tokens
+
+    qkv = torch.randn((batch.current_tokens, (n_heads_q + 2 * n_heads_kv) * head_size),
+                      device=device,
+                      dtype=torch.float16)
+
+    kv_cache = state_manager.get_cache(0)
+
+    copy_impl = LinearBlockedKVCopy(head_size, n_heads_q, n_heads_kv, torch.float16)
+    copy_impl(kv_cache, qkv, batch)
+
+    k = qkv[:, head_size * n_heads_q:head_size * (n_heads_q + n_heads_kv)]
+    v = qkv[:, head_size * (n_heads_q + n_heads_kv):]
+
+    validate_kv_cache(kv_cache, k, v, seq_descs, batch)
+
+
+@pytest.mark.inference_v2_ops
+def test_multi_sequence() -> None:
+    head_size = 64
+    n_heads_q = 16
+    n_heads_kv = 16
+    kv_block_size = 64
+    device = get_accelerator().current_device()
+
+    batch_config = [
+        (128, 0),
+        (177, 0),
+        (169, 8),
+        (117, 88),
+        (1, 293),
+        (1, 733),
+        (1, 33),
+    ]
+
+    batch, state_manager, seq_descs = build_batch_and_manager(batch_config, head_size, n_heads_kv, kv_block_size)
+
+    qkv = torch.randn((batch.current_tokens, (n_heads_q + 2 * n_heads_kv) * head_size),
+                      device=device,
+                      dtype=torch.float16)
+
+    kv_cache = state_manager.get_cache(0)
+
+    copy_impl = LinearBlockedKVCopy(head_size, n_heads_q, n_heads_kv, torch.float16)
+    copy_impl(kv_cache, qkv, batch)
+
+    k = qkv[:, head_size * n_heads_q:head_size * (n_heads_q + n_heads_kv)]
+    v = qkv[:, head_size * (n_heads_q + n_heads_kv):]
+
+    validate_kv_cache(kv_cache, k, v, seq_descs, batch)
diff --git a/tests/unit/inference/v2/kernels/ragged_ops/test_blocked_rotary_emb.py b/tests/unit/inference/v2/kernels/ragged_ops/test_blocked_rotary_emb.py
new file mode 100644
index 000000000000..618c2d3b87ec
--- /dev/null
+++ b/tests/unit/inference/v2/kernels/ragged_ops/test_blocked_rotary_emb.py
@@ -0,0 +1,203 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import List, Tuple
+
+import pytest
+import torch
+
+from deepspeed.accelerator import get_accelerator
+from deepspeed.inference.v2.kernels.ragged_ops import BlockedRotaryEmbeddings, BlockedTrainedRotaryEmbeddings
+from deepspeed.inference.v2.ragged import RaggedBatchWrapper, DSSequenceDescriptor
+from .ragged_testing_utils import build_batch_and_manager, validate_kv_cache
+from ....v2.inference_test_utils import allclose
+"""
+NOTE(cmikeh2): It is very possible to see unit test failures (even on FP16) depending on when
+certain values are casted up to or down from float32. If we are seeing accuracy issues, we should
+make sure we are aligning on the training implementation's cast pattern here, given these tolerances
+tend to be sufficient elsewhere.
+"""
+
+
+def rotary_pos_embs(q: torch.Tensor, k: torch.Tensor, seq_descs: List[DSSequenceDescriptor], batch: RaggedBatchWrapper,
+                    head_size: int):
+
+    def make_cos_sin_emb(seq_len: int) -> Tuple[torch.Tensor, torch.Tensor]:
+        t = torch.arange(seq_len, dtype=torch.float32, device=get_accelerator().current_device())
+        inv_freq = (1.0 / (10000.0**(torch.arange(
+            0, head_size, 2, dtype=torch.float32, device=get_accelerator().current_device()) / head_size))).half()
+
+        freqs = torch.einsum("i,j->ij", t, inv_freq)
+        emb = torch.cat((freqs, freqs), dim=-1)
+
+        return torch.cos(emb)[:, None, :], torch.sin(emb)[:, None, :], inv_freq
+
+    def rotate_half(x: torch.Tensor) -> torch.Tensor:
+        return torch.cat((-x[..., x.shape[-1] // 2:], x[..., :x.shape[-1] // 2]), dim=-1)
+
+    cos, sin, freqs = make_cos_sin_emb(1024)
+
+    q_out = torch.empty_like(q)
+    k_out = torch.empty_like(k)
+    n_heads_q = q.shape[1] // head_size
+    n_heads_kv = k.shape[1] // head_size
+
+    inflight_descs = batch.inflight_seq_descriptors(on_device=False)[:batch.current_sequences]
+
+    if inflight_descs.shape[0] != len(seq_descs):
+        raise ValueError("The number of sequence descriptors does not match the number of sequences in the batch.")
+
+    for seq_desc, inflight_seq in zip(seq_descs, inflight_descs):
+        start_idx = inflight_seq[0]
+        n_tokens = seq_desc.in_flight_tokens
+
+        q_src = q[start_idx:start_idx + n_tokens].reshape(n_tokens, n_heads_q, head_size).float()
+        k_src = k[start_idx:start_idx + n_tokens].reshape(n_tokens, n_heads_kv, head_size).float()
+        freq_start_offset = seq_desc.seen_tokens
+
+        cos_chunk = cos[range(freq_start_offset, freq_start_offset + n_tokens)]
+        sin_chunk = sin[range(freq_start_offset, freq_start_offset + n_tokens)]
+
+        q_emb = q_src * cos_chunk + rotate_half(q_src) * sin_chunk
+        k_emb = k_src * cos_chunk + rotate_half(k_src) * sin_chunk
+
+        q_out[start_idx:start_idx + n_tokens] = q_emb.reshape(n_tokens, n_heads_q * head_size).to(q_out.dtype)
+        k_out[start_idx:start_idx + n_tokens] = k_emb.reshape(n_tokens, n_heads_kv * head_size).to(k_out.dtype)
+
+    return q_out, k_out, freqs
+
+
+@pytest.mark.inference_v2_ops
+@pytest.mark.parametrize("n_tokens, history_size", [(1, 0), (17, 0), (33, 15), (1, 63)])
+@pytest.mark.parametrize("trained_emb", [False, True])
+def test_single_sequence_single_block(n_tokens: int, history_size: int, trained_emb: bool):
+    """
+    Validate that the copy works correctly
+    """
+    head_size = 64
+    n_heads_q = 16
+    n_heads_kv = 16
+    kv_block_size = 64
+    device = get_accelerator().current_device()
+
+    batch, state_manager, seq_descs = build_batch_and_manager([(n_tokens, history_size)], head_size, n_heads_kv,
+                                                              kv_block_size)
+
+    assert batch.current_sequences == 1
+    assert batch.current_tokens == n_tokens
+
+    qkv = torch.randn((batch.current_tokens, (n_heads_q + 2 * n_heads_kv) * head_size),
+                      device=device,
+                      dtype=torch.float16)
+    qkv_ref = qkv.clone()
+
+    q = qkv_ref[:, :head_size * n_heads_q]
+    k = qkv_ref[:, head_size * n_heads_q:head_size * (n_heads_q + n_heads_kv)]
+    v = qkv_ref[:, head_size * (n_heads_q + n_heads_kv):]
+
+    q_ref, k, freqs = rotary_pos_embs(q, k, seq_descs, batch, head_size)
+    freqs = freqs.half()
+
+    kv_cache = state_manager.get_cache(0)
+
+    if trained_emb:
+        copy_impl = BlockedTrainedRotaryEmbeddings(head_size, n_heads_q, n_heads_kv, torch.float16)
+        copy_impl(kv_cache, qkv, batch, freqs)
+    else:
+        copy_impl = BlockedRotaryEmbeddings(head_size, n_heads_q, n_heads_kv, torch.float16)
+        copy_impl(kv_cache, qkv, batch)
+
+    assert allclose(qkv[:, :head_size * n_heads_q], q_ref)
+    validate_kv_cache(kv_cache, k, v, seq_descs, batch, exact=False)
+
+
+@pytest.mark.inference_v2_ops
+@pytest.mark.parametrize("n_tokens, history_size", [(128, 0), (177, 0), (169, 8), (117, 88)])
+@pytest.mark.parametrize("trained_emb", [False, True])
+def test_single_sequence_multiple_blocks(n_tokens: int, history_size: int, trained_emb: bool):
+    """
+    Validate that the copy works correctly
+    """
+    head_size = 64
+    n_heads_q = 16
+    n_heads_kv = 16
+    kv_block_size = 64
+    device = get_accelerator().current_device()
+
+    batch, state_manager, seq_descs = build_batch_and_manager([(n_tokens, history_size)], head_size, n_heads_kv,
+                                                              kv_block_size)
+
+    assert batch.current_sequences == 1
+    assert batch.current_tokens == n_tokens
+
+    qkv = torch.randn((batch.current_tokens, (n_heads_q + 2 * n_heads_kv) * head_size),
+                      device=device,
+                      dtype=torch.float16)
+    qkv_ref = qkv.clone()
+
+    q = qkv_ref[:, :head_size * n_heads_q]
+    k = qkv_ref[:, head_size * n_heads_q:head_size * (n_heads_q + n_heads_kv)]
+    v = qkv_ref[:, head_size * (n_heads_q + n_heads_kv):]
+
+    q_ref, k, freqs = rotary_pos_embs(q, k, seq_descs, batch, head_size)
+    freqs = freqs.half()
+
+    kv_cache = state_manager.get_cache(0)
+
+    if trained_emb:
+        copy_impl = BlockedTrainedRotaryEmbeddings(head_size, n_heads_q, n_heads_kv, torch.float16)
+        copy_impl(kv_cache, qkv, batch, freqs)
+    else:
+        copy_impl = BlockedRotaryEmbeddings(head_size, n_heads_q, n_heads_kv, torch.float16)
+        copy_impl(kv_cache, qkv, batch)
+
+    assert allclose(qkv[:, :head_size * n_heads_q], q_ref)
+    validate_kv_cache(kv_cache, k, v, seq_descs, batch, exact=False)
+
+
+@pytest.mark.inference_v2_ops
+@pytest.mark.parametrize("trained_emb", [False, True])
+def test_multi_sequences(trained_emb: bool) -> None:
+    head_size = 64
+    n_heads_q = 16
+    n_heads_kv = 16
+    kv_block_size = 64
+    device = get_accelerator().current_device()
+
+    batch_config = [
+        (128, 0),
+        (177, 0),
+        (169, 8),
+        (117, 88),
+        (1, 293),
+        (1, 733),
+        (1, 33),
+    ]
+
+    batch, state_manager, seq_descs = build_batch_and_manager(batch_config, head_size, n_heads_kv, kv_block_size)
+
+    qkv = torch.randn((batch.current_tokens, (n_heads_q + 2 * n_heads_kv) * head_size),
+                      device=device,
+                      dtype=torch.float16)
+    qkv_ref = qkv.clone()
+
+    q = qkv_ref[:, :head_size * n_heads_q]
+    k = qkv_ref[:, head_size * n_heads_q:head_size * (n_heads_q + n_heads_kv)]
+    v = qkv_ref[:, head_size * (n_heads_q + n_heads_kv):]
+
+    q_ref, k, freqs = rotary_pos_embs(q, k, seq_descs, batch, head_size)
+    freqs = freqs.half()
+
+    kv_cache = state_manager.get_cache(0)
+
+    if trained_emb:
+        copy_impl = BlockedTrainedRotaryEmbeddings(head_size, n_heads_q, n_heads_kv, torch.float16)
+        copy_impl(kv_cache, qkv, batch, freqs)
+    else:
+        copy_impl = BlockedRotaryEmbeddings(head_size, n_heads_q, n_heads_kv, torch.float16)
+        copy_impl(kv_cache, qkv, batch)
+
+    assert allclose(qkv[:, :head_size * n_heads_q], q_ref)
+    validate_kv_cache(kv_cache, k, v, seq_descs, batch, exact=False)
diff --git a/tests/unit/inference/v2/kernels/ragged_ops/test_logits_gather.py b/tests/unit/inference/v2/kernels/ragged_ops/test_logits_gather.py
new file mode 100644
index 000000000000..1feefa9ee588
--- /dev/null
+++ b/tests/unit/inference/v2/kernels/ragged_ops/test_logits_gather.py
@@ -0,0 +1,96 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import List
+
+import pytest
+import torch
+
+from deepspeed.accelerator import get_accelerator
+from deepspeed.inference.v2.kernels.ragged_ops import RaggedLogitsGather
+from ....v2.inference_test_utils import allclose, get_dtypes
+from .ragged_testing_utils import build_simple_batch
+
+
+def baseline_implementation(hidden_states: torch.Tensor, seq_lens: List[int]) -> torch.Tensor:
+    output = torch.empty((len(seq_lens), hidden_states.shape[1]),
+                         dtype=hidden_states.dtype,
+                         device=hidden_states.device)
+
+    offset = 0
+    for i, seq_len in enumerate(seq_lens):
+        output[i] = hidden_states[offset + seq_len - 1]
+        offset += seq_len
+
+    return output
+
+
+@pytest.mark.inference_v2_ops
+@pytest.mark.parametrize('dtype', get_dtypes())
+def test_supported_dtypes(dtype: torch.dtype) -> None:
+    """
+    Validate support on nominally supported data types.
+    """
+    model_dim = 4096
+
+    batch = build_simple_batch([256], padding=False)
+    hidden_states = torch.randn((batch.current_tokens, model_dim),
+                                dtype=dtype,
+                                device=get_accelerator().current_device())
+
+    reference_result = baseline_implementation(hidden_states, [256])
+
+    kernel = RaggedLogitsGather(model_dim, dtype)
+    output = torch.empty_like(reference_result)
+    kernel(output, hidden_states, batch)
+
+    assert allclose(output, reference_result)
+
+
+@pytest.mark.inference_v2_ops
+@pytest.mark.parametrize('seq_lens', [[128, 64, 192, 32], [57, 112, 63, 89, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1],
+                                      [63, 27, 74, 83, 32, 17, 1, 1, 1, 1, 1]])
+def test_multiple_sequences(seq_lens: List[int]) -> None:
+    """
+    Validate support on more multi-sequence inputs.
+    """
+    model_dim = 4096
+    dtype = torch.float16
+
+    batch = build_simple_batch(seq_lens, padding=False)
+    hidden_states = torch.randn((batch.current_tokens, model_dim),
+                                dtype=dtype,
+                                device=get_accelerator().current_device())
+
+    reference_result = baseline_implementation(hidden_states, seq_lens)
+
+    kernel = RaggedLogitsGather(model_dim, dtype)
+    output = torch.empty_like(reference_result)
+    kernel(output, hidden_states, batch)
+
+    assert allclose(output, reference_result)
+
+
+@pytest.mark.inference_v2_ops
+@pytest.mark.parametrize("model_dim", [1024, 6144, 6784])
+def test_problem_size_permutations(model_dim: int) -> None:
+    """
+    Validate for different embedding sizes.
+    """
+    dtype = torch.float16
+    seq_lens = [128, 64, 192, 32]
+
+    batch = build_simple_batch(seq_lens, padding=False)
+    hidden_states = torch.randn((batch.current_tokens, model_dim),
+                                dtype=dtype,
+                                device=get_accelerator().current_device())
+
+    reference_result = baseline_implementation(hidden_states, seq_lens)
+
+    kernel = RaggedLogitsGather(model_dim, dtype)
+    output = torch.empty_like(reference_result)
+    kernel(output, hidden_states, batch)
+
+    assert allclose(output, reference_result)
diff --git a/tests/unit/inference/v2/kernels/ragged_ops/test_moe_gather.py b/tests/unit/inference/v2/kernels/ragged_ops/test_moe_gather.py
new file mode 100644
index 000000000000..5fa375b49c19
--- /dev/null
+++ b/tests/unit/inference/v2/kernels/ragged_ops/test_moe_gather.py
@@ -0,0 +1,83 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import pytest
+import torch
+
+from deepspeed.accelerator import get_accelerator
+from deepspeed.inference.v2.inference_utils import DtypeEnum
+from deepspeed.inference.v2.kernels.ragged_ops import (
+    MoEGather,
+    MoEScatter,
+    RaggedTop1Gating,
+)
+from .ragged_testing_utils import build_simple_batch
+"""
+For simplicity's sake, these tests do rely on ``RaggedTop1Gating``  and
+``MoEScatter`` to produce correct inputs. If either of these kernels is broken
+these tests will fail, so double check the unit test results there before
+debugging here.
+"""
+
+
+def build_inputs(n_tokens, n_experts, do_padding):
+
+    assert n_tokens <= 2048, "This test will break if n_tokens > 2048"
+
+    # Sequence composition shouldn't matter here
+    batch = build_simple_batch([n_tokens], padding=do_padding)
+
+    logits = torch.randn((batch.tensor_toks, n_experts),
+                         dtype=torch.float16,
+                         device=get_accelerator().current_device())
+
+    # This will make each token's value equal to its index. NOTE: This will break for
+    # tokens with index > 2048.
+    hidden_states = torch.arange(batch.tensor_toks, dtype=torch.float16,
+                                 device=get_accelerator().current_device()).repeat_interleave(4096, dim=0).reshape(
+                                     batch.tensor_toks, 4096).contiguous()
+
+    gate = RaggedTop1Gating(DtypeEnum.fp16)
+
+    # Gating outputs
+    expert_counts = torch.zeros((n_experts, ), dtype=torch.int32, device=get_accelerator().current_device())
+    scores = torch.empty((batch.tensor_toks, ), dtype=torch.float32, device=get_accelerator().current_device())
+    expert_assignment = torch.empty((batch.tensor_toks, ),
+                                    dtype=torch.int32,
+                                    device=get_accelerator().current_device())
+    expert_offset = torch.empty((batch.tensor_toks, ), dtype=torch.int32, device=get_accelerator().current_device())
+
+    gate(expert_counts, scores, expert_assignment, expert_offset, logits, batch)
+
+    # Scatter outputs
+    moe_input = torch.empty((batch.tensor_toks, 4096), dtype=torch.float16, device=get_accelerator().current_device())
+    expert_cumsum = torch.empty((n_experts, ), dtype=torch.int64, device=get_accelerator().current_device())
+    mapped_slots = torch.empty((batch.tensor_toks, ), dtype=torch.int32, device=get_accelerator().current_device())
+
+    scatter = MoEScatter(DtypeEnum.fp16, 4096)
+    scatter(moe_input, expert_cumsum, mapped_slots, hidden_states, expert_counts, expert_assignment, expert_offset)
+
+    return batch, moe_input, scores, mapped_slots, expert_counts
+
+
+@pytest.mark.inference_v2_ops
+@pytest.mark.parametrize("n_tokens, n_experts", [(13, 64), (278, 64), (1977, 64)])
+@pytest.mark.parametrize("do_padding", [True, False])
+def test_moe_gather(n_tokens, n_experts, do_padding):
+
+    batch, moe_input, scores, mapped_slots, expert_counts = build_inputs(n_tokens, n_experts, do_padding)
+
+    output = torch.randn((batch.tensor_toks, 4096), dtype=torch.float16, device=get_accelerator().current_device())
+
+    gather = MoEGather(DtypeEnum.fp16, 4096)
+    gather(output, moe_input, scores, mapped_slots, expert_counts)
+
+    for token_idx in range(n_tokens):
+        assert torch.equal(
+            output[token_idx],
+            torch.full((4096, ),
+                       token_idx * scores[token_idx],
+                       dtype=torch.float16,
+                       device=get_accelerator().current_device()))
diff --git a/tests/unit/inference/v2/kernels/ragged_ops/test_moe_scatter.py b/tests/unit/inference/v2/kernels/ragged_ops/test_moe_scatter.py
new file mode 100644
index 000000000000..4ca051410c1c
--- /dev/null
+++ b/tests/unit/inference/v2/kernels/ragged_ops/test_moe_scatter.py
@@ -0,0 +1,74 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import pytest
+import torch
+
+from deepspeed.accelerator import get_accelerator
+from deepspeed.inference.v2.inference_utils import DtypeEnum
+from deepspeed.inference.v2.kernels.ragged_ops import MoEScatter, RaggedTop1Gating
+from .ragged_testing_utils import build_simple_batch
+"""
+For simplicity's sake, these tests do rely on ``RaggedTop1Gating`` to produce correct
+inputs. If ``RaggedTop1Gating`` is broken, these tests will fail, so double check
+the unit test results there before debugging here.
+"""
+
+
+@pytest.mark.inference_v2_ops
+@pytest.mark.parametrize("n_tokens, n_experts", [(13, 64), (278, 64), (1977, 64)])
+@pytest.mark.parametrize("do_padding", [True, False])
+def test_moe_scatter(n_tokens, n_experts, do_padding):
+
+    # Sequence composition shouldn't matter here
+    batch = build_simple_batch([n_tokens], padding=do_padding)
+
+    logits = torch.randn((batch.tensor_toks, n_experts),
+                         dtype=torch.float16,
+                         device=get_accelerator().current_device())
+
+    # This will make each token's value equal to its index. NOTE: This will break for
+    # tokens with index > 2048.
+    hidden_states = torch.arange(batch.tensor_toks, dtype=torch.float16,
+                                 device=get_accelerator().current_device()).repeat_interleave(4096, dim=0).reshape(
+                                     batch.tensor_toks, 4096).contiguous()
+
+    gate = RaggedTop1Gating(DtypeEnum.fp16)
+
+    # Gating outputs
+    expert_counts = torch.zeros((n_experts, ), dtype=torch.int32, device=get_accelerator().current_device())
+    scores = torch.empty((batch.tensor_toks, ), dtype=torch.float32, device=get_accelerator().current_device())
+    expert_assignment = torch.empty((batch.tensor_toks, ),
+                                    dtype=torch.int32,
+                                    device=get_accelerator().current_device())
+    expert_offset = torch.empty((batch.tensor_toks, ), dtype=torch.int32, device=get_accelerator().current_device())
+
+    gate(expert_counts, scores, expert_assignment, expert_offset, logits, batch)
+
+    # Scatter outputs
+    moe_input = torch.empty((batch.tensor_toks, 4096), dtype=torch.float16, device=get_accelerator().current_device())
+    expert_cumsum = torch.empty((n_experts, ), dtype=torch.int64, device=get_accelerator().current_device())
+    mapped_slots = torch.empty((batch.tensor_toks, ), dtype=torch.int32, device=get_accelerator().current_device())
+
+    scatter = MoEScatter(DtypeEnum.fp16, 4096)
+    scatter(moe_input, expert_cumsum, mapped_slots, hidden_states, expert_counts, expert_assignment, expert_offset)
+    assert torch.equal(expert_cumsum, torch.cumsum(expert_counts, dim=0).to(torch.int64))
+
+    for token_idx in range(batch.tensor_toks):
+        if token_idx < n_tokens:
+            expert_idx = expert_assignment[token_idx].item()
+            if expert_idx == 0:
+                expert_cumsum_val = 0
+            else:
+                expert_cumsum_val = expert_cumsum[expert_idx - 1]
+            offset = expert_offset[token_idx]
+            total_offset = offset + expert_cumsum_val
+
+            assert total_offset == mapped_slots[token_idx].item()
+            assert torch.equal(moe_input[total_offset], hidden_states[token_idx])
+        else:
+            assert mapped_slots[token_idx].item() == -1
+
+    assert expert_cumsum[-1] == n_tokens
diff --git a/tests/unit/inference/v2/kernels/ragged_ops/test_ragged_embed.py b/tests/unit/inference/v2/kernels/ragged_ops/test_ragged_embed.py
new file mode 100644
index 000000000000..f179f62a9b12
--- /dev/null
+++ b/tests/unit/inference/v2/kernels/ragged_ops/test_ragged_embed.py
@@ -0,0 +1,177 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import List, Optional, Tuple
+
+import pytest
+import torch
+
+from deepspeed.accelerator import get_accelerator
+from deepspeed.inference.v2.kernels.ragged_ops import RaggedEmbeddingKernel
+from ....v2.inference_test_utils import allclose, get_dtypes
+from .ragged_testing_utils import build_batch_and_manager
+
+
+def baseline_implementation(token_ids: torch.Tensor,
+                            embedding_table: torch.Tensor,
+                            unpadded_size: int,
+                            positional_embedding_table: Optional[torch.Tensor] = None,
+                            positional_ids: Optional[torch.Tensor] = None) -> torch.Tensor:
+    """
+    Baseline implementation for our ragged embedding kernel.
+    """
+    if unpadded_size == token_ids.shape[0]:
+        token_embed = torch.nn.functional.embedding(token_ids, embedding_table)
+
+        if positional_embedding_table is not None:
+            pos_embed = torch.nn.functional.embedding(positional_ids, positional_embedding_table)
+            token_embed += pos_embed
+        return token_embed
+    else:
+        real_token_ids = token_ids[:unpadded_size]
+        output = torch.empty((token_ids.shape[0], embedding_table.shape[1]),
+                             dtype=embedding_table.dtype,
+                             device=get_accelerator().current_device())
+        unpadded_output = torch.nn.functional.embedding(real_token_ids, embedding_table)
+
+        # Positional embeddings aren't padded because it's simulated
+        if positional_embedding_table is not None:
+            pos_embed = torch.nn.functional.embedding(positional_ids, positional_embedding_table)
+            unpadded_output += pos_embed
+
+        output[:unpadded_size] = unpadded_output
+        return output
+
+
+def _ragged_embed_test_helper(sequence_config: List[Tuple[int, int]],
+                              embed_dtype: torch.dtype,
+                              token_dtype: torch.dtype,
+                              embed_dim: int,
+                              vocab_size: int,
+                              do_padding: bool = False,
+                              pos_embed_size: int = -1,
+                              pos_embed_offset: int = 0) -> None:
+    """
+    Helper for embedding test to limit the number of tests to run.
+
+    Params:
+        embed_dim (int): Model dimension
+        vocab_size (int): Leading dimension on embedding weight
+        pos_embed_size (int): Size of positional embedding. If negative, no positional embedding
+            is used.
+        pos_embed_offset (int): Offset for positional embedding. Effectively, the raw offsets
+            of a token into a sequence are offset by this amount into the embedding matrix. (
+            i.e. the shape of the positional embeddings is (pos_embed_size + pos_embed_offset
+            embed_dim)
+    """
+    device = get_accelerator().current_device()
+
+    # Heads/Block size are irrelevant here but need something.
+    batch, _, _, = build_batch_and_manager(sequence_config, 64, 16, 64, vocab_range=vocab_size, padding=do_padding)
+
+    embedding_table = torch.randn((vocab_size, embed_dim), dtype=embed_dtype, device=device)
+
+    if pos_embed_size > 0:
+        pos_embedding_table = torch.randn((pos_embed_size + pos_embed_offset, embed_dim),
+                                          dtype=embed_dtype,
+                                          device=device)
+        positional_ids = torch.cat([
+            torch.arange(start_idx, start_idx + seq_len, dtype=token_dtype, device=device)
+            for seq_len, start_idx in sequence_config
+        ]) + pos_embed_offset
+    else:
+        pos_embedding_table = None
+        positional_ids = None
+
+    baseline_output = baseline_implementation(batch.input_ids().to(token_dtype), embedding_table, batch.current_tokens,
+                                              pos_embedding_table, positional_ids)
+
+    kernel = RaggedEmbeddingKernel(embed_dtype, token_dtype, embed_dim)
+    output = torch.empty_like(baseline_output)
+
+    kernel(output,
+           batch,
+           embedding_table,
+           position_embed_weight=pos_embedding_table,
+           position_embed_offset=pos_embed_offset)
+
+    if do_padding:
+        assert output.shape[0] != batch.current_tokens
+
+    assert allclose(output[:batch.current_tokens], baseline_output[:batch.current_tokens])
+
+
+@pytest.mark.inference_v2_ops
+@pytest.mark.parametrize('token_dtype', [torch.int32, torch.int64])
+@pytest.mark.parametrize('embed_dtype', get_dtypes())
+def test_dtype_permutations(token_dtype: torch.dtype, embed_dtype: torch.dtype) -> None:
+    """
+    Validate (on a single problem size) that the kernel support for different data types
+    is correct.
+    """
+    embed_dim = 4096
+    vocab_size = 50304
+
+    _ragged_embed_test_helper([(256, 0)], embed_dtype, token_dtype, embed_dim, vocab_size)
+
+
+@pytest.mark.inference_v2_ops
+@pytest.mark.parametrize('vocab_size, embed_dim', [(1024, 1024), (32000, 5120), (50304, 6144)])
+def test_problem_size_permutations(vocab_size: int, embed_dim: int) -> None:
+    """
+    Validate on wider range of problem sizes.
+    """
+
+    _ragged_embed_test_helper([(256, 0)], torch.float16, torch.int32, embed_dim, vocab_size)
+
+
+@pytest.mark.inference_v2_ops
+@pytest.mark.parametrize('seq_lens', [[128, 64, 192, 32], [57, 112, 63, 89, 1, 1, 1, 1]])
+@pytest.mark.parametrize('do_padding', [True, False])
+def test_complex_sequences(seq_lens: List[int], do_padding: bool) -> None:
+    """
+    Validate on different ragged batch construction scenarios.
+    """
+    embed_dim = 4096
+    vocab_size = 50304
+
+    _ragged_embed_test_helper([(seq_len, 0) for seq_len in seq_lens],
+                              torch.float16,
+                              torch.int32,
+                              embed_dim,
+                              vocab_size,
+                              do_padding=do_padding)
+
+
+@pytest.mark.inference_v2_ops
+@pytest.mark.parametrize("seq_lens", [[(256, 0)], [(256, 0),
+                                                   (128, 0)], [(256, 0), (128, 0),
+                                                               (64, 0)], [(1, 877), (619, 0), (213, 372), (1, 45)]])
+def test_positional_embedding(seq_lens: List[Tuple[int, int]]) -> None:
+    """
+    Validate that positional embedding works correctly.
+    """
+    embed_dim = 4096
+    vocab_size = 50304
+
+    _ragged_embed_test_helper(seq_lens, torch.float16, torch.int32, embed_dim, vocab_size, pos_embed_size=2048)
+
+
+@pytest.mark.inference_v2_ops
+def test_positional_embedding_offset() -> None:
+    """
+    Validate that positional embedding works correctly with an offset.
+    """
+    embed_dim = 4096
+    vocab_size = 50304
+    seq_config = [(1, 877), (619, 0), (213, 372), (1, 45)]
+
+    _ragged_embed_test_helper(seq_config,
+                              torch.float16,
+                              torch.int32,
+                              embed_dim,
+                              vocab_size,
+                              pos_embed_size=2048,
+                              pos_embed_offset=2)
diff --git a/tests/unit/inference/v2/kernels/ragged_ops/test_top_1_gating.py b/tests/unit/inference/v2/kernels/ragged_ops/test_top_1_gating.py
new file mode 100644
index 000000000000..6ff2508bf320
--- /dev/null
+++ b/tests/unit/inference/v2/kernels/ragged_ops/test_top_1_gating.py
@@ -0,0 +1,120 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import pytest
+import torch
+import torch.nn.functional as F
+
+from deepspeed.accelerator import get_accelerator
+from deepspeed.inference.v2.inference_utils import DtypeEnum
+from deepspeed.inference.v2.kernels.ragged_ops import RaggedTop1Gating
+from .ragged_testing_utils import build_simple_batch
+from ....v2.inference_test_utils import allclose
+
+
+def _test_single_mapping_helper(n_tokens: int,
+                                n_experts: int,
+                                assigned_expert: int,
+                                logit_fill: float = 0.0,
+                                match_fill: float = 1.0) -> None:
+    logits = torch.full((n_tokens, n_experts),
+                        logit_fill,
+                        dtype=torch.float16,
+                        device=get_accelerator().current_device())
+
+    logits[:, assigned_expert] = match_fill
+
+    gate = RaggedTop1Gating(DtypeEnum.fp16)
+
+    expert_counts = torch.zeros((n_experts, ), dtype=torch.int32, device=get_accelerator().current_device())
+    scores = torch.empty((n_tokens, ), dtype=torch.float32, device=get_accelerator().current_device())
+    expert_assignment = torch.empty((n_tokens, ), dtype=torch.int32, device=get_accelerator().current_device())
+    expert_offset = torch.empty((n_tokens, ), dtype=torch.int32, device=get_accelerator().current_device())
+    batch = build_simple_batch([n_tokens], padding=False)
+
+    gate(expert_counts, scores, expert_assignment, expert_offset, logits, batch)
+
+    assert expert_counts[assigned_expert] == n_tokens
+    assert torch.all(expert_assignment == assigned_expert)
+    assert torch.unique(expert_offset).shape[0] == n_tokens
+    assert allclose(scores, F.softmax(logits.float(), dim=1)[:, assigned_expert])
+
+
+@pytest.mark.inference_v2_ops
+@pytest.mark.parametrize('n_tokens, n_experts', [(1, 16), (17, 16), (32, 128), (89, 128), (433, 128)])
+def test_single_mapping_gating(n_tokens: int, n_experts: int) -> None:
+    """
+    Evaluate our expert stacking behavior in complete isolation. This ensures all tokens
+    mapped to the same expert are getting unique offsets and identical scores.
+    """
+    assigned_expert = 13
+    _test_single_mapping_helper(n_tokens, n_experts, assigned_expert)
+
+
+@pytest.mark.inference_v2_ops
+def test_negative_logits():
+    """
+    Ensure that scores/values are propagated correctly when all the logits are negative. An
+    earlier implementation of the scoring would return NaN for this case.
+    """
+    _test_single_mapping_helper(128, 32, 13, logit_fill=-2.0, match_fill=-1.0)
+
+
+@pytest.mark.inference_v2_ops
+def test_determinism():
+    """
+    Ensure that ties between two logits are broken deterministically. This is essential when
+    the gating is distributed across multiple devices that need to map the same token to
+    the same expert.
+    """
+
+    n_tokens = 512
+    n_experts = 64
+
+    logits = torch.zeros((n_tokens, n_experts), dtype=torch.float16, device=get_accelerator().current_device())
+    batch = build_simple_batch([n_tokens], padding=False)
+
+    logits[:, 19] = 1.0
+    logits[:, 26] = 1.0
+
+    gate = RaggedTop1Gating(DtypeEnum.fp16)
+
+    for _ in range(1024):
+        expert_counts = torch.zeros((n_experts, ), dtype=torch.int32, device=get_accelerator().current_device())
+        scores = torch.empty((n_tokens, ), dtype=torch.float32, device=get_accelerator().current_device())
+        expert_assignment = torch.empty((n_tokens, ), dtype=torch.int32, device=get_accelerator().current_device())
+        expert_offset = torch.empty((n_tokens, ), dtype=torch.int32, device=get_accelerator().current_device())
+        batch = build_simple_batch([n_tokens], padding=False)
+
+        gate(expert_counts, scores, expert_assignment, expert_offset, logits, batch)
+
+        assert expert_counts[19] == n_tokens
+        assert expert_counts[26] == 0
+        assert torch.all(expert_assignment == 19)
+        assert torch.unique(expert_offset).shape[0] == n_tokens
+        assert allclose(scores, F.softmax(logits.float(), dim=1)[:, 19])
+
+
+@pytest.mark.inference_v2_ops
+@pytest.mark.parametrize('n_tokens, n_experts', [(1, 16), (17, 16), (32, 128), (89, 128), (433, 2)])
+def test_score_accuracy(n_tokens: int, n_experts: int) -> None:
+    """
+    Validate expert scores are correct.
+    """
+    logits = torch.randn((n_tokens, n_experts), dtype=torch.float16, device=get_accelerator().current_device())
+    batch = build_simple_batch([n_tokens], padding=False)
+
+    gate = RaggedTop1Gating(DtypeEnum.fp16)
+
+    expert_counts = torch.zeros((n_experts, ), dtype=torch.int32, device=get_accelerator().current_device())
+    scores = torch.empty((n_tokens, ), dtype=torch.float32, device=get_accelerator().current_device())
+    expert_assignment = torch.empty((n_tokens, ), dtype=torch.int32, device=get_accelerator().current_device())
+    expert_offset = torch.empty((n_tokens, ), dtype=torch.int32, device=get_accelerator().current_device())
+
+    ref_scores = F.softmax(logits.float(), dim=1).max(dim=1).values
+
+    gate(expert_counts, scores, expert_assignment, expert_offset, logits, batch)
+    assert allclose(scores, ref_scores)
+    assert expert_counts.sum() == n_tokens
diff --git a/tests/unit/inference/v2/model_implementations/__init__.py b/tests/unit/inference/v2/model_implementations/__init__.py
new file mode 100644
index 000000000000..208299fb8c50
--- /dev/null
+++ b/tests/unit/inference/v2/model_implementations/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
diff --git a/tests/unit/inference/v2/model_implementations/parameters/__init__.py b/tests/unit/inference/v2/model_implementations/parameters/__init__.py
new file mode 100644
index 000000000000..208299fb8c50
--- /dev/null
+++ b/tests/unit/inference/v2/model_implementations/parameters/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
diff --git a/tests/unit/inference/v2/model_implementations/parameters/test_contiguify.py b/tests/unit/inference/v2/model_implementations/parameters/test_contiguify.py
new file mode 100644
index 000000000000..52ff0e134dfc
--- /dev/null
+++ b/tests/unit/inference/v2/model_implementations/parameters/test_contiguify.py
@@ -0,0 +1,120 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import List
+
+import pytest
+import torch
+
+from deepspeed.accelerator import get_accelerator
+from deepspeed.inference.v2.model_implementations.flat_model_helpers import (
+    flatten_inference_model,
+    restore_inference_model,
+)
+from deepspeed.inference.v2.model_implementations.layer_container_base import LayerContainer
+from .utils import SimpleParam, DummyInferenceModel
+
+
+class TransformerLayerContainer(LayerContainer):
+    """
+    Stub layer container
+    """
+    PARAM_MAPPING = {
+        "param_1": "param_1.param",
+        "param_2": "param_2.param",
+    }
+
+    param_1: SimpleParam
+
+    param_2: SimpleParam
+
+
+class NonTransformerContainer(LayerContainer):
+    """
+    Stub layer container
+    """
+    PARAM_MAPPING = {
+        "param_1": "param_1.param",
+        "param_2": "param_2.param",
+        "param_3": "param_3.param",
+    }
+
+    param_1: SimpleParam
+
+    param_2: SimpleParam
+
+    param_3: SimpleParam
+
+
+@pytest.mark.inference_v2
+def test_contiguify_roundtrip():
+    """
+    Validate that contiguify round trips and reconstructions are correct.
+    """
+    model = DummyInferenceModel()
+
+    n_layers = 2
+    transformer_params = []
+    transformer_containers = []
+
+    # Create parameters and populate them into the containers
+    for i in range(n_layers):
+        transformer_containers.append(TransformerLayerContainer(model))
+        layer_params = []
+        for j in range(2):
+            layer_params.append(torch.rand(16, 16))
+            transformer_containers[i].set_dependency(f"param_{j+1}", layer_params[j])
+
+        layer_params = [p.to(get_accelerator().current_device()) for p in layer_params]
+
+        transformer_params.append(layer_params)
+        assert transformer_containers[i].is_populated == True
+
+    non_transformer_params = []
+    non_transformer_container = NonTransformerContainer(model)
+
+    for i in range(3):
+        non_transformer_params.append(torch.rand(16, 16).permute(1, 0))
+        non_transformer_container.set_dependency(f"param_{i+1}", non_transformer_params[i])
+
+    non_transformer_params = [p.to(get_accelerator().current_device()) for p in non_transformer_params]
+
+    def validate_containers(t_containers: List[LayerContainer], n_t_containers: LayerContainer,
+                            t_params: List[List[torch.Tensor]], n_t_params: List[torch.Tensor]):
+        """
+        Validate params match what is on the containers.
+        """
+        for i in range(n_layers):
+            l_c = t_containers[i]
+
+            assert l_c.is_initialized == True
+
+            assert torch.equal(l_c.param_1, t_params[i][0])
+            assert torch.equal(l_c.param_2, t_params[i][1])
+
+        assert n_t_containers.is_initialized == True
+        assert torch.equal(n_t_containers.param_1, n_t_params[0])
+        assert torch.equal(n_t_containers.param_2, n_t_params[1])
+        assert torch.equal(n_t_containers.param_3, n_t_params[2])
+        assert not n_t_containers.param_1.is_contiguous()
+        assert not n_t_containers.param_2.is_contiguous()
+        assert not n_t_containers.param_3.is_contiguous()
+
+    buffer, metadata = flatten_inference_model(transformer_containers, non_transformer_container, "NoOpPolicy")
+
+    # Validate containers before contiguify
+    validate_containers(transformer_containers, non_transformer_container, transformer_params, non_transformer_params)
+
+    # Validate restore pass
+    transformer_containers_r = []
+    for i in range(n_layers):
+        transformer_containers_r.append(TransformerLayerContainer(model))
+
+    non_transformer_container_r = NonTransformerContainer(model)
+
+    restore_inference_model(buffer, metadata, transformer_containers_r, non_transformer_container_r)
+
+    validate_containers(transformer_containers_r, non_transformer_container_r, transformer_params,
+                        non_transformer_params)
diff --git a/tests/unit/inference/v2/model_implementations/parameters/test_layer_inheritance.py b/tests/unit/inference/v2/model_implementations/parameters/test_layer_inheritance.py
new file mode 100644
index 000000000000..07ad87e6168d
--- /dev/null
+++ b/tests/unit/inference/v2/model_implementations/parameters/test_layer_inheritance.py
@@ -0,0 +1,48 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import pytest
+import torch
+
+from deepspeed.inference.v2.inference_parameter import InferenceParameter
+from deepspeed.inference.v2.model_implementations.layer_container_base import LayerContainer
+
+from .utils import SimpleParam, DummyInferenceModel
+
+
+class ParentLayer(LayerContainer):
+    """
+    A layer that has a dependency on a simple parameter.
+    """
+
+    param_1: SimpleParam
+
+
+class ChildLayer(ParentLayer):
+    """
+    A layer that inherits from another layer.
+    """
+
+    param_2: SimpleParam
+
+
+@pytest.mark.inference_v2
+def test_layer_inheritance():
+    inference_model = DummyInferenceModel()
+
+    multi_param_layer = ChildLayer(inference_model)
+
+    assert multi_param_layer.n_params == 2
+    assert multi_param_layer.is_initialized is False
+
+    multi_param_layer.param_1.param = torch.ones(16, 16)
+
+    assert multi_param_layer.is_initialized is False
+
+    multi_param_layer.param_2.param = torch.full((16, 16), 2.0)
+
+    assert multi_param_layer.is_populated is True
+    assert isinstance(multi_param_layer.param_1, InferenceParameter)
+    assert isinstance(multi_param_layer.param_2, InferenceParameter)
diff --git a/tests/unit/inference/v2/model_implementations/parameters/test_mapping.py b/tests/unit/inference/v2/model_implementations/parameters/test_mapping.py
new file mode 100644
index 000000000000..52313cb6f202
--- /dev/null
+++ b/tests/unit/inference/v2/model_implementations/parameters/test_mapping.py
@@ -0,0 +1,168 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import pytest
+import torch
+
+from deepspeed.inference.v2.allocator import on_device
+from deepspeed.inference.v2.inference_parameter import InferenceParameter
+from deepspeed.inference.v2.model_implementations.parameter_base import ParameterBase, ParamList
+from deepspeed.inference.v2.model_implementations.layer_container_base import LayerContainer
+
+
+class MultiDependencyContainer(ParameterBase):
+
+    dependency_1: torch.Tensor
+
+    dependency_2: torch.Tensor
+
+    @on_device
+    def finalize(self) -> torch.Tensor:
+        param = torch.cat([self.dependency_1, self.dependency_2])
+        return InferenceParameter.initialize(param)
+
+
+class ListDependencyContainer(ParameterBase):
+
+    dependencies: ParamList("list_items")  # noqa: F821
+
+    @on_device
+    def finalize(self) -> torch.Tensor:
+        param = torch.cat(tuple(self.dependencies))
+        return InferenceParameter.initialize(param)
+
+
+class MappingLayer(LayerContainer):
+    PARAM_MAPPING = {
+        "model.val.item.d_1": "multi_depend.dependency_1",
+        "model.val.item.d_2": "multi_depend.dependency_2",
+        "model.list_vals.*.d": "list_depend.dependencies"
+    }
+
+    multi_depend: MultiDependencyContainer
+
+    list_depend: ListDependencyContainer
+
+
+class SubMappingLayer(MappingLayer):
+    PARAM_MAPPING = {
+        "model.val.item2.d_1": "multi_depend2.dependency_1",
+        "model.val.item2.d_2": "multi_depend2.dependency_2",
+    }
+
+    multi_depend2: MultiDependencyContainer
+
+
+class DoubleMappingLayer(LayerContainer):
+    PARAM_MAPPING = {
+        "model.val.item.d_1": ["multi_depend.dependency_1", "multi_depend.dependency_2"],
+    }
+
+    multi_depend: MultiDependencyContainer
+
+
+class InferenceModel:
+
+    @property
+    def list_items(self) -> int:
+        return 16
+
+
+@pytest.mark.inference_v2
+def test_mapping_syntax():
+    model = InferenceModel()
+
+    mapping_layer = MappingLayer(model)
+
+    mapping_layer.set_dependency("model.val.item.d_1", torch.ones(1))
+    mapping_layer.set_dependency("model.val.item.d_2", torch.ones(1) * 2)
+
+    assert isinstance(mapping_layer.multi_depend, torch.Tensor)
+
+    for i in range(16):
+        mapping_layer.set_dependency(f"model.list_vals.{i}.d", torch.ones(1) * i)
+        if i != 16 - 1:
+            assert mapping_layer.is_populated == False
+
+    assert isinstance(mapping_layer.list_depend, InferenceParameter)
+    assert mapping_layer.is_populated == True
+
+
+@pytest.mark.inference_v2
+def test_sub_mapping_syntax():
+    model = InferenceModel()
+
+    mapping_layer = SubMappingLayer(model)
+
+    mapping_layer.set_dependency("model.val.item.d_1", torch.ones(1))
+    mapping_layer.set_dependency("model.val.item.d_2", torch.ones(1) * 2)
+
+    assert isinstance(mapping_layer.multi_depend, InferenceParameter)
+
+    mapping_layer.set_dependency("model.val.item2.d_1", torch.ones(1))
+    mapping_layer.set_dependency("model.val.item2.d_2", torch.ones(1) * 2)
+
+    assert isinstance(mapping_layer.multi_depend2, InferenceParameter)
+
+    # We want to check into double digits to make sure that this isn't specific
+    # to single difit indexing.
+    for i in range(16):
+        mapping_layer.set_dependency(f"model.list_vals.{i}.d", torch.ones(1) * i)
+        if i != 16 - 1:
+            assert mapping_layer.is_populated == False
+
+    assert isinstance(mapping_layer.list_depend, InferenceParameter)
+    assert mapping_layer.is_populated == True
+
+
+@pytest.mark.inference_v2
+def test_double_mapping_syntax():
+    model = InferenceModel()
+
+    mapping_layer = DoubleMappingLayer(model)
+    mapping_layer.set_dependency("model.val.item.d_1", torch.ones(1))
+
+    # The single parameter setting should immediately make the parameter finalized
+    # and the whole layer initialized.
+    assert isinstance(mapping_layer.multi_depend, InferenceParameter)
+    assert mapping_layer.is_populated == True
+
+
+@pytest.mark.inference_v2
+def test_insufficient_mapping_syntax():
+    """
+    In the above example, we don't have a mapping for `multi_depend2.dependency_2`.
+    """
+
+    with pytest.raises(ValueError):
+
+        class InsuffienctMappingLayer(LayerContainer):
+            PARAM_MAPPING = {
+                "model.val.item.d_1": "multi_depend1.dependency_1",
+                "model.val.item.d_2": "multi_depend1.dependency_2",
+                "model.val.item2.d_1": "multi_depend2.dependency_1",
+            }
+
+            multi_depend1: MultiDependencyContainer
+
+            multi_depend2: MultiDependencyContainer
+
+
+@pytest.mark.inference_v2
+def test_unknown_target_mapping_syntax():
+    """
+    In the above example, `multi_depend_unknown` does not exist
+    """
+
+    with pytest.raises(ValueError):
+
+        class UnknownTargetMappingLayer(LayerContainer):
+            PARAM_MAPPING = {
+                "model.val.item.d_1": "multi_depend1.dependency_1",
+                "model.val.item.d_2": "multi_depend1.dependency_2",
+                "model.val.item2.d_1": "multi_depend_unknown.dependency_1",
+            }
+
+            multi_depend: MultiDependencyContainer
diff --git a/tests/unit/inference/v2/model_implementations/parameters/test_multi_parameter_layer.py b/tests/unit/inference/v2/model_implementations/parameters/test_multi_parameter_layer.py
new file mode 100644
index 000000000000..b319bf6de4ad
--- /dev/null
+++ b/tests/unit/inference/v2/model_implementations/parameters/test_multi_parameter_layer.py
@@ -0,0 +1,79 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import pytest
+import torch
+
+from deepspeed.inference.v2.inference_parameter import InferenceParameter
+from deepspeed.inference.v2.model_implementations.layer_container_base import LayerContainer
+
+from .utils import validate_device, SimpleParam, ListParam, DummyInferenceModel
+
+
+class MultiParameterLayer(LayerContainer):
+    """
+    Two dependencies, both of which are simple parameters.
+    """
+
+    param_1: SimpleParam
+
+    param_2: SimpleParam
+
+
+class MixedMultiParameterLayer(LayerContainer):
+    """
+    Two dependencies, one of which is a simple parameter, the other is a list parameter.
+    """
+
+    param_1: SimpleParam
+
+    param_2: ListParam
+
+
+@pytest.mark.inference_v2
+def test_multi_parameter_layer():
+    inference_model = DummyInferenceModel()
+
+    multi_param_layer = MultiParameterLayer(inference_model)
+
+    assert multi_param_layer.n_params == 2
+    assert multi_param_layer.is_populated is False
+
+    multi_param_layer.param_1.param = torch.ones(16, 16)
+
+    assert multi_param_layer.is_populated is False
+
+    multi_param_layer.param_2.param = torch.full((16, 16), 2.0)
+
+    assert multi_param_layer.is_populated is True
+    assert isinstance(multi_param_layer.param_1, InferenceParameter)
+    assert isinstance(multi_param_layer.param_2, InferenceParameter)
+
+
+@pytest.mark.inference_v2
+def test_mixed_multi_parameter_layer():
+    inference_model = DummyInferenceModel()
+
+    mixed_multi_param_layer = MixedMultiParameterLayer(inference_model)
+
+    assert mixed_multi_param_layer.n_params == 2
+    assert mixed_multi_param_layer.is_populated is False
+
+    mixed_multi_param_layer.param_2.params[1] = torch.full((16, 16), 2.0)
+    assert mixed_multi_param_layer.is_populated is False
+    assert not isinstance(mixed_multi_param_layer.param_2, InferenceParameter)
+
+    mixed_multi_param_layer.param_1.param = torch.ones(16, 16)
+    assert mixed_multi_param_layer.is_populated is False
+    assert isinstance(mixed_multi_param_layer.param_1, InferenceParameter)
+
+    validate_device(mixed_multi_param_layer.param_1)
+
+    mixed_multi_param_layer.param_2.params[0] = torch.full((16, 16), 2.0)
+
+    assert mixed_multi_param_layer.is_populated is True
+    assert isinstance(mixed_multi_param_layer.param_2, InferenceParameter)
+
+    validate_device(mixed_multi_param_layer.param_2)
diff --git a/tests/unit/inference/v2/model_implementations/parameters/test_parameter_list.py b/tests/unit/inference/v2/model_implementations/parameters/test_parameter_list.py
new file mode 100644
index 000000000000..260236562ee9
--- /dev/null
+++ b/tests/unit/inference/v2/model_implementations/parameters/test_parameter_list.py
@@ -0,0 +1,105 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import pytest
+import torch
+
+from deepspeed.inference.v2.allocator import on_device
+from deepspeed.inference.v2.inference_parameter import InferenceParameter
+from deepspeed.inference.v2.model_implementations.parameter_base import ParameterBase, ParamList
+from deepspeed.inference.v2.model_implementations.layer_container_base import LayerContainer
+from deepspeed.inference.v2.model_implementations.common_parameters import *
+
+from .utils import validate_device
+
+
+class SimpleMoELayer(LayerContainer):
+
+    moe_mlp_1: UnfusedMoEMLP1Parameter
+
+
+class DummyInferenceModel:
+
+    def __init__(self, experts_per_rank: int) -> None:
+        self._num_experts = experts_per_rank
+
+    @property
+    def num_experts(self) -> int:
+        return self._num_experts
+
+    @on_device
+    def transform_moe_mlp_1_param(self, param: torch.Tensor) -> torch.Tensor:
+        return InferenceParameter.initialize(param)
+
+
+@pytest.mark.inference_v2
+def test_simple_moe_layer():
+
+    inference_model = DummyInferenceModel(experts_per_rank=2)
+
+    simple_moe_layer = SimpleMoELayer(inference_model)
+
+    assert simple_moe_layer.moe_mlp_1.experts[0] is None
+    assert simple_moe_layer.moe_mlp_1.experts[1] is None
+
+    # Set the first expert
+    simple_moe_layer.moe_mlp_1.experts[0] = torch.zeros(16, 16)
+
+    assert simple_moe_layer.moe_mlp_1.experts[0] is not None
+    assert simple_moe_layer.moe_mlp_1.experts[1] is None
+
+    assert not simple_moe_layer.is_initialized
+
+    # Set the second expert
+    simple_moe_layer.moe_mlp_1.experts[1] = torch.ones(16, 16)
+
+    # We have all the experts, so the layer should be initialized
+    assert simple_moe_layer.is_initialized
+    assert isinstance(simple_moe_layer.moe_mlp_1, torch.Tensor)
+
+    validate_device(simple_moe_layer.moe_mlp_1)
+
+
+"""
+Check that we can mix the number of elements in lists in the same context and have that
+be tracked correctly.
+"""
+
+
+class CustomListParam1(ParameterBase):
+
+    deps: ParamList("attr_1")
+
+
+class CustomListParam2(ParameterBase):
+
+    deps: ParamList("attr_2")
+
+
+class MixedLayer(LayerContainer):
+
+    list_1: CustomListParam1
+    list_2: CustomListParam2
+
+
+class MixedInferenceModel:
+
+    @property
+    def attr_1(self) -> int:
+        return 1
+
+    @property
+    def attr_2(self) -> int:
+        return 2
+
+
+@pytest.mark.inference_v2
+def test_mixed_param_lists():
+    model = MixedInferenceModel()
+
+    layer = MixedLayer(model)
+
+    assert layer.list_1.deps.n_params == 1
+    assert layer.list_2.deps.n_params == 2
diff --git a/tests/unit/inference/v2/model_implementations/parameters/utils.py b/tests/unit/inference/v2/model_implementations/parameters/utils.py
new file mode 100644
index 000000000000..07d72059f9b3
--- /dev/null
+++ b/tests/unit/inference/v2/model_implementations/parameters/utils.py
@@ -0,0 +1,60 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+
+from deepspeed.accelerator import get_accelerator
+from deepspeed.inference.v2.allocator import on_device
+from deepspeed.inference.v2.inference_parameter import InferenceParameter
+from deepspeed.inference.v2.model_implementations.parameter_base import ParameterBase, ParametrizedList
+
+
+class SimpleParam(ParameterBase):
+    """
+    Parameter with single dependency.
+    """
+
+    param: torch.Tensor
+
+    @on_device
+    def finalize(self) -> torch.Tensor:
+        return self.inference_model.transform(self.param)
+
+
+class SimpleParametrizedList(ParametrizedList):
+    """
+    Parameter list based on `num_dependencies` attribute.
+    """
+
+    count_attr: str = "num_dependencies"
+
+
+class ListParam(ParameterBase):
+    """
+    Parameter with list dependency.
+
+    NOTE: This uses the tuple workaround for the `ParametrizedList` class
+    as described in the docstring of `ParametrizedList`.
+    """
+
+    params: SimpleParametrizedList
+
+    @on_device
+    def finalize(self) -> torch.Tensor:
+        return self.inference_model.transform(torch.cat(tuple(self.params)))
+
+
+class DummyInferenceModel:
+
+    @property
+    def num_dependencies(self) -> int:
+        return 2
+
+    def transform(self, param: torch.Tensor) -> torch.Tensor:
+        return InferenceParameter.initialize(param)
+
+
+def validate_device(tensor: torch.Tensor):
+    assert tensor.device == torch.device(get_accelerator().current_device())
diff --git a/tests/unit/inference/v2/model_implementations/sharding/__init__.py b/tests/unit/inference/v2/model_implementations/sharding/__init__.py
new file mode 100644
index 000000000000..208299fb8c50
--- /dev/null
+++ b/tests/unit/inference/v2/model_implementations/sharding/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
diff --git a/tests/unit/inference/v2/model_implementations/sharding/test_attn_out_sharding.py b/tests/unit/inference/v2/model_implementations/sharding/test_attn_out_sharding.py
new file mode 100644
index 000000000000..850c4c24fde6
--- /dev/null
+++ b/tests/unit/inference/v2/model_implementations/sharding/test_attn_out_sharding.py
@@ -0,0 +1,129 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import pytest
+import torch
+
+from deepspeed.accelerator import get_accelerator
+from deepspeed.inference.v2.model_implementations.sharding import *
+
+# None of the logic should be dependent on head size.
+HEAD_SIZE = 64
+
+
+def fill_with_head_ids(head_size: int, n_heads: int) -> torch.Tensor:
+    """
+    Fills a tensor with the associated head ids. All columns should have the same value.
+    """
+    head_ids = torch.arange(n_heads, dtype=torch.half, device=get_accelerator().current_device())
+
+    head_ids = head_ids.repeat_interleave(head_size).repeat(head_size * n_heads).reshape(n_heads * head_size, -1)
+    return head_ids
+
+
+@pytest.mark.inference_v2
+@pytest.mark.parametrize("n_heads, n_shards", [(1, 1), (8, 4), (32, 8)])
+def test_mha_even_sharding(n_heads: int, n_shards: int):
+    """
+    Even head sharding for MHA.
+
+    Args:
+        n_heads (int): The number QKV heads.
+        n_shards (int): The number of shards to test for.
+    """
+    param = fill_with_head_ids(HEAD_SIZE, n_heads)
+
+    n_local_heads = n_heads // n_shards
+    sharded_shape = (HEAD_SIZE * n_heads, HEAD_SIZE * n_local_heads)
+
+    for shard_rank in range(n_shards):
+        sharded_param = shard_attn_out_param(param, shard_rank, n_shards, HEAD_SIZE)
+        n_heads_local_q, _ = get_local_heads(shard_rank, n_shards, n_heads)
+
+        assert sharded_param.shape[-1] == HEAD_SIZE * n_heads_local_q
+        assert sharded_param.shape == sharded_shape
+
+        heads = torch.chunk(sharded_param, n_local_heads, dim=1)
+
+        for i, head in enumerate(heads):
+            assert torch.all(head == i + shard_rank * n_local_heads)
+
+
+@pytest.mark.inference_v2
+@pytest.mark.parametrize("n_heads, n_shards", [(3, 2), (20, 8)])
+def test_mha_unbalanced_sharding(n_heads: int, n_shards: int):
+    """
+    Unbalanced head sharding for MHA.
+
+    Args:
+        n_heads (int): The number QKV heads.
+        n_shards (int): The number of shards to test for.
+    """
+    param = fill_with_head_ids(HEAD_SIZE, n_heads)
+
+    max_heads = 0
+    min_heads = n_heads
+    seen_heads = set()
+    total_heads = 0
+
+    for shard_rank in range(n_shards):
+        sharded_param = shard_attn_out_param(param, shard_rank, n_shards, HEAD_SIZE)
+        n_heads_local_q, _ = get_local_heads(shard_rank, n_shards, n_heads)
+
+        assert sharded_param.shape[-1] == HEAD_SIZE * n_heads_local_q
+
+        n_local_heads = sharded_param.shape[1] // HEAD_SIZE
+        total_heads += n_local_heads
+        max_heads = max(max_heads, n_local_heads)
+        min_heads = min(min_heads, n_local_heads)
+
+        for i in range(n_local_heads):
+            head_ids = torch.unique_consecutive(sharded_param[:, i * HEAD_SIZE:(i + 1) * HEAD_SIZE])
+            assert len(head_ids) == 1
+            seen_heads.add(head_ids.item())
+
+    assert max_heads == min_heads + 1
+    assert total_heads == n_heads
+    assert len(seen_heads) == n_heads
+
+
+@pytest.mark.inference_v2
+@pytest.mark.parametrize("n_heads_q, n_heads_kv, n_shards", [(20, 4, 8)])
+def test_gqa_uneven_sharding(n_heads_q: int, n_heads_kv: int, n_shards: int):
+    """
+    We only test the uneven GQA test case because even GQA shards the attention output
+    in the exact same manner as MHA.
+
+    Args:
+        n_heads_q (int): The number of query heads.
+        n_heads_kv (int): The number of key/value heads.
+        n_shards (int): The number of shards to test for.
+    """
+    param = fill_with_head_ids(HEAD_SIZE, n_heads_q)
+
+    min_heads = n_heads_q
+    max_heads = 0
+    seen_heads = set()
+    total_heads = 0
+
+    for shard_rank in range(n_shards):
+        sharded_param = shard_attn_out_param(param, shard_rank, n_shards, HEAD_SIZE, n_heads_q, n_heads_kv)
+        n_heads_local_q, _ = get_local_heads(shard_rank, n_shards, n_heads_q, n_heads_kv)
+
+        assert sharded_param.shape[-1] == HEAD_SIZE * n_heads_local_q
+
+        n_local_heads = sharded_param.shape[1] // HEAD_SIZE
+        total_heads += n_local_heads
+        max_heads = max(max_heads, n_local_heads)
+        min_heads = min(min_heads, n_local_heads)
+
+        for i in range(n_local_heads):
+            head_id = torch.unique_consecutive(sharded_param[:, i * HEAD_SIZE:(i + 1) * HEAD_SIZE])
+            assert len(head_id) == 1
+            seen_heads.add(head_id.item())
+
+    assert max_heads == min_heads + 1
+    assert total_heads == n_heads_q
+    assert len(seen_heads) == n_heads_q
diff --git a/tests/unit/inference/v2/model_implementations/sharding/test_mlp_sharding.py b/tests/unit/inference/v2/model_implementations/sharding/test_mlp_sharding.py
new file mode 100644
index 000000000000..aac7e5391d8f
--- /dev/null
+++ b/tests/unit/inference/v2/model_implementations/sharding/test_mlp_sharding.py
@@ -0,0 +1,116 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import pytest
+import torch
+
+from deepspeed.accelerator import get_accelerator
+from deepspeed.inference.v2.model_implementations.sharding import *
+
+
+def round_up_to_256(x: int) -> int:
+    """
+    Round up to the nearest multiple of 256.
+    """
+    return x + (256 - x % 256)
+
+
+def make_params(model_dim: int, ffn_multiplier: int, n_experts: int, gated: bool = False) -> torch.Tensor:
+    """
+
+    """
+    if gated:
+        mlp_1_intermediate = round_up_to_256(int(model_dim * ffn_multiplier * 4 / 3))
+        mlp_2_intermediate = mlp_1_intermediate // 2
+    else:
+        mlp_1_intermediate = ffn_multiplier * model_dim
+        mlp_2_intermediate = ffn_multiplier * model_dim
+
+    mlp_1_shared_dim = torch.arange(mlp_1_intermediate, dtype=torch.float32, device=get_accelerator().current_device())
+
+    mlp_1_w = mlp_1_shared_dim.repeat_interleave(model_dim).reshape(mlp_1_intermediate, model_dim)
+    mlp_1_b = mlp_1_shared_dim
+
+    mlp_2_shared_dim = torch.arange(mlp_2_intermediate, dtype=torch.float32, device=get_accelerator().current_device())
+    mlp_2_w = mlp_2_shared_dim.repeat(model_dim).reshape(model_dim, mlp_2_intermediate)
+    mlp_2_b = torch.ones(model_dim, dtype=torch.float32, device=get_accelerator().current_device())
+
+    if n_experts > 1:
+        mlp_1_w = mlp_1_w.expand(n_experts, -1, -1)
+        mlp_1_b = mlp_1_b.expand(n_experts, -1)
+        mlp_2_w = mlp_2_w.expand(n_experts, -1, -1)
+        mlp_2_b = mlp_2_b.expand(n_experts, -1)
+
+    return (mlp_1_w, mlp_1_b, mlp_2_w, mlp_2_b)
+
+
+@pytest.mark.inference_v2
+@pytest.mark.parametrize("model_dim, ffn_multiplier, n_shards", [(1024, 4, 1), (1024, 4, 8), (1024, 4, 6)])
+@pytest.mark.parametrize("n_experts", [1, 16])
+def test_even_ffn_sharding(model_dim: int, ffn_multiplier: int, n_shards: int, n_experts: int):
+    """
+    FFN sharding tends to be much simpler than attention sharding since it works on larger granularities.
+    While the test case of (1024, 4, 6) is not a use case we're likely to see, this does ensure that
+    the sharding logic will round correctly for the alignments we care about.
+    """
+    mlp_1_w, mlp_1_b, mlp_2_w, mlp_2_b = make_params(model_dim, ffn_multiplier, n_experts)
+
+    total_ffn_dim = model_dim * ffn_multiplier
+    mapped_neurons = 0
+
+    is_moe = n_experts > 1
+
+    for shard_rank in range(n_shards):
+        shard_1_w = shard_mlp_1_param(mlp_1_w, shard_rank, n_shards, is_moe=is_moe)
+        shard_1_b = shard_mlp_1_param(mlp_1_b, shard_rank, n_shards, is_moe=is_moe)
+        shard_2_w = shard_mlp_2_param(mlp_2_w, shard_rank, n_shards, is_moe=is_moe)
+        shard_2_b = shard_mlp_2_param(mlp_2_b, shard_rank, n_shards, is_moe=is_moe)
+
+        assert shard_1_w.shape[-2] == shard_2_w.shape[-1]
+        assert shard_1_w.shape[-2] % DEFAULT_SHARD_GRANULARITY == 0
+        assert shard_1_w.shape[-2] == shard_1_b.shape[-1]
+
+        mapped_neurons += shard_1_w.shape[-2]
+
+        if shard_rank != 0:
+            assert shard_2_b is None
+        else:
+            assert shard_2_b.shape[-1] == model_dim
+
+    assert mapped_neurons == total_ffn_dim
+
+
+@pytest.mark.inference_v2
+@pytest.mark.parametrize("model_dim, ffn_multiplier, n_shards", [(1024, 4, 1), (1024, 4, 8), (1024, 4, 6)])
+@pytest.mark.parametrize("n_experts", [1, 16])
+def test_gated_ffn_sharding(model_dim: int, ffn_multiplier: int, n_shards: int, n_experts: int):
+    """
+    Test the same cases assuming a gated regime.
+    """
+    mlp_1_w, mlp_1_b, mlp_2_w, mlp_2_b = make_params(model_dim, ffn_multiplier, n_experts, gated=True)
+
+    total_ffn_dim = round_up_to_256(int(model_dim * ffn_multiplier * 4 / 3))
+    mapped_neurons = 0
+
+    is_moe = n_experts > 1
+
+    for shard_rank in range(n_shards):
+        shard_1_w = shard_mlp_1_param(mlp_1_w, shard_rank, n_shards, gated=True, is_moe=is_moe)
+        shard_1_b = shard_mlp_1_param(mlp_1_b, shard_rank, n_shards, gated=True, is_moe=is_moe)
+        shard_2_w = shard_mlp_2_param(mlp_2_w, shard_rank, n_shards, is_moe=is_moe)
+        shard_2_b = shard_mlp_2_param(mlp_2_b, shard_rank, n_shards, is_moe=is_moe)
+
+        assert shard_1_w.shape[-2] == shard_2_w.shape[-1] * 2
+        assert shard_1_w.shape[-2] % DEFAULT_SHARD_GRANULARITY == 0
+        assert shard_1_w.shape[-2] == shard_1_b.shape[-1]
+
+        mapped_neurons += shard_1_w.shape[-2]
+
+        if shard_rank != 0:
+            assert shard_2_b is None
+        else:
+            assert shard_2_b.shape[-1] == model_dim
+
+    assert mapped_neurons == total_ffn_dim
diff --git a/tests/unit/inference/v2/model_implementations/sharding/test_qkv_sharding.py b/tests/unit/inference/v2/model_implementations/sharding/test_qkv_sharding.py
new file mode 100644
index 000000000000..9a1cb9c09c64
--- /dev/null
+++ b/tests/unit/inference/v2/model_implementations/sharding/test_qkv_sharding.py
@@ -0,0 +1,251 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import Optional
+
+import pytest
+import torch
+
+from deepspeed.accelerator import get_accelerator
+from deepspeed.inference.v2.model_implementations.sharding import *
+
+
+def fill_with_head_ids(head_size: int, n_heads_q: int, n_heads_kv: Optional[int] = None) -> torch.Tensor:
+    """
+
+    """
+    head_ids_q = torch.arange(n_heads_q, dtype=torch.half, device=get_accelerator().current_device())
+    head_vals_q = head_ids_q.repeat_interleave(head_size * head_size * n_heads_q).reshape(n_heads_q * head_size, -1)
+
+    if n_heads_kv is None:
+        return torch.cat([head_vals_q, head_vals_q, head_vals_q], dim=0)
+
+    head_ids_k = torch.arange(n_heads_kv, dtype=torch.half, device=get_accelerator().current_device())
+    head_vals_k = head_ids_k.repeat_interleave(head_size * head_size * n_heads_q).reshape(n_heads_kv * head_size, -1)
+
+    return torch.cat([head_vals_q, head_vals_k, head_vals_k], dim=0)
+
+
+def validate_inferred_shape(shard: torch.Tensor, head_size: int, n_local_q_heads: int, n_local_kv_heads: int):
+    """
+    Validate that the leading dim of the shard is of the expected size and aligns with the sharding
+    logic for the attention computation itself.
+    """
+    inferred_leading_dim = head_size * (n_local_q_heads + 2 * n_local_kv_heads)
+    assert shard.shape[0] == inferred_leading_dim
+
+
+@pytest.mark.inference_v2
+@pytest.mark.parametrize("head_size", [64])
+@pytest.mark.parametrize("n_heads,n_shards", [(1, 1), (32, 1), (32, 8)])
+def test_even_mha_sharding(head_size: int, n_heads: int, n_shards: int):
+    """
+    Test for MHA sharding. In these scenarios, we expect that each of the shards
+    should be the same size.
+    """
+    param = fill_with_head_ids(head_size, n_heads)
+
+    heads_per_shard = n_heads // n_shards
+
+    for shard_rank in range(n_shards):
+
+        shard = shard_qkv_param(param, shard_rank, n_shards, head_size, n_heads, n_heads)
+        n_local_q_heads, n_local_kv_heads = get_local_heads(shard_rank, n_shards, n_heads, n_heads)
+        validate_inferred_shape(shard, head_size, n_local_q_heads, n_local_kv_heads)
+
+        assert shard.shape == (3 * head_size * heads_per_shard, head_size * n_heads)
+
+        heads = shard.chunk(heads_per_shard * 3, dim=0)
+        for i in range(heads_per_shard):
+            assert torch.all(heads[i] == i + shard_rank * heads_per_shard)
+            assert torch.all(heads[i + heads_per_shard] == i + shard_rank * heads_per_shard)
+            assert torch.all(heads[i + heads_per_shard * 2] == i + shard_rank * heads_per_shard)
+
+
+@pytest.mark.inference_v2
+@pytest.mark.parametrize("head_size", [64])
+@pytest.mark.parametrize("n_heads, n_shards", [(3, 2), (20, 8)])
+def test_unbalanced_mha_sharding(head_size: int, n_heads: int, n_shards: int):
+    """
+    Test MHA sharding when the distribution of heads will not be equal across all ranks.
+    """
+    param = fill_with_head_ids(head_size, n_heads)
+
+    max_heads = 0
+    min_heads = n_heads
+    total_heads = 0
+    seen_heads = set()
+
+    for shard_rank in range(n_shards):
+        shard = shard_qkv_param(param, shard_rank, n_shards, head_size, n_heads, n_heads)
+        n_local_q_heads, n_local_kv_heads = get_local_heads(shard_rank, n_shards, n_heads, n_heads)
+        validate_inferred_shape(shard, head_size, n_local_q_heads, n_local_kv_heads)
+
+        n_heads_in_shard = shard.shape[0] // head_size // 3
+
+        max_heads = max(max_heads, n_heads_in_shard)
+        min_heads = min(min_heads, n_heads_in_shard)
+        total_heads += n_heads_in_shard
+
+        heads = shard.chunk(n_heads_in_shard * 3, dim=0)
+
+        for local_head_id in range(n_heads_in_shard):
+            head_qkv = torch.cat([
+                heads[local_head_id], heads[local_head_id + n_heads_in_shard],
+                heads[local_head_id + 2 * n_heads_in_shard]
+            ],
+                                 dim=0)
+            assert head_qkv.shape == (3 * head_size, head_size * n_heads)
+
+            global_head_id = torch.unique_consecutive(head_qkv)
+            assert len(global_head_id) == 1
+
+            seen_heads.add(global_head_id.item())
+
+    assert max_heads - min_heads <= 1
+    assert total_heads == n_heads
+    assert len(seen_heads) == n_heads
+
+
+@pytest.mark.inference_v2
+@pytest.mark.parametrize("head_size", [64])
+@pytest.mark.parametrize("n_heads_q, n_heads_kv, n_shards", [(4, 2, 1), (8, 2, 1), (64, 16, 8)])
+def test_gqa_even_sharding(head_size: int, n_heads_q: int, n_heads_kv: int, n_shards: int):
+    """
+    Test GQA sharding when the KV heads are evenly divisible by the number of shards.
+    """
+    param = fill_with_head_ids(head_size, n_heads_q, n_heads_kv)
+
+    n_kv_heads_in_shard = n_heads_kv // n_shards
+    n_q_heads_in_shard = n_heads_q // n_shards
+
+    for shard_rank in range(n_shards):
+        shard = shard_qkv_param(param, shard_rank, n_shards, head_size, n_heads_q, n_heads_kv)
+        n_local_q_heads, n_local_kv_heads = get_local_heads(shard_rank, n_shards, n_heads_q, n_heads_kv)
+        validate_inferred_shape(shard, head_size, n_local_q_heads, n_local_kv_heads)
+
+        assert shard.shape[0] == (n_q_heads_in_shard + n_kv_heads_in_shard * 2) * head_size
+
+        q = shard[:n_q_heads_in_shard * head_size]
+        k = shard[n_q_heads_in_shard * head_size:(n_q_heads_in_shard + n_kv_heads_in_shard) * head_size]
+        v = shard[(n_q_heads_in_shard + n_kv_heads_in_shard) * head_size:]
+
+        for local_head_id in range(n_q_heads_in_shard):
+            assert torch.all(q[local_head_id * head_size:(local_head_id + 1) * head_size] == local_head_id +
+                             shard_rank * n_q_heads_in_shard)
+
+        for local_head_id in range(n_kv_heads_in_shard):
+            assert torch.all(k[local_head_id * head_size:(local_head_id + 1) * head_size] == local_head_id +
+                             shard_rank * n_kv_heads_in_shard)
+            assert torch.all(v[local_head_id * head_size:(local_head_id + 1) * head_size] == local_head_id +
+                             shard_rank * n_kv_heads_in_shard)
+
+
+@pytest.mark.inference_v2
+@pytest.mark.parametrize("head_size", [64])
+@pytest.mark.parametrize("n_heads_q, n_heads_kv, n_shards", [(4, 2, 4), (20, 4, 8)])
+def test_gqa_uneven_sharding(head_size: int, n_heads_q: int, n_heads_kv: int, n_shards: int):
+    """
+    Test GQA sharding when there are more shards than KV heads.
+    """
+    param = fill_with_head_ids(head_size, n_heads_q, n_heads_kv)
+
+    n_kv_heads_in_shard = 1
+    n_shards_per_kv_head = n_shards // n_heads_kv
+
+    max_heads = 0
+    min_heads = n_heads_q
+    total_heads = 0
+    seen_heads = set()
+
+    for shard_rank in range(n_shards):
+        shard = shard_qkv_param(param, shard_rank, n_shards, head_size, n_heads_q, n_heads_kv)
+        n_local_q_heads, n_local_kv_heads = get_local_heads(shard_rank, n_shards, n_heads_q, n_heads_kv)
+        validate_inferred_shape(shard, head_size, n_local_q_heads, n_local_kv_heads)
+
+        local_n_heads_q = (shard.shape[0] - 2 * n_kv_heads_in_shard * head_size) // head_size
+
+        max_heads = max(max_heads, local_n_heads_q)
+        min_heads = min(min_heads, local_n_heads_q)
+        total_heads += local_n_heads_q
+
+        q = shard[:local_n_heads_q * head_size]
+        kv = shard[local_n_heads_q * head_size:]
+
+        for local_head_id in range(local_n_heads_q):
+            q_head_id = torch.unique_consecutive(q[local_head_id * head_size:(local_head_id + 1) * head_size])
+            assert len(q_head_id) == 1
+
+            seen_heads.add(q_head_id.item())
+
+        kv_id_calc = shard_rank // n_shards_per_kv_head
+        kv_id = torch.unique_consecutive(kv)
+        assert len(kv_id) == 1
+        assert kv_id.item() == kv_id_calc
+
+    assert max_heads - min_heads <= 1
+    assert total_heads == n_heads_q
+    assert len(seen_heads) == n_heads_q
+
+
+@pytest.mark.inference_v2
+@pytest.mark.parametrize("head_size", [64])
+@pytest.mark.parametrize("n_heads, n_shards", [(6, 8)])
+def test_unsupported_mha_configs(head_size: int, n_heads: int, n_shards: int):
+    """
+    Sharding should fail if there are fewer heads than shards.
+
+    TODO(cmikeh2): Look to support this configuration.
+    """
+    param = fill_with_head_ids(head_size, n_heads)
+
+    for shard_rank in range(n_shards):
+        with pytest.raises(ValueError):
+            shard_qkv_param(param, shard_rank, n_shards, head_size, n_heads, n_heads)
+
+
+@pytest.mark.inference_v2
+@pytest.mark.parametrize("head_size", [64])
+@pytest.mark.parametrize("n_heads_q, n_heads_kv, n_shards", [(5, 2, 1), (40, 10, 8), (30, 5, 8)])
+def test_unsupported_gqa_configs(head_size: int, n_heads_q: int, n_heads_kv: int, n_shards: int):
+    """
+    GQA has stricter requirements. We must be able to evenly shard or distribute the KV heads.
+
+    Test cases are to test the following preconditions specifically:
+        1. n_heads_q % n_heads_kv == 0
+        2. We must be able to evenly distribute KV heads
+        3. We must be able to evely split KV heads
+    """
+    param = fill_with_head_ids(head_size, n_heads_q, n_heads_kv)
+
+    for shard_rank in range(n_shards):
+        with pytest.raises(ValueError):
+            shard_qkv_param(param, shard_rank, n_shards, head_size, n_heads_q, n_heads_kv)
+
+
+@pytest.mark.inference_v2
+def test_mha_input_shape_error():
+
+    param = torch.empty(256, 128)
+
+    n_heads = 2
+    head_size = 64
+
+    with pytest.raises(ValueError):
+        shard_qkv_param(param, 0, 1, 64)
+
+
+@pytest.mark.inference_v2
+def test_gqa_input_shape_error():
+
+    head_size = 64
+    n_heads_q = 16
+    n_heads_kv = 4
+
+    # Correct shape is 1536 (=16 * 64 + 2 * 4 * 64), 1024
+    param = torch.empty(2048, 1024)
+
+    with pytest.raises(ValueError):
+        shard_qkv_param(param, 0, 1, head_size, n_heads_q, n_heads_kv)
diff --git a/tests/unit/inference/v2/modules/__init__.py b/tests/unit/inference/v2/modules/__init__.py
new file mode 100644
index 000000000000..208299fb8c50
--- /dev/null
+++ b/tests/unit/inference/v2/modules/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
diff --git a/tests/unit/inference/v2/modules/test_blas_linear_module.py b/tests/unit/inference/v2/modules/test_blas_linear_module.py
new file mode 100644
index 000000000000..f4d0b1991238
--- /dev/null
+++ b/tests/unit/inference/v2/modules/test_blas_linear_module.py
@@ -0,0 +1,111 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import Optional
+
+import pytest
+import torch
+
+from deepspeed.accelerator import get_accelerator
+from deepspeed.inference.v2.inference_utils import ActivationType, DtypeEnum, is_gated
+from deepspeed.inference.v2.modules import ConfigBundle
+from deepspeed.inference.v2.modules.configs import DSLinearConfig
+from deepspeed.inference.v2.modules.interfaces import DSLinearRegistry
+from ...v2.inference_test_utils import allclose
+
+
+def reference_implementation(hidden_states: torch.Tensor, weight: torch.Tensor, bias: Optional[torch.Tensor],
+                             act_type: ActivationType) -> torch.Tensor:
+    dtype = hidden_states.dtype
+    out_states = torch.nn.functional.linear(hidden_states, weight, bias)
+    out_states.float()
+
+    if is_gated(act_type):
+        act_func_map = {
+            ActivationType.ReGLU: torch.nn.functional.relu,
+            ActivationType.GEGLU: lambda x: torch.nn.functional.gelu(x, approximate="tanh"),
+            ActivationType.SiGLU: torch.nn.functional.silu,
+        }
+
+        act_act = out_states[..., ::2]
+        act_linear = out_states[..., 1::2]
+
+        act_act = act_func_map[act_type](act_act)
+        out_states = act_act * act_linear
+    else:
+        act_func_map = {
+            ActivationType.RELU: torch.nn.functional.relu,
+            ActivationType.GELU: torch.nn.functional.gelu,
+            ActivationType.SILU: torch.nn.functional.silu,
+            ActivationType.IDENTITY: lambda x: x,
+        }
+
+        out_states = act_func_map[act_type](out_states)
+    return out_states.to(dtype)
+
+
+def _blas_linear_helper(tokens: int,
+                        in_channels: int,
+                        out_channels: int,
+                        dtype: DtypeEnum,
+                        act_fn: ActivationType,
+                        use_bias: bool = True) -> None:
+    linear_config = DSLinearConfig(max_tokens=2048,
+                                   in_channels=in_channels,
+                                   out_channels=out_channels,
+                                   activation=act_fn,
+                                   input_dtype=dtype,
+                                   output_dtype=dtype)
+
+    bundle = ConfigBundle(name='blas_fp_linear', config=linear_config)
+
+    module = DSLinearRegistry.instantiate_config(bundle)
+
+    # Input vals
+    hidden_states = torch.randn(
+        (tokens, in_channels), dtype=dtype.value, device=get_accelerator().current_device_name()) * .01
+
+    weight_out_channels = 2 * out_channels if is_gated(act_fn) else out_channels
+    weight = torch.randn(
+        (weight_out_channels, in_channels), dtype=dtype.value, device=get_accelerator().current_device_name()) * .01
+    if use_bias:
+        bias = torch.randn(
+            (weight_out_channels), dtype=dtype.value, device=get_accelerator().current_device_name()) * .01
+    else:
+        bias = None
+
+    # Reference output
+    ref_output = reference_implementation(hidden_states, weight, bias, act_fn)
+
+    # New output
+    ds_output = module(hidden_states, weight, bias)
+
+    # Check
+    assert allclose(ds_output, ref_output)
+
+
+@pytest.mark.inference_v2_ops
+@pytest.mark.parametrize("tokens, in_channels, out_channels", [(1, 4608, 1728), (37, 8192, 4096), (1280, 3072, 6144)])
+def test_blas_linear_shapes(tokens: int, in_channels: int, out_channels: int) -> None:
+
+    _blas_linear_helper(tokens, in_channels, out_channels, DtypeEnum.fp16, ActivationType.IDENTITY)
+
+
+all_acts = [
+    ActivationType.RELU,
+    ActivationType.GELU,
+    ActivationType.SILU,
+    ActivationType.GEGLU,
+    ActivationType.ReGLU,
+    ActivationType.SiGLU,
+]
+
+
+@pytest.mark.inference_v2_ops
+@pytest.mark.parametrize("act_fn", all_acts)
+@pytest.mark.parametrize("use_bias", [True, False])
+def test_blas_linear_act_fn(act_fn: ActivationType, use_bias: bool) -> None:
+
+    _blas_linear_helper(283, 512, 4096, DtypeEnum.fp16, act_fn, use_bias=use_bias)
diff --git a/tests/unit/inference/v2/modules/test_blocked_attn.py b/tests/unit/inference/v2/modules/test_blocked_attn.py
new file mode 100644
index 000000000000..215ad64636b1
--- /dev/null
+++ b/tests/unit/inference/v2/modules/test_blocked_attn.py
@@ -0,0 +1,210 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import itertools
+
+from typing import List, Tuple
+
+import pytest
+import torch
+
+from deepspeed.accelerator import get_accelerator
+from deepspeed.inference.v2.modules import ConfigBundle
+from deepspeed.inference.v2.modules.configs import DSSelfAttentionConfig, PositionalEmbeddingType
+from deepspeed.inference.v2.modules.interfaces import DSSelfAttentionRegistry, DSSelfAttentionBase
+
+from ..kernels.ragged_ops.ragged_testing_utils import build_batch_and_manager
+from ...v2.inference_test_utils import allclose
+
+try:
+    from flash_attn.flash_attn_interface import flash_attn_varlen_func
+    validate_accuracy = True
+except ImportError:
+    validate_accuracy = False
+
+
+def _blocked_flash_testing_helper(head_size: int,
+                                  n_heads_q: int,
+                                  n_heads_kv: int,
+                                  seq_params: List[Tuple[int, int]],
+                                  trained_freqs: bool = None) -> None:
+    """
+    Helper function for testing blocked flash attention. This implementation is based on
+    the implemnentation in ``unit.inference.kernels.ragged_ops.test_blocked_flash`` but
+    integrates functionality to validate the composability.
+    """
+    if trained_freqs is None:
+        embed_type = PositionalEmbeddingType.none
+        embed_args = {}
+    else:
+        embed_type = PositionalEmbeddingType.rotate_half
+        if trained_freqs:
+            embed_args = {'trained_freqs': True}
+        else:
+            embed_args = {'trained_freqs': False}
+
+    attn_config = DSSelfAttentionConfig(max_tokens=2048,
+                                        n_heads_q=n_heads_q,
+                                        n_heads_kv=n_heads_kv,
+                                        head_size=head_size,
+                                        max_sequences=32,
+                                        positional_embedding_type=embed_type,
+                                        positional_embedding_args=embed_args)
+
+    config = ConfigBundle(name='dense_blocked_attention', config=attn_config)
+    attn_module: DSSelfAttentionBase = DSSelfAttentionRegistry.instantiate_config(config)
+
+    kv_block_size = attn_module.kv_block_size
+
+    kvs = []
+    for _, history_len in seq_params:
+        if history_len > 0:
+            kvs.append(
+                torch.randn((history_len, 2 * n_heads_kv * head_size),
+                            device=get_accelerator().current_device(),
+                            dtype=torch.float16))
+        else:
+            kvs.append(None)
+
+    batch, state_manager, _ = build_batch_and_manager(seq_params, head_size, n_heads_kv, kv_block_size, kv_fill=kvs)
+
+    qkv = torch.randn((batch.current_tokens, (n_heads_q + 2 * n_heads_kv) * head_size),
+                      device=get_accelerator().current_device(),
+                      dtype=torch.float16)
+
+    kv_cache = state_manager.get_cache(0)
+
+    attn_module.build_atoms(batch)
+    if not trained_freqs:
+        out = attn_module(qkv, kv_cache, batch)
+    else:
+        inv_freqs = torch.randn((head_size // 2, ), device=get_accelerator().current_device(), dtype=torch.float16)
+        out = attn_module(qkv, kv_cache, batch, inv_freqs)
+
+    if validate_accuracy and trained_freqs is None:
+        cu_seqlens_q = torch.tensor([0] + list(itertools.accumulate([seq[0] for seq in seq_params])),
+                                    dtype=torch.int32,
+                                    device=get_accelerator().current_device())
+        cu_seqlens_kv = torch.tensor([0] + list(itertools.accumulate([seq[1] + seq[0] for seq in seq_params])),
+                                     dtype=torch.int32,
+                                     device=get_accelerator().current_device())
+
+        inflight_kv = qkv[:, head_size * n_heads_q:]
+        full_kvs = []
+        for i, kv in enumerate(kvs):
+            if kv is not None:
+                full_kvs.append(torch.cat([kv, inflight_kv[cu_seqlens_q[i]:cu_seqlens_q[i + 1]]], dim=0))
+            else:
+                full_kvs.append(inflight_kv[cu_seqlens_q[i]:cu_seqlens_q[i + 1]])
+        run_kvs = torch.cat(full_kvs, dim=0)
+        k = run_kvs[:, :head_size * n_heads_kv]
+        v = run_kvs[:, head_size * n_heads_kv:]
+
+        q = qkv[:, :head_size * n_heads_q]
+        q_ref = q.reshape((batch.current_tokens, n_heads_q, head_size))
+        k_ref = k.reshape((k.shape[0], n_heads_kv, head_size))
+        v_ref = v.reshape((v.shape[0], n_heads_kv, head_size))
+
+        max_seqlen_q = max([seq[0] for seq in seq_params])
+        max_seqlen_kv = max([seq[1] + seq[0] for seq in seq_params])
+
+        ref_o = flash_attn_varlen_func(q_ref,
+                                       k_ref,
+                                       v_ref,
+                                       cu_seqlens_q,
+                                       cu_seqlens_kv,
+                                       max_seqlen_q,
+                                       max_seqlen_kv,
+                                       softmax_scale=1.0,
+                                       causal=True)
+
+        ref_o = ref_o.reshape(batch.current_tokens, head_size * n_heads_q)
+
+        assert allclose(out, ref_o)
+
+    get_accelerator().synchronize()
+
+
+@pytest.mark.inference_v2_ops
+@pytest.mark.parametrize("n_tokens", [2, 33, 65, 128, 256, 2037])
+def test_single_prompt(n_tokens: int) -> None:
+    head_size = 64
+    n_heads_q = 16
+    n_heads_kv = 16
+
+    seq_params = [(n_tokens, 0)]
+    _blocked_flash_testing_helper(head_size, n_heads_q, n_heads_kv, seq_params)
+
+
+@pytest.mark.inference_v2_ops
+@pytest.mark.parametrize("prompt_lengths", [(128, 128), (192, 38), (514, 713), (83, 312, 610)])
+def test_multiple_prompts(prompt_lengths: Tuple[int, int]) -> None:
+    """
+    Test multiple prompts in a single batch.
+    """
+    head_size = 64
+    n_heads_q = 16
+    n_heads_kv = 16
+
+    seq_params = [(prompt_lengths[i], 0) for i in range(len(prompt_lengths))]
+    _blocked_flash_testing_helper(head_size, n_heads_q, n_heads_kv, seq_params)
+
+
+@pytest.mark.inference_v2_ops
+@pytest.mark.parametrize("seq_params", [(1, 34), (43, 40), (1, 144), (64, 128), (332, 628)])
+def test_continuation(seq_params: Tuple[int, int]) -> None:
+    """
+    Test continued generation/prompt processing.
+    """
+    head_size = 64
+    n_heads_q = 32
+    n_heads_kv = 32
+
+    _blocked_flash_testing_helper(head_size, n_heads_q, n_heads_kv, [seq_params])
+
+
+@pytest.mark.inference_v2_ops
+@pytest.mark.parametrize("head_size", [64, 128])
+def test_head_size(head_size: int) -> None:
+    n_heads_q = 16
+    n_heads_kv = 16
+    seq_params = [(128, 128), (192, 38), (1, 814)]
+
+    _blocked_flash_testing_helper(head_size, n_heads_q, n_heads_kv, seq_params)
+
+
+@pytest.mark.inference_v2_ops
+@pytest.mark.parametrize("head_config", [(32, 8), (64, 16), (40, 8)])
+def test_gqa(head_config: Tuple[int, int]) -> None:
+    head_size = 128
+    n_heads_q = head_config[0]
+    n_heads_kv = head_config[1]
+
+    seq_params = [(128, 128), (192, 38), (1, 814)]
+
+    _blocked_flash_testing_helper(head_size, n_heads_q, n_heads_kv, seq_params)
+
+
+@pytest.mark.inference_v2_ops
+def test_fully_composed() -> None:
+    head_size = 64
+    n_heads_q = 16
+    n_heads_kv = 16
+
+    seq_params = [(332, 628), (1, 718), (1, 323), (180, 5), (224, 0)]
+
+    _blocked_flash_testing_helper(head_size, n_heads_q, n_heads_kv, seq_params)
+
+
+@pytest.mark.inference_v2_ops
+@pytest.mark.parametrize("trained_freqs", [True, False])
+def test_rotary_emb(trained_freqs: bool) -> None:
+    head_size = 64
+    n_heads_q = 16
+    n_heads_kv = 16
+
+    seq_params = [(332, 628), (1, 718), (1, 323), (180, 5), (224, 0)]
+
+    _blocked_flash_testing_helper(head_size, n_heads_q, n_heads_kv, seq_params, trained_freqs=trained_freqs)
diff --git a/tests/unit/inference/v2/modules/test_cuda_pre_ln_module.py b/tests/unit/inference/v2/modules/test_cuda_pre_ln_module.py
new file mode 100644
index 000000000000..386f3b3ef0b3
--- /dev/null
+++ b/tests/unit/inference/v2/modules/test_cuda_pre_ln_module.py
@@ -0,0 +1,88 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import Optional, Tuple
+
+import pytest
+import torch
+
+from deepspeed.accelerator import get_accelerator
+from deepspeed.inference.v2.modules import ConfigBundle
+from deepspeed.inference.v2.modules.configs import DSNormConfig
+from deepspeed.inference.v2.modules.interfaces import DSPreNormRegistry
+from ...v2.inference_test_utils import get_dtypes, allclose
+
+
+def reference_implementation(residual: torch.Tensor, hidden_states: Optional[torch.Tensor], gamma: torch.Tensor,
+                             beta: torch.Tensor, epsilon: float) -> Tuple[torch.Tensor, torch.Tensor]:
+    dtype = residual.dtype
+
+    residual = residual.to(torch.float32)
+    gamma = gamma.to(torch.float32)
+    beta = beta.to(torch.float32)
+
+    if hidden_states is not None:
+        hidden_states = hidden_states.to(torch.float32)
+        residual = residual + hidden_states
+    hidden_states = torch.nn.functional.layer_norm(residual, (residual.size(-1), ),
+                                                   weight=gamma,
+                                                   bias=beta,
+                                                   eps=epsilon)
+    return residual.to(dtype), hidden_states.to(dtype)
+
+
+def _pre_ln_test_helper(n_tokens: int, n_channels: int, dtype: torch.dtype, res_add: bool = False):
+    config = DSNormConfig(max_tokens=2048,
+                          type="layer_norm",
+                          channels=n_channels,
+                          residual_dtype=dtype,
+                          input_dtype=dtype,
+                          output_dtype=dtype,
+                          eps=1e-5)
+    bundle = ConfigBundle(name='cuda_pre_ln', config=config)
+
+    # Input vals
+    if res_add:
+        hidden_states = torch.randn((n_tokens, n_channels),
+                                    dtype=dtype,
+                                    device=get_accelerator().current_device_name())
+    else:
+        hidden_states = None
+
+    residual = torch.randn((n_tokens, n_channels), dtype=dtype, device=get_accelerator().current_device_name())
+    gamma = torch.randn((n_channels), dtype=torch.float32, device=get_accelerator().current_device_name())
+    beta = torch.rand((n_channels), dtype=torch.float32, device=get_accelerator().current_device_name())
+    epsilon = 1e-5
+
+    # Reference output
+    ref_residual, ref_output = reference_implementation(residual, hidden_states, gamma, beta, epsilon)
+
+    # New output
+    pre_ln_module = DSPreNormRegistry.instantiate_config(bundle)
+    gamma = pre_ln_module.transform_param(gamma)
+    beta = pre_ln_module.transform_param(beta)
+
+    ds_residual, ds_output = pre_ln_module(residual, hidden_states, gamma, beta)
+
+    # Check
+    assert allclose(ds_residual, ref_residual)
+    assert allclose(ds_output, ref_output)
+
+
+@pytest.mark.inference_v2_ops
+@pytest.mark.parametrize("tokens, channels", [(1, 2048), (37, 8192), (1280, 768), (2048, 5120)])
+def test_token_channels(tokens: int, channels: int) -> None:
+    _pre_ln_test_helper(tokens, channels, torch.float16)
+
+
+@pytest.mark.inference_v2_ops
+@pytest.mark.parametrize("dtype", get_dtypes(include_float=False))
+def test_dtype(dtype: torch.dtype) -> None:
+    _pre_ln_test_helper(733, 2560, dtype)
+
+
+@pytest.mark.inference_v2_ops
+def test_no_res_add():
+    _pre_ln_test_helper(733, 2560, torch.float16, res_add=False)
diff --git a/tests/unit/inference/v2/modules/test_custom_module.py b/tests/unit/inference/v2/modules/test_custom_module.py
new file mode 100644
index 000000000000..eb54b7a913f2
--- /dev/null
+++ b/tests/unit/inference/v2/modules/test_custom_module.py
@@ -0,0 +1,76 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import pytest
+import torch
+
+from deepspeed.accelerator import get_accelerator
+from deepspeed.inference.v2.modules import ConfigBundle
+from deepspeed.inference.v2.modules.interfaces import DSPostNormRegistry
+from deepspeed.inference.v2.modules.configs import DSNormConfig
+from deepspeed.inference.v2.modules.implementations import cuda_post_ln
+from ...v2.inference_test_utils import allclose
+
+
+def reference_implementation(residual: torch.Tensor, hidden_states: torch.Tensor, gamma: torch.Tensor,
+                             beta: torch.Tensor, epsilon: float) -> torch.Tensor:
+    residual_f = residual.to(torch.float32)
+    hidden_states_f = hidden_states.to(torch.float32)
+    gamma_f = gamma.to(torch.float32)
+    beta_f = beta.to(torch.float32)
+    return torch.nn.functional.layer_norm(residual_f + hidden_states_f, (hidden_states_f.size(-1), ),
+                                          weight=gamma_f,
+                                          bias=beta_f,
+                                          eps=epsilon).to(hidden_states.dtype)
+
+
+@DSPostNormRegistry.register_module
+class CustomPostLNModule(cuda_post_ln.DSPostLNCUDAModule):
+
+    @staticmethod
+    def name():
+        return 'custom_post_ln'
+
+
+"""
+Here, we explicitly register an LN implementation outside the core deepspeed repo. This should
+validate that the registry is working as expected and we can implement modules outside the core
+repo.
+"""
+
+
+@pytest.mark.inference_v2_ops
+def test_custom_registration():
+    channels = 4096
+    dtype = torch.float16
+    tokens = 1024
+
+    config = DSNormConfig(max_tokens=2048,
+                          type="layer_norm",
+                          channels=channels,
+                          residual_dtype=dtype,
+                          input_dtype=dtype,
+                          output_dtype=dtype,
+                          eps=1e-5)
+    bundle = ConfigBundle(name='custom_post_ln', config=config)
+
+    # Input vals
+    hidden_states = torch.randn((tokens, channels), dtype=dtype, device=get_accelerator().current_device_name())
+    residual = torch.randn((tokens, channels), dtype=dtype, device=get_accelerator().current_device_name())
+    gamma = torch.randn((channels), dtype=torch.float32, device=get_accelerator().current_device_name())
+    beta = torch.rand((channels), dtype=torch.float32, device=get_accelerator().current_device_name())
+    epsilon = 1e-5
+
+    # Reference output
+    ref_output = reference_implementation(residual, hidden_states, gamma, beta, epsilon)
+
+    # New output
+    post_ln_module = DSPostNormRegistry.instantiate_config(bundle)
+    gamma = post_ln_module.transform_param(gamma)
+    beta = post_ln_module.transform_param(beta)
+    ds_output, _ = post_ln_module(residual, hidden_states, gamma, beta)
+
+    # Check
+    assert allclose(ds_output, ref_output)
diff --git a/tests/unit/inference/v2/modules/test_cutlass_moe.py b/tests/unit/inference/v2/modules/test_cutlass_moe.py
new file mode 100644
index 000000000000..e21170c9ed8f
--- /dev/null
+++ b/tests/unit/inference/v2/modules/test_cutlass_moe.py
@@ -0,0 +1,214 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import Tuple
+
+import pytest
+import torch
+
+from deepspeed.accelerator import get_accelerator
+from deepspeed.inference.v2.inference_utils import ActivationType, DtypeEnum
+from deepspeed.inference.v2.modules import ConfigBundle
+from deepspeed.inference.v2.modules.configs import DSMoEConfig
+from deepspeed.inference.v2.modules.interfaces import DSMoERegistry
+
+from ..kernels.ragged_ops.ragged_testing_utils import build_simple_batch
+from ...v2.inference_test_utils import allclose, get_dtypes
+
+
+def _gating_reference(logits: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Reference gating code.
+    """
+    logits = logits.float()
+    probs = torch.nn.functional.softmax(logits, dim=1)
+
+    indices1_s = torch.argmax(probs, dim=-1)
+    mask1 = torch.nn.functional.one_hot(indices1_s, num_classes=logits.shape[-1])
+    indices_mask = mask1.sum(dim=1) * logits.shape[-1] - 1
+    indices1_s = torch.min(indices1_s, indices_mask)
+
+    gates1_s = (probs * mask1).sum(dim=1)
+
+    sorted_indices = indices1_s.sort()[1]
+    original_indices = sorted_indices.sort()[1]
+
+    exp_count = torch.bincount(indices1_s, minlength=logits.shape[-1]).long()
+    exp_count_cumsum = exp_count.cumsum(dim=0)
+
+    return sorted_indices, original_indices, exp_count_cumsum, gates1_s
+
+
+def _reference_impl(hidden_states: torch.Tensor, gate_weight: torch.Tensor, mlp_1_w: torch.Tensor,
+                    mlp_2_w: torch.Tensor, mlp_1_b: torch.Tensor, mlp_2_b: torch.Tensor,
+                    act_fn: ActivationType) -> torch.Tensor:
+    """
+    Reference implementation of the MoE module.
+    """
+
+    act_fn_dict = {
+        ActivationType.GELU: torch.nn.functional.gelu,
+        ActivationType.RELU: torch.nn.functional.relu,
+        ActivationType.SILU: torch.nn.functional.silu,
+        ActivationType.IDENTITY: lambda x: x,
+    }
+
+    logits = torch.matmul(hidden_states, gate_weight.t())
+    sorted_indices, original_indices, exp_count_cumsum, gate_scales = _gating_reference(logits)
+
+    moe_input = hidden_states[sorted_indices]
+
+    output_unordered = torch.empty_like(hidden_states)
+
+    for expert_idx in range(mlp_1_w.shape[0]):
+        min_bound = 0 if expert_idx == 0 else exp_count_cumsum[expert_idx - 1]
+        max_bound = exp_count_cumsum[expert_idx]
+
+        input_slice = moe_input[min_bound:max_bound]
+        intermediate = torch.nn.functional.linear(input_slice, mlp_1_w[expert_idx], mlp_1_b[expert_idx])
+
+        intermediate = act_fn_dict[act_fn](intermediate)
+        output_slice = torch.nn.functional.linear(intermediate, mlp_2_w[expert_idx], mlp_2_b[expert_idx])
+
+        output_unordered[min_bound:max_bound] = output_slice
+
+    output = output_unordered[original_indices]
+
+    output.mul_(gate_scales.unsqueeze(-1)).reshape(hidden_states.shape)
+    return output
+
+
+def _cutlass_moe_testing_helper(tokens: int,
+                                in_channels: int,
+                                intermediate_dim: int,
+                                experts: int,
+                                dtype: int,
+                                activation_type: ActivationType = ActivationType.GELU,
+                                use_bias: bool = True,
+                                iters: int = 1) -> None:
+
+    config = DSMoEConfig(max_tokens=4096,
+                         model_dim=in_channels,
+                         intermediate_features=intermediate_dim,
+                         n_experts=experts,
+                         activation=activation_type,
+                         input_dtype=dtype,
+                         output_dtype=dtype)
+
+    implementation_config = {"weight_dtype": DtypeEnum(dtype)}
+
+    bundle = ConfigBundle(name='cutlass_multi_gemm_moe', config=config, implementation_config=implementation_config)
+    moe_module = DSMoERegistry.instantiate_config(bundle)
+
+    batch = build_simple_batch([tokens])
+
+    # Parameters
+    gate_weight = torch.randn(
+        (experts, in_channels), dtype=dtype.value, device=get_accelerator().current_device()) * .1
+
+    mlp_1_w = torch.randn(
+        (experts, intermediate_dim, in_channels), dtype=dtype.value, device=get_accelerator().current_device()) * .1
+    mlp_2_w = torch.randn(
+        (experts, in_channels, intermediate_dim), dtype=dtype.value, device=get_accelerator().current_device()) * .1
+
+    if use_bias:
+        mlp_1_b = torch.randn(
+            (experts, intermediate_dim), dtype=dtype.value, device=get_accelerator().current_device()) * .1
+        mlp_2_b = torch.randn(
+            (experts, in_channels), dtype=dtype.value, device=get_accelerator().current_device()) * .1
+    else:
+        mlp_1_b = None
+        mlp_2_b = None
+
+    gate_ds = moe_module.transform_gate_param(gate_weight)
+    mlp_1_w_ds = moe_module.transform_moe_mlp_1_param(mlp_1_w)
+    mlp_1_b_ds = moe_module.transform_moe_mlp_1_param(mlp_1_b)
+    mlp_2_w_ds = moe_module.transform_moe_mlp_2_param(mlp_2_w)
+    mlp_2_b_ds = moe_module.transform_moe_mlp_2_param(mlp_2_b)
+
+    for _ in range(iters):
+        # Input vals
+        hidden_states = torch.randn(
+            (tokens, in_channels), dtype=dtype.value, device=get_accelerator().current_device()) * .1
+
+        # Reference implementation
+        ref_output = _reference_impl(hidden_states, gate_weight, mlp_1_w, mlp_2_w, mlp_1_b, mlp_2_b, activation_type)
+
+        output = moe_module(hidden_states,
+                            batch,
+                            gate_ds,
+                            mlp_1_w_ds,
+                            mlp_2_w_ds,
+                            mlp_1_b=mlp_1_b_ds,
+                            mlp_2_b=mlp_2_b_ds)
+
+        # Increase the tolerance for larger meta ops since the error is additive
+        assert allclose(output, ref_output, tolerances=(1e-2, 1e-2))
+
+    get_accelerator().synchronize()
+
+
+@pytest.mark.inference_v2_ops
+@pytest.mark.parametrize("experts", [2, 32, 64])
+def test_expert_variance(experts: int) -> None:
+    _cutlass_moe_testing_helper(tokens=876,
+                                in_channels=4096,
+                                intermediate_dim=2048,
+                                experts=experts,
+                                dtype=DtypeEnum.fp16,
+                                activation_type=ActivationType.IDENTITY,
+                                use_bias=True)
+
+
+@pytest.mark.inference_v2_ops
+def test_successive_inputs():
+    """
+    The CUTLASS MoE uses persistent state (expert counts) that is assumed to be cleared
+    on each forward pass. This ensures that the module is clearing that metadata.
+    """
+    _cutlass_moe_testing_helper(tokens=876,
+                                in_channels=4096,
+                                intermediate_dim=2048,
+                                experts=64,
+                                dtype=DtypeEnum.fp16,
+                                activation_type=ActivationType.IDENTITY,
+                                use_bias=True,
+                                iters=10)
+
+
+@pytest.mark.inference_v2_ops
+@pytest.mark.parametrize("dtype", get_dtypes(include_float=False))
+def test_dtypes(dtype: torch.dtype) -> None:
+    _cutlass_moe_testing_helper(tokens=876,
+                                in_channels=4096,
+                                intermediate_dim=2048,
+                                experts=64,
+                                dtype=DtypeEnum(dtype),
+                                activation_type=ActivationType.IDENTITY,
+                                use_bias=True)
+
+
+@pytest.mark.inference_v2_ops
+@pytest.mark.parametrize("activation_type", [ActivationType.GELU, ActivationType.RELU, ActivationType.SILU])
+def test_activation_types(activation_type: ActivationType) -> None:
+    _cutlass_moe_testing_helper(tokens=876,
+                                in_channels=4096,
+                                intermediate_dim=2048,
+                                experts=64,
+                                dtype=DtypeEnum.fp16,
+                                activation_type=activation_type,
+                                use_bias=True)
+
+
+@pytest.mark.inference_v2_ops
+@pytest.mark.parametrize("in_channels, out_channels", [(4096, 2048), (2048, 8192), (6144, 3072)])
+def test_in_out_channels(in_channels: int, out_channels: int) -> None:
+    _cutlass_moe_testing_helper(tokens=876,
+                                in_channels=in_channels,
+                                intermediate_dim=out_channels,
+                                experts=64,
+                                dtype=DtypeEnum.fp16,
+                                activation_type=ActivationType.IDENTITY,
+                                use_bias=True)
diff --git a/tests/unit/inference/v2/modules/test_post_ln_module.py b/tests/unit/inference/v2/modules/test_post_ln_module.py
new file mode 100644
index 000000000000..f9dcfd272170
--- /dev/null
+++ b/tests/unit/inference/v2/modules/test_post_ln_module.py
@@ -0,0 +1,58 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import pytest
+import torch
+
+from deepspeed.accelerator import get_accelerator
+from deepspeed.inference.v2.modules import ConfigBundle
+from deepspeed.inference.v2.modules.configs import DSNormConfig
+from deepspeed.inference.v2.modules.interfaces import DSPostNormRegistry
+from ...v2.inference_test_utils import get_dtypes, allclose
+
+
+def reference_implementation(residual: torch.Tensor, hidden_states: torch.Tensor, gamma: torch.Tensor,
+                             beta: torch.Tensor, epsilon: float) -> torch.Tensor:
+    residual_f = residual.to(torch.float32)
+    hidden_states_f = hidden_states.to(torch.float32)
+    gamma_f = gamma.to(torch.float32)
+    beta_f = beta.to(torch.float32)
+    return torch.nn.functional.layer_norm(residual_f + hidden_states_f, (hidden_states_f.size(-1), ),
+                                          weight=gamma_f,
+                                          bias=beta_f,
+                                          eps=epsilon).to(hidden_states.dtype)
+
+
+@pytest.mark.inference_v2_ops
+@pytest.mark.parametrize("tokens, channels", [(1, 2048), (37, 8192), (1280, 768), (2048, 5120)])
+@pytest.mark.parametrize("dtype", get_dtypes())
+def test_cuda_post_ln_module(tokens: int, channels: int, dtype: torch.dtype) -> None:
+    config = DSNormConfig(max_tokens=2048,
+                          type="layer_norm",
+                          channels=channels,
+                          residual_dtype=dtype,
+                          input_dtype=dtype,
+                          output_dtype=dtype,
+                          eps=1e-5)
+    bundle = ConfigBundle(name='cuda_post_ln', config=config)
+
+    # Input vals
+    hidden_states = torch.randn((tokens, channels), dtype=dtype, device=get_accelerator().current_device_name())
+    residual = torch.randn((tokens, channels), dtype=dtype, device=get_accelerator().current_device_name())
+    gamma = torch.randn((channels), dtype=torch.float32, device=get_accelerator().current_device_name())
+    beta = torch.rand((channels), dtype=torch.float32, device=get_accelerator().current_device_name())
+    epsilon = 1e-5
+
+    # Reference output
+    ref_output = reference_implementation(residual, hidden_states, gamma, beta, epsilon)
+
+    # New output
+    post_ln_module = DSPostNormRegistry.instantiate_config(bundle)
+    gamma = post_ln_module.transform_param(gamma)
+    beta = post_ln_module.transform_param(beta)
+    ds_output, _ = post_ln_module(residual, hidden_states, gamma, beta)
+
+    # Check
+    assert allclose(ds_output, ref_output)
diff --git a/tests/unit/inference/v2/modules/test_pre_rms_module.py b/tests/unit/inference/v2/modules/test_pre_rms_module.py
new file mode 100644
index 000000000000..bbd108a35a5a
--- /dev/null
+++ b/tests/unit/inference/v2/modules/test_pre_rms_module.py
@@ -0,0 +1,88 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import Optional, Tuple
+
+import pytest
+import torch
+
+from deepspeed.accelerator import get_accelerator
+from deepspeed.inference.v2.modules import ConfigBundle
+from deepspeed.inference.v2.modules.configs import DSNormConfig
+from deepspeed.inference.v2.modules.interfaces import DSPreNormRegistry
+from ...v2.inference_test_utils import get_dtypes, allclose
+
+
+def reference_implementation(residual: torch.Tensor, hidden_states: Optional[torch.Tensor], gamma: torch.Tensor,
+                             epsilon: float) -> Tuple[torch.Tensor, torch.Tensor]:
+    dtype = residual.dtype
+
+    if hidden_states is not None:
+        hidden_states = hidden_states
+        residual = residual + hidden_states
+
+    rms_vals = residual.to(torch.float32)
+    variance = rms_vals.pow(2).mean(-1, keepdim=True)
+    rms_vals = rms_vals * torch.rsqrt(variance + epsilon)
+
+    if gamma.dtype in [torch.float16, torch.bfloat16]:
+        rms_vals = rms_vals.to(gamma.dtype)
+
+    hidden_states = gamma * rms_vals
+
+    return residual.to(dtype), hidden_states.to(dtype)
+
+
+def _pre_rms_test_helper(n_tokens: int, n_channels: int, dtype: torch.dtype, res_add: bool = False):
+    config = DSNormConfig(max_tokens=2048,
+                          type="rms_norm",
+                          channels=n_channels,
+                          residual_dtype=dtype,
+                          input_dtype=dtype,
+                          output_dtype=dtype,
+                          eps=1e-5)
+    bundle = ConfigBundle(name='cuda_pre_rms', config=config)
+
+    # Input vals
+    if res_add:
+        hidden_states = torch.randn((n_tokens, n_channels),
+                                    dtype=dtype,
+                                    device=get_accelerator().current_device_name())
+    else:
+        hidden_states = None
+
+    residual = torch.randn((n_tokens, n_channels), dtype=dtype, device=get_accelerator().current_device_name())
+    gamma = torch.randn((n_channels), dtype=torch.float32, device=get_accelerator().current_device_name())
+    epsilon = 1e-5
+
+    # Reference output
+    ref_residual, ref_output = reference_implementation(residual, hidden_states, gamma, epsilon)
+
+    # New output
+    pre_ln_module = DSPreNormRegistry.instantiate_config(bundle)
+    gamma = pre_ln_module.transform_param(gamma)
+
+    ds_residual, ds_output = pre_ln_module(residual, hidden_states, gamma)
+
+    # Check
+    assert allclose(ds_residual, ref_residual)
+    assert allclose(ds_output, ref_output)
+
+
+@pytest.mark.inference_v2_ops
+@pytest.mark.parametrize("tokens, channels", [(1, 2048), (37, 8192), (1280, 768), (2048, 5120)])
+def test_token_channels(tokens: int, channels: int) -> None:
+    _pre_rms_test_helper(tokens, channels, torch.float16)
+
+
+@pytest.mark.inference_v2_ops
+@pytest.mark.parametrize("dtype", get_dtypes(include_float=False))
+def test_dtype(dtype: torch.dtype) -> None:
+    _pre_rms_test_helper(733, 2560, dtype)
+
+
+@pytest.mark.inference_v2_ops
+def test_no_res_add():
+    _pre_rms_test_helper(733, 2560, torch.float16, res_add=False)
diff --git a/tests/unit/inference/v2/ragged/__init__.py b/tests/unit/inference/v2/ragged/__init__.py
new file mode 100644
index 000000000000..208299fb8c50
--- /dev/null
+++ b/tests/unit/inference/v2/ragged/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
diff --git a/tests/unit/inference/v2/ragged/test_blocked_allocator.py b/tests/unit/inference/v2/ragged/test_blocked_allocator.py
new file mode 100644
index 000000000000..4596e81c5652
--- /dev/null
+++ b/tests/unit/inference/v2/ragged/test_blocked_allocator.py
@@ -0,0 +1,166 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import random
+from typing import List
+
+import pytest
+import torch
+
+from deepspeed.inference.v2.ragged.blocked_allocator import BlockedAllocator
+
+
+@pytest.mark.inference_v2
+@pytest.mark.parametrize('bad_size', [0, -1])
+def test_bad_initialization(bad_size: int) -> None:
+    with pytest.raises(ValueError):
+        BlockedAllocator(bad_size)
+
+
+@pytest.mark.inference_v2
+def test_allocation() -> None:
+
+    allocator = BlockedAllocator(16)
+
+    a1 = allocator.allocate(4)
+    assert a1.numel() == 4
+    assert allocator.free_blocks == 12
+
+    a2_allocs = []
+    for i in range(3):
+        a2_allocs.append(allocator.allocate(2))
+        assert allocator.free_blocks == 12 - (i + 1) * 2
+
+    a3 = allocator.allocate(6)
+    assert a3.numel() == 6
+
+    assert allocator.free_blocks == 0
+
+    # Test that we can't allocate more blocks than we have.
+    with pytest.raises(ValueError):
+        allocator.allocate(1)
+
+    all_vals = torch.cat([a1, *a2_allocs, a3], dim=0)
+    unique_vals = torch.unique(all_vals, sorted=False)
+    assert unique_vals.numel() == all_vals.numel()
+
+
+@pytest.mark.inference_v2
+def test_too_large_allocation():
+    allocator = BlockedAllocator(16)
+
+    with pytest.raises(ValueError):
+        allocator.allocate(17)
+
+
+@pytest.mark.inference_v2
+def test_deallocation() -> None:
+    allocator = BlockedAllocator(16)
+
+    # Allocate
+    all_blocks = allocator.allocate(16)
+    assert allocator.free_blocks == 0
+
+    # Deallocate all blocks
+    allocator.free(all_blocks)
+    assert allocator.free_blocks == 16
+
+    # Get all the blocks again
+    all_blocks = allocator.allocate(16)
+
+    # Deallocate in chunks
+    c1 = all_blocks[:4]
+    c2 = all_blocks[4:8]
+
+    allocator.free(c1)
+    assert allocator.free_blocks == 4
+
+    allocator.free(c2)
+    assert allocator.free_blocks == 8
+
+    with pytest.raises(ValueError):
+        allocator.free(c1)
+
+    with pytest.raises(ValueError):
+        allocator.free(c2)
+
+
+@pytest.mark.inference_v2
+@pytest.mark.parametrize('index', [-1, 2])
+def test_invalid_dealloc_indices(index: int):
+    allocator = BlockedAllocator(1)
+
+    with pytest.raises(ValueError):
+        allocator.free(torch.tensor([index]))
+
+
+@pytest.mark.inference_v2
+@pytest.mark.parametrize('index', [-1, 2])
+def test_invalid_alloc_indices(index: int):
+    allocator = BlockedAllocator(1)
+    allocator.allocate(1)
+
+    to_free = [0, index]
+
+    with pytest.raises(ValueError):
+        allocator.free(torch.tensor(to_free))
+
+    # Block 0 should not be freed if passed with an invalid index.
+    assert allocator.free_blocks == 0
+
+    allocator.free(torch.tensor([0]))
+    assert allocator.free_blocks == 1
+
+
+@pytest.mark.inference_v2
+@pytest.mark.parametrize('test_iters', [8192])
+def test_long_running_allocation(test_iters: int) -> None:
+    """
+    Evaluate the stability of the allocator over a longer sequence of allocations/deallocations.
+    """
+    TOTAL_BLOCKS = 128
+
+    allocator = BlockedAllocator(TOTAL_BLOCKS)
+
+    def validate_uniqueness(all_blocks: List[torch.Tensor]) -> None:
+        all_vals = torch.cat(all_blocks, dim=0)
+        assert all_vals.numel() <= TOTAL_BLOCKS
+
+        unique_vals = torch.unique(all_vals, sorted=False)
+        assert unique_vals.numel() == all_vals.numel()
+
+    all_allocs: List[torch.Tensor] = []
+    num_allocs = 0
+    num_frees = 0
+    num_blocks_allocated = 0
+    num_blocks_freed = 0
+
+    for _ in range(test_iters):
+        decision = random.randint(0, 1)
+
+        if decision == 0:
+            blocks_to_allocate = random.randint(1, 24)
+            if blocks_to_allocate > allocator.free_blocks:
+                with pytest.raises(ValueError):
+                    allocator.allocate(blocks_to_allocate)
+            else:
+                all_allocs.append(allocator.allocate(blocks_to_allocate))
+                num_allocs += 1
+                num_blocks_allocated += blocks_to_allocate
+        else:
+            if len(all_allocs) > 0:
+                idx = random.randint(0, len(all_allocs) - 1)
+                allocator.free(all_allocs[idx])
+
+                num_frees += 1
+                num_blocks_freed += all_allocs[idx].numel()
+
+                del all_allocs[idx]
+
+        if len(all_allocs) > 0:
+            validate_uniqueness(all_allocs)
+
+    assert num_allocs == num_frees + len(all_allocs)
+    assert num_blocks_allocated == num_blocks_freed + (TOTAL_BLOCKS - allocator.free_blocks)
diff --git a/tests/unit/inference/v2/ragged/test_manager_configs.py b/tests/unit/inference/v2/ragged/test_manager_configs.py
new file mode 100644
index 000000000000..bdd513445ddb
--- /dev/null
+++ b/tests/unit/inference/v2/ragged/test_manager_configs.py
@@ -0,0 +1,58 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import pytest
+
+from pydantic import ValidationError
+
+from deepspeed.inference.v2.ragged import DSStateManagerConfig
+
+
+@pytest.mark.inference_v2
+def test_negative_max_tracked_sequences() -> None:
+    with pytest.raises(ValidationError):
+        DSStateManagerConfig(max_tracked_sequences=-1)
+
+
+@pytest.mark.inference_v2
+def test_zero_max_tracked_sequences() -> None:
+    with pytest.raises(ValidationError):
+        DSStateManagerConfig(max_tracked_sequences=0)
+
+
+@pytest.mark.inference_v2
+def test_negative_max_ragged_batch_size() -> None:
+    with pytest.raises(ValidationError):
+        DSStateManagerConfig(max_ragged_batch_size=-1)
+
+
+@pytest.mark.inference_v2
+def test_zero_max_ragged_batch_size() -> None:
+    with pytest.raises(ValidationError):
+        DSStateManagerConfig(max_ragged_batch_size=0)
+
+
+@pytest.mark.inference_v2
+def test_negative_max_ragged_sequence_count() -> None:
+    with pytest.raises(ValidationError):
+        DSStateManagerConfig(max_ragged_sequence_count=-1)
+
+
+@pytest.mark.inference_v2
+def test_zero_max_ragged_sequence_count() -> None:
+    with pytest.raises(ValidationError):
+        DSStateManagerConfig(max_ragged_sequence_count=0)
+
+
+@pytest.mark.inference_v2
+def test_too_small_max_ragged_batch_size() -> None:
+    with pytest.raises(ValidationError):
+        DSStateManagerConfig(max_ragged_batch_size=512, max_ragged_sequence_count=1024)
+
+
+@pytest.mark.inference_v2
+def test_too_small_max_tracked_sequences() -> None:
+    with pytest.raises(ValidationError):
+        DSStateManagerConfig(max_tracked_sequences=512, max_ragged_sequence_count=1024)
diff --git a/tests/unit/inference/v2/ragged/test_ragged_wrapper.py b/tests/unit/inference/v2/ragged/test_ragged_wrapper.py
new file mode 100644
index 000000000000..3cb74f4c49d2
--- /dev/null
+++ b/tests/unit/inference/v2/ragged/test_ragged_wrapper.py
@@ -0,0 +1,112 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import List
+
+import pytest
+import torch
+
+from deepspeed.accelerator import get_accelerator
+from deepspeed.inference.v2.ragged import (
+    PlaceholderSequenceDescriptor,
+    RaggedBatchWrapper,
+    DSStateManagerConfig,
+)
+
+
+@pytest.mark.inference_v2
+@pytest.mark.parametrize('max_ragged_sequence_count, max_ragged_batch_size', [(128, 512), (128, 1024)])
+def test_wrapper_initialization(max_ragged_sequence_count: int, max_ragged_batch_size: int) -> None:
+    config = DSStateManagerConfig(max_tracked_sequences=max_ragged_sequence_count,
+                                  max_ragged_batch_size=max_ragged_batch_size,
+                                  max_ragged_sequence_count=max_ragged_sequence_count)
+
+    batch = RaggedBatchWrapper(config)
+
+    assert batch.current_tokens == 0
+    assert batch.current_sequences == 0
+
+
+@pytest.mark.inference_v2
+@pytest.mark.parametrize('seq_len', [1, 37, 128, 512])
+def test_single_sequence_batch(seq_len: int) -> None:
+    """
+    Test we successfully construct single sequence batches and the on device metadata is accurate.
+    """
+
+    config = DSStateManagerConfig()
+    batch = RaggedBatchWrapper(config)
+
+    batch.clear()
+
+    assert batch.current_tokens == 0
+    assert batch.current_sequences == 0
+
+    seq_desc = PlaceholderSequenceDescriptor()
+    tokens = torch.randint(0, 100, (seq_len, ))
+    batch.insert_sequence(seq_desc, tokens)
+
+    batch.finalize()
+
+    assert batch.current_tokens == seq_len
+    assert batch.current_sequences == 1
+    assert torch.equal(batch.input_ids(), tokens.to(get_accelerator().current_device()))
+    assert torch.equal(batch.tokens_to_seq(), torch.zeros_like(tokens, device=get_accelerator().current_device()))
+    assert torch.equal(batch.batch_metadata_buffer(),
+                       torch.tensor([seq_len, 1], device=get_accelerator().current_device()))
+
+    batch.clear()
+
+    assert batch.current_tokens == 0
+    assert batch.current_sequences == 0
+
+
+@pytest.mark.inference_v2
+@pytest.mark.parametrize('seq_lens', [[128, 128], [1, 32, 243], [64, 1, 1, 1, 1, 393, 27, 2]])
+def test_multi_sequence_batch(seq_lens: List[int]) -> None:
+    """
+    Test sequentially adding new tokens to a batch and validate device data structures hold
+    the appropriate data.
+    """
+    config = DSStateManagerConfig()
+    batch = RaggedBatchWrapper(config)
+
+    batch.clear()
+
+    assert batch.current_tokens == 0
+    assert batch.current_sequences == 0
+
+    all_toks = [torch.randint(0, 100, (seq_len, )) for seq_len in seq_lens]
+
+    for i, toks in enumerate(all_toks):
+        seq_desc = PlaceholderSequenceDescriptor()
+        batch.insert_sequence(seq_desc, toks)
+
+        assert batch.current_tokens == sum(seq_lens[:i + 1])
+        assert batch.current_sequences == i + 1
+
+    batch.finalize()
+
+    assert batch.current_tokens == sum(seq_lens)
+    assert batch.current_sequences == len(seq_lens)
+
+    assert torch.equal(batch.input_ids(), torch.cat(all_toks, dim=0).to(get_accelerator().current_device()))
+    assert torch.equal(
+        batch.tokens_to_seq(),
+        torch.cat([torch.full((seq_len, ), i, dtype=torch.int32) for i, seq_len in enumerate(seq_lens)],
+                  dim=0).to(get_accelerator().current_device()))
+
+    for i, seq_len in enumerate(seq_lens):
+        assert batch.inflight_seq_descriptors()[i][0] == sum(seq_lens[:i])
+        assert batch.inflight_seq_descriptors()[i][1] == seq_len
+        assert batch.inflight_seq_descriptors()[i][2] == 0
+
+    assert torch.equal(batch.batch_metadata_buffer(),
+                       torch.tensor([sum(seq_lens), len(seq_lens)], device=get_accelerator().current_device()))
+
+    batch.clear()
+
+    assert batch.current_tokens == 0
+    assert batch.current_sequences == 0
diff --git a/tests/unit/launcher/test_ds_arguments.py b/tests/unit/launcher/test_ds_arguments.py
index 9d7af74f2c57..a2d06e7601ab 100644
--- a/tests/unit/launcher/test_ds_arguments.py
+++ b/tests/unit/launcher/test_ds_arguments.py
@@ -1,8 +1,12 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import argparse
 import pytest
 import deepspeed
+from deepspeed.utils.numa import parse_range_list
 
 
 def basic_parser():
@@ -82,12 +86,7 @@ def test_no_ds_parser():
 def test_core_deepscale_arguments():
     parser = basic_parser()
     parser = deepspeed.add_config_arguments(parser)
-    args = parser.parse_args(
-        ['--num_epochs',
-         '2',
-         '--deepspeed',
-         '--deepspeed_config',
-         'foo.json'])
+    args = parser.parse_args(['--num_epochs', '2', '--deepspeed', '--deepspeed_config', 'foo.json'])
     assert args
 
     assert hasattr(args, 'num_epochs')
@@ -100,3 +99,35 @@ def test_core_deepscale_arguments():
     assert hasattr(args, 'deepspeed_config')
     assert type(args.deepspeed_config) == str
     assert args.deepspeed_config == 'foo.json'
+
+
+def test_core_binding_arguments():
+    core_list = parse_range_list("0,2-4,6,8-9")
+    assert core_list == [0, 2, 3, 4, 6, 8, 9]
+
+    try:
+        # negative case for range overlapping
+        core_list = parse_range_list("0,2-6,5-9")
+    except ValueError as e:
+        pass
+    else:
+        # invalid core list must fail
+        assert False
+
+    try:
+        # negative case for reverse order -- case 1
+        core_list = parse_range_list("8,2-6")
+    except ValueError as e:
+        pass
+    else:
+        # invalid core list must fail
+        assert False
+
+    try:
+        # negative case for reverse order -- case 2
+        core_list = parse_range_list("1,6-2")
+    except ValueError as e:
+        pass
+    else:
+        # invalid core list must fail
+        assert False
diff --git a/tests/unit/launcher/test_multinode_runner.py b/tests/unit/launcher/test_multinode_runner.py
index 27233d7150dc..ec0459ab0a6f 100644
--- a/tests/unit/launcher/test_multinode_runner.py
+++ b/tests/unit/launcher/test_multinode_runner.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from copy import deepcopy
 from deepspeed.launcher import multinode_runner as mnrunner
@@ -19,7 +22,7 @@ def runner_info():
 def test_pdsh_runner(runner_info):
     env, resource_pool, world_info, args = runner_info
     runner = mnrunner.PDSHRunner(args, world_info)
-    cmd, kill_cmd = runner.get_cmd(env, resource_pool)
+    cmd, kill_cmd, env = runner.get_cmd(env, resource_pool)
     assert cmd[0] == 'pdsh'
     assert env['PDSH_RCMD_TYPE'] == 'ssh'
 
diff --git a/tests/unit/launcher/test_run.py b/tests/unit/launcher/test_run.py
index 1d7f4efc6815..6540ebcf598c 100644
--- a/tests/unit/launcher/test_run.py
+++ b/tests/unit/launcher/test_run.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import pytest
 
diff --git a/tests/unit/megatron_model.py b/tests/unit/megatron_model.py
index 32faf2244940..011ebaf4d3b9 100644
--- a/tests/unit/megatron_model.py
+++ b/tests/unit/megatron_model.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 import os
@@ -31,12 +34,7 @@ def get_gpt2_model(args_others, mp_size=1):
     args_defaults.update(args_others)
 
     # setting "make-vocab-size-divisible-by" to avoid word-embedding size change in resizing testing.
-    sys.argv.extend([
-        '--model-parallel-size',
-        str(mp_size),
-        '--make-vocab-size-divisible-by',
-        str(1)
-    ])
+    sys.argv.extend(['--model-parallel-size', str(mp_size), '--make-vocab-size-divisible-by', str(1)])
 
     initialize_megatron(args_defaults=args_defaults, ignore_unknown_args=True)
     model = GPT2Model(num_tokentypes=0, parallel_output=False)
@@ -44,15 +42,13 @@ def get_gpt2_model(args_others, mp_size=1):
     from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP
     from megatron import mpu
     i = get_accelerator().current_device_name()
-    model = torchDDP(model,
-                     device_ids=[i],
-                     output_device=i,
-                     process_group=mpu.get_data_parallel_group())
+    model = torchDDP(model, device_ids=[i], output_device=i, process_group=mpu.get_data_parallel_group())
 
     return model
 
 
 class MockGPT2ModelPipe(PipelineModule):
+
     def __init__(self, num_layers, mp_size, args_others, topo, **kwargs):
         from megatron.initialize import initialize_megatron
 
@@ -65,38 +61,25 @@ def __init__(self, num_layers, mp_size, args_others, topo, **kwargs):
         args_defaults.update(args_others)
 
         # setting "make-vocab-size-divisible-by" to avoid word-embedding size change in resizing testing.
-        sys.argv.extend([
-            '--model-parallel-size',
-            str(mp_size),
-            '--make-vocab-size-divisible-by',
-            str(1)
-        ])
+        sys.argv.extend(['--model-parallel-size', str(mp_size), '--make-vocab-size-divisible-by', str(1)])
 
         initialize_megatron(args_defaults=args_defaults, ignore_unknown_args=True)
 
         from megatron.model.transformer import ParallelTransformerLayer
 
         class ParallelTransformerLayerPipe(ParallelTransformerLayer):
+
             def forward(self, args):
                 # hardcode attn mask for testing, PP requires the attn_mask to be stashed
-                attention_mask = torch.tensor(
-                    [[True]],
-                    device=get_accelerator().current_device_name())
+                attention_mask = torch.tensor([[True]], device=get_accelerator().current_device_name())
                 return super().forward(args, attention_mask)
 
         layers = []
         for x in range(num_layers):
             layers.append(
-                LayerSpec(ParallelTransformerLayerPipe,
-                          self.gpt2_attention_mask_func,
-                          self.init_method_normal(0.02),
-                          self.scaled_init_method_normal(0.02,
-                                                         num_layers),
-                          x))
-        super().__init__(layers=layers,
-                         loss_fn=torch.nn.CrossEntropyLoss(),
-                         topology=topo,
-                         **kwargs)
+                LayerSpec(ParallelTransformerLayerPipe, self.gpt2_attention_mask_func, self.init_method_normal(0.02),
+                          self.scaled_init_method_normal(0.02, num_layers), x))
+        super().__init__(layers=layers, loss_fn=torch.nn.CrossEntropyLoss(), topology=topo, **kwargs)
 
     def gpt2_attention_mask_func(self, attention_scores, ltor_mask):
         attention_scores.masked_fill_(ltor_mask, -10000.0)
@@ -104,6 +87,7 @@ def gpt2_attention_mask_func(self, attention_scores, ltor_mask):
 
     def init_method_normal(self, sigma):
         """Init method based on N(0, sigma)."""
+
         def init_(tensor):
             return torch.nn.init.normal_(tensor, mean=0.0, std=sigma)
 
diff --git a/tests/unit/model_parallelism/test_configurable_parallel_mp.py b/tests/unit/model_parallelism/test_configurable_parallel_mp.py
index d17f45c0b526..824ecea5f144 100644
--- a/tests/unit/model_parallelism/test_configurable_parallel_mp.py
+++ b/tests/unit/model_parallelism/test_configurable_parallel_mp.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import os
 import torch
@@ -10,14 +13,13 @@
 from deepspeed.accelerator import get_accelerator
 from unit.common import DistributedTest, DistributedFixture
 from unit.megatron_model import get_gpt2_model, get_megatron_version
+from deepspeed.runtime.utils import required_torch_version
 
-TORCH_MAJOR = int(torch.__version__.split('.')[0])
-TORCH_MINOR = int(torch.__version__.split('.')[1])
-pytestmark = pytest.mark.skipif(
-    TORCH_MAJOR < 1 or (TORCH_MAJOR == 1 and TORCH_MINOR < 5),
-    reason='Megatron-LM package requires Pytorch version 1.5 or above')
+pytestmark = pytest.mark.skipif(not required_torch_version(min_version=1.5, max_version=1.13),
+                                reason='Megatron-LM package requires Pytorch version >=1.5 and <=1.13')
 
 
+# TODO: integrated testing of TP and ZeRO 1/2/3
 def get_deepspeed_model(model):
     ds_config_dict = {
         "train_micro_batch_size_per_gpu": 1,
@@ -30,14 +32,15 @@ def get_deepspeed_model(model):
     }
 
     from megatron import mpu
-    model, _, _,_ = deepspeed.initialize(model=model,
-                                         mpu=mpu,
-                                         model_parameters=model.parameters(),
-                                         config=ds_config_dict)
+    model, _, _, _ = deepspeed.initialize(model=model,
+                                          mpu=mpu,
+                                          model_parameters=model.parameters(),
+                                          config=ds_config_dict)
     return model
 
 
 class ConfigurableMP(DistributedTest):
+
     @pytest.fixture(autouse=True)
     def reset_random(self, seed=1234):
         random.seed(seed)
@@ -49,16 +52,14 @@ def reset_random(self, seed=1234):
     def inputs(self, bs=1, seq_len=20):
         input_ids = torch.randint(low=0, high=1000, size=(bs, seq_len))
         position_ids = torch.randint(low=0, high=2, size=(bs, seq_len))
-        attention_mask = torch.randint(low=0,
-                                       high=2,
-                                       size=(bs,
-                                             seq_len),
-                                       dtype=torch.bool)
+        attention_mask = torch.randint(low=0, high=2, size=(bs, seq_len), dtype=torch.bool)
         return [input_ids, position_ids, attention_mask]
 
 
 class TestConfigurableMP(ConfigurableMP):
+
     @pytest.mark.world_size(1)
+    @pytest.mark.skip(reason="megatron-lm is currently broken so this test cannot be run.")
     def test_gpt2_basic(self, tmpdir, inputs):
         args_defaults = {
             'num_layers': 2,
@@ -72,24 +73,21 @@ def test_gpt2_basic(self, tmpdir, inputs):
 
         model.eval()
         device_name = get_accelerator().device_name()
-        baseline = model(inputs[0].to(device_name),
-                         inputs[1].to(device_name),
-                         inputs[2].to(device_name))
+        baseline = model(inputs[0].to(device_name), inputs[1].to(device_name), inputs[2].to(device_name))
 
         tag = 'mp_1'
         state_dict = {}
         state_dict['checkpoint_version'] = get_megatron_version()
         model.save_checkpoint(tmpdir, tag=tag, client_state=state_dict)
         dist.barrier()
-        model.load_checkpoint(tmpdir,
-                              tag=tag,
-                              load_optimizer_states=False,
-                              load_lr_scheduler_states=False)
+        model.load_checkpoint(tmpdir, tag=tag, load_optimizer_states=False, load_lr_scheduler_states=False)
 
         test = model(inputs[0], inputs[1], inputs[2])
-        assert torch.allclose(baseline, test, atol=1e-07), f"Baseline output {baseline} is not equal to save-then-load output {test}"
+        assert torch.allclose(baseline, test,
+                              atol=1e-07), f"Baseline output {baseline} is not equal to save-then-load output {test}"
 
     @pytest.mark.world_size(2)
+    @pytest.mark.skip(reason="megatron-lm is currently broken so this test cannot be run.")
     def test_gpt2_mp2_no_resize(self, tmpdir, inputs):
         args_defaults = {
             'num_layers': 2,
@@ -104,25 +102,19 @@ def test_gpt2_mp2_no_resize(self, tmpdir, inputs):
         model.eval()
 
         device_name = get_accelerator().device_name()
-        baseline = model(inputs[0].to(device_name),
-                         inputs[1].to(device_name),
-                         inputs[2].to(device_name))
+        baseline = model(inputs[0].to(device_name), inputs[1].to(device_name), inputs[2].to(device_name))
 
         tag = 'mp_2'
         state_dict = {}
         state_dict['checkpoint_version'] = get_megatron_version()
         model.save_checkpoint(tmpdir, tag=tag, client_state=state_dict)
         dist.barrier()
-        model.load_checkpoint(tmpdir,
-                              tag=tag,
-                              load_optimizer_states=False,
-                              load_lr_scheduler_states=False)
+        model.load_checkpoint(tmpdir, tag=tag, load_optimizer_states=False, load_lr_scheduler_states=False)
 
         device_name = get_accelerator().device_name()
-        test = model(inputs[0].to(device_name),
-                     inputs[1].to(device_name),
-                     inputs[2].to(device_name))
-        assert torch.allclose(baseline, test, rtol=1.0, atol=1e-07), f"Baseline output {baseline} is not equal to save-then-load output {test}"
+        test = model(inputs[0].to(device_name), inputs[1].to(device_name), inputs[2].to(device_name))
+        assert torch.allclose(baseline, test, rtol=1.0,
+                              atol=1e-07), f"Baseline output {baseline} is not equal to save-then-load output {test}"
 
 
 # This fixture provides the baseline model with mp=2 to TestConfigurableMPResize
@@ -144,9 +136,7 @@ def run(self, inputs, class_tmpdir):
 
         with torch.no_grad():
             device_name = get_accelerator().device_name()
-            baseline = model(inputs[0].to(device_name),
-                             inputs[1].to(device_name),
-                             inputs[2].to(device_name))
+            baseline = model(inputs[0].to(device_name), inputs[1].to(device_name), inputs[2].to(device_name))
             if dist.get_rank() == 0:
                 save_path = os.path.join(class_tmpdir, "output.pt")
                 torch.save(baseline.cpu(), save_path)
@@ -159,6 +149,7 @@ def run(self, inputs, class_tmpdir):
 class TestConfigurableResizeMP(ConfigurableMP):
     world_size = [1, 4]
 
+    @pytest.mark.skip(reason="megatron-lm is currently broken so this test cannot be run.")
     def test(self, baseline_mp2, inputs, class_tmpdir):
         args_defaults = {
             'num_layers': 2,
@@ -174,15 +165,13 @@ def test(self, baseline_mp2, inputs, class_tmpdir):
         model.eval()
 
         with torch.no_grad():
-            model.load_checkpoint(class_tmpdir,
-                                  load_optimizer_states=False,
-                                  load_lr_scheduler_states=False)
+            model.load_checkpoint(class_tmpdir, load_optimizer_states=False, load_lr_scheduler_states=False)
             device_name = get_accelerator().device_name()
-            test = model(inputs[0].to(device_name),
-                         inputs[1].to(device_name),
-                         inputs[2].to(device_name))
+            test = model(inputs[0].to(device_name), inputs[1].to(device_name), inputs[2].to(device_name))
             if dist.get_rank() == 0:
                 load_path = os.path.join(class_tmpdir, "output.pt")
                 baseline = torch.load(load_path)
                 test = test.cpu()
-                assert torch.allclose(baseline, test, atol=1e-03), f"Baseline output {baseline} is not equal to save-then-load output {test}"
+                assert torch.allclose(
+                    baseline, test,
+                    atol=1e-03), f"Baseline output {baseline} is not equal to save-then-load output {test}"
diff --git a/tests/unit/model_parallelism/test_configurable_parallel_pp.py b/tests/unit/model_parallelism/test_configurable_parallel_pp.py
index af091d68c411..b500b9d857a5 100644
--- a/tests/unit/model_parallelism/test_configurable_parallel_pp.py
+++ b/tests/unit/model_parallelism/test_configurable_parallel_pp.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import os
 import torch
@@ -12,12 +15,10 @@
 from unit.megatron_model import MockGPT2ModelPipe as GPT2ModelPipe
 from deepspeed.utils import RepeatingLoader
 from deepspeed.accelerator import get_accelerator
+from deepspeed.runtime.utils import required_torch_version
 
-TORCH_MAJOR = int(torch.__version__.split('.')[0])
-TORCH_MINOR = int(torch.__version__.split('.')[1])
-pytestmark = pytest.mark.skipif(
-    TORCH_MAJOR < 1 or (TORCH_MAJOR == 1 and TORCH_MINOR < 5),
-    reason='Megatron-LM package requires Pytorch version 1.5 or above')
+pytestmark = pytest.mark.skipif(not required_torch_version(min_version=1.5, max_version=1.13),
+                                reason='Megatron-LM package requires Pytorch version >=1.5 and <=1.13')
 
 
 def get_deepspeed_model(model):
@@ -31,9 +32,7 @@ def get_deepspeed_model(model):
         },
     }
 
-    model, _, _,_ = deepspeed.initialize(model=model,
-                                         model_parameters=model.parameters(),
-                                         config=ds_config_dict)
+    model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=ds_config_dict)
     return model.to(get_accelerator().device_name())
 
 
@@ -48,6 +47,7 @@ def get_topology(mp, pp, world_size):
 
 
 class ConfigurablePP(DistributedTest):
+
     @pytest.fixture(autouse=True)
     def reset_random(self, seed=1234):
         random.seed(seed)
@@ -58,11 +58,7 @@ def reset_random(self, seed=1234):
     @pytest.fixture
     def inputs(self, bs=1, seq_len=1, hidden_size=128):
         hidden_states = torch.randn(bs, seq_len, hidden_size)
-        attention_mask = torch.randint(low=0,
-                                       high=2,
-                                       size=(bs,
-                                             seq_len),
-                                       dtype=torch.bool)
+        attention_mask = torch.randint(low=0, high=2, size=(bs, seq_len), dtype=torch.bool)
         return (hidden_states, attention_mask)
 
 
@@ -71,6 +67,7 @@ class TestConfigurablePP(ConfigurablePP):
     pp_size = 2
     world_size = 4  # mp_size * pp_size
 
+    @pytest.mark.skip(reason="megatron-lm is currently broken so this test cannot be run.")
     def test_pp_basic(self, inputs, tmpdir):
         # basic test case, mp_size=2, pp_size=2, verify ckpt saving/loading.
         args_defaults = {
@@ -102,20 +99,13 @@ def test_pp_basic(self, inputs, tmpdir):
         else:
             data_iter = None
 
-        baseline = model.eval_batch(data_iter=data_iter,
-                                    compute_loss=False,
-                                    reduce_output=None)
+        baseline = model.eval_batch(data_iter=data_iter, compute_loss=False, reduce_output=None)
 
         dist.barrier()
-        model.load_checkpoint(tmpdir,
-                              tag=tag,
-                              load_optimizer_states=False,
-                              load_lr_scheduler_states=False)
+        model.load_checkpoint(tmpdir, tag=tag, load_optimizer_states=False, load_lr_scheduler_states=False)
         dist.barrier()
 
-        test = model.eval_batch(data_iter=data_iter,
-                                compute_loss=False,
-                                reduce_output=None)
+        test = model.eval_batch(data_iter=data_iter, compute_loss=False, reduce_output=None)
 
         if test is not None:
             assert len(baseline) == len(test)
@@ -123,7 +113,9 @@ def test_pp_basic(self, inputs, tmpdir):
             for mb in range(len(baseline)):
                 for b, t in zip(baseline[mb], test[mb]):
                     if b.is_floating_point():  # don't compare masks
-                        assert torch.allclose(b, t, atol=1e-07), f"Baseline output {baseline} is not equal to save-then-load output {test}"
+                        assert torch.allclose(
+                            b, t,
+                            atol=1e-07), f"Baseline output {baseline} is not equal to save-then-load output {test}"
 
 
 # Fixture for defining the checkpoint path since all tests in
@@ -139,7 +131,8 @@ class _baseline(DistributedFixture):
     world_size = None
 
     def run(self, inputs, class_tmpdir, checkpoint_tag, mp_size, pp_size):
-        assert int(os.environ["WORLD_SIZE"]) == (pp_size * mp_size), "world size does not match provided pp_size and mp_size"
+        assert int(os.environ["WORLD_SIZE"]) == (pp_size *
+                                                 mp_size), "world size does not match provided pp_size and mp_size"
         args_defaults = {
             'num_layers': 8,
             'hidden_size': 128,
@@ -163,9 +156,7 @@ def run(self, inputs, class_tmpdir, checkpoint_tag, mp_size, pp_size):
             else:
                 data_iter = None
 
-            baseline = model.eval_batch(data_iter=data_iter,
-                                        compute_loss=False,
-                                        reduce_output=None)
+            baseline = model.eval_batch(data_iter=data_iter, compute_loss=False, reduce_output=None)
 
             if baseline is not None:
                 # baseline should be [[hidden, True]]]
@@ -177,9 +168,7 @@ def run(self, inputs, class_tmpdir, checkpoint_tag, mp_size, pp_size):
 
             state_dict = {}
             state_dict['checkpoint_version'] = get_megatron_version()
-            model.save_checkpoint(class_tmpdir,
-                                  tag=checkpoint_tag,
-                                  client_state=state_dict)
+            model.save_checkpoint(class_tmpdir, tag=checkpoint_tag, client_state=state_dict)
 
 
 # This may look odd, but there is a limitation with DistributedFixture that
@@ -198,14 +187,8 @@ class baseline_ws4(_baseline):
 
 
 class TestConfigurableResizePP(ConfigurablePP):
-    def _test(self,
-              inputs,
-              class_tmpdir,
-              checkpoint_tag,
-              mp_size,
-              pp_size,
-              mp_resize,
-              pp_resize):
+
+    def _test(self, inputs, class_tmpdir, checkpoint_tag, mp_size, pp_size, mp_resize, pp_resize):
         args_defaults = {
             'num_layers': 8,
             'hidden_size': 128,
@@ -233,9 +216,7 @@ def _test(self,
             else:
                 data_iter = None
 
-            test = model.eval_batch(data_iter=data_iter,
-                                    compute_loss=False,
-                                    reduce_output=None)
+            test = model.eval_batch(data_iter=data_iter, compute_loss=False, reduce_output=None)
 
             if test is not None:
                 # test should be [[hidden, True]]]
@@ -245,108 +226,42 @@ def _test(self,
                 test = test[0][0].cpu()
                 load_path = os.path.join(class_tmpdir, f"output-{checkpoint_tag}.pt")
                 baseline = torch.load(load_path)
-                assert torch.allclose(baseline, test, atol=1e-03), f"Baseline output {baseline} is not equal to save-then-load output {test}"
+                assert torch.allclose(
+                    baseline, test,
+                    atol=1e-03), f"Baseline output {baseline} is not equal to save-then-load output {test}"
 
     # These tests are divided by baseline model worldsize and test model worldsize
     @pytest.mark.world_size(1)
     @pytest.mark.parametrize("mp_size, pp_size, mp_resize, pp_resize", [(1, 2, 1, 1)])
-    def test_world_size_2to1(self,
-                             inputs,
-                             class_tmpdir,
-                             checkpoint_tag,
-                             baseline_ws2,
-                             mp_size,
-                             pp_size,
-                             mp_resize,
+    @pytest.mark.skip(reason="megatron-lm is currently broken so this test cannot be run.")
+    def test_world_size_2to1(self, inputs, class_tmpdir, checkpoint_tag, baseline_ws2, mp_size, pp_size, mp_resize,
                              pp_resize):
-        self._test(inputs,
-                   class_tmpdir,
-                   checkpoint_tag,
-                   mp_size,
-                   pp_size,
-                   mp_resize,
-                   pp_resize)
+        self._test(inputs, class_tmpdir, checkpoint_tag, mp_size, pp_size, mp_resize, pp_resize)
 
     @pytest.mark.world_size(1)
     @pytest.mark.parametrize("mp_size, pp_size, mp_resize, pp_resize", [(2, 2, 1, 1)])
-    def test_world_size_4to1(self,
-                             inputs,
-                             class_tmpdir,
-                             checkpoint_tag,
-                             baseline_ws4,
-                             mp_size,
-                             pp_size,
-                             mp_resize,
+    @pytest.mark.skip(reason="megatron-lm is currently broken so this test cannot be run.")
+    def test_world_size_4to1(self, inputs, class_tmpdir, checkpoint_tag, baseline_ws4, mp_size, pp_size, mp_resize,
                              pp_resize):
-        self._test(inputs,
-                   class_tmpdir,
-                   checkpoint_tag,
-                   mp_size,
-                   pp_size,
-                   mp_resize,
-                   pp_resize)
+        self._test(inputs, class_tmpdir, checkpoint_tag, mp_size, pp_size, mp_resize, pp_resize)
 
     @pytest.mark.world_size(2)
     @pytest.mark.parametrize("mp_size, pp_size, mp_resize, pp_resize", [(2, 2, 2, 1)])
-    def test_world_size_4to2(self,
-                             inputs,
-                             class_tmpdir,
-                             checkpoint_tag,
-                             baseline_ws4,
-                             mp_size,
-                             pp_size,
-                             mp_resize,
+    @pytest.mark.skip(reason="megatron-lm is currently broken so this test cannot be run.")
+    def test_world_size_4to2(self, inputs, class_tmpdir, checkpoint_tag, baseline_ws4, mp_size, pp_size, mp_resize,
                              pp_resize):
-        self._test(inputs,
-                   class_tmpdir,
-                   checkpoint_tag,
-                   mp_size,
-                   pp_size,
-                   mp_resize,
-                   pp_resize)
+        self._test(inputs, class_tmpdir, checkpoint_tag, mp_size, pp_size, mp_resize, pp_resize)
 
     @pytest.mark.world_size(4)
     @pytest.mark.parametrize("mp_size, pp_size, mp_resize, pp_resize", [(1, 1, 2, 2)])
-    def test_world_size_1to4(self,
-                             inputs,
-                             class_tmpdir,
-                             checkpoint_tag,
-                             baseline_ws1,
-                             mp_size,
-                             pp_size,
-                             mp_resize,
+    @pytest.mark.skip(reason="megatron-lm is currently broken so this test cannot be run.")
+    def test_world_size_1to4(self, inputs, class_tmpdir, checkpoint_tag, baseline_ws1, mp_size, pp_size, mp_resize,
                              pp_resize):
-        self._test(inputs,
-                   class_tmpdir,
-                   checkpoint_tag,
-                   mp_size,
-                   pp_size,
-                   mp_resize,
-                   pp_resize)
+        self._test(inputs, class_tmpdir, checkpoint_tag, mp_size, pp_size, mp_resize, pp_resize)
 
     @pytest.mark.world_size(4)
-    @pytest.mark.parametrize("mp_size, pp_size, mp_resize, pp_resize",
-                             [(1,
-                               2,
-                               1,
-                               4),
-                              (2,
-                               1,
-                               2,
-                               2)])
-    def test_world_size_2to4(self,
-                             inputs,
-                             class_tmpdir,
-                             checkpoint_tag,
-                             baseline_ws2,
-                             mp_size,
-                             pp_size,
-                             mp_resize,
+    @pytest.mark.parametrize("mp_size, pp_size, mp_resize, pp_resize", [(1, 2, 1, 4), (2, 1, 2, 2)])
+    @pytest.mark.skip(reason="megatron-lm is currently broken so this test cannot be run.")
+    def test_world_size_2to4(self, inputs, class_tmpdir, checkpoint_tag, baseline_ws2, mp_size, pp_size, mp_resize,
                              pp_resize):
-        self._test(inputs,
-                   class_tmpdir,
-                   checkpoint_tag,
-                   mp_size,
-                   pp_size,
-                   mp_resize,
-                   pp_resize)
+        self._test(inputs, class_tmpdir, checkpoint_tag, mp_size, pp_size, mp_resize, pp_resize)
diff --git a/tests/unit/modeling.py b/tests/unit/modeling.py
index 94dea45468bc..835975697afc 100644
--- a/tests/unit/modeling.py
+++ b/tests/unit/modeling.py
@@ -1,3 +1,8 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 from __future__ import absolute_import, division, print_function, unicode_literals
 # Copyright The Microsoft DeepSpeed Team
 # DeepSpeed note, code taken from commit 3d59216cec89a363649b4fe3d15295ba936ced0f
@@ -24,17 +29,11 @@
 import json
 import logging
 import math
-import os
-import shutil
-import tarfile
-import tempfile
 from io import open
 
 import torch
 from torch import nn
-from torch.nn import CrossEntropyLoss
 from torch.utils import checkpoint
-import deepspeed.comm as dist
 
 from torch.nn import Module
 import torch.nn.functional as F
@@ -46,90 +45,6 @@
 from deepspeed.accelerator import get_accelerator
 
 logger = logging.getLogger(__name__)
-
-PRETRAINED_MODEL_ARCHIVE_MAP = {
-    'bert-base-uncased':
-    "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz",
-    'bert-large-uncased':
-    "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased.tar.gz",
-    'bert-base-cased':
-    "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased.tar.gz",
-    'bert-large-cased':
-    "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased.tar.gz",
-    'bert-base-multilingual-uncased':
-    "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased.tar.gz",
-    'bert-base-multilingual-cased':
-    "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased.tar.gz",
-    'bert-base-chinese':
-    "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese.tar.gz",
-}
-CONFIG_NAME = 'bert_config.json'
-WEIGHTS_NAME = 'pytorch_model.bin'
-TF_WEIGHTS_NAME = 'model.ckpt'
-
-
-def load_tf_weights_in_bert(model, tf_checkpoint_path):
-    """ Load tf checkpoints in a pytorch model
-    """
-    try:
-        import re
-        import numpy as np
-        import tensorflow as tf
-    except ImportError:
-        print(
-            "Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
-            "https://www.tensorflow.org/install/ for installation instructions.")
-        raise
-    tf_path = os.path.abspath(tf_checkpoint_path)
-    print("Converting TensorFlow checkpoint from {}".format(tf_path))
-    # Load weights from TF model
-    init_vars = tf.train.list_variables(tf_path)
-    names = []
-    arrays = []
-    for name, shape in init_vars:
-        print("Loading TF weight {} with shape {}".format(name, shape))
-        array = tf.train.load_variable(tf_path, name)
-        names.append(name)
-        arrays.append(array)
-
-    for name, array in zip(names, arrays):
-        name = name.split('/')
-        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
-        # which are not required for using pretrained model
-        if any(n in ["adam_v", "adam_m"] for n in name):
-            print("Skipping {}".format("/".join(name)))
-            continue
-        pointer = model
-        for m_name in name:
-            if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
-                l = re.split(r'_(\d+)', m_name)
-            else:
-                l = [m_name]
-            if l[0] == 'kernel' or l[0] == 'gamma':
-                pointer = getattr(pointer, 'weight')
-            elif l[0] == 'output_bias' or l[0] == 'beta':
-                pointer = getattr(pointer, 'bias')
-            elif l[0] == 'output_weights':
-                pointer = getattr(pointer, 'weight')
-            else:
-                pointer = getattr(pointer, l[0])
-            if len(l) >= 2:
-                num = int(l[1])
-                pointer = pointer[num]
-        if m_name[-11:] == '_embeddings':
-            pointer = getattr(pointer, 'weight')
-        elif m_name == 'kernel':
-            array = np.transpose(array)
-        try:
-            assert pointer.shape == array.shape
-        except AssertionError as e:
-            e.args += (pointer.shape, array.shape)
-            raise
-        print("Initialize PyTorch weight {}".format(name))
-        pointer.data = torch.from_numpy(array)
-    return model
-
-
 """
 @torch.jit.script
 def f_gelu(x):
@@ -183,6 +98,7 @@ def swish(x):
 
 
 class GPUTimer:
+
     def __init__(self):
         super().__init__()
         self.start = get_accelerator().Event()  # noqa: F821
@@ -202,13 +118,7 @@ class LinearActivation(Module):
     """
     __constants__ = ['bias']
 
-    def __init__(self,
-                 in_features,
-                 out_features,
-                 weights,
-                 biases,
-                 act='gelu',
-                 bias=True):
+    def __init__(self, in_features, out_features, weights, biases, act='gelu', bias=True):
         super(LinearActivation, self).__init__()
         self.in_features = in_features
         self.out_features = out_features
@@ -256,15 +166,14 @@ def forward(self, input):
             return self.act_fn(F.linear(input, self.weight, self.bias))
 
     def extra_repr(self):
-        return 'in_features={}, out_features={}, bias={}'.format(
-            self.in_features,
-            self.out_features,
-            self.bias is not None)
+        return 'in_features={}, out_features={}, bias={}'.format(self.in_features, self.out_features, self.bias
+                                                                 is not None)
 
 
 class BertConfig(object):
     """Configuration class to store the configuration of a `BertModel`.
     """
+
     def __init__(self,
                  vocab_size_or_config_json_file,
                  hidden_size=768,
@@ -306,8 +215,7 @@ def __init__(self,
         if isinstance(vocab_size_or_config_json_file, str):
             with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
                 json_config = json.loads(reader.read())
-            for key, value in json_config.items():
-                self.__dict__[key] = value
+            self.__dict__.update(json_config)
         elif isinstance(vocab_size_or_config_json_file, int):
             self.vocab_size = vocab_size_or_config_json_file
             self.hidden_size = hidden_size
@@ -330,8 +238,7 @@ def __init__(self,
     def from_dict(cls, json_object):
         """Constructs a `BertConfig` from a Python dictionary of parameters."""
         config = BertConfig(vocab_size_or_config_json_file=-1)
-        for key, value in json_object.items():
-            config.__dict__[key] = value
+        config.__dict__.update(json_object)
         return config
 
     @classmethod
@@ -361,11 +268,10 @@ def to_json_string(self):
     #apex.amp.register_float_function(apex.normalization.FusedLayerNorm, 'forward')
     BertLayerNorm = apex.normalization.FusedLayerNorm
 except ImportError:
-    print(
-        "Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex."
-    )
+    print("Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.")
 
     class BertLayerNorm(nn.Module):
+
         def __init__(self, hidden_size, eps=1e-12):
             """Construct a layernorm module in the TF style (epsilon inside the square root).
             """
@@ -381,49 +287,13 @@ def forward(self, x):
             return self.weight * x + self.bias
 
 
-class BertEmbeddings(nn.Module):
-    """Construct the embeddings from word, position and token_type embeddings.
-    """
-    def __init__(self, config):
-        super(BertEmbeddings, self).__init__()
-        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size)
-        self.position_embeddings = nn.Embedding(config.max_position_embeddings,
-                                                config.hidden_size)
-        self.token_type_embeddings = nn.Embedding(config.type_vocab_size,
-                                                  config.hidden_size)
-
-        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
-        # any TensorFlow checkpoint file
-        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-
-    def forward(self, input_ids, token_type_ids=None):
-        seq_length = input_ids.size(1)
-        position_ids = torch.arange(seq_length,
-                                    dtype=torch.long,
-                                    device=input_ids.device)
-        position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
-        if token_type_ids is None:
-            token_type_ids = torch.zeros_like(input_ids)
-
-        words_embeddings = self.word_embeddings(input_ids)
-        position_embeddings = self.position_embeddings(position_ids)
-        token_type_embeddings = self.token_type_embeddings(token_type_ids)
-
-        embeddings = words_embeddings + position_embeddings + token_type_embeddings
-        embeddings = self.LayerNorm(embeddings)
-        embeddings = self.dropout(embeddings)
-        return embeddings
-
-
 class BertSelfAttention(nn.Module):
+
     def __init__(self, i, config, weights, biases):
         super(BertSelfAttention, self).__init__()
         if config.hidden_size % config.num_attention_heads != 0:
-            raise ValueError(
-                "The hidden size (%d) is not a multiple of the number of attention "
-                "heads (%d)" % (config.hidden_size,
-                                config.num_attention_heads))
+            raise ValueError("The hidden size (%d) is not a multiple of the number of attention "
+                             "heads (%d)" % (config.hidden_size, config.num_attention_heads))
         self.num_attention_heads = config.num_attention_heads
         self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
         self.all_head_size = self.num_attention_heads * self.attention_head_size
@@ -451,14 +321,12 @@ def __init__(self, i, config, weights, biases):
         #self.softmax = DeepSpeedSoftmax(i, self.softmax_config)
 
     def transpose_for_scores(self, x):
-        new_x_shape = x.size()[:-1] + (self.num_attention_heads,
-                                       self.attention_head_size)
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
         x = x.view(*new_x_shape)
         return x.permute(0, 2, 1, 3)
 
     def transpose_key_for_scores(self, x):
-        new_x_shape = x.size()[:-1] + (self.num_attention_heads,
-                                       self.attention_head_size)
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
         x = x.view(*new_x_shape)
         return x.permute(0, 2, 3, 1)
 
@@ -494,6 +362,7 @@ def forward(self, hidden_states, attention_mask, grads=None):
 
 
 class BertSelfOutput(nn.Module):
+
     def __init__(self, config, weights, biases):
         super(BertSelfOutput, self).__init__()
         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
@@ -513,6 +382,7 @@ def get_w(self):
 
 
 class BertAttention(nn.Module):
+
     def __init__(self, i, config, weights, biases):
         super(BertAttention, self).__init__()
         self.self = BertSelfAttention(i, config, weights, biases)
@@ -528,6 +398,7 @@ def get_w(self):
 
 
 class BertIntermediate(nn.Module):
+
     def __init__(self, config, weights, biases):
         super(BertIntermediate, self).__init__()
         self.dense_act = LinearActivation(config.hidden_size,
@@ -542,6 +413,7 @@ def forward(self, hidden_states):
 
 
 class BertOutput(nn.Module):
+
     def __init__(self, config, weights, biases):
         super(BertOutput, self).__init__()
         self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
@@ -558,6 +430,7 @@ def forward(self, hidden_states, input_tensor):
 
 
 class BertLayer(nn.Module):
+
     def __init__(self, i, config, weights, biases):
         super(BertLayer, self).__init__()
         self.attention = BertAttention(i, config, weights, biases)
@@ -580,26 +453,14 @@ def forward(self, hidden_states, attention_mask, grads, collect_all_grads=False)
             self.biases[2].register_hook(lambda x, self=self: grads.append([x, "V_B"]))
             self.weight[3].register_hook(lambda x, self=self: grads.append([x, "O_W"]))
             self.biases[3].register_hook(lambda x, self=self: grads.append([x, "O_B"]))
-            self.attention.output.LayerNorm.weight.register_hook(
-                lambda x,
-                self=self: grads.append([x,
-                                         "N2_W"]))
-            self.attention.output.LayerNorm.bias.register_hook(
-                lambda x,
-                self=self: grads.append([x,
-                                         "N2_B"]))
+            self.attention.output.LayerNorm.weight.register_hook(lambda x, self=self: grads.append([x, "N2_W"]))
+            self.attention.output.LayerNorm.bias.register_hook(lambda x, self=self: grads.append([x, "N2_B"]))
             self.weight[5].register_hook(lambda x, self=self: grads.append([x, "int_W"]))
             self.biases[5].register_hook(lambda x, self=self: grads.append([x, "int_B"]))
             self.weight[6].register_hook(lambda x, self=self: grads.append([x, "out_W"]))
             self.biases[6].register_hook(lambda x, self=self: grads.append([x, "out_B"]))
-            self.output.LayerNorm.weight.register_hook(
-                lambda x,
-                self=self: grads.append([x,
-                                         "norm_W"]))
-            self.output.LayerNorm.bias.register_hook(
-                lambda x,
-                self=self: grads.append([x,
-                                         "norm_B"]))
+            self.output.LayerNorm.weight.register_hook(lambda x, self=self: grads.append([x, "norm_W"]))
+            self.output.LayerNorm.bias.register_hook(lambda x, self=self: grads.append([x, "norm_B"]))
 
         return layer_output
 
@@ -608,46 +469,30 @@ def get_w(self):
 
 
 class BertEncoder(nn.Module):
+
     def __init__(self, config, weights, biases):
         super(BertEncoder, self).__init__()
         #layer = BertLayer(config, weights, biases)
         self.FinalLayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
 
-        self.layer = nn.ModuleList([
-            copy.deepcopy(BertLayer(i,
-                                    config,
-                                    weights,
-                                    biases)) for i in range(config.num_hidden_layers)
-        ])
+        self.layer = nn.ModuleList(
+            [copy.deepcopy(BertLayer(i, config, weights, biases)) for i in range(config.num_hidden_layers)])
         self.grads = []
         self.graph = []
 
     def get_grads(self):
         return self.grads
 
-    # def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True):
-    #     all_encoder_layers = []
-    #     for layer_module in self.layer:
-    #         hidden_states = layer_module(hidden_states, attention_mask)
-    #         if output_all_encoded_layers:
-    #             all_encoder_layers.append(hidden_states)
-    #     if not output_all_encoded_layers:
-    #         all_encoder_layers.append(hidden_states)
-    #     return all_encoder_layers
-
     def get_modules(self, big_node, input):
         for mdl in big_node.named_children():
             self.graph.append(mdl)
             self.get_modules(self, mdl, input)
 
-    def forward(self,
-                hidden_states,
-                attention_mask,
-                output_all_encoded_layers=True,
-                checkpoint_activations=False):
+    def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True, checkpoint_activations=False):
         all_encoder_layers = []
 
         def custom(start, end):
+
             def custom_forward(*inputs):
                 layers = self.layer[start:end]
                 x_ = inputs[0]
@@ -662,23 +507,13 @@ def custom_forward(*inputs):
             num_layers = len(self.layer)
             chunk_length = math.ceil(math.sqrt(num_layers))
             while l < num_layers:
-                hidden_states = checkpoint.checkpoint(custom(l,
-                                                             l + chunk_length),
-                                                      hidden_states,
-                                                      attention_mask * 1)
+                hidden_states = checkpoint.checkpoint(custom(l, l + chunk_length), hidden_states, attention_mask * 1)
                 l += chunk_length
             # decoder layers
         else:
             for i, layer_module in enumerate(self.layer):
-                hidden_states = layer_module(hidden_states,
-                                             attention_mask,
-                                             self.grads,
-                                             collect_all_grads=True)
-                hidden_states.register_hook(
-                    lambda x,
-                    i=i,
-                    self=self: self.grads.append([x,
-                                                  "hidden_state"]))
+                hidden_states = layer_module(hidden_states, attention_mask, self.grads, collect_all_grads=True)
+                hidden_states.register_hook(lambda x, i=i, self=self: self.grads.append([x, "hidden_state"]))
                 #print("pytorch weight is: ", layer_module.get_w())
 
                 if output_all_encoded_layers:
@@ -687,904 +522,3 @@ def custom_forward(*inputs):
         if not output_all_encoded_layers or checkpoint_activations:
             all_encoder_layers.append((hidden_states))
         return all_encoder_layers
-
-
-#class BertEncoder(nn.Module):
-#    def __init__(self, config):
-#        super(BertEncoder, self).__init__()
-#        layer = BertLayer(config)
-#        self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)])
-#
-#    def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True):
-#        all_encoder_layers = []
-#        for layer_module in self.layer:
-#            hidden_states = layer_module(hidden_states, attention_mask)
-#            if output_all_encoded_layers:
-#                all_encoder_layers.append(hidden_states)
-#        if not output_all_encoded_layers:
-#            all_encoder_layers.append(hidden_states)
-#        return all_encoder_layers
-
-
-class BertPooler(nn.Module):
-    def __init__(self, config):
-        super(BertPooler, self).__init__()
-        self.dense_act = LinearActivation(config.hidden_size,
-                                          config.hidden_size,
-                                          act="tanh")
-
-    def forward(self, hidden_states):
-        # We "pool" the model by simply taking the hidden state corresponding
-        # to the first token.
-        first_token_tensor = hidden_states[:, 0]
-        pooled_output = self.dense_act(first_token_tensor)
-        return pooled_output
-
-
-class BertPredictionHeadTransform(nn.Module):
-    def __init__(self, config):
-        super(BertPredictionHeadTransform, self).__init__()
-        self.dense_act = LinearActivation(config.hidden_size,
-                                          config.hidden_size,
-                                          act=config.hidden_act)
-        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
-
-    def forward(self, hidden_states):
-        hidden_states = self.dense_act(hidden_states)
-        hidden_states = self.LayerNorm(hidden_states)
-        return hidden_states
-
-
-class BertLMPredictionHead(nn.Module):
-    def __init__(self, config, bert_model_embedding_weights):
-        super(BertLMPredictionHead, self).__init__()
-        self.transform = BertPredictionHeadTransform(config)
-
-        # The output weights are the same as the input embeddings, but there is
-        # an output-only bias for each token.
-        self.decoder = nn.Linear(bert_model_embedding_weights.size(1),
-                                 bert_model_embedding_weights.size(0),
-                                 bias=False)
-        self.decoder.weight = bert_model_embedding_weights
-        self.bias = nn.Parameter(torch.zeros(bert_model_embedding_weights.size(0)))
-
-    def forward(self, hidden_states):
-        hidden_states = self.transform(hidden_states)
-        get_accelerator().range_push(
-            "decoder input.size() = {}, weight.size() = {}".format(
-                hidden_states.size(),
-                self.decoder.weight.size()))
-        hidden_states = self.decoder(hidden_states) + self.bias
-        get_accelerator().range_pop()
-        return hidden_states
-
-
-class BertOnlyMLMHead(nn.Module):
-    def __init__(self, config, bert_model_embedding_weights):
-        super(BertOnlyMLMHead, self).__init__()
-        self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights)
-
-    def forward(self, sequence_output):
-        prediction_scores = self.predictions(sequence_output)
-        return prediction_scores
-
-
-class BertOnlyNSPHead(nn.Module):
-    def __init__(self, config):
-        super(BertOnlyNSPHead, self).__init__()
-        self.seq_relationship = nn.Linear(config.hidden_size, 2)
-
-    def forward(self, pooled_output):
-        seq_relationship_score = self.seq_relationship(pooled_output)
-        return seq_relationship_score
-
-
-class BertPreTrainingHeads(nn.Module):
-    def __init__(self, config, bert_model_embedding_weights):
-        super(BertPreTrainingHeads, self).__init__()
-        self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights)
-        self.seq_relationship = nn.Linear(config.hidden_size, 2)
-
-    def forward(self, sequence_output, pooled_output):
-        prediction_scores = self.predictions(sequence_output)
-        seq_relationship_score = self.seq_relationship(pooled_output)
-        return prediction_scores, seq_relationship_score
-
-
-class BertPreTrainedModel(nn.Module):
-    """ An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
-    """
-    def __init__(self, config, *inputs, **kwargs):
-        super(BertPreTrainedModel, self).__init__()
-        if not isinstance(config, BertConfig):
-            raise ValueError(
-                "Parameter config in `{}(config)` should be an instance of class `BertConfig`. "
-                "To create a model from a Google pretrained model use "
-                "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
-                    self.__class__.__name__,
-                    self.__class__.__name__))
-        self.config = config
-
-    def init_bert_weights(self, module):
-        """ Initialize the weights.
-        """
-        if isinstance(module, (nn.Linear, nn.Embedding)):
-            # Slightly different from the TF version which uses truncated_normal for initialization
-            # cf https://github.com/pytorch/pytorch/pull/5617
-            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
-        elif isinstance(module, BertLayerNorm):
-            module.bias.data.zero_()
-            module.weight.data.fill_(1.0)
-        if isinstance(module, nn.Linear) and module.bias is not None:
-            module.bias.data.zero_()
-
-    @classmethod
-    def from_pretrained(cls,
-                        pretrained_model_name_or_path,
-                        state_dict=None,
-                        cache_dir=None,
-                        from_tf=False,
-                        *inputs,
-                        **kwargs):
-        """
-        Instantiate a BertPreTrainedModel from a pre-trained model file or a pytorch state dict.
-        Download and cache the pre-trained model file if needed.
-
-        Params:
-            pretrained_model_name_or_path: either:
-                - a str with the name of a pre-trained model to load selected in the list of:
-                    . `bert-base-uncased`
-                    . `bert-large-uncased`
-                    . `bert-base-cased`
-                    . `bert-large-cased`
-                    . `bert-base-multilingual-uncased`
-                    . `bert-base-multilingual-cased`
-                    . `bert-base-chinese`
-                - a path or url to a pretrained model archive containing:
-                    . `bert_config.json` a configuration file for the model
-                    . `pytorch_model.bin` a PyTorch dump of a BertForPreTraining instance
-                - a path or url to a pretrained model archive containing:
-                    . `bert_config.json` a configuration file for the model
-                    . `model.chkpt` a TensorFlow checkpoint
-            from_tf: should we load the weights from a locally saved TensorFlow checkpoint
-            cache_dir: an optional path to a folder in which the pre-trained models will be cached.
-            state_dict: an optional state dictionary (collections.OrderedDict object) to use instead of Google pre-trained models
-            *inputs, **kwargs: additional input for the specific Bert class
-                (ex: num_labels for BertForSequenceClassification)
-        """
-        if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:
-            archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name_or_path]
-        else:
-            archive_file = pretrained_model_name_or_path
-        if resolved_archive_file == archive_file:  # noqa: F821
-            logger.info("loading archive file {}".format(archive_file))
-        else:
-            logger.info("loading archive file {} from cache at {}".format(
-                archive_file,
-                resolved_archive_file))  # noqa: F821
-        tempdir = None
-        if os.path.isdir(resolved_archive_file) or from_tf:  # noqa: F821
-            serialization_dir = resolved_archive_file  # noqa: F821
-        else:
-            # Extract archive to temp dir
-            tempdir = tempfile.mkdtemp()
-            logger.info("extracting archive file {} to temp dir {}".format(
-                resolved_archive_file,  # noqa: F821
-                tempdir))
-            with tarfile.open(resolved_archive_file, 'r:gz') as archive:  # noqa: F821
-                archive.extractall(tempdir)
-            serialization_dir = tempdir
-        # Load config
-        config_file = os.path.join(serialization_dir, CONFIG_NAME)
-        config = BertConfig.from_json_file(config_file)
-        logger.info("Model config {}".format(config))
-        # Instantiate model.
-        model = cls(config, *inputs, **kwargs)
-        if state_dict is None and not from_tf:
-            weights_path = os.path.join(serialization_dir, WEIGHTS_NAME)
-            state_dict = torch.load(
-                weights_path,
-                map_location='cpu' if not get_accelerator().is_available() else None)
-        if tempdir:
-            # Clean up temp dir
-            shutil.rmtree(tempdir)
-        if from_tf:
-            # Directly load from a TensorFlow checkpoint
-            weights_path = os.path.join(serialization_dir, TF_WEIGHTS_NAME)
-            return load_tf_weights_in_bert(model, weights_path)
-        # Load from a PyTorch state_dict
-        old_keys = []
-        new_keys = []
-        for key in state_dict.keys():
-            new_key = None
-            if 'gamma' in key:
-                new_key = key.replace('gamma', 'weight')
-            if 'beta' in key:
-                new_key = key.replace('beta', 'bias')
-            if new_key:
-                old_keys.append(key)
-                new_keys.append(new_key)
-        for old_key, new_key in zip(old_keys, new_keys):
-            state_dict[new_key] = state_dict.pop(old_key)
-
-        missing_keys = []
-        unexpected_keys = []
-        error_msgs = []
-        # copy state_dict so _load_from_state_dict can modify it
-        metadata = getattr(state_dict, '_metadata', None)
-        state_dict = state_dict.copy()
-        if metadata is not None:
-            state_dict._metadata = metadata
-
-        def load(module, prefix=''):
-            local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
-            module._load_from_state_dict(state_dict,
-                                         prefix,
-                                         local_metadata,
-                                         True,
-                                         missing_keys,
-                                         unexpected_keys,
-                                         error_msgs)
-            for name, child in module._modules.items():
-                if child is not None:
-                    load(child, prefix + name + '.')
-
-        start_prefix = ''
-        if not hasattr(model,
-                       'bert') and any(s.startswith('bert.') for s in state_dict.keys()):
-            start_prefix = 'bert.'
-        load(model, prefix=start_prefix)
-        if len(missing_keys) > 0:
-            logger.info("Weights of {} not initialized from pretrained model: {}".format(
-                model.__class__.__name__,
-                missing_keys))
-        if len(unexpected_keys) > 0:
-            logger.info("Weights from pretrained model not used in {}: {}".format(
-                model.__class__.__name__,
-                unexpected_keys))
-        if len(error_msgs) > 0:
-            raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
-                model.__class__.__name__,
-                "\n\t".join(error_msgs)))
-        return model
-
-
-class BertModel(BertPreTrainedModel):
-    """BERT model ("Bidirectional Embedding Representations from a Transformer").
-
-    Params:
-        config: a BertConfig class instance with the configuration to build a new model
-
-    Inputs:
-        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
-            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
-            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
-        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
-            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
-            a `sentence B` token (see BERT paper for more details).
-        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
-            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
-            input sequence length in the current batch. It's the mask that we typically use for attention when
-            a batch has varying length sentences.
-        `output_all_encoded_layers`: boolean which controls the content of the `encoded_layers` output as described below. Default: `True`.
-
-    Outputs: Tuple of (encoded_layers, pooled_output)
-        `encoded_layers`: controlled by `output_all_encoded_layers` argument:
-            - `output_all_encoded_layers=True`: outputs a list of the full sequences of encoded-hidden-states at the end
-                of each attention block (i.e. 12 full sequences for BERT-base, 24 for BERT-large), each
-                encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, hidden_size],
-            - `output_all_encoded_layers=False`: outputs only the full sequence of hidden-states corresponding
-                to the last attention block of shape [batch_size, sequence_length, hidden_size],
-        `pooled_output`: a torch.FloatTensor of size [batch_size, hidden_size] which is the output of a
-            classifier pretrained on top of the hidden state associated to the first character of the
-            input (`CLS`) to train on the Next-Sentence task (see BERT's paper).
-
-    Example usage:
-    ```python
-    # Already been converted into WordPiece token ids
-    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
-    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
-
-    config = modeling.BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
-        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
-
-    model = modeling.BertModel(config=config)
-    all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
-    ```
-    """
-    def __init__(self, config):
-        super(BertModel, self).__init__(config)
-        self.embeddings = BertEmbeddings(config)
-        self.encoder = BertEncoder(config)
-        self.pooler = BertPooler(config)
-        self.apply(self.init_bert_weights)
-
-    def forward(self,
-                input_ids,
-                token_type_ids=None,
-                attention_mask=None,
-                output_all_encoded_layers=True,
-                checkpoint_activations=False):
-        if attention_mask is None:
-            attention_mask = torch.ones_like(input_ids)
-        if token_type_ids is None:
-            token_type_ids = torch.zeros_like(input_ids)
-
-        # We create a 3D attention mask from a 2D tensor mask.
-        # Sizes are [batch_size, 1, 1, to_seq_length]
-        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
-        # this attention mask is more simple than the triangular masking of causal attention
-        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
-        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
-
-        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
-        # masked positions, this operation will create a tensor which is 0.0 for
-        # positions we want to attend and -10000.0 for masked positions.
-        # Since we are adding it to the raw scores before the softmax, this is
-        # effectively the same as removing these entirely.
-        extended_attention_mask = extended_attention_mask.to(dtype=next(
-            self.parameters()).dtype)  # fp16 compatibility
-        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
-
-        embedding_output = self.embeddings(input_ids, token_type_ids)
-        encoded_layers = self.encoder(
-            embedding_output,
-            extended_attention_mask,
-            output_all_encoded_layers=output_all_encoded_layers,
-            checkpoint_activations=checkpoint_activations)
-        sequence_output = encoded_layers[-1]
-        pooled_output = self.pooler(sequence_output)
-        if not output_all_encoded_layers:
-            encoded_layers = encoded_layers[-1]
-        return encoded_layers, pooled_output
-
-
-class BertForPreTraining(BertPreTrainedModel):
-    """BERT model with pre-training heads.
-    This module comprises the BERT model followed by the two pre-training heads:
-        - the masked language modeling head, and
-        - the next sentence classification head.
-
-    Params:
-        config: a BertConfig class instance with the configuration to build a new model.
-
-    Inputs:
-        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
-            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
-            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
-        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
-            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
-            a `sentence B` token (see BERT paper for more details).
-        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
-            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
-            input sequence length in the current batch. It's the mask that we typically use for attention when
-            a batch has varying length sentences.
-        `masked_lm_labels`: optional masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
-            with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
-            is only computed for the labels set in [0, ..., vocab_size]
-        `next_sentence_label`: optional next sentence classification loss: torch.LongTensor of shape [batch_size]
-            with indices selected in [0, 1].
-            0 => next sentence is the continuation, 1 => next sentence is a random sentence.
-
-    Outputs:
-        if `masked_lm_labels` and `next_sentence_label` are not `None`:
-            Outputs the total_loss which is the sum of the masked language modeling loss and the next
-            sentence classification loss.
-        if `masked_lm_labels` or `next_sentence_label` is `None`:
-            Outputs a tuple comprising
-            - the masked language modeling logits of shape [batch_size, sequence_length, vocab_size], and
-            - the next sentence classification logits of shape [batch_size, 2].
-
-    Example usage:
-    ```python
-    # Already been converted into WordPiece token ids
-    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
-    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
-
-    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
-        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
-
-    model = BertForPreTraining(config)
-    masked_lm_logits_scores, seq_relationship_logits = model(input_ids, token_type_ids, input_mask)
-    ```
-    """
-    def __init__(self, config, args):
-        super(BertForPreTraining, self).__init__(config)
-        self.summary_writer = None
-        if dist.get_rank() == 0:
-            self.summary_writer = args.summary_writer
-        self.samples_per_step = dist.get_world_size() * args.train_batch_size
-        self.sample_count = self.samples_per_step
-        self.bert = BertModel(config)
-        self.cls = BertPreTrainingHeads(config,
-                                        self.bert.embeddings.word_embeddings.weight)
-        self.apply(self.init_bert_weights)
-
-    def log_summary_writer(self, logs: dict, base='Train'):
-        if dist.get_rank() == 0:
-            module_name = "Samples"  #self._batch_module_name.get(batch_type, self._get_batch_type_error(batch_type))
-            for key, log in logs.items():
-                self.summary_writer.add_scalar(f'{base}/{module_name}/{key}',
-                                               log,
-                                               self.sample_count)
-            self.sample_count += self.samples_per_step
-
-    def forward(self, batch, log=True):
-        #input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, next_sentence_label=None, checkpoint_activations=False):
-        input_ids = batch[1]
-        token_type_ids = batch[3]
-        attention_mask = batch[2]
-        masked_lm_labels = batch[5]
-        next_sentence_label = batch[4]
-        checkpoint_activations = False
-
-        sequence_output, pooled_output = self.bert(input_ids, token_type_ids, attention_mask,
-                                                   output_all_encoded_layers=False, checkpoint_activations=checkpoint_activations)
-        prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
-
-        if masked_lm_labels is not None and next_sentence_label is not None:
-            loss_fct = CrossEntropyLoss(ignore_index=-1)
-            masked_lm_loss = loss_fct(prediction_scores.view(-1,
-                                                             self.config.vocab_size),
-                                      masked_lm_labels.view(-1))
-            next_sentence_loss = loss_fct(seq_relationship_score.view(-1,
-                                                                      2),
-                                          next_sentence_label.view(-1))
-            #print("loss is {} {}".format(masked_lm_loss, next_sentence_loss))
-            total_loss = masked_lm_loss + next_sentence_loss
-            #            if log:
-            #                self.log_summary_writer(logs={'train_loss': total_loss.item()})
-            return total_loss
-        else:
-            return prediction_scores, seq_relationship_score
-
-
-class BertForMaskedLM(BertPreTrainedModel):
-    """BERT model with the masked language modeling head.
-    This module comprises the BERT model followed by the masked language modeling head.
-
-    Params:
-        config: a BertConfig class instance with the configuration to build a new model.
-
-    Inputs:
-        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
-            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
-            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
-        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
-            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
-            a `sentence B` token (see BERT paper for more details).
-        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
-            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
-            input sequence length in the current batch. It's the mask that we typically use for attention when
-            a batch has varying length sentences.
-        `masked_lm_labels`: masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
-            with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
-            is only computed for the labels set in [0, ..., vocab_size]
-
-    Outputs:
-        if `masked_lm_labels` is  not `None`:
-            Outputs the masked language modeling loss.
-        if `masked_lm_labels` is `None`:
-            Outputs the masked language modeling logits of shape [batch_size, sequence_length, vocab_size].
-
-    Example usage:
-    ```python
-    # Already been converted into WordPiece token ids
-    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
-    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
-
-    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
-        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
-
-    model = BertForMaskedLM(config)
-    masked_lm_logits_scores = model(input_ids, token_type_ids, input_mask)
-    ```
-    """
-    def __init__(self, config):
-        super(BertForMaskedLM, self).__init__(config)
-        self.bert = BertModel(config)
-        self.cls = BertOnlyMLMHead(config, self.bert.embeddings.word_embeddings.weight)
-        self.apply(self.init_bert_weights)
-
-    def forward(self,
-                input_ids,
-                token_type_ids=None,
-                attention_mask=None,
-                masked_lm_labels=None,
-                checkpoint_activations=False):
-        sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask,
-                                       output_all_encoded_layers=False)
-        prediction_scores = self.cls(sequence_output)
-
-        if masked_lm_labels is not None:
-            loss_fct = CrossEntropyLoss(ignore_index=-1)
-            masked_lm_loss = loss_fct(prediction_scores.view(-1,
-                                                             self.config.vocab_size),
-                                      masked_lm_labels.view(-1))
-            return masked_lm_loss
-        else:
-            return prediction_scores
-
-
-class BertForNextSentencePrediction(BertPreTrainedModel):
-    """BERT model with next sentence prediction head.
-    This module comprises the BERT model followed by the next sentence classification head.
-
-    Params:
-        config: a BertConfig class instance with the configuration to build a new model.
-
-    Inputs:
-        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
-            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
-            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
-        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
-            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
-            a `sentence B` token (see BERT paper for more details).
-        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
-            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
-            input sequence length in the current batch. It's the mask that we typically use for attention when
-            a batch has varying length sentences.
-        `next_sentence_label`: next sentence classification loss: torch.LongTensor of shape [batch_size]
-            with indices selected in [0, 1].
-            0 => next sentence is the continuation, 1 => next sentence is a random sentence.
-
-    Outputs:
-        if `next_sentence_label` is not `None`:
-            Outputs the total_loss which is the sum of the masked language modeling loss and the next
-            sentence classification loss.
-        if `next_sentence_label` is `None`:
-            Outputs the next sentence classification logits of shape [batch_size, 2].
-
-    Example usage:
-    ```python
-    # Already been converted into WordPiece token ids
-    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
-    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
-
-    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
-        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
-
-    model = BertForNextSentencePrediction(config)
-    seq_relationship_logits = model(input_ids, token_type_ids, input_mask)
-    ```
-    """
-    def __init__(self, config):
-        super(BertForNextSentencePrediction, self).__init__(config)
-        self.bert = BertModel(config)
-        self.cls = BertOnlyNSPHead(config)
-        self.apply(self.init_bert_weights)
-
-    def forward(self,
-                input_ids,
-                token_type_ids=None,
-                attention_mask=None,
-                next_sentence_label=None,
-                checkpoint_activations=False):
-        _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask,
-                                     output_all_encoded_layers=False)
-        seq_relationship_score = self.cls(pooled_output)
-
-        if next_sentence_label is not None:
-            loss_fct = CrossEntropyLoss(ignore_index=-1)
-            next_sentence_loss = loss_fct(seq_relationship_score.view(-1,
-                                                                      2),
-                                          next_sentence_label.view(-1))
-            return next_sentence_loss
-        else:
-            return seq_relationship_score
-
-
-class BertForSequenceClassification(BertPreTrainedModel):
-    """BERT model for classification.
-    This module is composed of the BERT model with a linear layer on top of
-    the pooled output.
-
-    Params:
-        `config`: a BertConfig class instance with the configuration to build a new model.
-        `num_labels`: the number of classes for the classifier. Default = 2.
-
-    Inputs:
-        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
-            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
-            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
-        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
-            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
-            a `sentence B` token (see BERT paper for more details).
-        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
-            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
-            input sequence length in the current batch. It's the mask that we typically use for attention when
-            a batch has varying length sentences.
-        `labels`: labels for the classification output: torch.LongTensor of shape [batch_size]
-            with indices selected in [0, ..., num_labels].
-
-    Outputs:
-        if `labels` is not `None`:
-            Outputs the CrossEntropy classification loss of the output with the labels.
-        if `labels` is `None`:
-            Outputs the classification logits of shape [batch_size, num_labels].
-
-    Example usage:
-    ```python
-    # Already been converted into WordPiece token ids
-    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
-    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
-
-    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
-        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
-
-    num_labels = 2
-
-    model = BertForSequenceClassification(config, num_labels)
-    logits = model(input_ids, token_type_ids, input_mask)
-    ```
-    """
-    def __init__(self, config, num_labels):
-        super(BertForSequenceClassification, self).__init__(config)
-        self.num_labels = num_labels
-        self.bert = BertModel(config)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        self.classifier = nn.Linear(config.hidden_size, num_labels)
-        self.apply(self.init_bert_weights)
-
-    def forward(self,
-                input_ids,
-                token_type_ids=None,
-                attention_mask=None,
-                labels=None,
-                checkpoint_activations=False):
-        _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
-        pooled_output = self.dropout(pooled_output)
-        logits = self.classifier(pooled_output)
-
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
-            return loss
-        else:
-            return logits
-
-
-class BertForMultipleChoice(BertPreTrainedModel):
-    """BERT model for multiple choice tasks.
-    This module is composed of the BERT model with a linear layer on top of
-    the pooled output.
-
-    Params:
-        `config`: a BertConfig class instance with the configuration to build a new model.
-        `num_choices`: the number of classes for the classifier. Default = 2.
-
-    Inputs:
-        `input_ids`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length]
-            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
-            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
-        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, num_choices, sequence_length]
-            with the token types indices selected in [0, 1]. Type 0 corresponds to a `sentence A`
-            and type 1 corresponds to a `sentence B` token (see BERT paper for more details).
-        `attention_mask`: an optional torch.LongTensor of shape [batch_size, num_choices, sequence_length] with indices
-            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
-            input sequence length in the current batch. It's the mask that we typically use for attention when
-            a batch has varying length sentences.
-        `labels`: labels for the classification output: torch.LongTensor of shape [batch_size]
-            with indices selected in [0, ..., num_choices].
-
-    Outputs:
-        if `labels` is not `None`:
-            Outputs the CrossEntropy classification loss of the output with the labels.
-        if `labels` is `None`:
-            Outputs the classification logits of shape [batch_size, num_labels].
-
-    Example usage:
-    ```python
-    # Already been converted into WordPiece token ids
-    input_ids = torch.LongTensor([[[31, 51, 99], [15, 5, 0]], [[12, 16, 42], [14, 28, 57]]])
-    input_mask = torch.LongTensor([[[1, 1, 1], [1, 1, 0]],[[1,1,0], [1, 0, 0]]])
-    token_type_ids = torch.LongTensor([[[0, 0, 1], [0, 1, 0]],[[0, 1, 1], [0, 0, 1]]])
-    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
-        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
-
-    num_choices = 2
-
-    model = BertForMultipleChoice(config, num_choices)
-    logits = model(input_ids, token_type_ids, input_mask)
-    ```
-    """
-    def __init__(self, config, num_choices):
-        super(BertForMultipleChoice, self).__init__(config)
-        self.num_choices = num_choices
-        self.bert = BertModel(config)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        self.classifier = nn.Linear(config.hidden_size, 1)
-        self.apply(self.init_bert_weights)
-
-    def forward(self,
-                input_ids,
-                token_type_ids=None,
-                attention_mask=None,
-                labels=None,
-                checkpoint_activations=False):
-        flat_input_ids = input_ids.view(-1, input_ids.size(-1))
-        flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1))
-        flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1))
-        _, pooled_output = self.bert(flat_input_ids, flat_token_type_ids, flat_attention_mask, output_all_encoded_layers=False)
-        pooled_output = self.dropout(pooled_output)
-        logits = self.classifier(pooled_output)
-        reshaped_logits = logits.view(-1, self.num_choices)
-
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(reshaped_logits, labels)
-            return loss
-        else:
-            return reshaped_logits
-
-
-class BertForTokenClassification(BertPreTrainedModel):
-    """BERT model for token-level classification.
-    This module is composed of the BERT model with a linear layer on top of
-    the full hidden state of the last layer.
-
-    Params:
-        `config`: a BertConfig class instance with the configuration to build a new model.
-        `num_labels`: the number of classes for the classifier. Default = 2.
-
-    Inputs:
-        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
-            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
-            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
-        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
-            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
-            a `sentence B` token (see BERT paper for more details).
-        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
-            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
-            input sequence length in the current batch. It's the mask that we typically use for attention when
-            a batch has varying length sentences.
-        `labels`: labels for the classification output: torch.LongTensor of shape [batch_size, sequence_length]
-            with indices selected in [0, ..., num_labels].
-
-    Outputs:
-        if `labels` is not `None`:
-            Outputs the CrossEntropy classification loss of the output with the labels.
-        if `labels` is `None`:
-            Outputs the classification logits of shape [batch_size, sequence_length, num_labels].
-
-    Example usage:
-    ```python
-    # Already been converted into WordPiece token ids
-    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
-    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
-
-    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
-        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
-
-    num_labels = 2
-
-    model = BertForTokenClassification(config, num_labels)
-    logits = model(input_ids, token_type_ids, input_mask)
-    ```
-    """
-    def __init__(self, config, num_labels):
-        super(BertForTokenClassification, self).__init__(config)
-        self.num_labels = num_labels
-        self.bert = BertModel(config)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        self.classifier = nn.Linear(config.hidden_size, num_labels)
-        self.apply(self.init_bert_weights)
-
-    def forward(self,
-                input_ids,
-                token_type_ids=None,
-                attention_mask=None,
-                labels=None,
-                checkpoint_activations=False):
-        sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
-        sequence_output = self.dropout(sequence_output)
-        logits = self.classifier(sequence_output)
-
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            # Only keep active parts of the loss
-            if attention_mask is not None:
-                active_loss = attention_mask.view(-1) == 1
-                active_logits = logits.view(-1, self.num_labels)[active_loss]
-                active_labels = labels.view(-1)[active_loss]
-                loss = loss_fct(active_logits, active_labels)
-            else:
-                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
-            return loss
-        else:
-            return logits
-
-
-class BertForQuestionAnswering(BertPreTrainedModel):
-    """BERT model for Question Answering (span extraction).
-    This module is composed of the BERT model with a linear layer on top of
-    the sequence output that computes start_logits and end_logits
-
-    Params:
-        `config`: a BertConfig class instance with the configuration to build a new model.
-
-    Inputs:
-        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
-            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
-            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
-        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
-            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
-            a `sentence B` token (see BERT paper for more details).
-        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
-            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
-            input sequence length in the current batch. It's the mask that we typically use for attention when
-            a batch has varying length sentences.
-        `start_positions`: position of the first token for the labeled span: torch.LongTensor of shape [batch_size].
-            Positions are clamped to the length of the sequence and position outside of the sequence are not taken
-            into account for computing the loss.
-        `end_positions`: position of the last token for the labeled span: torch.LongTensor of shape [batch_size].
-            Positions are clamped to the length of the sequence and position outside of the sequence are not taken
-            into account for computing the loss.
-
-    Outputs:
-        if `start_positions` and `end_positions` are not `None`:
-            Outputs the total_loss which is the sum of the CrossEntropy loss for the start and end token positions.
-        if `start_positions` or `end_positions` is `None`:
-            Outputs a tuple of start_logits, end_logits which are the logits respectively for the start and end
-            position tokens of shape [batch_size, sequence_length].
-
-    Example usage:
-    ```python
-    # Already been converted into WordPiece token ids
-    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
-    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
-
-    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
-        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
-
-    model = BertForQuestionAnswering(config)
-    start_logits, end_logits = model(input_ids, token_type_ids, input_mask)
-    ```
-    """
-    def __init__(self, config):
-        super(BertForQuestionAnswering, self).__init__(config)
-        self.bert = BertModel(config)
-        # TODO check with Google if it's normal there is no dropout on the token classifier of SQuAD in the TF version
-        # self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        self.qa_outputs = nn.Linear(config.hidden_size, 2)
-        self.apply(self.init_bert_weights)
-
-    def forward(self,
-                input_ids,
-                token_type_ids=None,
-                attention_mask=None,
-                start_positions=None,
-                end_positions=None,
-                checkpoint_activations=False):
-        sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
-        logits = self.qa_outputs(sequence_output)
-        start_logits, end_logits = logits.split(1, dim=-1)
-        start_logits = start_logits.squeeze(-1)
-        end_logits = end_logits.squeeze(-1)
-
-        if start_positions is not None and end_positions is not None:
-            # If we are on multi-GPU, split add a dimension
-            if len(start_positions.size()) > 1:
-                start_positions = start_positions.squeeze(-1)
-            if len(end_positions.size()) > 1:
-                end_positions = end_positions.squeeze(-1)
-            # sometimes the start/end positions are outside our model inputs, we ignore these terms
-            ignored_index = start_logits.size(1)
-            start_positions.clamp_(0, ignored_index)
-            end_positions.clamp_(0, ignored_index)
-
-            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
-            start_loss = loss_fct(start_logits, start_positions)
-            end_loss = loss_fct(end_logits, end_positions)
-            total_loss = (start_loss + end_loss) / 2
-            return total_loss
-        else:
-            return start_logits, end_logits
diff --git a/tests/unit/modelingpreln.py b/tests/unit/modelingpreln.py
index 0069add9aa4d..7e7b46a0d4d5 100644
--- a/tests/unit/modelingpreln.py
+++ b/tests/unit/modelingpreln.py
@@ -1,3 +1,8 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 from __future__ import absolute_import, division, print_function, unicode_literals
 # Copyright The Microsoft DeepSpeed Team
 # DeepSpeed note, code taken from commit 3d59216cec89a363649b4fe3d15295ba936ced0f
@@ -24,112 +29,18 @@
 import json
 import logging
 import math
-import os
-import shutil
-import tarfile
-import tempfile
 from io import open
 
 import torch
 from torch import nn
-from torch.nn import CrossEntropyLoss
 from torch.utils import checkpoint
-import deepspeed.comm as dist
 
 from torch.nn import Module
 import torch.nn.functional as F
 import torch.nn.init as init
 from deepspeed.accelerator import get_accelerator
 
-#from numba import cuda
-
-#from deepspeed_cuda import DeepSpeedSoftmaxConfig, DeepSpeedSoftmax
-
 logger = logging.getLogger(__name__)
-
-PRETRAINED_MODEL_ARCHIVE_MAP = {
-    'bert-base-uncased':
-    "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz",
-    'bert-large-uncased':
-    "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased.tar.gz",
-    'bert-base-cased':
-    "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased.tar.gz",
-    'bert-large-cased':
-    "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased.tar.gz",
-    'bert-base-multilingual-uncased':
-    "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased.tar.gz",
-    'bert-base-multilingual-cased':
-    "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased.tar.gz",
-    'bert-base-chinese':
-    "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese.tar.gz",
-}
-CONFIG_NAME = 'bert_config.json'
-WEIGHTS_NAME = 'pytorch_model.bin'
-TF_WEIGHTS_NAME = 'model.ckpt'
-
-
-def load_tf_weights_in_bert(model, tf_checkpoint_path):
-    """ Load tf checkpoints in a pytorch model
-    """
-    try:
-        import re
-        import numpy as np
-        import tensorflow as tf
-    except ImportError:
-        print(
-            "Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
-            "https://www.tensorflow.org/install/ for installation instructions.")
-        raise
-    tf_path = os.path.abspath(tf_checkpoint_path)
-    print("Converting TensorFlow checkpoint from {}".format(tf_path))
-    # Load weights from TF model
-    init_vars = tf.train.list_variables(tf_path)
-    names = []
-    arrays = []
-    for name, shape in init_vars:
-        print("Loading TF weight {} with shape {}".format(name, shape))
-        array = tf.train.load_variable(tf_path, name)
-        names.append(name)
-        arrays.append(array)
-
-    for name, array in zip(names, arrays):
-        name = name.split('/')
-        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
-        # which are not required for using pretrained model
-        if any(n in ["adam_v", "adam_m"] for n in name):
-            print("Skipping {}".format("/".join(name)))
-            continue
-        pointer = model
-        for m_name in name:
-            if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
-                l = re.split(r'_(\d+)', m_name)
-            else:
-                l = [m_name]
-            if l[0] == 'kernel' or l[0] == 'gamma':
-                pointer = getattr(pointer, 'weight')
-            elif l[0] == 'output_bias' or l[0] == 'beta':
-                pointer = getattr(pointer, 'bias')
-            elif l[0] == 'output_weights':
-                pointer = getattr(pointer, 'weight')
-            else:
-                pointer = getattr(pointer, l[0])
-            if len(l) >= 2:
-                num = int(l[1])
-                pointer = pointer[num]
-        if m_name[-11:] == '_embeddings':
-            pointer = getattr(pointer, 'weight')
-        elif m_name == 'kernel':
-            array = np.transpose(array)
-        try:
-            assert pointer.shape == array.shape
-        except AssertionError as e:
-            e.args += (pointer.shape, array.shape)
-            raise
-        print("Initialize PyTorch weight {}".format(name))
-        pointer.data = torch.from_numpy(array)
-    return model
-
-
 """
 @torch.jit.script
 def f_gelu(x):
@@ -183,6 +94,7 @@ def swish(x):
 
 
 class GPUTimer:
+
     def __init__(self):
         super().__init__()
         self.start = get_accelerator().Event()  # noqa: F821
@@ -202,13 +114,7 @@ class LinearActivation(Module):
     """
     __constants__ = ['bias']
 
-    def __init__(self,
-                 in_features,
-                 out_features,
-                 weights,
-                 biases,
-                 act='gelu',
-                 bias=True):
+    def __init__(self, in_features, out_features, weights, biases, act='gelu', bias=True):
         super(LinearActivation, self).__init__()
         self.in_features = in_features
         self.out_features = out_features
@@ -256,15 +162,14 @@ def forward(self, input):
             return self.act_fn(F.linear(input, self.weight, self.bias))
 
     def extra_repr(self):
-        return 'in_features={}, out_features={}, bias={}'.format(
-            self.in_features,
-            self.out_features,
-            self.bias is not None)
+        return 'in_features={}, out_features={}, bias={}'.format(self.in_features, self.out_features, self.bias
+                                                                 is not None)
 
 
 class BertConfig(object):
     """Configuration class to store the configuration of a `BertModel`.
     """
+
     def __init__(self,
                  vocab_size_or_config_json_file,
                  hidden_size=768,
@@ -361,11 +266,10 @@ def to_json_string(self):
     #apex.amp.register_float_function(apex.normalization.FusedLayerNorm, 'forward')
     BertLayerNorm = apex.normalization.FusedLayerNorm
 except ImportError:
-    print(
-        "Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex."
-    )
+    print("Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.")
 
     class BertLayerNorm(nn.Module):
+
         def __init__(self, hidden_size, eps=1e-12):
             """Construct a layernorm module in the TF style (epsilon inside the square root).
             """
@@ -392,13 +296,12 @@ def forward(self, x):
 class BertEmbeddings(nn.Module):
     """Construct the embeddings from word, position and token_type embeddings.
     """
+
     def __init__(self, config):
         super(BertEmbeddings, self).__init__()
         self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size)
-        self.position_embeddings = nn.Embedding(config.max_position_embeddings,
-                                                config.hidden_size)
-        self.token_type_embeddings = nn.Embedding(config.type_vocab_size,
-                                                  config.hidden_size)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
 
         # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
         # any TensorFlow checkpoint file
@@ -407,9 +310,7 @@ def __init__(self, config):
 
     def forward(self, input_ids, token_type_ids=None):
         seq_length = input_ids.size(1)
-        position_ids = torch.arange(seq_length,
-                                    dtype=torch.long,
-                                    device=input_ids.device)
+        position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)
         position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
         if token_type_ids is None:
             token_type_ids = torch.zeros_like(input_ids)
@@ -425,13 +326,12 @@ def forward(self, input_ids, token_type_ids=None):
 
 
 class BertSelfAttention(nn.Module):
+
     def __init__(self, i, config, weights, biases):
         super(BertSelfAttention, self).__init__()
         if config.hidden_size % config.num_attention_heads != 0:
-            raise ValueError(
-                "The hidden size (%d) is not a multiple of the number of attention "
-                "heads (%d)" % (config.hidden_size,
-                                config.num_attention_heads))
+            raise ValueError("The hidden size (%d) is not a multiple of the number of attention "
+                             "heads (%d)" % (config.hidden_size, config.num_attention_heads))
         self.num_attention_heads = config.num_attention_heads
         self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
         self.all_head_size = self.num_attention_heads * self.attention_head_size
@@ -459,14 +359,12 @@ def __init__(self, i, config, weights, biases):
         #self.softmax = DeepSpeedSoftmax(i, self.softmax_config)
 
     def transpose_for_scores(self, x):
-        new_x_shape = x.size()[:-1] + (self.num_attention_heads,
-                                       self.attention_head_size)
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
         x = x.view(*new_x_shape)
         return x.permute(0, 2, 1, 3)
 
     def transpose_key_for_scores(self, x):
-        new_x_shape = x.size()[:-1] + (self.num_attention_heads,
-                                       self.attention_head_size)
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
         x = x.view(*new_x_shape)
         return x.permute(0, 2, 3, 1)
 
@@ -559,6 +457,7 @@ def forward(self, hidden_states, attention_mask, grads=None):
 
 
 class BertSelfOutput(nn.Module):
+
     def __init__(self, config, weights, biases):
         super(BertSelfOutput, self).__init__()
         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
@@ -586,6 +485,7 @@ def get_w(self):
 
 
 class BertAttention(nn.Module):
+
     def __init__(self, i, config, weights, biases):
         super(BertAttention, self).__init__()
         self.self = BertSelfAttention(i, config, weights, biases)
@@ -601,6 +501,7 @@ def get_w(self):
 
 
 class BertIntermediate(nn.Module):
+
     def __init__(self, config, weights, biases):
         super(BertIntermediate, self).__init__()
         self.dense_act = LinearActivation(config.hidden_size,
@@ -615,6 +516,7 @@ def forward(self, hidden_states):
 
 
 class BertOutput(nn.Module):
+
     def __init__(self, config, weights, biases):
         super(BertOutput, self).__init__()
         self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
@@ -641,6 +543,7 @@ def forward(self, hidden_states, input_tensor):
 
 
 class BertLayer(nn.Module):
+
     def __init__(self, i, config, weights, biases):
         super(BertLayer, self).__init__()
         self.attention = BertAttention(i, config, weights, biases)
@@ -674,26 +577,14 @@ def forward(self, hidden_states, attention_mask, grads, collect_all_grads=False)
             self.biases[2].register_hook(lambda x, self=self: grads.append([x, "V_B"]))
             self.weight[3].register_hook(lambda x, self=self: grads.append([x, "O_W"]))
             self.biases[3].register_hook(lambda x, self=self: grads.append([x, "O_B"]))
-            self.PostAttentionLayerNorm.weight.register_hook(
-                lambda x,
-                self=self: grads.append([x,
-                                         "N2_W"]))
-            self.PostAttentionLayerNorm.bias.register_hook(
-                lambda x,
-                self=self: grads.append([x,
-                                         "N2_B"]))
+            self.PostAttentionLayerNorm.weight.register_hook(lambda x, self=self: grads.append([x, "N2_W"]))
+            self.PostAttentionLayerNorm.bias.register_hook(lambda x, self=self: grads.append([x, "N2_B"]))
             self.weight[5].register_hook(lambda x, self=self: grads.append([x, "int_W"]))
             self.biases[5].register_hook(lambda x, self=self: grads.append([x, "int_B"]))
             self.weight[6].register_hook(lambda x, self=self: grads.append([x, "out_W"]))
             self.biases[6].register_hook(lambda x, self=self: grads.append([x, "out_B"]))
-            self.PreAttentionLayerNorm.weight.register_hook(
-                lambda x,
-                self=self: grads.append([x,
-                                         "norm_W"]))
-            self.PreAttentionLayerNorm.bias.register_hook(
-                lambda x,
-                self=self: grads.append([x,
-                                         "norm_B"]))
+            self.PreAttentionLayerNorm.weight.register_hook(lambda x, self=self: grads.append([x, "norm_W"]))
+            self.PreAttentionLayerNorm.bias.register_hook(lambda x, self=self: grads.append([x, "norm_B"]))
 
         return layer_output + intermediate_input
 
@@ -702,46 +593,30 @@ def get_w(self):
 
 
 class BertEncoder(nn.Module):
+
     def __init__(self, config, weights, biases):
         super(BertEncoder, self).__init__()
         #layer = BertLayer(config, weights, biases)
         self.FinalLayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
 
-        self.layer = nn.ModuleList([
-            copy.deepcopy(BertLayer(i,
-                                    config,
-                                    weights,
-                                    biases)) for i in range(config.num_hidden_layers)
-        ])
+        self.layer = nn.ModuleList(
+            [copy.deepcopy(BertLayer(i, config, weights, biases)) for i in range(config.num_hidden_layers)])
         self.grads = []
         self.graph = []
 
     def get_grads(self):
         return self.grads
 
-    # def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True):
-    #     all_encoder_layers = []
-    #     for layer_module in self.layer:
-    #         hidden_states = layer_module(hidden_states, attention_mask)
-    #         if output_all_encoded_layers:
-    #             all_encoder_layers.append(hidden_states)
-    #     if not output_all_encoded_layers:
-    #         all_encoder_layers.append(hidden_states)
-    #     return all_encoder_layers
-
     def get_modules(self, big_node, input):
         for mdl in big_node.named_children():
             self.graph.append(mdl)
             self.get_modules(self, mdl, input)
 
-    def forward(self,
-                hidden_states,
-                attention_mask,
-                output_all_encoded_layers=True,
-                checkpoint_activations=False):
+    def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True, checkpoint_activations=False):
         all_encoder_layers = []
 
         def custom(start, end):
+
             def custom_forward(*inputs):
                 layers = self.layer[start:end]
                 x_ = inputs[0]
@@ -756,23 +631,13 @@ def custom_forward(*inputs):
             num_layers = len(self.layer)
             chunk_length = math.ceil(math.sqrt(num_layers))
             while l < num_layers:
-                hidden_states = checkpoint.checkpoint(custom(l,
-                                                             l + chunk_length),
-                                                      hidden_states,
-                                                      attention_mask * 1)
+                hidden_states = checkpoint.checkpoint(custom(l, l + chunk_length), hidden_states, attention_mask * 1)
                 l += chunk_length
             # decoder layers
         else:
             for i, layer_module in enumerate(self.layer):
-                hidden_states = layer_module(hidden_states,
-                                             attention_mask,
-                                             self.grads,
-                                             collect_all_grads=True)
-                hidden_states.register_hook(
-                    lambda x,
-                    i=i,
-                    self=self: self.grads.append([x,
-                                                  "hidden_state"]))
+                hidden_states = layer_module(hidden_states, attention_mask, self.grads, collect_all_grads=True)
+                hidden_states.register_hook(lambda x, i=i, self=self: self.grads.append([x, "hidden_state"]))
                 #print("pytorch weight is: ", layer_module.get_w())
 
                 if output_all_encoded_layers:
@@ -782,904 +647,3 @@ def custom_forward(*inputs):
             hidden_states = self.FinalLayerNorm(hidden_states)
             all_encoder_layers.append((hidden_states))
         return all_encoder_layers
-
-
-#class BertEncoder(nn.Module):
-#    def __init__(self, config):
-#        super(BertEncoder, self).__init__()
-#        layer = BertLayer(config)
-#        self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)])
-#
-#    def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True):
-#        all_encoder_layers = []
-#        for layer_module in self.layer:
-#            hidden_states = layer_module(hidden_states, attention_mask)
-#            if output_all_encoded_layers:
-#                all_encoder_layers.append(hidden_states)
-#        if not output_all_encoded_layers:
-#            all_encoder_layers.append(hidden_states)
-#        return all_encoder_layers
-
-
-class BertPooler(nn.Module):
-    def __init__(self, config):
-        super(BertPooler, self).__init__()
-        self.dense_act = LinearActivation(config.hidden_size,
-                                          config.hidden_size,
-                                          act="tanh")
-
-    def forward(self, hidden_states):
-        # We "pool" the model by simply taking the hidden state corresponding
-        # to the first token.
-        first_token_tensor = hidden_states[:, 0]
-        pooled_output = self.dense_act(first_token_tensor)
-        return pooled_output
-
-
-class BertPredictionHeadTransform(nn.Module):
-    def __init__(self, config):
-        super(BertPredictionHeadTransform, self).__init__()
-        self.dense_act = LinearActivation(config.hidden_size,
-                                          config.hidden_size,
-                                          act=config.hidden_act)
-        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
-
-    def forward(self, hidden_states):
-        hidden_states = self.dense_act(hidden_states)
-        hidden_states = self.LayerNorm(hidden_states)
-        return hidden_states
-
-
-class BertLMPredictionHead(nn.Module):
-    def __init__(self, config, bert_model_embedding_weights):
-        super(BertLMPredictionHead, self).__init__()
-        self.transform = BertPredictionHeadTransform(config)
-
-        # The output weights are the same as the input embeddings, but there is
-        # an output-only bias for each token.
-        self.decoder = nn.Linear(bert_model_embedding_weights.size(1),
-                                 bert_model_embedding_weights.size(0),
-                                 bias=False)
-        self.decoder.weight = bert_model_embedding_weights
-        self.bias = nn.Parameter(torch.zeros(bert_model_embedding_weights.size(0)))
-
-    def forward(self, hidden_states):
-        hidden_states = self.transform(hidden_states)
-        get_accelerator().range_push(
-            "decoder input.size() = {}, weight.size() = {}".format(
-                hidden_states.size(),
-                self.decoder.weight.size()))
-        hidden_states = self.decoder(hidden_states) + self.bias
-        get_accelerator().range_pop()
-        return hidden_states
-
-
-class BertOnlyMLMHead(nn.Module):
-    def __init__(self, config, bert_model_embedding_weights):
-        super(BertOnlyMLMHead, self).__init__()
-        self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights)
-
-    def forward(self, sequence_output):
-        prediction_scores = self.predictions(sequence_output)
-        return prediction_scores
-
-
-class BertOnlyNSPHead(nn.Module):
-    def __init__(self, config):
-        super(BertOnlyNSPHead, self).__init__()
-        self.seq_relationship = nn.Linear(config.hidden_size, 2)
-
-    def forward(self, pooled_output):
-        seq_relationship_score = self.seq_relationship(pooled_output)
-        return seq_relationship_score
-
-
-class BertPreTrainingHeads(nn.Module):
-    def __init__(self, config, bert_model_embedding_weights):
-        super(BertPreTrainingHeads, self).__init__()
-        self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights)
-        self.seq_relationship = nn.Linear(config.hidden_size, 2)
-
-    def forward(self, sequence_output, pooled_output):
-        prediction_scores = self.predictions(sequence_output)
-        seq_relationship_score = self.seq_relationship(pooled_output)
-        return prediction_scores, seq_relationship_score
-
-
-class BertPreTrainedModel(nn.Module):
-    """ An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
-    """
-    def __init__(self, config, *inputs, **kwargs):
-        super(BertPreTrainedModel, self).__init__()
-        if not isinstance(config, BertConfig):
-            raise ValueError(
-                "Parameter config in `{}(config)` should be an instance of class `BertConfig`. "
-                "To create a model from a Google pretrained model use "
-                "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
-                    self.__class__.__name__,
-                    self.__class__.__name__))
-        self.config = config
-
-    def init_bert_weights(self, module):
-        """ Initialize the weights.
-        """
-        if isinstance(module, (nn.Linear, nn.Embedding)):
-            # Slightly different from the TF version which uses truncated_normal for initialization
-            # cf https://github.com/pytorch/pytorch/pull/5617
-            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
-        elif isinstance(module, BertLayerNorm):
-            module.bias.data.zero_()
-            module.weight.data.fill_(1.0)
-        if isinstance(module, nn.Linear) and module.bias is not None:
-            module.bias.data.zero_()
-
-    @classmethod
-    def from_pretrained(cls,
-                        pretrained_model_name_or_path,
-                        state_dict=None,
-                        cache_dir=None,
-                        from_tf=False,
-                        *inputs,
-                        **kwargs):
-        """
-        Instantiate a BertPreTrainedModel from a pre-trained model file or a pytorch state dict.
-        Download and cache the pre-trained model file if needed.
-
-        Params:
-            pretrained_model_name_or_path: either:
-                - a str with the name of a pre-trained model to load selected in the list of:
-                    . `bert-base-uncased`
-                    . `bert-large-uncased`
-                    . `bert-base-cased`
-                    . `bert-large-cased`
-                    . `bert-base-multilingual-uncased`
-                    . `bert-base-multilingual-cased`
-                    . `bert-base-chinese`
-                - a path or url to a pretrained model archive containing:
-                    . `bert_config.json` a configuration file for the model
-                    . `pytorch_model.bin` a PyTorch dump of a BertForPreTraining instance
-                - a path or url to a pretrained model archive containing:
-                    . `bert_config.json` a configuration file for the model
-                    . `model.chkpt` a TensorFlow checkpoint
-            from_tf: should we load the weights from a locally saved TensorFlow checkpoint
-            cache_dir: an optional path to a folder in which the pre-trained models will be cached.
-            state_dict: an optional state dictionary (collections.OrderedDict object) to use instead of Google pre-trained models
-            *inputs, **kwargs: additional input for the specific Bert class
-                (ex: num_labels for BertForSequenceClassification)
-        """
-        if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:
-            archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name_or_path]
-        else:
-            archive_file = pretrained_model_name_or_path
-        if resolved_archive_file == archive_file:  # noqa: F821
-            logger.info("loading archive file {}".format(archive_file))
-        else:
-            logger.info("loading archive file {} from cache at {}".format(
-                archive_file,
-                resolved_archive_file))  # noqa: F821
-        tempdir = None
-        if os.path.isdir(resolved_archive_file) or from_tf:  # noqa: F821
-            serialization_dir = resolved_archive_file  # noqa: F821
-        else:
-            # Extract archive to temp dir
-            tempdir = tempfile.mkdtemp()
-            logger.info("extracting archive file {} to temp dir {}".format(
-                resolved_archive_file,  # noqa: F821
-                tempdir))
-            with tarfile.open(resolved_archive_file, 'r:gz') as archive:  # noqa: F821
-                archive.extractall(tempdir)
-            serialization_dir = tempdir
-        # Load config
-        config_file = os.path.join(serialization_dir, CONFIG_NAME)
-        config = BertConfig.from_json_file(config_file)
-        logger.info("Model config {}".format(config))
-        # Instantiate model.
-        model = cls(config, *inputs, **kwargs)
-        if state_dict is None and not from_tf:
-            weights_path = os.path.join(serialization_dir, WEIGHTS_NAME)
-            state_dict = torch.load(
-                weights_path,
-                map_location='cpu' if not get_accelerator().is_available() else None)
-        if tempdir:
-            # Clean up temp dir
-            shutil.rmtree(tempdir)
-        if from_tf:
-            # Directly load from a TensorFlow checkpoint
-            weights_path = os.path.join(serialization_dir, TF_WEIGHTS_NAME)
-            return load_tf_weights_in_bert(model, weights_path)
-        # Load from a PyTorch state_dict
-        old_keys = []
-        new_keys = []
-        for key in state_dict.keys():
-            new_key = None
-            if 'gamma' in key:
-                new_key = key.replace('gamma', 'weight')
-            if 'beta' in key:
-                new_key = key.replace('beta', 'bias')
-            if new_key:
-                old_keys.append(key)
-                new_keys.append(new_key)
-        for old_key, new_key in zip(old_keys, new_keys):
-            state_dict[new_key] = state_dict.pop(old_key)
-
-        missing_keys = []
-        unexpected_keys = []
-        error_msgs = []
-        # copy state_dict so _load_from_state_dict can modify it
-        metadata = getattr(state_dict, '_metadata', None)
-        state_dict = state_dict.copy()
-        if metadata is not None:
-            state_dict._metadata = metadata
-
-        def load(module, prefix=''):
-            local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
-            module._load_from_state_dict(state_dict,
-                                         prefix,
-                                         local_metadata,
-                                         True,
-                                         missing_keys,
-                                         unexpected_keys,
-                                         error_msgs)
-            for name, child in module._modules.items():
-                if child is not None:
-                    load(child, prefix + name + '.')
-
-        start_prefix = ''
-        if not hasattr(model,
-                       'bert') and any(s.startswith('bert.') for s in state_dict.keys()):
-            start_prefix = 'bert.'
-        load(model, prefix=start_prefix)
-        if len(missing_keys) > 0:
-            logger.info("Weights of {} not initialized from pretrained model: {}".format(
-                model.__class__.__name__,
-                missing_keys))
-        if len(unexpected_keys) > 0:
-            logger.info("Weights from pretrained model not used in {}: {}".format(
-                model.__class__.__name__,
-                unexpected_keys))
-        if len(error_msgs) > 0:
-            raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
-                model.__class__.__name__,
-                "\n\t".join(error_msgs)))
-        return model
-
-
-class BertModel(BertPreTrainedModel):
-    """BERT model ("Bidirectional Embedding Representations from a Transformer").
-
-    Params:
-        config: a BertConfig class instance with the configuration to build a new model
-
-    Inputs:
-        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
-            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
-            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
-        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
-            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
-            a `sentence B` token (see BERT paper for more details).
-        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
-            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
-            input sequence length in the current batch. It's the mask that we typically use for attention when
-            a batch has varying length sentences.
-        `output_all_encoded_layers`: boolean which controls the content of the `encoded_layers` output as described below. Default: `True`.
-
-    Outputs: Tuple of (encoded_layers, pooled_output)
-        `encoded_layers`: controlled by `output_all_encoded_layers` argument:
-            - `output_all_encoded_layers=True`: outputs a list of the full sequences of encoded-hidden-states at the end
-                of each attention block (i.e. 12 full sequences for BERT-base, 24 for BERT-large), each
-                encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, hidden_size],
-            - `output_all_encoded_layers=False`: outputs only the full sequence of hidden-states corresponding
-                to the last attention block of shape [batch_size, sequence_length, hidden_size],
-        `pooled_output`: a torch.FloatTensor of size [batch_size, hidden_size] which is the output of a
-            classifier pretrained on top of the hidden state associated to the first character of the
-            input (`CLS`) to train on the Next-Sentence task (see BERT's paper).
-
-    Example usage:
-    ```python
-    # Already been converted into WordPiece token ids
-    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
-    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
-
-    config = modeling.BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
-        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
-
-    model = modeling.BertModel(config=config)
-    all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
-    ```
-    """
-    def __init__(self, config):
-        super(BertModel, self).__init__(config)
-        self.embeddings = BertEmbeddings(config)
-        self.encoder = BertEncoder(config)
-        self.pooler = BertPooler(config)
-        self.apply(self.init_bert_weights)
-
-    def forward(self,
-                input_ids,
-                token_type_ids=None,
-                attention_mask=None,
-                output_all_encoded_layers=True,
-                checkpoint_activations=False):
-        if attention_mask is None:
-            attention_mask = torch.ones_like(input_ids)
-        if token_type_ids is None:
-            token_type_ids = torch.zeros_like(input_ids)
-
-        # We create a 3D attention mask from a 2D tensor mask.
-        # Sizes are [batch_size, 1, 1, to_seq_length]
-        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
-        # this attention mask is more simple than the triangular masking of causal attention
-        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
-        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
-
-        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
-        # masked positions, this operation will create a tensor which is 0.0 for
-        # positions we want to attend and -10000.0 for masked positions.
-        # Since we are adding it to the raw scores before the softmax, this is
-        # effectively the same as removing these entirely.
-        extended_attention_mask = extended_attention_mask.to(dtype=next(
-            self.parameters()).dtype)  # fp16 compatibility
-        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
-
-        embedding_output = self.embeddings(input_ids, token_type_ids)
-        encoded_layers = self.encoder(
-            embedding_output,
-            extended_attention_mask,
-            output_all_encoded_layers=output_all_encoded_layers,
-            checkpoint_activations=checkpoint_activations)
-        sequence_output = encoded_layers[-1]
-        pooled_output = self.pooler(sequence_output)
-        if not output_all_encoded_layers:
-            encoded_layers = encoded_layers[-1]
-        return encoded_layers, pooled_output
-
-
-class BertForPreTraining(BertPreTrainedModel):
-    """BERT model with pre-training heads.
-    This module comprises the BERT model followed by the two pre-training heads:
-        - the masked language modeling head, and
-        - the next sentence classification head.
-
-    Params:
-        config: a BertConfig class instance with the configuration to build a new model.
-
-    Inputs:
-        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
-            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
-            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
-        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
-            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
-            a `sentence B` token (see BERT paper for more details).
-        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
-            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
-            input sequence length in the current batch. It's the mask that we typically use for attention when
-            a batch has varying length sentences.
-        `masked_lm_labels`: optional masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
-            with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
-            is only computed for the labels set in [0, ..., vocab_size]
-        `next_sentence_label`: optional next sentence classification loss: torch.LongTensor of shape [batch_size]
-            with indices selected in [0, 1].
-            0 => next sentence is the continuation, 1 => next sentence is a random sentence.
-
-    Outputs:
-        if `masked_lm_labels` and `next_sentence_label` are not `None`:
-            Outputs the total_loss which is the sum of the masked language modeling loss and the next
-            sentence classification loss.
-        if `masked_lm_labels` or `next_sentence_label` is `None`:
-            Outputs a tuple comprising
-            - the masked language modeling logits of shape [batch_size, sequence_length, vocab_size], and
-            - the next sentence classification logits of shape [batch_size, 2].
-
-    Example usage:
-    ```python
-    # Already been converted into WordPiece token ids
-    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
-    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
-
-    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
-        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
-
-    model = BertForPreTraining(config)
-    masked_lm_logits_scores, seq_relationship_logits = model(input_ids, token_type_ids, input_mask)
-    ```
-    """
-    def __init__(self, config, args):
-        super(BertForPreTraining, self).__init__(config)
-        self.summary_writer = None
-        if dist.get_rank() == 0:
-            self.summary_writer = args.summary_writer
-        self.samples_per_step = dist.get_world_size() * args.train_batch_size
-        self.sample_count = self.samples_per_step
-        self.bert = BertModel(config)
-        self.cls = BertPreTrainingHeads(config,
-                                        self.bert.embeddings.word_embeddings.weight)
-        self.apply(self.init_bert_weights)
-
-    def log_summary_writer(self, logs: dict, base='Train'):
-        if dist.get_rank() == 0:
-            module_name = "Samples"  #self._batch_module_name.get(batch_type, self._get_batch_type_error(batch_type))
-            for key, log in logs.items():
-                self.summary_writer.add_scalar(f'{base}/{module_name}/{key}',
-                                               log,
-                                               self.sample_count)
-            self.sample_count += self.samples_per_step
-
-    def forward(self, batch, log=True):
-        #input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, next_sentence_label=None, checkpoint_activations=False):
-        input_ids = batch[1]
-        token_type_ids = batch[3]
-        attention_mask = batch[2]
-        masked_lm_labels = batch[5]
-        next_sentence_label = batch[4]
-        checkpoint_activations = False
-
-        sequence_output, pooled_output = self.bert(input_ids, token_type_ids, attention_mask,
-                                                   output_all_encoded_layers=False, checkpoint_activations=checkpoint_activations)
-        prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
-
-        if masked_lm_labels is not None and next_sentence_label is not None:
-            loss_fct = CrossEntropyLoss(ignore_index=-1)
-            masked_lm_loss = loss_fct(prediction_scores.view(-1,
-                                                             self.config.vocab_size),
-                                      masked_lm_labels.view(-1))
-            next_sentence_loss = loss_fct(seq_relationship_score.view(-1,
-                                                                      2),
-                                          next_sentence_label.view(-1))
-            #print("loss is {} {}".format(masked_lm_loss, next_sentence_loss))
-            total_loss = masked_lm_loss + next_sentence_loss
-            #            if log:
-            #                self.log_summary_writer(logs={'train_loss': total_loss.item()})
-            return total_loss
-        else:
-            return prediction_scores, seq_relationship_score
-
-
-class BertForMaskedLM(BertPreTrainedModel):
-    """BERT model with the masked language modeling head.
-    This module comprises the BERT model followed by the masked language modeling head.
-
-    Params:
-        config: a BertConfig class instance with the configuration to build a new model.
-
-    Inputs:
-        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
-            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
-            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
-        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
-            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
-            a `sentence B` token (see BERT paper for more details).
-        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
-            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
-            input sequence length in the current batch. It's the mask that we typically use for attention when
-            a batch has varying length sentences.
-        `masked_lm_labels`: masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
-            with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
-            is only computed for the labels set in [0, ..., vocab_size]
-
-    Outputs:
-        if `masked_lm_labels` is  not `None`:
-            Outputs the masked language modeling loss.
-        if `masked_lm_labels` is `None`:
-            Outputs the masked language modeling logits of shape [batch_size, sequence_length, vocab_size].
-
-    Example usage:
-    ```python
-    # Already been converted into WordPiece token ids
-    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
-    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
-
-    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
-        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
-
-    model = BertForMaskedLM(config)
-    masked_lm_logits_scores = model(input_ids, token_type_ids, input_mask)
-    ```
-    """
-    def __init__(self, config):
-        super(BertForMaskedLM, self).__init__(config)
-        self.bert = BertModel(config)
-        self.cls = BertOnlyMLMHead(config, self.bert.embeddings.word_embeddings.weight)
-        self.apply(self.init_bert_weights)
-
-    def forward(self,
-                input_ids,
-                token_type_ids=None,
-                attention_mask=None,
-                masked_lm_labels=None,
-                checkpoint_activations=False):
-        sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask,
-                                       output_all_encoded_layers=False)
-        prediction_scores = self.cls(sequence_output)
-
-        if masked_lm_labels is not None:
-            loss_fct = CrossEntropyLoss(ignore_index=-1)
-            masked_lm_loss = loss_fct(prediction_scores.view(-1,
-                                                             self.config.vocab_size),
-                                      masked_lm_labels.view(-1))
-            return masked_lm_loss
-        else:
-            return prediction_scores
-
-
-class BertForNextSentencePrediction(BertPreTrainedModel):
-    """BERT model with next sentence prediction head.
-    This module comprises the BERT model followed by the next sentence classification head.
-
-    Params:
-        config: a BertConfig class instance with the configuration to build a new model.
-
-    Inputs:
-        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
-            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
-            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
-        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
-            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
-            a `sentence B` token (see BERT paper for more details).
-        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
-            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
-            input sequence length in the current batch. It's the mask that we typically use for attention when
-            a batch has varying length sentences.
-        `next_sentence_label`: next sentence classification loss: torch.LongTensor of shape [batch_size]
-            with indices selected in [0, 1].
-            0 => next sentence is the continuation, 1 => next sentence is a random sentence.
-
-    Outputs:
-        if `next_sentence_label` is not `None`:
-            Outputs the total_loss which is the sum of the masked language modeling loss and the next
-            sentence classification loss.
-        if `next_sentence_label` is `None`:
-            Outputs the next sentence classification logits of shape [batch_size, 2].
-
-    Example usage:
-    ```python
-    # Already been converted into WordPiece token ids
-    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
-    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
-
-    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
-        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
-
-    model = BertForNextSentencePrediction(config)
-    seq_relationship_logits = model(input_ids, token_type_ids, input_mask)
-    ```
-    """
-    def __init__(self, config):
-        super(BertForNextSentencePrediction, self).__init__(config)
-        self.bert = BertModel(config)
-        self.cls = BertOnlyNSPHead(config)
-        self.apply(self.init_bert_weights)
-
-    def forward(self,
-                input_ids,
-                token_type_ids=None,
-                attention_mask=None,
-                next_sentence_label=None,
-                checkpoint_activations=False):
-        _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask,
-                                     output_all_encoded_layers=False)
-        seq_relationship_score = self.cls(pooled_output)
-
-        if next_sentence_label is not None:
-            loss_fct = CrossEntropyLoss(ignore_index=-1)
-            next_sentence_loss = loss_fct(seq_relationship_score.view(-1,
-                                                                      2),
-                                          next_sentence_label.view(-1))
-            return next_sentence_loss
-        else:
-            return seq_relationship_score
-
-
-class BertForSequenceClassification(BertPreTrainedModel):
-    """BERT model for classification.
-    This module is composed of the BERT model with a linear layer on top of
-    the pooled output.
-
-    Params:
-        `config`: a BertConfig class instance with the configuration to build a new model.
-        `num_labels`: the number of classes for the classifier. Default = 2.
-
-    Inputs:
-        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
-            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
-            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
-        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
-            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
-            a `sentence B` token (see BERT paper for more details).
-        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
-            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
-            input sequence length in the current batch. It's the mask that we typically use for attention when
-            a batch has varying length sentences.
-        `labels`: labels for the classification output: torch.LongTensor of shape [batch_size]
-            with indices selected in [0, ..., num_labels].
-
-    Outputs:
-        if `labels` is not `None`:
-            Outputs the CrossEntropy classification loss of the output with the labels.
-        if `labels` is `None`:
-            Outputs the classification logits of shape [batch_size, num_labels].
-
-    Example usage:
-    ```python
-    # Already been converted into WordPiece token ids
-    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
-    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
-
-    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
-        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
-
-    num_labels = 2
-
-    model = BertForSequenceClassification(config, num_labels)
-    logits = model(input_ids, token_type_ids, input_mask)
-    ```
-    """
-    def __init__(self, config, num_labels):
-        super(BertForSequenceClassification, self).__init__(config)
-        self.num_labels = num_labels
-        self.bert = BertModel(config)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        self.classifier = nn.Linear(config.hidden_size, num_labels)
-        self.apply(self.init_bert_weights)
-
-    def forward(self,
-                input_ids,
-                token_type_ids=None,
-                attention_mask=None,
-                labels=None,
-                checkpoint_activations=False):
-        _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
-        pooled_output = self.dropout(pooled_output)
-        logits = self.classifier(pooled_output)
-
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
-            return loss
-        else:
-            return logits
-
-
-class BertForMultipleChoice(BertPreTrainedModel):
-    """BERT model for multiple choice tasks.
-    This module is composed of the BERT model with a linear layer on top of
-    the pooled output.
-
-    Params:
-        `config`: a BertConfig class instance with the configuration to build a new model.
-        `num_choices`: the number of classes for the classifier. Default = 2.
-
-    Inputs:
-        `input_ids`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length]
-            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
-            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
-        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, num_choices, sequence_length]
-            with the token types indices selected in [0, 1]. Type 0 corresponds to a `sentence A`
-            and type 1 corresponds to a `sentence B` token (see BERT paper for more details).
-        `attention_mask`: an optional torch.LongTensor of shape [batch_size, num_choices, sequence_length] with indices
-            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
-            input sequence length in the current batch. It's the mask that we typically use for attention when
-            a batch has varying length sentences.
-        `labels`: labels for the classification output: torch.LongTensor of shape [batch_size]
-            with indices selected in [0, ..., num_choices].
-
-    Outputs:
-        if `labels` is not `None`:
-            Outputs the CrossEntropy classification loss of the output with the labels.
-        if `labels` is `None`:
-            Outputs the classification logits of shape [batch_size, num_labels].
-
-    Example usage:
-    ```python
-    # Already been converted into WordPiece token ids
-    input_ids = torch.LongTensor([[[31, 51, 99], [15, 5, 0]], [[12, 16, 42], [14, 28, 57]]])
-    input_mask = torch.LongTensor([[[1, 1, 1], [1, 1, 0]],[[1,1,0], [1, 0, 0]]])
-    token_type_ids = torch.LongTensor([[[0, 0, 1], [0, 1, 0]],[[0, 1, 1], [0, 0, 1]]])
-    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
-        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
-
-    num_choices = 2
-
-    model = BertForMultipleChoice(config, num_choices)
-    logits = model(input_ids, token_type_ids, input_mask)
-    ```
-    """
-    def __init__(self, config, num_choices):
-        super(BertForMultipleChoice, self).__init__(config)
-        self.num_choices = num_choices
-        self.bert = BertModel(config)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        self.classifier = nn.Linear(config.hidden_size, 1)
-        self.apply(self.init_bert_weights)
-
-    def forward(self,
-                input_ids,
-                token_type_ids=None,
-                attention_mask=None,
-                labels=None,
-                checkpoint_activations=False):
-        flat_input_ids = input_ids.view(-1, input_ids.size(-1))
-        flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1))
-        flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1))
-        _, pooled_output = self.bert(flat_input_ids, flat_token_type_ids, flat_attention_mask, output_all_encoded_layers=False)
-        pooled_output = self.dropout(pooled_output)
-        logits = self.classifier(pooled_output)
-        reshaped_logits = logits.view(-1, self.num_choices)
-
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(reshaped_logits, labels)
-            return loss
-        else:
-            return reshaped_logits
-
-
-class BertForTokenClassification(BertPreTrainedModel):
-    """BERT model for token-level classification.
-    This module is composed of the BERT model with a linear layer on top of
-    the full hidden state of the last layer.
-
-    Params:
-        `config`: a BertConfig class instance with the configuration to build a new model.
-        `num_labels`: the number of classes for the classifier. Default = 2.
-
-    Inputs:
-        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
-            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
-            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
-        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
-            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
-            a `sentence B` token (see BERT paper for more details).
-        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
-            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
-            input sequence length in the current batch. It's the mask that we typically use for attention when
-            a batch has varying length sentences.
-        `labels`: labels for the classification output: torch.LongTensor of shape [batch_size, sequence_length]
-            with indices selected in [0, ..., num_labels].
-
-    Outputs:
-        if `labels` is not `None`:
-            Outputs the CrossEntropy classification loss of the output with the labels.
-        if `labels` is `None`:
-            Outputs the classification logits of shape [batch_size, sequence_length, num_labels].
-
-    Example usage:
-    ```python
-    # Already been converted into WordPiece token ids
-    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
-    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
-
-    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
-        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
-
-    num_labels = 2
-
-    model = BertForTokenClassification(config, num_labels)
-    logits = model(input_ids, token_type_ids, input_mask)
-    ```
-    """
-    def __init__(self, config, num_labels):
-        super(BertForTokenClassification, self).__init__(config)
-        self.num_labels = num_labels
-        self.bert = BertModel(config)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        self.classifier = nn.Linear(config.hidden_size, num_labels)
-        self.apply(self.init_bert_weights)
-
-    def forward(self,
-                input_ids,
-                token_type_ids=None,
-                attention_mask=None,
-                labels=None,
-                checkpoint_activations=False):
-        sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
-        sequence_output = self.dropout(sequence_output)
-        logits = self.classifier(sequence_output)
-
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            # Only keep active parts of the loss
-            if attention_mask is not None:
-                active_loss = attention_mask.view(-1) == 1
-                active_logits = logits.view(-1, self.num_labels)[active_loss]
-                active_labels = labels.view(-1)[active_loss]
-                loss = loss_fct(active_logits, active_labels)
-            else:
-                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
-            return loss
-        else:
-            return logits
-
-
-class BertForQuestionAnswering(BertPreTrainedModel):
-    """BERT model for Question Answering (span extraction).
-    This module is composed of the BERT model with a linear layer on top of
-    the sequence output that computes start_logits and end_logits
-
-    Params:
-        `config`: a BertConfig class instance with the configuration to build a new model.
-
-    Inputs:
-        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
-            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
-            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
-        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
-            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
-            a `sentence B` token (see BERT paper for more details).
-        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
-            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
-            input sequence length in the current batch. It's the mask that we typically use for attention when
-            a batch has varying length sentences.
-        `start_positions`: position of the first token for the labeled span: torch.LongTensor of shape [batch_size].
-            Positions are clamped to the length of the sequence and position outside of the sequence are not taken
-            into account for computing the loss.
-        `end_positions`: position of the last token for the labeled span: torch.LongTensor of shape [batch_size].
-            Positions are clamped to the length of the sequence and position outside of the sequence are not taken
-            into account for computing the loss.
-
-    Outputs:
-        if `start_positions` and `end_positions` are not `None`:
-            Outputs the total_loss which is the sum of the CrossEntropy loss for the start and end token positions.
-        if `start_positions` or `end_positions` is `None`:
-            Outputs a tuple of start_logits, end_logits which are the logits respectively for the start and end
-            position tokens of shape [batch_size, sequence_length].
-
-    Example usage:
-    ```python
-    # Already been converted into WordPiece token ids
-    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
-    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
-
-    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
-        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
-
-    model = BertForQuestionAnswering(config)
-    start_logits, end_logits = model(input_ids, token_type_ids, input_mask)
-    ```
-    """
-    def __init__(self, config):
-        super(BertForQuestionAnswering, self).__init__(config)
-        self.bert = BertModel(config)
-        # TODO check with Google if it's normal there is no dropout on the token classifier of SQuAD in the TF version
-        # self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        self.qa_outputs = nn.Linear(config.hidden_size, 2)
-        self.apply(self.init_bert_weights)
-
-    def forward(self,
-                input_ids,
-                token_type_ids=None,
-                attention_mask=None,
-                start_positions=None,
-                end_positions=None,
-                checkpoint_activations=False):
-        sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
-        logits = self.qa_outputs(sequence_output)
-        start_logits, end_logits = logits.split(1, dim=-1)
-        start_logits = start_logits.squeeze(-1)
-        end_logits = end_logits.squeeze(-1)
-
-        if start_positions is not None and end_positions is not None:
-            # If we are on multi-GPU, split add a dimension
-            if len(start_positions.size()) > 1:
-                start_positions = start_positions.squeeze(-1)
-            if len(end_positions.size()) > 1:
-                end_positions = end_positions.squeeze(-1)
-            # sometimes the start/end positions are outside our model inputs, we ignore these terms
-            ignored_index = start_logits.size(1)
-            start_positions.clamp_(0, ignored_index)
-            end_positions.clamp_(0, ignored_index)
-
-            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
-            start_loss = loss_fct(start_logits, start_positions)
-            end_loss = loss_fct(end_logits, end_positions)
-            total_loss = (start_loss + end_loss) / 2
-            return total_loss
-        else:
-            return start_logits, end_logits
diff --git a/tests/unit/moe/test_moe.py b/tests/unit/moe/test_moe.py
index fe5359249dc8..310a0df16381 100644
--- a/tests/unit/moe/test_moe.py
+++ b/tests/unit/moe/test_moe.py
@@ -1,27 +1,36 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 import deepspeed
 import pytest
+import gc
 from unit.common import DistributedTest
 from unit.simple_model import SimplePRMoEModel, SimpleMoEModel, sequence_dataloader
-from unit.util import required_torch_version
+from deepspeed.moe.utils import split_params_into_different_moe_groups_for_optimizer, is_moe_param
+from deepspeed.runtime.utils import required_torch_version
 
 
 @pytest.mark.parametrize("ep_size", [2, 4])
+@pytest.mark.parametrize("zero_stage", [0, 1, 2])
 @pytest.mark.parametrize("use_residual", [True, False])
 class TestMoE(DistributedTest):
     world_size = 4
 
-    def test(self, ep_size, use_residual):
-        if not required_torch_version():
+    def test(self, ep_size, zero_stage, use_residual):
+        if not required_torch_version(min_version=1.8):
             pytest.skip("DeepSpeed MoE tests need torch 1.8 or higher to run correctly")
 
         config_dict = {
-            "train_batch_size": 8,
+            "train_micro_batch_size_per_gpu": 1,
             "steps_per_print": 1,
             "fp16": {
                 "enabled": True
+            },
+            "zero_optimization": {
+                "stage": zero_stage
             }
         }
         hidden_dim = 16
@@ -29,22 +38,72 @@ def test(self, ep_size, use_residual):
         # E+D -- ep_size = 2
         # E only -- ep_size = 4
         model = SimpleMoEModel(hidden_dim, ep_size=ep_size, use_residual=use_residual)
-        optimizer = torch.optim.AdamW(params=model.parameters())
-        model, _, _, _ = deepspeed.initialize(config=config_dict,
-                                              model=model,
-                                              optimizer=optimizer,
-                                              dist_init_required=False)
+        param_group = {'params': [p for p in model.parameters()], 'name': 'random-unique-name'}
+        params = split_params_into_different_moe_groups_for_optimizer(param_group)
+        optimizer = torch.optim.AdamW(params=params)
+        model, optimizer, _, _ = deepspeed.initialize(config=config_dict,
+                                                      model=model,
+                                                      optimizer=optimizer,
+                                                      dist_init_required=False)
         #dist_init_required=False -- parameterize to True/False?
 
-        data_loader = sequence_dataloader(model=model,
-                                          total_samples=50,
-                                          hidden_dim=hidden_dim,
-                                          device=model.device)
+        data_loader = sequence_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device)
+
+        def strict_average_tensor(tensor):
+            process_group = optimizer.dp_process_group
+            curr_size = 0
+            pg_offsets = []
+            for i, param, param_id in optimizer.params_in_ipg_bucket:
+                process_group = optimizer.dp_process_group
+                if optimizer.ipg_bucket_has_moe_params:
+                    process_group = optimizer.expert_dp_process_group[param.group_name] if is_moe_param(
+                        param) else optimizer.dp_process_group
+                partition_ids = optimizer.param_to_partition_ids[i][param_id]
+                # Get all partition ids + their offsets
+                partition_offsets = []
+                for partition_id in partition_ids:
+                    offset = optimizer.grad_start_offset[i][partition_id][param_id]
+                    partition_offsets.append(offset)
+                partition_offsets.sort()
+                # Calculate rank and offsets for grad slices
+                for idx, offset in enumerate(partition_offsets):
+                    # Calculate numel for grad slice depending on partition location
+                    if idx == len(partition_offsets) - 1:
+                        # Last partition_id uses its own offset
+                        numel = param.numel() - offset
+                    else:
+                        # Set numel to next partition's offset
+                        numel = partition_offsets[idx + 1] - offset
+                    pg_offsets.append((curr_size, process_group))
+                    curr_size += numel
+
+            def strict_narrow(dim, start, length):
+                lo, hi = 0, len(pg_offsets) - 1
+                while lo < hi:
+                    mi = lo + (hi - lo) // 2
+                    if pg_offsets[mi][0] >= start:
+                        hi = mi
+                    else:
+                        lo = mi + 1
+                curr_slice, reduce_process_group = lo, pg_offsets[lo][1]
+                while curr_slice < len(pg_offsets) and start + length > pg_offsets[curr_slice][0]:
+                    assert reduce_process_group == pg_offsets[curr_slice][
+                        1], "reduce process_group does not match the parameter's process_group"
+                    curr_slice += 1
+                return orig_narrow(dim, start, length)  # real call
+
+            orig_narrow, tensor.narrow = tensor.narrow, strict_narrow
+            type(optimizer).average_tensor(optimizer, tensor)  # real call
+            tensor.narrow = orig_narrow
+
+        if "average_tensor" in dir(optimizer):
+            optimizer.average_tensor = strict_average_tensor
 
         for n, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
             model.backward(loss)
             model.step()
+            gc.collect()  # Must do this or we get a memory leak in this test
 
 
 @pytest.mark.parametrize("ep_size, use_residual", [(2, True), (2, False)])
@@ -52,16 +111,10 @@ class TestPRMoE(DistributedTest):
     world_size = 4
 
     def test(self, ep_size, use_residual):
-        if not required_torch_version():
+        if not required_torch_version(min_version=1.8):
             pytest.skip("DeepSpeed MoE tests need torch 1.8 or higher to run correctly")
 
-        config_dict = {
-            "train_batch_size": 8,
-            "steps_per_print": 1,
-            "fp16": {
-                "enabled": True
-            }
-        }
+        config_dict = {"train_batch_size": 8, "steps_per_print": 1, "fp16": {"enabled": True}}
         hidden_dim = 16
 
         # E+D -- ep_size = 2
@@ -73,10 +126,7 @@ def test(self, ep_size, use_residual):
                                               optimizer=optimizer,
                                               dist_init_required=False)
 
-        data_loader = sequence_dataloader(model=model,
-                                          total_samples=50,
-                                          hidden_dim=hidden_dim,
-                                          device=model.device)
+        data_loader = sequence_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device)
 
         for n, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
diff --git a/tests/unit/moe/test_moe_tp.py b/tests/unit/moe/test_moe_tp.py
index ba63a102a0ed..0069c674690c 100644
--- a/tests/unit/moe/test_moe_tp.py
+++ b/tests/unit/moe/test_moe_tp.py
@@ -1,14 +1,18 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 import deepspeed
 import pytest
 from unit.common import DistributedTest
-from unit.util import required_torch_version
+from deepspeed.runtime.utils import required_torch_version
 from deepspeed.moe.layer import MoE
 
 
 class MPU():
+
     def __init__(self, tp_world_size):
         self.rank = deepspeed.comm.get_rank()
         self.world_size = deepspeed.comm.get_world_size()
@@ -54,24 +58,15 @@ class TestMOETensorParallel(DistributedTest):
     def test(self, ep_size, tp_size, enable_expert_tp, use_residual):
         # TODO: replace this with a true parallel mlp in the future
         # and run convergence tests
-        if not required_torch_version():
+        if not required_torch_version(min_version=1.8):
             pytest.skip("DeepSpeed MoE tests need torch 1.8 or higher to run correctly")
 
-        config_dict = {
-            "train_batch_size": 8,
-            "steps_per_print": 1,
-            "fp16": {
-                "enabled": True
-            }
-        }
+        config_dict = {"train_batch_size": 8, "steps_per_print": 1, "fp16": {"enabled": True}}
         hidden_dim = 16
 
-        tensor_parallel_expert = torch.nn.Sequential(
-            torch.nn.Linear(hidden_dim,
-                            4 * hidden_dim // tp_size),
-            torch.nn.ReLU(),
-            torch.nn.Linear(4 * hidden_dim // tp_size,
-                            hidden_dim))
+        tensor_parallel_expert = torch.nn.Sequential(torch.nn.Linear(hidden_dim, 4 * hidden_dim // tp_size),
+                                                     torch.nn.ReLU(),
+                                                     torch.nn.Linear(4 * hidden_dim // tp_size, hidden_dim))
 
         # set num experts to world size
         world_size = deepspeed.comm.get_world_size()
@@ -92,7 +87,6 @@ def test(self, ep_size, tp_size, enable_expert_tp, use_residual):
 
         assert model.num_local_experts == world_size // ep_size
         if enable_expert_tp:
-            assert deepspeed.utils.groups._get_expert_model_parallel_world_size(
-            ) == tp_size
+            assert deepspeed.utils.groups._get_expert_model_parallel_world_size() == tp_size
         else:
             assert deepspeed.utils.groups._get_expert_model_parallel_world_size() == 1
diff --git a/tests/unit/monitor/test_monitor.py b/tests/unit/monitor/test_monitor.py
index 7cf10619661c..3e04bebfb6c1 100644
--- a/tests/unit/monitor/test_monitor.py
+++ b/tests/unit/monitor/test_monitor.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from deepspeed.monitor.tensorboard import TensorBoardMonitor
 from deepspeed.monitor.wandb import WandbMonitor
diff --git a/tests/unit/multi_output_model.py b/tests/unit/multi_output_model.py
index 8993813aa545..e84215fb4e95 100644
--- a/tests/unit/multi_output_model.py
+++ b/tests/unit/multi_output_model.py
@@ -1,9 +1,13 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 
 
 class MultiOutputModel(torch.nn.Module):
+
     def __init__(self, hidden_dim, weight_value):
         super(MultiOutputModel, self).__init__()
         self.linear = torch.nn.Linear(hidden_dim, hidden_dim, bias=False)
@@ -24,19 +28,11 @@ def multi_output_dataloader(model, total_samples, hidden_dim, device, inputs, ta
     batch_size = model.train_micro_batch_size_per_gpu()
 
     train_data = [
-        torch.full(size=(total_samples,
-                         hidden_dim),
-                   fill_value=x,
-                   device=device,
-                   dtype=torch.half,
-                   requires_grad=True) for x in inputs
+        torch.full(size=(total_samples, hidden_dim), fill_value=x, device=device, dtype=torch.half, requires_grad=True)
+        for x in inputs
     ]
 
-    train_label = [
-        torch.empty(total_samples,
-                    device=device,
-                    dtype=torch.long).fill_(y) for y in targets
-    ]
+    train_label = [torch.empty(total_samples, device=device, dtype=torch.long).fill_(y) for y in targets]
 
     train_dataset = torch.utils.data.TensorDataset(*train_data, *train_label)
     train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size)
diff --git a/tests/unit/ops/accelerators/test_accelerator_backward.py b/tests/unit/ops/accelerators/test_accelerator_backward.py
index ad26daeb698c..43f7b471e2ae 100644
--- a/tests/unit/ops/accelerators/test_accelerator_backward.py
+++ b/tests/unit/ops/accelerators/test_accelerator_backward.py
@@ -1,22 +1,27 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
-import math
 import numpy as np
 import torch
 import pytest
 import random
 import copy
+import os
 from torch import nn
 from deepspeed import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig
 from deepspeed.accelerator import get_accelerator
 from unit.modeling import BertConfig, BertLayerNorm, BertEncoder as BertEncoderPostln
 from unit.modelingpreln import BertEncoder as BertEncoderPreln
-from unit.common import DistributedTest
+from unit.common import DistributedTest, is_rocm_pytorch
 
 #if not deepspeed.ops.__installed_ops__['transformer']:
 #pytest.skip(
 #    "transformer kernels are temporarily disabled because of unexplained failures",
 #    allow_module_level=True)
+if torch.half not in get_accelerator().supported_dtypes():
+    pytest.skip(f"fp16 not supported, valid dtype: {get_accelerator().supported_dtypes()}", allow_module_level=True)
 
 
 def check_equal(first, second, atol=1e-2, verbose=False):
@@ -91,26 +96,21 @@ def zero_grad(variables):
 
 
 class DSEncoder(nn.Module):
+
     def __init__(self, config, weights, biases):
         super(DSEncoder, self).__init__()
         self.FinalLayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
         self.layer = nn.ModuleList([
-            copy.deepcopy(DeepSpeedTransformerLayer(config,
-                                                    weights,
-                                                    biases))
-            for _ in range(config.num_hidden_layers)
+            copy.deepcopy(DeepSpeedTransformerLayer(config, weights, biases)) for _ in range(config.num_hidden_layers)
         ])
         self.grads = []
         self.pre_or_post = config.pre_layer_norm
 
-    def forward(self,
-                hidden_states,
-                attention_mask,
-                output_all_encoded_layers=True,
-                checkpoint_activations=False):
+    def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True, checkpoint_activations=False):
         all_encoder_layers = []
 
         def custom(start, end):
+
             def custom_forward(*inputs):
                 layers = self.layer[start:end]
                 x_ = inputs[0]
@@ -121,25 +121,23 @@ def custom_forward(*inputs):
             return custom_forward
 
         if checkpoint_activations:
-            l = 0
-            num_layers = len(self.layer)
-            chunk_length = math.ceil(math.sqrt(num_layers))
-            while l < num_layers:
-                hidden_states = checkpoint.checkpoint(custom(l,  # noqa: F821
-                                                             l + chunk_length),
-                                                      hidden_states,
-                                                      attention_mask * 1)
-                l += chunk_length
+            raise NotImplementedError("`checkpoint` is not defined below")
+            #l = 0
+            #num_layers = len(self.layer)
+            #chunk_length = math.ceil(math.sqrt(num_layers))
+            #while l < num_layers:
+            #    hidden_states = checkpoint.checkpoint(
+            #        custom(
+            #            l,  # noqa: F821
+            #            l + chunk_length),
+            #        hidden_states,
+            #        attention_mask * 1)
+            #    l += chunk_length
             # decoder layers
         else:
             for i, layer_module in enumerate(self.layer):
-                hidden_states = layer_module(hidden_states,
-                                             attention_mask,
-                                             grads=self.grads)
-                hidden_states.register_hook(
-                    lambda x,
-                    self=self: self.grads.append([x,
-                                                  "hidden_state"]))
+                hidden_states = layer_module(hidden_states, attention_mask, grads=self.grads)
+                hidden_states.register_hook(lambda x, self=self: self.grads.append([x, "hidden_state"]))
 
                 if output_all_encoded_layers:
                     all_encoder_layers.append(hidden_states)
@@ -171,20 +169,14 @@ def create_models(ds_config):
     biases = []
 
     for i in range(4):
-        weights.append(
-            nn.Parameter(torch.Tensor(ds_config.hidden_size,
-                                      ds_config.hidden_size)))
+        weights.append(nn.Parameter(torch.Tensor(ds_config.hidden_size, ds_config.hidden_size)))
         weights[i].data.normal_(mean=0.0, std=ds_config.initializer_range)
 
     weights.append(nn.Parameter(torch.Tensor(ds_config.hidden_size)))
     weights[4].data.fill_(1.0)
-    weights.append(
-        nn.Parameter(torch.Tensor(ds_config.intermediate_size,
-                                  ds_config.hidden_size)))
+    weights.append(nn.Parameter(torch.Tensor(ds_config.intermediate_size, ds_config.hidden_size)))
     weights[5].data.normal_(mean=0.0, std=ds_config.initializer_range)
-    weights.append(
-        nn.Parameter(torch.Tensor(ds_config.hidden_size,
-                                  ds_config.intermediate_size)))
+    weights.append(nn.Parameter(torch.Tensor(ds_config.hidden_size, ds_config.intermediate_size)))
     weights[6].data.normal_(mean=0.0, std=ds_config.initializer_range)
     weights.append(nn.Parameter(torch.Tensor(ds_config.hidden_size)))
     weights[7].data.fill_(1.0)
@@ -229,10 +221,7 @@ def run_backward(ds_config, seq_len, atol=1e-2, verbose=False):
 
     # prepare test data
     kwargs = kwargs_fp16 if ds_config.fp16 else kwargs_fp32
-    hidden_states = torch.randn(ds_config.batch_size,
-                                seq_len,
-                                ds_config.hidden_size,
-                                **kwargs)
+    hidden_states = torch.randn(ds_config.batch_size, seq_len, ds_config.hidden_size, **kwargs)
     input_mask = torch.randn(ds_config.batch_size, 1, 1, seq_len, **kwargs)
     Y = torch.randn(ds_config.batch_size, seq_len, ds_config.hidden_size, **kwargs)
 
@@ -247,10 +236,7 @@ def run_backward(ds_config, seq_len, atol=1e-2, verbose=False):
     base_grads = bert_encoder.get_grads()
 
     # run ds
-    ds_results = ds_encoder(hidden_states,
-                            input_mask,
-                            output_all_encoded_layers=False,
-                            checkpoint_activations=False)
+    ds_results = ds_encoder(hidden_states, input_mask, output_all_encoded_layers=False, checkpoint_activations=False)
 
     loss = (Y - ds_results[0]).pow(2).sum() / 64
     loss.backward()
@@ -260,9 +246,7 @@ def run_backward(ds_config, seq_len, atol=1e-2, verbose=False):
     check_equal(base_grads, ds_grads, atol=atol, verbose=verbose)
 
 
-#test_backward[3-1024-120-16-24-True-True-0.05]
-#test_backward[3-1024-52-16-24-False-True-0.2]
-# 3-128-54-2-24-False-True-0.2
+# NOTE: Keep these different params as they have helped find divergence in behavior between AMD and NVIDIA.
 @pytest.mark.parametrize('batch_size, hidden_size, seq_len, heads, num_layers, is_preln, use_fp16, atol',
                          [
                              (64,160,128,2,24,False,True, 0.2),
@@ -270,28 +254,16 @@ def run_backward(ds_config, seq_len, atol=1e-2, verbose=False):
                              (8,1600,128,25,3,True,True, 0.05),
                              (8,160,128,2,3,True,True, 0.1),
                              (8,1600,128,2,3,True,True, 0.05),
-                             #(3,1024,119,16,24,True,False, 0.05),
-                             #(3,1024,115,16,24,True,True, 0.05),
-                             #(1024,128,10,2,2,False,False, 0.1),
-                             #(3,1024,52,16,24,False,True, 0.2),
-                             #(3,128,51,2,24,False,False, 0.1),
-                             #(3,128,54,2,24,False,True, 0.2),
                          ]) # yapf: disable
 class TestCUDABackward(DistributedTest):
     world_size = 1
+    if is_rocm_pytorch():
+        #This is to flush denorms in forward pass. Please refer to https://github.com/pytorch/pytorch/blob/main/docs/source/notes/numerical_accuracy.rst#reduced-precision-fp16-and-bf16-gemms-and-convolutions-on-amd-instinct-mi200-devices
+        os.environ['ROCBLAS_INTERNAL_FP16_ALT_IMPL'] = '1'
 
-    def test_backward(self,
-                      batch_size,
-                      hidden_size,
-                      seq_len,
-                      heads,
-                      num_layers,
-                      is_preln,
-                      use_fp16,
-                      atol):
+    def test_backward(self, is_preln, use_fp16, batch_size, hidden_size, seq_len, heads, num_layers, atol):
         # Only run fp16 test cases on devices with FP16 capability.
-        if not get_accelerator().is_fp16_supported() and (use_fp16 is True
-                                                          or is_preln is False):
+        if not get_accelerator().is_fp16_supported() and (use_fp16 is True or is_preln is False):
             return
 
         ds_config = DeepSpeedTransformerConfig()
@@ -308,38 +280,3 @@ def test_backward(self,
         ds_config.fp16 = use_fp16
 
         run_backward(ds_config, seq_len, atol=atol, verbose=True)
-
-    #                         [
-    #                             (3,1024,128,16,24,True,False, 0.07),
-    #                             (3,1024,128,16,24,True,True, 0.05),
-    #                             (3,1024,128,16,24,False,False, 0.1),
-    #                             (3,1024,128,16,24,False,True, 0.2),
-    #                         ]) # yapf: disable
-    #def test_backward_stochastic(batch_size,
-    #                             hidden_size,
-    #                             seq_len,
-    #                             heads,
-    #                             num_layers,
-    #                             is_preln,
-    #                             use_fp16,
-    #                             atol):
-    #    # Only run fp16 test cases on devices with FP16 capability.
-    #    if not get_accelerator().is_fp16_supported() and use_fp16 is True:
-    #        return
-    #
-    #    ds_config = DeepSpeedTransformerConfig()
-    #    ds_config.layer_id = None
-    #    ds_config.batch_size = batch_size
-    #    ds_config.hidden_size = hidden_size
-    #    ds_config.intermediate_size = 4 * hidden_size
-    #    ds_config.max_seq_length = seq_len
-    #    ds_config.heads = heads
-    #    ds_config.attn_dropout_ratio = 0.0
-    #    ds_config.hidden_dropout_ratio = 0.0
-    #    ds_config.num_hidden_layers = num_layers
-    #    ds_config.pre_layer_norm = is_preln
-    #    ds_config.initializer_range = 0.02
-    #    ds_config.fp16 = use_fp16
-    #    ds_config.stochastic_mode = True
-    #
-    #    run_backward(ds_config, atol=atol)
diff --git a/tests/unit/ops/accelerators/test_accelerator_forward.py b/tests/unit/ops/accelerators/test_accelerator_forward.py
index 317e2fe3cb45..ee9464f63aa1 100644
--- a/tests/unit/ops/accelerators/test_accelerator_forward.py
+++ b/tests/unit/ops/accelerators/test_accelerator_forward.py
@@ -1,6 +1,8 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
-import math
 import numpy as np
 import torch
 import pytest
@@ -13,6 +15,9 @@
 from deepspeed.accelerator import get_accelerator
 from unit.common import DistributedTest
 
+if torch.half not in get_accelerator().supported_dtypes():
+    pytest.skip(f"fp16 not supported, valid dtype: {get_accelerator().supported_dtypes()}", allow_module_level=True)
+
 
 def check_equal(first, second, atol=1e-2, verbose=False):
     if verbose:
@@ -38,26 +43,21 @@ def zero_grad(variables):
 
 
 class DSEncoder(nn.Module):
+
     def __init__(self, config, weights, biases):
         super(DSEncoder, self).__init__()
         self.FinalLayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
         self.layer = nn.ModuleList([
-            copy.deepcopy(DeepSpeedTransformerLayer(config,
-                                                    weights,
-                                                    biases))
-            for _ in range(config.num_hidden_layers)
+            copy.deepcopy(DeepSpeedTransformerLayer(config, weights, biases)) for _ in range(config.num_hidden_layers)
         ])
         self.grads = []
         self.pre_or_post = config.pre_layer_norm
 
-    def forward(self,
-                hidden_states,
-                attention_mask,
-                output_all_encoded_layers=True,
-                checkpoint_activations=False):
+    def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True, checkpoint_activations=False):
         all_encoder_layers = []
 
         def custom(start, end):
+
             def custom_forward(*inputs):
                 layers = self.layer[start:end]
                 x_ = inputs[0]
@@ -68,15 +68,18 @@ def custom_forward(*inputs):
             return custom_forward
 
         if checkpoint_activations:
-            l = 0
-            num_layers = len(self.layer)
-            chunk_length = math.ceil(math.sqrt(num_layers))
-            while l < num_layers:
-                hidden_states = checkpoint.checkpoint(custom(l,  # noqa: F821
-                                                             l + chunk_length),
-                                                      hidden_states,
-                                                      attention_mask * 1)
-                l += chunk_length
+            raise NotImplementedError("`checkpoint` below is not defined")
+            #l = 0
+            #num_layers = len(self.layer)
+            #chunk_length = math.ceil(math.sqrt(num_layers))
+            #while l < num_layers:
+            #    hidden_states = checkpoint.checkpoint(
+            #        custom(
+            #            l,  # noqa: F821
+            #            l + chunk_length),
+            #        hidden_states,
+            #        attention_mask * 1)
+            #    l += chunk_length
             # decoder layers
         else:
             for i, layer_module in enumerate(self.layer):
@@ -111,20 +114,14 @@ def create_models(ds_config):
     biases = []
 
     for i in range(4):
-        weights.append(
-            nn.Parameter(torch.Tensor(ds_config.hidden_size,
-                                      ds_config.hidden_size)))
+        weights.append(nn.Parameter(torch.Tensor(ds_config.hidden_size, ds_config.hidden_size)))
         weights[i].data.normal_(mean=0.0, std=ds_config.initializer_range)
 
     weights.append(nn.Parameter(torch.Tensor(ds_config.hidden_size)))
     weights[4].data.fill_(1.0)
-    weights.append(
-        nn.Parameter(torch.Tensor(ds_config.intermediate_size,
-                                  ds_config.hidden_size)))
+    weights.append(nn.Parameter(torch.Tensor(ds_config.intermediate_size, ds_config.hidden_size)))
     weights[5].data.normal_(mean=0.0, std=ds_config.initializer_range)
-    weights.append(
-        nn.Parameter(torch.Tensor(ds_config.hidden_size,
-                                  ds_config.intermediate_size)))
+    weights.append(nn.Parameter(torch.Tensor(ds_config.hidden_size, ds_config.intermediate_size)))
     weights[6].data.normal_(mean=0.0, std=ds_config.initializer_range)
     weights.append(nn.Parameter(torch.Tensor(ds_config.hidden_size)))
     weights[7].data.fill_(1.0)
@@ -181,10 +178,7 @@ def run_forward(ds_config, seq_len, atol=1e-2, verbose=False, test_bsz=None):
                                 checkpoint_activations=False)
 
     # run ds
-    ds_results = ds_encoder(hidden_states,
-                            input_mask,
-                            output_all_encoded_layers=False,
-                            checkpoint_activations=False)
+    ds_results = ds_encoder(hidden_states, input_mask, output_all_encoded_layers=False, checkpoint_activations=False)
 
     # check forward evaluation
     check_equal(base_results, ds_results, atol=atol, verbose=verbose)
@@ -233,15 +227,9 @@ def run_forward(ds_config, seq_len, atol=1e-2, verbose=False, test_bsz=None):
                          ]) # yapf: disable
 class TestCUDAForward(DistributedTest):
     world_size = 1
+    reuse_dist_env = True
 
-    def test_forward(self,
-                     batch_size,
-                     hidden_size,
-                     seq_len,
-                     heads,
-                     num_layers,
-                     is_preln,
-                     use_fp16):
+    def test_forward(self, batch_size, hidden_size, seq_len, heads, num_layers, is_preln, use_fp16):
         # Only run fp16 test cases on devices with FP16 capability.
         if not get_accelerator().is_fp16_supported() and use_fp16 is True:
             return
@@ -272,14 +260,7 @@ def test_forward(self,
 class TestCUDAForwardSmallBatchSize(DistributedTest):
     world_size = 1
 
-    def test_forward_with_small_bsz(self,
-                                    batch_size,
-                                    small_bsz,
-                                    hidden_size,
-                                    seq_len,
-                                    heads,
-                                    num_layers,
-                                    is_preln,
+    def test_forward_with_small_bsz(self, batch_size, small_bsz, hidden_size, seq_len, heads, num_layers, is_preln,
                                     use_fp16):
         # Only run fp16 test cases on devices with FP16 capability.
         if not get_accelerator().is_fp16_supported() and use_fp16 is True:
@@ -310,14 +291,7 @@ def test_forward_with_small_bsz(self,
 class TestCUDAForwardStochastic(DistributedTest):
     world_size = 1
 
-    def test_forward_stochastic(self,
-                                batch_size,
-                                hidden_size,
-                                seq_len,
-                                heads,
-                                num_layers,
-                                is_preln,
-                                use_fp16):
+    def test_forward_stochastic(self, batch_size, hidden_size, seq_len, heads, num_layers, is_preln, use_fp16):
         # Only run fp16 test cases on devices with FP16 capability.
         if not get_accelerator().is_fp16_supported() and use_fp16 is True:
             return
diff --git a/tests/unit/ops/adagrad/test_cpu_adagrad.py b/tests/unit/ops/adagrad/test_cpu_adagrad.py
index 17001e6bd021..99e934e2efda 100644
--- a/tests/unit/ops/adagrad/test_cpu_adagrad.py
+++ b/tests/unit/ops/adagrad/test_cpu_adagrad.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 import numpy as np
@@ -31,17 +34,7 @@ class TestCPUAdagrad(DistributedTest):
         init_distributed = False
         set_dist_env = False
 
-    @pytest.mark.parametrize('model_size',
-                            [
-                                (64),
-                                (22),
-                                (55),
-                                (127),
-                                (1024),
-                                (1048576),
-                                (30000000),
-                            ]) # yapf: disable
-    def test_cpu_adagrad_opt(self, model_size):
+    def test_cpu_adagrad_opt(self, model_size=64):
         device = 'cpu'
         rng_state = torch.get_rng_state()
         param = torch.nn.Parameter(torch.randn(model_size, device=device))
@@ -62,36 +55,18 @@ def test_cpu_adagrad_opt(self, model_size):
 
         check_equal(param, param1, atol=1e-2, verbose=True)
 
-
-    @pytest.mark.parametrize('model_size,vocabulary_size,dim',
-                            [
-                                (16 * 2, 16 * 4, 16),
-                                (16 * 32, 16 * 256, 16),
-                                (16 * 256, 16 * 16384, 16),
-                            ]) # yapf: disable
-    def test_cpu_adagrad_opt_sparse_embedding(self, model_size, vocabulary_size, dim):
+    def test_cpu_adagrad_opt_sparse_embedding(self, model_size=32, vocabulary_size=64, dim=16):
         device = 'cpu'
         rng_state = torch.get_rng_state()
 
         def gen_sparse_grad(vocabulary_size, dim, num_indices, dtype, device):
-            i = torch.randint(vocabulary_size,
-                              size=(1,
-                                    num_indices),
-                              dtype=torch.int64,
-                              device=device)
+            i = torch.randint(vocabulary_size, size=(1, num_indices), dtype=torch.int64, device=device)
             v = torch.randn(num_indices, dim, dtype=dtype, device=device)
             t = torch.sparse_coo_tensor(i, v, (vocabulary_size, dim), device=device)
             t = t.coalesce()
-            new_i = (t.indices().view(-1,
-                                      1).repeat(1,
-                                                dim) * dim +
-                     torch.tensor(range(dim))).flatten().unsqueeze(0)
+            new_i = (t.indices().view(-1, 1).repeat(1, dim) * dim + torch.tensor(range(dim))).flatten().unsqueeze(0)
             new_v = t.values().flatten()
-            new_t = torch.sparse_coo_tensor(new_i,
-                                            new_v,
-                                            (vocabulary_size * dim,
-                                             ),
-                                            device=device)
+            new_t = torch.sparse_coo_tensor(new_i, new_v, (vocabulary_size * dim, ), device=device)
             new_t = new_t.coalesce()
             new_t.requires_grad = False
             return new_t
@@ -101,17 +76,9 @@ def gen_sparse_grad(vocabulary_size, dim, num_indices, dtype, device):
         num_indices = int(model_size // dim)
         dtype = torch.float32
 
-        param = torch.nn.Parameter(torch.randn((voc_size * dim,
-                                                ),
-                                               dtype=dtype,
-                                               device=device),
-                                   requires_grad=True)
+        param = torch.nn.Parameter(torch.randn((voc_size * dim, ), dtype=dtype, device=device), requires_grad=True)
         torch.set_rng_state(rng_state)
-        param1 = torch.nn.Parameter(torch.randn((voc_size * dim,
-                                                 ),
-                                                dtype=dtype,
-                                                device=device),
-                                    requires_grad=True)
+        param1 = torch.nn.Parameter(torch.randn((voc_size * dim, ), dtype=dtype, device=device), requires_grad=True)
         torch.set_rng_state(rng_state)
 
         optimizer = DeepSpeedCPUAdagrad([param])
@@ -119,17 +86,9 @@ def gen_sparse_grad(vocabulary_size, dim, num_indices, dtype, device):
 
         for i in range(10):
             torch.set_rng_state(rng_state)
-            param.grad = gen_sparse_grad(voc_size,
-                                         dim,
-                                         num_indices,
-                                         dtype=dtype,
-                                         device=device)
+            param.grad = gen_sparse_grad(voc_size, dim, num_indices, dtype=dtype, device=device)
             torch.set_rng_state(rng_state)
-            param1.grad = gen_sparse_grad(voc_size,
-                                          dim,
-                                          num_indices,
-                                          dtype=dtype,
-                                          device=device)
+            param1.grad = gen_sparse_grad(voc_size, dim, num_indices, dtype=dtype, device=device)
             optimizer.step()
             optimizer1.step()
 
@@ -137,6 +96,7 @@ def gen_sparse_grad(vocabulary_size, dim, num_indices, dtype, device):
 
 
 class TestCPUAdagradGPUError(DistributedTest):
+
     def test_cpu_adagrad_gpu_error(self):
         model_size = 64
         device = get_accelerator().device_name(0)  # 'cuda:0' or 'xpu:0'
diff --git a/tests/unit/ops/adam/test_adamw.py b/tests/unit/ops/adam/test_adamw.py
index 03a7c3ca3266..3b1b088766a5 100644
--- a/tests/unit/ops/adam/test_adamw.py
+++ b/tests/unit/ops/adam/test_adamw.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import deepspeed
 import torch
@@ -8,7 +11,10 @@
 from deepspeed.ops.adam import DeepSpeedCPUAdam
 from unit.common import DistributedTest
 from unit.simple_model import SimpleModel
+from deepspeed.accelerator import get_accelerator
 
+if torch.half not in get_accelerator().supported_dtypes():
+    pytest.skip(f"fp16 not supported, valid dtype: {get_accelerator().supported_dtypes()}", allow_module_level=True)
 # yapf: disable
 #'optimizer, zero_offload, torch_adam, adam_w_mode, resulting_optimizer
 adam_configs = [["AdamW", False, False, False, (FusedAdam, True)],
@@ -33,6 +39,7 @@
     adam_configs)
 class TestAdamConfigs(DistributedTest):
     world_size = 1
+    reuse_dist_env = True
 
     def test(self,
              optimizer,
diff --git a/tests/unit/ops/adam/test_cpu_adam.py b/tests/unit/ops/adam/test_cpu_adam.py
index d10fb98105a8..9a6ff6689446 100644
--- a/tests/unit/ops/adam/test_cpu_adam.py
+++ b/tests/unit/ops/adam/test_cpu_adam.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 import numpy as np
@@ -31,17 +34,13 @@ def check_equal(first, second, atol=1e-2, verbose=False):
 def _compare_optimizers(model_size, param1, optimizer1, param2, optimizer2):
     for i in range(10):
         param1.grad = torch.randn(model_size, device=param1.device).to(param1.dtype)
-        param2.grad = param1.grad.clone().detach().to(device=param2.device,
-                                                      dtype=param2.dtype)
+        param2.grad = param1.grad.clone().detach().to(device=param2.device, dtype=param2.dtype)
 
         optimizer1.step()
         optimizer2.step()
 
     tolerance = param1.float().norm().detach().numpy() * 1e-2
-    check_equal(param1.float().norm(),
-                param2.float().cpu().norm(),
-                atol=tolerance,
-                verbose=True)
+    check_equal(param1.float().norm(), param2.float().cpu().norm(), atol=tolerance, verbose=True)
 
 
 @pytest.mark.parametrize('dtype', [torch.half, torch.float], ids=["fp16", "fp32"])
@@ -56,13 +55,13 @@ def _compare_optimizers(model_size, param1, optimizer1, param2, optimizer2):
                          ]) # yapf: disable
 class TestCPUAdam(DistributedTest):
     world_size = 1
+    reuse_dist_env = True
     requires_cuda_env = False
     if not get_accelerator().is_available():
         init_distributed = False
         set_dist_env = False
 
-    @pytest.mark.skipif(not get_accelerator().is_available(),
-                        reason="only supported in CUDA environments.")
+    @pytest.mark.skipif(not get_accelerator().is_available(), reason="only supported in CUDA environments.")
     def test_fused_adam_equal(self, dtype, model_size):
         if ("amd" in pytest.cpu_vendor) and (dtype == torch.half):
             pytest.skip("cpu-adam with half precision not supported on AMD CPUs")
@@ -95,9 +94,7 @@ def test_torch_adamw_equal(self, dtype, model_size):
             ref_param_device = get_accelerator().device_name()
         else:
             if dtype == torch.half:
-                pytest.skip(
-                    "torch.optim.AdamW with half precision only supported in CUDA environments."
-                )
+                pytest.skip("torch.optim.AdamW with half precision only supported in CUDA environments.")
             ref_param_device = 'cpu'
 
             from deepspeed.ops.adam import DeepSpeedCPUAdam
@@ -117,6 +114,7 @@ def test_torch_adamw_equal(self, dtype, model_size):
 
 
 class TestCPUAdamGPUError(DistributedTest):
+
     def test_cpu_adam_gpu_error(self):
         model_size = 64
         from deepspeed.ops.adam import DeepSpeedCPUAdam
diff --git a/tests/unit/ops/adam/test_hybrid_adam.py b/tests/unit/ops/adam/test_hybrid_adam.py
new file mode 100644
index 000000000000..c7ef4890b322
--- /dev/null
+++ b/tests/unit/ops/adam/test_hybrid_adam.py
@@ -0,0 +1,78 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+import numpy as np
+import pytest
+
+from cpuinfo import get_cpu_info
+
+import deepspeed
+from deepspeed.accelerator import get_accelerator
+from deepspeed.ops.adam import FusedAdam, DeepSpeedCPUAdam
+from deepspeed.ops.op_builder import CPUAdamBuilder
+from unit.common import DistributedTest
+
+if not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
+    pytest.skip("hybrid-adam is not compatible", allow_module_level=True)
+
+pytest.cpu_vendor = get_cpu_info()["vendor_id_raw"].lower()
+
+
+def check_equal(first, second, atol=1e-2, verbose=False):
+    x = first.detach().numpy()
+    y = second.detach().numpy()
+    print("ATOL", atol)
+    if verbose:
+        print("x = {}".format(x.flatten()))
+        print("y = {}".format(y.flatten()))
+        print('-' * 80)
+    np.testing.assert_allclose(x, y, err_msg="param-update mismatch!", atol=atol)
+
+
+@pytest.mark.parametrize('dtype', [torch.half, torch.float], ids=["fp16", "fp32"])
+@pytest.mark.parametrize('model_size', [8, 16])
+class TestHybridAdam(DistributedTest):
+    world_size = 1
+    reuse_dist_env = True
+    requires_cuda_env = False
+    if not get_accelerator().is_available():
+        init_distributed = False
+        set_dist_env = False
+
+    @pytest.mark.skipif(not get_accelerator().is_available(), reason="only supported in CUDA environments.")
+    def test_hybrid_adam_equal(self, dtype, model_size):
+        if ("amd" in pytest.cpu_vendor) and (dtype == torch.half):
+            pytest.skip("cpu-adam with half precision not supported on AMD CPUs")
+
+        ref_data = torch.randn(model_size).to(dtype)
+        total_data = ref_data.clone().detach()
+
+        ref_param = torch.nn.Parameter(ref_data)
+        ref_optimizer = DeepSpeedCPUAdam([ref_param])
+
+        cpu_data, cuda_data = total_data.chunk(2)
+        cpu_param = torch.nn.Parameter(cpu_data)
+        cuda_param = torch.nn.Parameter(cuda_data.to(get_accelerator().device_name()))
+
+        cpu_optimizer = DeepSpeedCPUAdam([cpu_param])
+        cuda_optimizer = FusedAdam([cuda_param])
+
+        ref_grad = torch.randn(model_size).to(dtype)
+        cpu_grad, cuda_grad = ref_grad.clone().detach().chunk(2)
+
+        ref_param.grad = ref_grad
+        cpu_param.grad = cpu_grad
+        cuda_param.grad = cuda_grad.to(get_accelerator().device_name())
+
+        ref_optimizer.step()
+        cpu_optimizer.step()
+        cuda_optimizer.step()
+
+        cuda_param_copy = cuda_param.cpu()
+
+        total_param = torch.cat((cpu_param, cuda_param_copy))
+
+        check_equal(ref_param, total_param)
diff --git a/tests/unit/ops/aio/test_aio.py b/tests/unit/ops/aio/test_aio.py
index 86265ab15ef9..f6d175ce67bc 100644
--- a/tests/unit/ops/aio/test_aio.py
+++ b/tests/unit/ops/aio/test_aio.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import pytest
 import os
@@ -80,34 +83,21 @@ def _validate_handle_state(handle, single_submit, overlap_events):
 @pytest.mark.parametrize("overlap_events", [True, False])
 class TestRead(DistributedTest):
     world_size = 1
+    reuse_dist_env = True
     requires_cuda_env = False
     if not get_accelerator().is_available():
         init_distributed = False
         set_dist_env = False
 
-    def test_parallel_read(self,
-                           tmpdir,
-                           use_cuda_pinned_tensor,
-                           single_submit,
-                           overlap_events):
-        _skip_for_invalid_environment(use_cuda_device=False,
-                                      use_cuda_pinned_tensor=use_cuda_pinned_tensor)
+    def test_parallel_read(self, tmpdir, use_cuda_pinned_tensor, single_submit, overlap_events):
+        _skip_for_invalid_environment(use_cuda_device=False, use_cuda_pinned_tensor=use_cuda_pinned_tensor)
 
-        h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE,
-                                               QUEUE_DEPTH,
-                                               single_submit,
-                                               overlap_events,
-                                               IO_PARALLEL)
+        h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL)
 
         if use_cuda_pinned_tensor:
-            aio_buffer = get_accelerator().pin_memory(
-                torch.empty(IO_SIZE,
-                            dtype=torch.uint8,
-                            device='cpu'))
+            aio_buffer = get_accelerator().pin_memory(torch.empty(IO_SIZE, dtype=torch.uint8, device='cpu'))
         else:
-            aio_buffer = h.new_cpu_locked_tensor(IO_SIZE,
-                                                 torch.empty(0,
-                                                             dtype=torch.uint8))
+            aio_buffer = h.new_cpu_locked_tensor(IO_SIZE, torch.empty(0, dtype=torch.uint8))
 
         _validate_handle_state(h, single_submit, overlap_events)
 
@@ -123,35 +113,18 @@ def test_parallel_read(self,
             h.free_cpu_locked_tensor(aio_buffer)
 
     @pytest.mark.parametrize("cuda_device", [True, False])
-    def test_async_read(self,
-                        tmpdir,
-                        use_cuda_pinned_tensor,
-                        single_submit,
-                        overlap_events,
-                        cuda_device):
-        _skip_for_invalid_environment(use_cuda_device=cuda_device,
-                                      use_cuda_pinned_tensor=use_cuda_pinned_tensor)
+    def test_async_read(self, tmpdir, use_cuda_pinned_tensor, single_submit, overlap_events, cuda_device):
+        _skip_for_invalid_environment(use_cuda_device=cuda_device, use_cuda_pinned_tensor=use_cuda_pinned_tensor)
 
         use_cpu_locked_tensor = False
-        h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE,
-                                               QUEUE_DEPTH,
-                                               single_submit,
-                                               overlap_events,
-                                               IO_PARALLEL)
+        h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL)
 
         if cuda_device:
-            aio_buffer = torch.empty(IO_SIZE,
-                                     dtype=torch.uint8,
-                                     device=get_accelerator().device_name())
+            aio_buffer = torch.empty(IO_SIZE, dtype=torch.uint8, device=get_accelerator().device_name())
         elif use_cuda_pinned_tensor:
-            aio_buffer = get_accelerator().pin_memory(
-                torch.empty(IO_SIZE,
-                            dtype=torch.uint8,
-                            device='cpu'))
+            aio_buffer = get_accelerator().pin_memory(torch.empty(IO_SIZE, dtype=torch.uint8, device='cpu'))
         else:
-            aio_buffer = h.new_cpu_locked_tensor(IO_SIZE,
-                                                 torch.empty(0,
-                                                             dtype=torch.uint8))
+            aio_buffer = h.new_cpu_locked_tensor(IO_SIZE, torch.empty(0, dtype=torch.uint8))
             use_cpu_locked_tensor = True
 
         _validate_handle_state(h, single_submit, overlap_events)
@@ -176,25 +149,17 @@ def test_async_read(self,
 @pytest.mark.parametrize("overlap_events", [True, False])
 class TestWrite(DistributedTest):
     world_size = 1
+    reuse_dist_env = True
     requires_cuda_env = False
     if not get_accelerator().is_available():
         init_distributed = False
         set_dist_env = False
 
-    def test_parallel_write(self,
-                            tmpdir,
-                            use_cuda_pinned_tensor,
-                            single_submit,
-                            overlap_events):
-        _skip_for_invalid_environment(use_cuda_device=False,
-                                      use_cuda_pinned_tensor=use_cuda_pinned_tensor)
+    def test_parallel_write(self, tmpdir, use_cuda_pinned_tensor, single_submit, overlap_events):
+        _skip_for_invalid_environment(use_cuda_device=False, use_cuda_pinned_tensor=use_cuda_pinned_tensor)
 
         ref_file, ref_buffer = _do_ref_write(tmpdir)
-        h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE,
-                                               QUEUE_DEPTH,
-                                               single_submit,
-                                               overlap_events,
-                                               IO_PARALLEL)
+        h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL)
 
         if use_cuda_pinned_tensor:
             aio_file, aio_buffer = _get_test_write_file_and_cpu_buffer(tmpdir, ref_buffer)
@@ -215,22 +180,12 @@ def test_parallel_write(self,
         assert filecmp.cmp(ref_file, aio_file, shallow=False)
 
     @pytest.mark.parametrize("cuda_device", [True, False])
-    def test_async_write(self,
-                         tmpdir,
-                         use_cuda_pinned_tensor,
-                         single_submit,
-                         overlap_events,
-                         cuda_device):
-        _skip_for_invalid_environment(use_cuda_device=cuda_device,
-                                      use_cuda_pinned_tensor=use_cuda_pinned_tensor)
+    def test_async_write(self, tmpdir, use_cuda_pinned_tensor, single_submit, overlap_events, cuda_device):
+        _skip_for_invalid_environment(use_cuda_device=cuda_device, use_cuda_pinned_tensor=use_cuda_pinned_tensor)
 
         ref_file, ref_buffer = _do_ref_write(tmpdir)
 
-        h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE,
-                                               QUEUE_DEPTH,
-                                               single_submit,
-                                               overlap_events,
-                                               IO_PARALLEL)
+        h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL)
         use_cpu_locked_tensor = False
         if cuda_device:
             aio_file, aio_buffer = _get_test_write_file_and_cuda_buffer(tmpdir, ref_buffer)
@@ -269,8 +224,7 @@ class TestAsyncQueue(DistributedTest):
 
     @pytest.mark.parametrize("async_queue", [2, 3])
     def test_read(self, tmpdir, async_queue, use_cuda_pinned_tensor, cuda_device):
-        _skip_for_invalid_environment(use_cuda_device=cuda_device,
-                                      use_cuda_pinned_tensor=use_cuda_pinned_tensor)
+        _skip_for_invalid_environment(use_cuda_device=cuda_device, use_cuda_pinned_tensor=use_cuda_pinned_tensor)
 
         ref_files = []
         for i in range(async_queue):
@@ -279,33 +233,22 @@ def test_read(self, tmpdir, async_queue, use_cuda_pinned_tensor, cuda_device):
 
         single_submit = True
         overlap_events = True
-        h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE,
-                                               QUEUE_DEPTH,
-                                               single_submit,
-                                               overlap_events,
-                                               IO_PARALLEL)
+        h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL)
 
         use_cpu_locked_tensor = False
         if cuda_device:
             aio_buffers = [
-                torch.empty(IO_SIZE,
-                            dtype=torch.uint8,
-                            device=get_accelerator().device_name())
+                torch.empty(IO_SIZE, dtype=torch.uint8, device=get_accelerator().device_name())
                 for _ in range(async_queue)
             ]
         elif use_cuda_pinned_tensor:
             aio_buffers = [
-                get_accelerator().pin_memory(
-                    torch.empty(IO_SIZE,
-                                dtype=torch.uint8,
-                                device='cpu')) for _ in range(async_queue)
+                get_accelerator().pin_memory(torch.empty(IO_SIZE, dtype=torch.uint8, device='cpu'))
+                for _ in range(async_queue)
             ]
         else:
             tmp_tensor = torch.empty(0, dtype=torch.uint8)
-            aio_buffers = [
-                h.new_cpu_locked_tensor(IO_SIZE,
-                                        tmp_tensor) for _ in range(async_queue)
-            ]
+            aio_buffers = [h.new_cpu_locked_tensor(IO_SIZE, tmp_tensor) for _ in range(async_queue)]
             use_cpu_locked_tensor = True
 
         _validate_handle_state(h, single_submit, overlap_events)
@@ -328,8 +271,7 @@ def test_read(self, tmpdir, async_queue, use_cuda_pinned_tensor, cuda_device):
 
     @pytest.mark.parametrize("async_queue", [2, 3])
     def test_write(self, tmpdir, use_cuda_pinned_tensor, async_queue, cuda_device):
-        _skip_for_invalid_environment(use_cuda_device=cuda_device,
-                                      use_cuda_pinned_tensor=use_cuda_pinned_tensor)
+        _skip_for_invalid_environment(use_cuda_device=cuda_device, use_cuda_pinned_tensor=use_cuda_pinned_tensor)
 
         ref_files = []
         ref_buffers = []
@@ -340,11 +282,7 @@ def test_write(self, tmpdir, use_cuda_pinned_tensor, async_queue, cuda_device):
 
         single_submit = True
         overlap_events = True
-        h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE,
-                                               QUEUE_DEPTH,
-                                               single_submit,
-                                               overlap_events,
-                                               IO_PARALLEL)
+        h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL)
 
         aio_files = []
         aio_buffers = []
diff --git a/tests/unit/ops/deepspeed4science/test_DS4Sci_EvoformerAttention.py b/tests/unit/ops/deepspeed4science/test_DS4Sci_EvoformerAttention.py
new file mode 100644
index 000000000000..25624f3a6818
--- /dev/null
+++ b/tests/unit/ops/deepspeed4science/test_DS4Sci_EvoformerAttention.py
@@ -0,0 +1,103 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import List
+
+import pytest
+import torch
+from torch.nn import functional as F
+import deepspeed
+from deepspeed.ops.op_builder import EvoformerAttnBuilder
+from deepspeed.ops.deepspeed4science import DS4Sci_EvoformerAttention
+from deepspeed.accelerator import get_accelerator
+from unit.util import skip_on_arch
+
+if not deepspeed.ops.__compatible_ops__[EvoformerAttnBuilder.NAME]:
+    pytest.skip("DS4Sci_EvoformerAttention ops are not available on this system", allow_module_level=True)
+
+
+def attention_reference(
+        q_input: torch.Tensor,  # [*, Dim_Q, H, C_hid]
+        k_input: torch.Tensor,  # [*, Dim_Q, H, C_hid]
+        v_input: torch.Tensor,  # [*, Dim_Q, H, C_hid]
+        biases: List[torch.Tensor],
+        sm_scale: float) -> torch.Tensor:
+    q = q_input.transpose(-2, -3)
+    k = k_input.transpose(-2, -3)
+    v = v_input.transpose(-2, -3)
+    k_t = k.transpose(-1, -2)
+    a = torch.matmul(q, k_t) * sm_scale
+
+    for b in biases:
+        a += b
+
+    a = F.softmax(a, dim=-1)
+    a_v = torch.matmul(a, v)
+    o = a_v.transpose(-2, -3)
+
+    return o
+
+
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("tensor_shape", [(1, 256, 256, 4, 32), (1, 512, 256, 8, 8)])
+def test_DS4Sci_EvoformerAttention(dtype, tensor_shape):
+    skip_on_arch(8 if dtype == torch.bfloat16 else 7)
+    batch, n, seq_len, heads, dim = tensor_shape
+    Q = torch.randn(batch,
+                    n,
+                    seq_len,
+                    heads,
+                    dim,
+                    dtype=dtype,
+                    device=get_accelerator().device_name(),
+                    requires_grad=True)
+    K = torch.randn(batch,
+                    n,
+                    seq_len,
+                    heads,
+                    dim,
+                    dtype=dtype,
+                    device=get_accelerator().device_name(),
+                    requires_grad=True)
+    V = torch.randn(batch,
+                    n,
+                    seq_len,
+                    heads,
+                    dim,
+                    dtype=dtype,
+                    device=get_accelerator().device_name(),
+                    requires_grad=True)
+    mask = torch.randint(0, 2, (batch, n, 1, 1, seq_len), dtype=dtype, device=get_accelerator().device_name())
+    mask_bias = 1e9 * (mask - 1)
+    bias = torch.randn(batch,
+                       1,
+                       heads,
+                       seq_len,
+                       seq_len,
+                       dtype=dtype,
+                       device=get_accelerator().device_name(),
+                       requires_grad=True)
+    dummy_out = torch.rand_like(Q, dtype=dtype, device=get_accelerator().device_name())
+    ref_out = attention_reference(Q, K, V, [mask_bias, bias], 1 / (dim**0.5))
+    ref_out.backward(dummy_out)
+    ref_dv, V.grad = V.grad.clone(), None
+    ref_dk, K.grad = K.grad.clone(), None
+    ref_dq, Q.grad = Q.grad.clone(), None
+    ref_db, bias.grad = bias.grad.clone(), None
+
+    out = DS4Sci_EvoformerAttention(Q, K, V, [mask_bias, bias])
+    out.backward(dummy_out)
+    dv, v_grad = V.grad.clone(), None
+    dk, k_grad = K.grad.clone(), None
+    dq, q_grad = Q.grad.clone(), None
+    db, bias.grad = bias.grad.clone(), None
+
+    eps = 1e-2 if dtype == torch.float16 else 5e-2
+
+    assert torch.max(torch.abs(ref_out - out)).item() < eps, f"out eps: {torch.max(torch.abs(ref_out - out))}"
+    assert torch.max(torch.abs(ref_dv - dv)) < eps, f"dv eps: {torch.max(torch.abs(ref_dv - dv))}"
+    assert torch.max(torch.abs(ref_dk - dk)) < eps, f"dk eps: {torch.max(torch.abs(ref_dk - dk))}"
+    assert torch.max(torch.abs(ref_dq - dq)) < eps, f"dq eps: {torch.max(torch.abs(ref_dq - dq))}"
+    assert torch.max(torch.abs(ref_db - db)) < 2 * eps, f"db eps: {torch.max(torch.abs(ref_db - db))}"
diff --git a/tests/unit/ops/lion/test_cpu_lion.py b/tests/unit/ops/lion/test_cpu_lion.py
new file mode 100644
index 000000000000..61a069af3257
--- /dev/null
+++ b/tests/unit/ops/lion/test_cpu_lion.py
@@ -0,0 +1,96 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+import numpy as np
+import pytest
+from cpuinfo import get_cpu_info
+
+import deepspeed
+from deepspeed.accelerator import get_accelerator
+from deepspeed.ops.lion import FusedLion
+from deepspeed.ops.op_builder import CPULionBuilder
+from unit.common import DistributedTest
+
+if not deepspeed.ops.__compatible_ops__[CPULionBuilder.NAME]:
+    pytest.skip("cpu-lion is not compatible", allow_module_level=True)
+
+pytest.cpu_vendor = get_cpu_info()["vendor_id_raw"].lower()
+
+
+def check_equal(first, second, atol=1e-2, verbose=False):
+    x = first.detach().numpy()
+    y = second.detach().numpy()
+    print("ATOL", atol)
+    if verbose:
+        print("x = {}".format(x.flatten()))
+        print("y = {}".format(y.flatten()))
+        print('-' * 80)
+    np.testing.assert_allclose(x, y, err_msg="param-update mismatch!", atol=atol)
+
+
+def _compare_optimizers(model_size, param1, optimizer1, param2, optimizer2):
+    for i in range(10):
+        param1.grad = torch.randn(model_size, device=param1.device).to(param1.dtype)
+        param2.grad = param1.grad.clone().detach().to(device=param2.device, dtype=param2.dtype)
+
+        optimizer1.step()
+        optimizer2.step()
+
+    tolerance = param1.float().norm().detach().numpy() * 1e-2
+    check_equal(param1.float().norm(), param2.float().cpu().norm(), atol=tolerance, verbose=True)
+
+
+@pytest.mark.parametrize('dtype', [torch.half, torch.float], ids=["fp16", "fp32"])
+@pytest.mark.parametrize('model_size',
+                         [
+                             (64),
+                             (22),
+                             #(55),
+                             (128),
+                             (1024),
+                             (1048576),
+                         ]) # yapf: disable
+class TestCPULion(DistributedTest):
+    world_size = 1
+    reuse_dist_env = True
+    requires_cuda_env = False
+    if not get_accelerator().is_available():
+        init_distributed = False
+        set_dist_env = False
+
+    @pytest.mark.skipif(not get_accelerator().is_available(), reason="only supported in CUDA environments.")
+    def test_fused_lion_equal(self, dtype, model_size):
+        if ("amd" in pytest.cpu_vendor) and (dtype == torch.half):
+            pytest.skip("cpu-lion with half precision not supported on AMD CPUs")
+
+        from deepspeed.ops.lion import DeepSpeedCPULion
+
+        cpu_data = torch.randn(model_size, device='cpu').to(dtype)
+        cpu_param = torch.nn.Parameter(cpu_data)
+        cuda_param = torch.nn.Parameter(cpu_data.to(get_accelerator().device_name()))
+
+        cpu_optimizer = DeepSpeedCPULion([cpu_param])
+        cuda_optimizer = FusedLion([cuda_param])
+
+        _compare_optimizers(model_size=model_size,
+                            param1=cpu_param,
+                            optimizer1=cpu_optimizer,
+                            param2=cuda_param,
+                            optimizer2=cuda_optimizer)
+
+
+class TestCPULionGPUError(DistributedTest):
+
+    def test_cpu_lion_gpu_error(self):
+        model_size = 64
+        from deepspeed.ops.lion import DeepSpeedCPULion
+        device = get_accelerator().device_name(0)  # 'cuda:0' or 'xpu:0'
+        param = torch.nn.Parameter(torch.randn(model_size, device=device))
+        optimizer = DeepSpeedCPULion([param])
+
+        param.grad = torch.randn(model_size, device=device)
+        with pytest.raises(AssertionError):
+            optimizer.step()
diff --git a/tests/unit/ops/lion/test_lion.py b/tests/unit/ops/lion/test_lion.py
new file mode 100644
index 000000000000..b2c3ac2f52df
--- /dev/null
+++ b/tests/unit/ops/lion/test_lion.py
@@ -0,0 +1,59 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import deepspeed
+import torch
+import pytest
+
+from deepspeed.ops.lion import FusedLion
+from deepspeed.ops.lion import DeepSpeedCPULion
+from unit.common import DistributedTest
+from unit.simple_model import SimpleModel
+from deepspeed.accelerator import get_accelerator
+
+if torch.half not in get_accelerator().supported_dtypes():
+    pytest.skip(f"fp16 not supported, valid dtype: {get_accelerator().supported_dtypes()}", allow_module_level=True)
+# yapf: disable
+#'optimizer, zero_offload, resulting_optimizer
+lion_configs = [["Lion",  False, FusedLion],
+                ["Lion",  True,  DeepSpeedCPULion]]
+
+@pytest.mark.parametrize(
+    'optimizer, zero_offload, resulting_optimizer',
+    lion_configs)
+class TestLionConfigs(DistributedTest):
+    world_size = 1
+    reuse_dist_env = True
+
+    def test(self,
+             optimizer,
+             zero_offload,
+             resulting_optimizer):
+        config_dict = {
+            "train_batch_size": 2,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": optimizer,
+                "params": {
+                    "lr": 0.00015,
+                }
+            },
+            "gradient_clipping": 1.0,
+            "fp16": {
+                "enabled": True
+            },
+            "zero_optimization": {
+                "stage": 2,
+                "cpu_offload": zero_offload
+            }
+        }
+        model = SimpleModel(10)
+        model, _, _, _ = deepspeed.initialize(config=config_dict,
+                                              model=model,
+                                              model_parameters=model.parameters())
+        # get base optimizer under zero
+        ds_optimizer = model.optimizer.optimizer
+        opt_class = resulting_optimizer
+        assert isinstance(ds_optimizer, opt_class)
diff --git a/tests/unit/ops/quantizer/test_dequantize.py b/tests/unit/ops/quantizer/test_dequantize.py
deleted file mode 100644
index 5dc2f7d68a70..000000000000
--- a/tests/unit/ops/quantizer/test_dequantize.py
+++ /dev/null
@@ -1,96 +0,0 @@
-"""
-Copyright 2022 The Microsoft DeepSpeed Team
-"""
-
-import pytest
-import torch
-from deepspeed.ops import op_builder
-from deepspeed.accelerator import get_accelerator
-
-quantize_module = None
-
-
-def int4x2to2xint4(int4X2tensor):
-    high = int4X2tensor >> 4
-    low = (int4X2tensor << 4) >> 4
-    return torch.stack((high, low), dim=-1).flatten()
-
-
-def run_quantize(data, num_groups, q_bits, is_symmetric_quant):
-    global quantize_module
-    if quantize_module is None:
-        quantize_module = op_builder.QuantizerBuilder().load()
-
-    return quantize_module.quantize(
-        data,
-        num_groups,
-        q_bits,
-        quantize_module.Symmetric if is_symmetric_quant else quantize_module.Asymmetric)
-
-
-def run_dequantize(quantized_data, params, num_groups, q_bits, is_symmetric_quant):
-    global quantize_module
-    if quantize_module is None:
-        quantize_module = op_builder.QuantizerBuilder().load()
-
-    return quantize_module.dequantize(
-        quantized_data,
-        params,
-        num_groups,
-        q_bits,
-        quantize_module.Symmetric if is_symmetric_quant else quantize_module.Asymmetric)
-
-
-def run_ref_dequantize(quantized_data, params, num_groups, q_bits, is_symmetric_quant):
-
-    if (q_bits == 4):
-        quantized_data = int4x2to2xint4(quantized_data)
-
-    quantized_data = quantized_data.reshape(num_groups, -1).to(torch.float32)
-
-    if is_symmetric_quant:
-        return (quantized_data * params).to(torch.float16)
-    else:
-        scales = params[:, 0].reshape(-1, 1)
-        offsets = params[:, 1].reshape(-1, 1)
-        return (quantized_data * scales + offsets).to(torch.float16)
-
-
-@pytest.mark.inference_ops
-@pytest.mark.parametrize("num_groups", [1, 13, 512])
-@pytest.mark.parametrize("num_elems",
-                         [8,
-                          16,
-                          32,
-                          64,
-                          128,
-                          256,
-                          4096,
-                          8192,
-                          12288,
-                          16384])
-@pytest.mark.parametrize("is_symmetric_quant", [True, False])
-@pytest.mark.parametrize("q_bits", [4, 8])
-def test_dequantize(num_elems, num_groups, is_symmetric_quant, q_bits):
-
-    activations = torch.randn((num_groups,
-                               num_elems),
-                              dtype=torch.float16,
-                              device=get_accelerator().device_name())
-    quantized_data, params = run_quantize(activations, num_groups, q_bits, is_symmetric_quant)
-
-    ds_dequant = run_dequantize(quantized_data,
-                                params,
-                                num_groups,
-                                q_bits,
-                                is_symmetric_quant)
-    ref_dequant = run_ref_dequantize(quantized_data,
-                                     params,
-                                     num_groups,
-                                     q_bits,
-                                     is_symmetric_quant)
-
-    assert (torch.allclose(ds_dequant.flatten(),
-                           ref_dequant.flatten(),
-                           rtol=3e-2,
-                           atol=2e-3))
diff --git a/tests/unit/ops/quantizer/test_fake_quantization.py b/tests/unit/ops/quantizer/test_fake_quantization.py
index c5304f7694ee..2549878ed541 100644
--- a/tests/unit/ops/quantizer/test_fake_quantization.py
+++ b/tests/unit/ops/quantizer/test_fake_quantization.py
@@ -1,9 +1,16 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 import pytest
+import deepspeed
 from deepspeed.accelerator import get_accelerator
-from deepspeed.ops import op_builder
+from deepspeed.ops.op_builder import QuantizerBuilder
+
+if not deepspeed.ops.__compatible_ops__[QuantizerBuilder.NAME]:
+    pytest.skip("Inference ops are not available on this system", allow_module_level=True)
 
 quantizer_cuda_module = None
 
@@ -33,7 +40,7 @@ def run_quant_dequant(inputs, groups, bits):
     global quantizer_cuda_module
 
     if quantizer_cuda_module is None:
-        quantizer_cuda_module = op_builder.QuantizerBuilder().load()
+        quantizer_cuda_module = QuantizerBuilder().load()
     return quantizer_cuda_module.ds_quantize_fp16(inputs, groups, bits)
 
 
@@ -45,8 +52,7 @@ def run_quant_dequant(inputs, groups, bits):
 # Note that we have an explicit boundary for groups as ((size / groups) - 1) / 4096 + 1) <= MAX_REG.
 def test_fake_quant_dequant(tensor_shape, groups):
 
-    input_tensor = torch.rand((tensor_shape),
-                              dtype=torch.float16).to(get_accelerator().device_name())
+    input_tensor = torch.rand((tensor_shape), dtype=torch.float16).to(get_accelerator().device_name())
 
     # 8-bit quantization.
     ref_input_8bit = input_tensor.clone().detach()
diff --git a/tests/unit/ops/quantizer/test_quantize.py b/tests/unit/ops/quantizer/test_quantize.py
index 3cfd812e63f9..1f5e3dc95721 100644
--- a/tests/unit/ops/quantizer/test_quantize.py
+++ b/tests/unit/ops/quantizer/test_quantize.py
@@ -1,26 +1,40 @@
-"""
-Copyright 2022 The Microsoft DeepSpeed Team
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import pytest
 import torch
-from deepspeed.ops import op_builder
+import deepspeed
+from deepspeed.ops.op_builder import QuantizerBuilder
 from deepspeed.accelerator import get_accelerator
 
+if not deepspeed.ops.__compatible_ops__[QuantizerBuilder.NAME]:
+    pytest.skip("Inference ops are not available on this system", allow_module_level=True)
+
 inference_module = None
 
 
 def run_quantize_ds(activations, num_groups, q_bits, is_symmetric_quant):
     global inference_module
     if inference_module is None:
-        inference_module = op_builder.QuantizerBuilder().load()
+        inference_module = QuantizerBuilder().load()
+
+    return inference_module.quantize(activations, num_groups, q_bits,
+                                     inference_module.Symmetric if is_symmetric_quant else inference_module.Asymmetric)
+
 
-    return inference_module.quantize(
+def run_dequantize_ds(activations, params, num_groups, q_bits, is_symmetric_quant):
+    global inference_module
+    if inference_module is None:
+        inference_module = QuantizerBuilder().load()
+    return inference_module.dequantize(
         activations,
+        params,
         num_groups,
         q_bits,
-        inference_module.Symmetric
-        if is_symmetric_quant else inference_module.Asymmetric)
+        inference_module.Symmetric if is_symmetric_quant else inference_module.Asymmetric,
+    )
 
 
 def get_q_props(q_bits):
@@ -33,13 +47,7 @@ def get_q_props(q_bits):
     return q_range, q_max, q_min
 
 
-def get_scale_zero_point(q_bits,
-                         is_symmetric_quant,
-                         max,
-                         min,
-                         absmax,
-                         scales=None,
-                         zero_points=None):
+def get_scale_zero_point(q_bits, is_symmetric_quant, max, min, absmax, scales=None, zero_points=None):
 
     q_range, q_max, q_min = get_q_props(q_bits)
 
@@ -47,14 +55,11 @@ def get_scale_zero_point(q_bits,
         scale = torch.empty_like(absmax)
         for i, x in enumerate(absmax):
             scale[i] = torch.ones_like(x) if x == 0 else q_range / (2 * x)
-        zero_point = torch.zeros(scale.shape,
-                                 dtype=torch.float32,
-                                 device=get_accelerator().device_name())
+        zero_point = torch.zeros(scale.shape, dtype=torch.float32, device=get_accelerator().device_name())
     else:
         scale = torch.empty_like(max)
         for i, x in enumerate(max):
-            scale[i] = torch.ones_like(x) if max[i] == min[i] else q_range / (max[i] -
-                                                                              min[i])
+            scale[i] = torch.ones_like(x) if max[i] == min[i] else q_range / (max[i] - min[i])
         zero_point = q_min - (min * scale)
 
     return scale, zero_point
@@ -73,15 +78,14 @@ def run_float_quantize(q_bits, is_symmetric_quant, activations_ref, num_groups):
 
     activations_ref = activations_ref.reshape(num_groups, -1).to(dtype=torch.float32)
 
-    max_abs_activations_ref = torch.amax(torch.abs(activations_ref),
-                                         dim=-1).view(num_groups,
-                                                      -1)
+    max_abs_activations_ref = torch.amax(torch.abs(activations_ref), dim=-1).view(num_groups, -1)
     max_activations_ref = torch.amax(activations_ref, dim=-1).view(num_groups, -1)
     min_activations_ref = torch.amin(activations_ref, dim=-1).view(num_groups, -1)
 
     _, q_max, q_min = get_q_props(q_bits)
 
-    scale, zero_point = get_scale_zero_point(q_bits, is_symmetric_quant, max_activations_ref, min_activations_ref, max_abs_activations_ref)
+    scale, zero_point = get_scale_zero_point(q_bits, is_symmetric_quant, max_activations_ref, min_activations_ref,
+                                             max_abs_activations_ref)
 
     data_f = activations_ref * scale
 
@@ -90,9 +94,7 @@ def run_float_quantize(q_bits, is_symmetric_quant, activations_ref, num_groups):
 
     data_i32 = torch.round(data_f).to(dtype=torch.int32)
 
-    data_i32 = torch.minimum(torch.maximum(data_i32,
-                                           q_min.expand_as(data_i32)),
-                             q_max.expand_as(data_i32))
+    data_i32 = torch.minimum(torch.maximum(data_i32, q_min.expand_as(data_i32)), q_max.expand_as(data_i32))
     data_i8 = data_i32.to(dtype=torch.int8)
 
     scales = (1.0 / scale).reshape(-1, 1)
@@ -102,61 +104,51 @@ def run_float_quantize(q_bits, is_symmetric_quant, activations_ref, num_groups):
     return data_i8, params
 
 
+def run_float_dequantize(q_bits, is_symmetric_quant, data_i8, params, num_groups):
+    data_f = data_i8.reshape(num_groups, -1).to(dtype=torch.float32)
+
+    scales = params[:, 0].reshape(-1, 1)
+    offsets = params[:, 1].reshape(-1, 1)
+
+    if not is_symmetric_quant:
+        data_f = data_f - offsets
+    else:
+        assert offsets.allclose(torch.zeros_like(offsets))
+
+    data_f = data_f * scales
+
+    return data_f
+
+
 @pytest.mark.inference_ops
 @pytest.mark.parametrize("num_groups", [1, 13, 512])
-@pytest.mark.parametrize("num_elems",
-                         [8,
-                          16,
-                          32,
-                          64,
-                          128,
-                          256,
-                          4096,
-                          8192,
-                          12288,
-                          16384])
+@pytest.mark.parametrize("num_elems", [8, 16, 32, 64, 128, 256, 4096, 8192, 12288, 16384])
 @pytest.mark.parametrize("is_symmetric_quant", [True, False])
 @pytest.mark.parametrize("q_bits", [4, 8])
 @pytest.mark.parametrize("directed_case", ["all_zeros", None])
-def test_float_quantize(num_elems,
-                        num_groups,
-                        is_symmetric_quant,
-                        q_bits,
-                        directed_case):
+def test_float_quantize(num_elems, num_groups, is_symmetric_quant, q_bits, directed_case):
+    # fix seed
+    torch.manual_seed(num_elems)
 
     if directed_case == "all_zeros":
-        activations_ds = torch.zeros((num_groups,
-                                      num_elems),
+        activations_ds = torch.zeros((num_groups, num_elems),
                                      dtype=torch.float16,
                                      device=get_accelerator().device_name())
     else:
-        activations_ds = torch.randn((num_groups,
-                                      num_elems),
+        activations_ds = torch.randn((num_groups, num_elems),
                                      dtype=torch.float16,
                                      device=get_accelerator().device_name())
     activations_ref = activations_ds.clone().detach()
 
     ref_out_tensor, ref_params = run_float_quantize(q_bits, is_symmetric_quant, activations_ref, num_groups)
+    ref_dequantized_tensor = run_float_dequantize(q_bits, is_symmetric_quant, ref_out_tensor, ref_params, num_groups)
+    # we need to convert the tensor to float64 to avoid overflow
+    ref_quantization_error = torch.sum(torch.abs((activations_ref - ref_dequantized_tensor).to(torch.float64)))
 
     ds_out_tensor, ds_out_params = run_quantize_ds(activations_ds, num_groups, q_bits, is_symmetric_quant)
+    ds_dequantized_tensor = run_dequantize_ds(ds_out_tensor, ds_out_params, num_groups, q_bits, is_symmetric_quant)
+    assert torch.all(torch.isfinite(ds_dequantized_tensor))
 
-    if (q_bits == 4):
-        ds_out_tensor = int4x2to2xint4(ds_out_tensor)
+    ds_quantization_error = torch.sum(torch.abs((activations_ds - ds_dequantized_tensor).to(torch.float64)))
 
-    # Allow a max difference of 1 to account for differences in rounding in pytorch implementation
-    assert (torch.all(
-        torch.lt(torch.abs(ds_out_tensor.flatten() - ref_out_tensor.flatten()),
-                 2)))
-    if is_symmetric_quant:
-        assert (torch.allclose(ds_out_params.flatten(), ref_params[:, 0].flatten()))
-    else:
-        assert (torch.allclose(ds_out_params[:,
-                                             0].flatten(),
-                               ref_params[:,
-                                          0].flatten()))
-        assert (torch.allclose(ds_out_params[:,
-                                             1].flatten(),
-                               ref_params[:,
-                                          1].flatten(),
-                               atol=5e-5,
-                               rtol=5e-5))
+    assert (ds_quantization_error <= ref_quantization_error * 1.05)
diff --git a/tests/unit/ops/sparse_attention/test_sparse_attention.py b/tests/unit/ops/sparse_attention/test_sparse_attention.py
index a4fc49354739..217267a97951 100644
--- a/tests/unit/ops/sparse_attention/test_sparse_attention.py
+++ b/tests/unit/ops/sparse_attention/test_sparse_attention.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 # DeepSpeed note, some parts of code taken & adapted from commit c368a9fd1b2c9dee4cc94de9a6bb0be3d447be41
 # https://github.com/ptillet/torch-blocksparse/blob/master/tests/test_softmax.py
@@ -10,10 +13,10 @@
 import deepspeed
 from deepspeed.accelerator import get_accelerator
 from deepspeed.ops.op_builder import SparseAttnBuilder
+from unit.util import skip_on_arch, skip_on_cuda
 
 if not deepspeed.ops.__compatible_ops__[SparseAttnBuilder.NAME]:
-    pytest.skip("sparse attention op is not compatible on this system",
-                allow_module_level=True)
+    pytest.skip("sparse attention op is not compatible on this system", allow_module_level=True)
 
 
 def dense_to_sparse(w, mask, block):
@@ -25,7 +28,7 @@ def dense_to_sparse(w, mask, block):
     h, i, j = nnz[:, 0], nnz[:, 1], nnz[:, 2]
     for zz in range(Z):
         for idx, (hh, ii, jj) in enumerate(zip(h, i, j)):
-            ret[zz, idx, :, :] = w[zz, hh, ii*block: (ii+1)*block, jj*block: (jj+1)*block]
+            ret[zz, idx, :, :] = w[zz, hh, ii * block:(ii + 1) * block, jj * block:(jj + 1) * block]
     return ret
 
 
@@ -95,34 +98,23 @@ def init_softmax_inputs(Z, H, M, N, scale, rho, block, dtype, dense_x=True, layo
     if layout is None:
         layout = make_layout(rho, (H, M // block, N // block))
     if dense_x:
-        x = torch.rand((Z,
-                        H,
-                        M,
-                        N),
-                       dtype=dtype,
-                       requires_grad=True,
-                       device=get_accelerator().device_name())
+        x = torch.rand((Z, H, M, N), dtype=dtype, requires_grad=True, device=get_accelerator().device_name())
     else:
-        x = torch.rand((Z,
-                        layout.sum(),
-                        block,
-                        block),
+        x = torch.rand((Z, layout.sum(), block, block),
                        dtype=dtype,
                        requires_grad=True,
                        device=get_accelerator().device_name())
     dx = torch.rand_like(x)
     bool_attn_mask = torch.randint(low=0,
                                    high=2,
-                                   size=(N,
-                                         N),
+                                   size=(N, N),
                                    dtype=torch.bool,
                                    requires_grad=False,
                                    device=get_accelerator().device_name())
     fp_attn_mask = bool_attn_mask.type(dtype)
     kp_mask = torch.randint(low=0,
                             high=2,
-                            size=(Z,
-                                  N),
+                            size=(Z, N),
                             dtype=dtype,
                             requires_grad=False,
                             device=get_accelerator().device_name())
@@ -130,32 +122,28 @@ def init_softmax_inputs(Z, H, M, N, scale, rho, block, dtype, dense_x=True, layo
     return layout, x, dx, bool_attn_mask, fp_attn_mask, kp_mask
 
 
-def _skip_on_cuda_compatability():
-    if deepspeed.accelerator.get_accelerator().device_name() == 'cuda':
-        if torch.cuda.get_device_capability()[0] < 7:
-            pytest.skip("needs higher compute capability than 7")
-        cuda_major = int(torch.version.cuda.split('.')[0]) * 10
-        cuda_minor = int(torch.version.cuda.split('.')[1])
-        cuda_version = cuda_major + cuda_minor
-        if (cuda_version != 101 and cuda_version != 102) and \
-                (cuda_version != 111 and cuda_version != 110):
-            pytest.skip("requires cuda 10.1 or 10.2 or 11.0 or 11.1")
-    else:
-        assert deepspeed.accelerator.get_accelerator().device_name() == 'xpu'
-        return
-
-
 @pytest.mark.parametrize("block", [16, 32])
 @pytest.mark.parametrize("width", [256, 576])
 @pytest.mark.parametrize("dtype", [torch.float16, torch.float32])
 def test_softmax(block, width, dtype):
-    _skip_on_cuda_compatability()
+    valid_cuda_versions = [101, 102, 110, 111]
+    skip_on_arch(min_arch=7)
+    skip_on_cuda(valid_cuda=valid_cuda_versions)
+
     Z = 2
     H = 4
     scale = 0.4
     rho = 0.4
     M = N = width
-    layout, x, dx, bool_attn_mask, fp_attn_mask, kp_mask = init_softmax_inputs(Z, H, M, N, scale, rho, block, dtype, layout=None)
+    layout, x, dx, bool_attn_mask, fp_attn_mask, kp_mask = init_softmax_inputs(Z,
+                                                                               H,
+                                                                               M,
+                                                                               N,
+                                                                               scale,
+                                                                               rho,
+                                                                               block,
+                                                                               dtype,
+                                                                               layout=None)
     ref_y, ref_dx = run_softmax_reference(x, scale, dx, kp_mask, bool_attn_mask, layout, block)
     st_y, st_dx = run_softmax_sparse(x, scale, dx, kp_mask, fp_attn_mask, layout, block)
 
@@ -206,20 +194,8 @@ def init_matmul_inputs(Z, H, M, N, K, rho, mode, trans_a, trans_b, block, dtype,
     BS0 = N if trans_b else K
     BS1 = K if trans_b else N
     shape = {'sdd': (M, N), 'dsd': (AS0, AS1), 'dds': (BS0, BS1)}[mode]
-    x = torch.rand((Z,
-                    H,
-                    AS0,
-                    AS1),
-                   dtype=dtype,
-                   requires_grad=True,
-                   device=get_accelerator().device_name())
-    w = torch.rand((Z,
-                    H,
-                    BS0,
-                    BS1),
-                   dtype=dtype,
-                   requires_grad=True,
-                   device=get_accelerator().device_name())
+    x = torch.rand((Z, H, AS0, AS1), dtype=dtype, requires_grad=True, device=get_accelerator().device_name())
+    w = torch.rand((Z, H, BS0, BS1), dtype=dtype, requires_grad=True, device=get_accelerator().device_name())
     dy = torch.rand((Z, H, M, N), dtype=dtype, device=get_accelerator().device_name())
     if layout is None:
         layout = make_layout(rho, (H, shape[0] // block, shape[1] // block))
@@ -256,7 +232,10 @@ def init_matmul_inputs(Z, H, M, N, K, rho, mode, trans_a, trans_b, block, dtype,
 
 @pytest.mark.parametrize("block, dtype, mode, trans_a, trans_b", testdata)
 def test_matmul(block, dtype, mode, trans_a, trans_b):
-    _skip_on_cuda_compatability()
+    valid_cuda_versions = [101, 102, 110, 111]
+    skip_on_arch(min_arch=7)
+    skip_on_cuda(valid_cuda=valid_cuda_versions)
+
     Z = 3
     H = 2
     M = 128
@@ -266,6 +245,7 @@ def test_matmul(block, dtype, mode, trans_a, trans_b):
     x, w, dy, shape, layout = init_matmul_inputs(Z, H, M, N, K, rho, mode, trans_a, trans_b, block, dtype, layout=None)
     ref_y, ref_dx, ref_dw = run_matmul_reference(x.clone(), w.clone(), mode, trans_a, trans_b, layout, block, dy)
     st_y, st_dx, st_dw = run_matmul_sparse(x.clone(), w.clone(), mode, trans_a, trans_b, layout, block, dy)
+
     assert allclose(ref_y, st_y)
     assert allclose(ref_dx, st_dx)
     assert allclose(ref_dw, st_dw)
diff --git a/tests/unit/ops/spatial/test_nhwc_bias_add.py b/tests/unit/ops/spatial/test_nhwc_bias_add.py
index f3a31cf47ba4..3787b46e266a 100644
--- a/tests/unit/ops/spatial/test_nhwc_bias_add.py
+++ b/tests/unit/ops/spatial/test_nhwc_bias_add.py
@@ -1,12 +1,18 @@
-'''
-Copyright 2022 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import pytest
 import torch
+import deepspeed
+from deepspeed.ops.op_builder import SpatialInferenceBuilder
 from deepspeed.ops.transformer.inference.bias_add import nhwc_bias_add
 from deepspeed.accelerator import get_accelerator
 
+if not deepspeed.ops.__compatible_ops__[SpatialInferenceBuilder.NAME]:
+    pytest.skip("Inference ops are not available on this system", allow_module_level=True)
+
 
 def allclose(x, y):
     assert x.dtype == y.dtype
@@ -18,22 +24,7 @@ def ref_bias_add(activations, bias):
     return activations + bias.reshape(1, -1, 1, 1)
 
 
-channels_list = [
-    192,
-    384,
-    320,
-    576,
-    640,
-    768,
-    960,
-    1152,
-    1280,
-    1536,
-    1600,
-    1920,
-    2240,
-    2560
-]
+channels_list = [192, 384, 320, 576, 640, 768, 960, 1152, 1280, 1536, 1600, 1920, 2240, 2560]
 
 
 @pytest.mark.inference_ops
@@ -41,16 +32,10 @@ def ref_bias_add(activations, bias):
 @pytest.mark.parametrize("image_size", [16, 32, 64])
 @pytest.mark.parametrize("channels", channels_list)
 def test_bias_add(batch, image_size, channels):
-    activations = torch.randn(
-        (batch,
-         channels,
-         image_size,
-         image_size),
-        dtype=torch.float16,
-        device=get_accelerator().device_name()).to(memory_format=torch.channels_last)
-    bias = torch.randn((channels),
-                       dtype=torch.float16,
-                       device=get_accelerator().device_name())
+    activations = torch.randn((batch, channels, image_size, image_size),
+                              dtype=torch.float16,
+                              device=get_accelerator().device_name()).to(memory_format=torch.channels_last)
+    bias = torch.randn((channels), dtype=torch.float16, device=get_accelerator().device_name())
 
     ref_vals = ref_bias_add(activations.clone().detach(), bias)
     ds_vals = nhwc_bias_add(activations, bias)
@@ -67,23 +52,13 @@ def ref_bias_add_add(activations, bias, other):
 @pytest.mark.parametrize("image_size", [16, 32, 64])
 @pytest.mark.parametrize("channels", channels_list)
 def test_bias_add_add(batch, image_size, channels):
-    activations = torch.randn(
-        (batch,
-         channels,
-         image_size,
-         image_size),
-        dtype=torch.float16,
-        device=get_accelerator().device_name()).to(memory_format=torch.channels_last)
-    other = torch.randn(
-        (batch,
-         channels,
-         image_size,
-         image_size),
-        dtype=torch.float16,
-        device=get_accelerator().device_name()).to(memory_format=torch.channels_last)
-    bias = torch.randn((channels),
-                       dtype=torch.float16,
-                       device=get_accelerator().device_name())
+    activations = torch.randn((batch, channels, image_size, image_size),
+                              dtype=torch.float16,
+                              device=get_accelerator().device_name()).to(memory_format=torch.channels_last)
+    other = torch.randn((batch, channels, image_size, image_size),
+                        dtype=torch.float16,
+                        device=get_accelerator().device_name()).to(memory_format=torch.channels_last)
+    bias = torch.randn((channels), dtype=torch.float16, device=get_accelerator().device_name())
 
     ref_vals = ref_bias_add_add(activations.clone().detach(), bias, other)
     ds_vals = nhwc_bias_add(activations, bias, other=other)
@@ -92,13 +67,7 @@ def test_bias_add_add(batch, image_size, channels):
 
 
 def ref_bias_add_bias_add(activations, bias, other, other_bias):
-    return (activations + bias.reshape(1,
-                                       -1,
-                                       1,
-                                       1)) + (other + other_bias.reshape(1,
-                                                                         -1,
-                                                                         1,
-                                                                         1))
+    return (activations + bias.reshape(1, -1, 1, 1)) + (other + other_bias.reshape(1, -1, 1, 1))
 
 
 @pytest.mark.inference_ops
@@ -106,31 +75,16 @@ def ref_bias_add_bias_add(activations, bias, other, other_bias):
 @pytest.mark.parametrize("image_size", [16, 32, 64])
 @pytest.mark.parametrize("channels", channels_list)
 def test_bias_add_bias_add(batch, image_size, channels):
-    activations = torch.randn(
-        (batch,
-         channels,
-         image_size,
-         image_size),
-        dtype=torch.float16,
-        device=get_accelerator().device_name()).to(memory_format=torch.channels_last)
-    other = torch.randn(
-        (batch,
-         channels,
-         image_size,
-         image_size),
-        dtype=torch.float16,
-        device=get_accelerator().device_name()).to(memory_format=torch.channels_last)
-    bias = torch.randn((channels),
-                       dtype=torch.float16,
-                       device=get_accelerator().device_name())
-    other_bias = torch.randn((channels),
-                             dtype=torch.float16,
-                             device=get_accelerator().device_name())
-
-    ref_vals = ref_bias_add_bias_add(activations.clone().detach(),
-                                     bias,
-                                     other,
-                                     other_bias)
+    activations = torch.randn((batch, channels, image_size, image_size),
+                              dtype=torch.float16,
+                              device=get_accelerator().device_name()).to(memory_format=torch.channels_last)
+    other = torch.randn((batch, channels, image_size, image_size),
+                        dtype=torch.float16,
+                        device=get_accelerator().device_name()).to(memory_format=torch.channels_last)
+    bias = torch.randn((channels), dtype=torch.float16, device=get_accelerator().device_name())
+    other_bias = torch.randn((channels), dtype=torch.float16, device=get_accelerator().device_name())
+
+    ref_vals = ref_bias_add_bias_add(activations.clone().detach(), bias, other, other_bias)
     ds_vals = nhwc_bias_add(activations, bias, other=other, other_bias=other_bias)
 
     assert allclose(ds_vals, ref_vals)
diff --git a/tests/unit/ops/transformer/inference/__init__.py b/tests/unit/ops/transformer/inference/__init__.py
new file mode 100644
index 000000000000..208299fb8c50
--- /dev/null
+++ b/tests/unit/ops/transformer/inference/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
diff --git a/tests/unit/ops/transformer/inference/inference_test_utils.py b/tests/unit/ops/transformer/inference/inference_test_utils.py
new file mode 100644
index 000000000000..9c7b428c0e68
--- /dev/null
+++ b/tests/unit/ops/transformer/inference/inference_test_utils.py
@@ -0,0 +1,65 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+from deepspeed.accelerator import get_accelerator
+
+TOLERANCES = None
+
+
+def get_tolerances():
+    global TOLERANCES
+    if TOLERANCES is None:
+        TOLERANCES = {torch.float32: (5e-4, 5e-5), torch.float16: (3e-2, 2e-3)}
+        if get_accelerator().is_bf16_supported():
+            # Note: BF16 tolerance is higher than FP16 because of the lower precision (7 (+1) bits vs
+            # 10 (+1) bits)
+            TOLERANCES[torch.bfloat16] = (4.8e-1, 3.2e-2)
+    return TOLERANCES
+
+
+DTYPES = None
+
+
+def get_dtypes():
+    global DTYPES
+    if DTYPES is None:
+        DTYPES = [torch.float16, torch.float32]
+        try:
+            if get_accelerator().is_bf16_supported():
+                DTYPES.append(torch.bfloat16)
+        except (AssertionError, AttributeError):
+            pass
+    return DTYPES
+
+
+def allclose(x, y):
+    assert x.dtype == y.dtype
+    rtol, atol = get_tolerances()[x.dtype]
+    return torch.allclose(x, y, rtol=rtol, atol=atol)
+
+
+def assert_almost_equal(x, y, decimal=2, err_msg=''):
+    import numpy.testing as npt
+    if isinstance(x, torch.Tensor):
+        if x.dtype == torch.bfloat16:
+            x = x.float()
+        x = x.cpu().detach().numpy()
+    if isinstance(y, torch.Tensor):
+        if y.dtype == torch.bfloat16:
+            y = y.float()
+        y = y.cpu().detach().numpy()
+    npt.assert_array_almost_equal(x, y, err_msg=err_msg, decimal=decimal)
+
+
+def max_diff(a, b):
+    a = a.to(torch.float32).flatten()
+    b = b.to(torch.float32).flatten()
+    diff = torch.abs(a - b)
+    max_diff_indices = torch.argsort(diff)[-1]
+    print("Max difference indices:", max_diff_indices)
+    print("Max difference values:", diff[max_diff_indices])
+    print(f"{a[max_diff_indices]} vs {b[max_diff_indices]}")
+    return max_diff_indices
diff --git a/tests/unit/ops/transformer/inference/test_attention.py b/tests/unit/ops/transformer/inference/test_attention.py
new file mode 100644
index 000000000000..13abe8b915c7
--- /dev/null
+++ b/tests/unit/ops/transformer/inference/test_attention.py
@@ -0,0 +1,91 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import pytest
+import torch
+import deepspeed
+from deepspeed.accelerator import get_accelerator
+from .inference_test_utils import assert_almost_equal
+
+
+# reference timplementation
+def ref_torch_attention(q, k, v, mask, sm_scale):
+    p = torch.matmul(q, k.transpose(2, 3)) * sm_scale
+    p = torch.softmax(p.float() + mask, dim=-1).half()
+    ref_out = torch.matmul(p, v)
+    return ref_out
+
+
+# test attention operator
+@pytest.mark.inference_ops
+@pytest.mark.parametrize("BATCH", [1])  # batch
+@pytest.mark.parametrize("H", [12])  # heads
+@pytest.mark.parametrize("N_CTX", [16, 128])  # sequence length
+@pytest.mark.parametrize("D_HEAD", [64, 128])
+@pytest.mark.parametrize("causal", [True, False])
+@pytest.mark.parametrize("use_flash", [True, False])
+def test_attention(BATCH, H, N_CTX, D_HEAD, causal, use_flash, dtype=torch.float16):
+    if not deepspeed.HAS_TRITON:
+        pytest.skip("triton has to be installed for the test")
+
+    minus_inf = -65504.0
+
+    # skip autotune in testing
+    from deepspeed.ops.transformer.inference.triton.matmul_ext import fp16_matmul
+    fp16_matmul.skip_autotune()
+
+    from deepspeed.ops.transformer.inference.triton.attention import _triton_attention, _triton_packed_flash
+    torch.manual_seed(20)
+    q = torch.empty((BATCH, H, N_CTX, D_HEAD), dtype=dtype, device="cuda").normal_(mean=0, std=.5)
+    k = torch.empty((BATCH, H, N_CTX, D_HEAD), dtype=dtype, device="cuda").normal_(mean=0, std=.5)
+    v = torch.empty((BATCH, H, N_CTX, D_HEAD), dtype=dtype, device="cuda").normal_(mean=0, std=.5)
+    sm_scale = 0.3
+
+    # reference implementation
+    p = torch.matmul(q, k.transpose(2, 3)) * sm_scale
+    score = p
+    mask = torch.zeros((BATCH, H, N_CTX, N_CTX), dtype=dtype, device="cuda")
+    M = torch.tril(torch.ones((N_CTX, N_CTX), device="cuda"))
+    if causal:
+        for z in range(BATCH):
+            for h in range(H):
+                mask[:, :, M == 0] = minus_inf
+    p = torch.softmax(p.float() + mask, dim=-1).half()
+    softmax_out = p
+    ref_out = torch.matmul(p, v)
+    context = ref_out
+
+    # adjust it to expected tensor format and run test
+    qkv = torch.randn((BATCH, N_CTX, 3 * H * D_HEAD), dtype=dtype, device='cuda', requires_grad=False)
+    qkv[:, :, :H * D_HEAD] = q.permute(0, 2, 1, 3).contiguous().reshape((BATCH, N_CTX, H * D_HEAD))
+    qkv[:, :, 1 * H * D_HEAD:2 * H * D_HEAD] = k.permute(0, 2, 1, 3).contiguous().reshape((BATCH, N_CTX, H * D_HEAD))
+    qkv[:, :, 2 * H * D_HEAD:] = v.permute(0, 2, 1, 3).contiguous().reshape((BATCH, N_CTX, H * D_HEAD))
+
+    if use_flash:
+        if not get_accelerator().is_triton_supported():
+            pytest.skip("triton flash attention is supported when the compute capability > 8.0")
+        triton_mask = torch.zeros((BATCH, 1, 1, N_CTX), dtype=dtype, device="cuda")
+        if not causal:
+            lengths = torch.randint(N_CTX - 8, N_CTX, (BATCH, 1), device='cuda')
+            for i, l in enumerate(lengths):
+                triton_mask[i, ..., l:] = minus_inf
+            mask = torch.zeros((BATCH, H, N_CTX, N_CTX), dtype=dtype, device="cuda")
+            for b in range(BATCH):
+                mask[b, :, :, lengths[b]:] = minus_inf
+            ref_out = ref_torch_attention(q, k, v, mask, sm_scale)
+        tri_out = _triton_packed_flash(qkv, D_HEAD, triton_mask, sm_scale, causal=causal, add_mask=(not causal))
+    else:
+        tri_out = _triton_attention(qkv,
+                                    input_mask=mask,
+                                    layer_past=None,
+                                    alibi=None,
+                                    scale=sm_scale,
+                                    head_size=D_HEAD,
+                                    triangular=False,
+                                    use_cuda_flash=False,
+                                    use_triton_flash=False,
+                                    use_ds_attention=False)
+    tri_out = tri_out.reshape((BATCH, N_CTX, H, D_HEAD)).permute(0, 2, 1, 3)
+    assert_almost_equal(ref_out, tri_out)
diff --git a/tests/unit/ops/transformer/inference/test_bias_add.py b/tests/unit/ops/transformer/inference/test_bias_add.py
index f3475a14766d..843c9b889c2b 100644
--- a/tests/unit/ops/transformer/inference/test_bias_add.py
+++ b/tests/unit/ops/transformer/inference/test_bias_add.py
@@ -1,25 +1,22 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import pytest
 import torch
 import deepspeed
 from deepspeed.accelerator import get_accelerator
 from deepspeed.ops.op_builder import InferenceBuilder
+from .inference_test_utils import allclose, get_dtypes
 
 if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
-    pytest.skip("Inference ops are not available on this system",
-                allow_module_level=True)
+    pytest.skip("Inference ops are not available on this system", allow_module_level=True)
 
 inference_module = None
 torch_minor_version = None
 
 
-def allclose(x, y):
-    assert x.dtype == y.dtype
-    rtol, atol = {torch.float32: (5e-4, 5e-5), torch.float16: (3e-2, 2e-3)}[x.dtype]
-    return torch.allclose(x, y, rtol=rtol, atol=atol)
-
-
 def run_bias_add_reference(activations, bias):
     return activations + bias
 
@@ -30,6 +27,8 @@ def run_bias_add_ds(activations, bias):
         inference_module = InferenceBuilder().load()
     if activations.dtype == torch.float16:
         return inference_module.bias_add_fp16(activations, bias)
+    elif activations.dtype == torch.bfloat16:
+        return inference_module.bias_add_bf16(activations, bias)
     else:
         return inference_module.bias_add_fp32(activations, bias)
 
@@ -38,20 +37,16 @@ def run_bias_add_ds(activations, bias):
 @pytest.mark.parametrize("batch", [1, 2])
 @pytest.mark.parametrize("sequence", [1, 128, 255])
 @pytest.mark.parametrize("channels", [512, 1232, 4096])
-@pytest.mark.parametrize("dtype", [torch.float16, torch.float32], ids=["fp16", "fp32"])
+@pytest.mark.parametrize("dtype", get_dtypes())
 def test_bias_add(batch, sequence, channels, dtype):
-    activations_ds = torch.randn((batch,
-                                  sequence,
-                                  channels),
-                                 dtype=dtype,
-                                 device=get_accelerator().device_name())
-    bias_ds = torch.randn((channels),
-                          dtype=dtype,
-                          device=get_accelerator().device_name())
+    activations_ds = torch.randn((batch, sequence, channels), dtype=dtype, device=get_accelerator().device_name())
+    bias_ds = torch.randn((channels), dtype=dtype, device=get_accelerator().device_name())
 
     activations_ref = activations_ds.clone().detach()
     bias_ref = bias_ds.clone().detach()
 
     ds_out = run_bias_add_ds(activations_ds, bias_ds)
     ref_out = run_bias_add_reference(activations_ref, bias_ref)
-    assert allclose(ds_out, ref_out)
+    if not allclose(ds_out, ref_out):
+        print((ds_out - ref_out).abs().max())
+        assert (allclose(ds_out, ref_out))
diff --git a/tests/unit/ops/transformer/inference/test_bias_geglu.py b/tests/unit/ops/transformer/inference/test_bias_geglu.py
index c70974e51d94..d5ab13964974 100644
--- a/tests/unit/ops/transformer/inference/test_bias_geglu.py
+++ b/tests/unit/ops/transformer/inference/test_bias_geglu.py
@@ -1,58 +1,76 @@
-"""
-Copyright 2022 The Microsoft DeepSpeed Team
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import pytest
 import torch
 import deepspeed
 from deepspeed.ops.op_builder import InferenceBuilder
 from deepspeed.accelerator import get_accelerator
+from deepspeed.utils.types import ActivationFuncType
+from .inference_test_utils import allclose, get_dtypes
 
 if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
-    pytest.skip("Inference ops are not available on this system",
-                allow_module_level=True)
+    pytest.skip("Inference ops are not available on this system", allow_module_level=True)
 
 inference_module = None
 torch_minor_version = None
 
 
-def allclose(x, y):
-    assert x.dtype == y.dtype
-    rtol, atol = {torch.float32: (5e-3, 5e-4), torch.float16: (3e-2, 2e-3), torch.int8: (0, 0)}[x.dtype]
-    return torch.allclose(x, y, rtol=rtol, atol=atol)
-
-
 def run_bias_geglu_reference(activations, bias):
     # Expected behavior is that of casting to float32 internally
     # Explicitly using the default GeLU
     activations = activations + bias.reshape(1, 1, -1)
     hidden_states, gate = activations.chunk(2, dim=-1)
-    return hidden_states * torch.nn.functional.gelu(gate.to(torch.float32)).to(
-        activations.dtype)
+    return hidden_states * torch.nn.functional.gelu(gate.to(torch.float32)).to(activations.dtype)
 
 
 def run_bias_geglu_ds(activation, bias):
     global inference_module
     if inference_module is None:
         inference_module = InferenceBuilder().load()
-    return inference_module.bias_geglu(activation, bias)
+    return inference_module.gated_activation(activation, bias, ActivationFuncType.GATED_GELU)
 
 
 @pytest.mark.inference_ops
 @pytest.mark.parametrize("batch", [1, 2])
 @pytest.mark.parametrize("sequence", [1, 128, 255])
 @pytest.mark.parametrize("channels", [512, 1232, 4096])
-@pytest.mark.parametrize("dtype", [torch.float16, torch.float32])
+@pytest.mark.parametrize("dtype", get_dtypes())
 def test_bias_geglu(batch, sequence, channels, dtype):
-    activation = torch.randn((batch,
-                              sequence,
-                              channels * 2),
-                             dtype=dtype,
-                             device=get_accelerator().device_name())
-    bias = torch.randn((channels * 2),
-                       dtype=dtype,
-                       device=get_accelerator().device_name())
+    activation = torch.randn((batch, sequence, channels * 2), dtype=dtype, device=get_accelerator().device_name())
+    bias = torch.randn((channels * 2), dtype=dtype, device=get_accelerator().device_name())
 
     ds_out = run_bias_geglu_ds(activation, bias)
     ref_out = run_bias_geglu_reference(activation, bias)
     assert (allclose(ds_out, ref_out))
+
+
+def run_gated_silu_reference(activations, bias):
+    # Expected behavior is that of casting to float32 internally
+    # Explicitly using the default GeLU
+    activations = activations + bias.reshape(1, 1, -1)
+    hidden_states, gate = activations.chunk(2, dim=-1)
+    return hidden_states * torch.nn.functional.silu(gate.to(torch.float32)).to(activations.dtype)
+
+
+def run_gated_silu_ds(activation, bias):
+    global inference_module
+    if inference_module is None:
+        inference_module = InferenceBuilder().load()
+    return inference_module.gated_activation(activation, bias, ActivationFuncType.GATED_SILU)
+
+
+@pytest.mark.inference_ops
+@pytest.mark.parametrize("batch", [1, 2])
+@pytest.mark.parametrize("sequence", [1, 128, 255])
+@pytest.mark.parametrize("channels", [512, 1232, 4096])
+@pytest.mark.parametrize("dtype", [torch.float16, torch.float32])
+def test_gated_silu(batch, sequence, channels, dtype):
+    activation = torch.randn((batch, sequence, channels * 2), dtype=dtype, device=get_accelerator().device_name())
+    bias = torch.randn((channels * 2), dtype=dtype, device=get_accelerator().device_name())
+
+    ds_out = run_gated_silu_ds(activation, bias)
+    ref_out = run_gated_silu_reference(activation, bias)
+    assert (allclose(ds_out, ref_out))
diff --git a/tests/unit/ops/transformer/inference/test_bias_gelu.py b/tests/unit/ops/transformer/inference/test_bias_gelu.py
index 3c1762179ead..fd82da51380c 100644
--- a/tests/unit/ops/transformer/inference/test_bias_gelu.py
+++ b/tests/unit/ops/transformer/inference/test_bias_gelu.py
@@ -1,32 +1,26 @@
-"""
-Copyright 2022 The Microsoft DeepSpeed Team
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import pytest
 import torch
 import deepspeed
 from deepspeed.accelerator import get_accelerator
 from deepspeed.ops.op_builder import InferenceBuilder
+from .inference_test_utils import allclose, get_dtypes
 from packaging import version as pkg_version
 
 if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
-    pytest.skip("Inference ops are not available on this system",
-                allow_module_level=True)
+    pytest.skip("Inference ops are not available on this system", allow_module_level=True)
 
 inference_module = None
 torch_minor_version = None
 
 
-def allclose(x, y):
-    assert x.dtype == y.dtype
-    rtol, atol = {torch.float32: (5e-4, 5e-5), torch.float16: (3e-2, 2e-3)}[x.dtype]
-    return torch.allclose(x, y, rtol=rtol, atol=atol)
-
-
 def run_bias_gelu_reference(activations, bias):
     # Expected behavior is that of casting to float32 internally and using the tanh approximation
-    return torch.nn.functional.gelu(activations.to(torch.float32) +
-                                    bias.to(torch.float32),
+    return torch.nn.functional.gelu(activations.to(torch.float32) + bias.to(torch.float32),
                                     approximate='tanh').to(activations.dtype)
 
 
@@ -36,6 +30,8 @@ def run_bias_gelu_ds(activations, bias):
         inference_module = InferenceBuilder().load()
     if activations.dtype == torch.float16:
         return inference_module.bias_gelu_fp16(activations, bias)
+    elif activations.dtype == torch.bfloat16:
+        return inference_module.bias_gelu_bf16(activations, bias)
     else:
         return inference_module.bias_gelu_fp32(activations, bias)
 
@@ -44,19 +40,13 @@ def run_bias_gelu_ds(activations, bias):
 @pytest.mark.parametrize("batch", [1, 2])
 @pytest.mark.parametrize("sequence", [1, 128, 255])
 @pytest.mark.parametrize("channels", [512, 1232, 4096])
-@pytest.mark.parametrize("dtype", [torch.float16, torch.float32])
+@pytest.mark.parametrize("dtype", get_dtypes())
 def test_bias_gelu(batch, sequence, channels, dtype):
     if pkg_version.parse(torch.__version__) < pkg_version.parse("1.12"):
         pytest.skip("gelu implementation matches only after torch 1.12")
 
-    activations_ds = torch.randn((batch,
-                                  sequence,
-                                  channels),
-                                 dtype=dtype,
-                                 device=get_accelerator().device_name())
-    bias_ds = torch.randn((channels),
-                          dtype=dtype,
-                          device=get_accelerator().device_name())
+    activations_ds = torch.randn((batch, sequence, channels), dtype=dtype, device=get_accelerator().device_name())
+    bias_ds = torch.randn((channels), dtype=dtype, device=get_accelerator().device_name())
 
     activations_ref = activations_ds.clone().detach()
     bias_ref = bias_ds.clone().detach()
diff --git a/tests/unit/ops/transformer/inference/test_bias_relu.py b/tests/unit/ops/transformer/inference/test_bias_relu.py
index e2b66f6bd2e1..881af78e92cf 100644
--- a/tests/unit/ops/transformer/inference/test_bias_relu.py
+++ b/tests/unit/ops/transformer/inference/test_bias_relu.py
@@ -1,31 +1,25 @@
-"""
-Copyright 2022 The Microsoft DeepSpeed Team
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import pytest
 import torch
 import deepspeed
 from deepspeed.accelerator import get_accelerator
 from deepspeed.ops.op_builder import InferenceBuilder
+from .inference_test_utils import allclose, get_dtypes
 
 if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
-    pytest.skip("Inference ops are not available on this system",
-                allow_module_level=True)
+    pytest.skip("Inference ops are not available on this system", allow_module_level=True)
 
 inference_module = None
 torch_minor_version = None
 
 
-def allclose(x, y):
-    assert x.dtype == y.dtype
-    rtol, atol = {torch.float32: (5e-4, 5e-5), torch.float16: (3e-2, 2e-3)}[x.dtype]
-    return torch.allclose(x, y, rtol=rtol, atol=atol)
-
-
 def run_bias_relu_reference(activations, bias):
     # Expected behavior is that of casting to float32 internally
-    return torch.nn.functional.relu(
-        activations.to(torch.float32) + bias.to(torch.float32)).to(activations.dtype)
+    return torch.nn.functional.relu(activations.to(torch.float32) + bias.to(torch.float32)).to(activations.dtype)
 
 
 def run_bias_relu_ds(activations, bias):
@@ -34,6 +28,8 @@ def run_bias_relu_ds(activations, bias):
         inference_module = InferenceBuilder().load()
     if activations.dtype == torch.float16:
         return inference_module.bias_relu_fp16(activations, bias)
+    elif activations.dtype == torch.bfloat16:
+        return inference_module.bias_relu_bf16(activations, bias)
     else:
         return inference_module.bias_relu_fp32(activations, bias)
 
@@ -42,16 +38,10 @@ def run_bias_relu_ds(activations, bias):
 @pytest.mark.parametrize("batch", [1, 2])
 @pytest.mark.parametrize("sequence", [1, 128, 255])
 @pytest.mark.parametrize("channels", [512, 1232, 4096])
-@pytest.mark.parametrize("dtype", [torch.float16, torch.float32])
+@pytest.mark.parametrize("dtype", get_dtypes())
 def test_bias_relu(batch, sequence, channels, dtype):
-    activations_ds = torch.randn((batch,
-                                  sequence,
-                                  channels),
-                                 dtype=dtype,
-                                 device=get_accelerator().device_name())
-    bias_ds = torch.randn((channels),
-                          dtype=dtype,
-                          device=get_accelerator().device_name())
+    activations_ds = torch.randn((batch, sequence, channels), dtype=dtype, device=get_accelerator().device_name())
+    bias_ds = torch.randn((channels), dtype=dtype, device=get_accelerator().device_name())
 
     activations_ref = activations_ds.clone().detach()
     bias_ref = bias_ds.clone().detach()
diff --git a/tests/unit/ops/transformer/inference/test_gelu.py b/tests/unit/ops/transformer/inference/test_gelu.py
new file mode 100644
index 000000000000..de924848bfb4
--- /dev/null
+++ b/tests/unit/ops/transformer/inference/test_gelu.py
@@ -0,0 +1,70 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import pytest
+import torch
+import deepspeed
+from deepspeed.ops.op_builder import InferenceBuilder
+
+if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
+    pytest.skip("Inference ops are not available on this system", allow_module_level=True)
+
+inference_module = None
+torch_minor_version = None
+
+
+def allclose(x, y):
+    assert x.dtype == y.dtype
+    rtol, atol = {torch.float32: (5e-4, 5e-5), torch.float16: (3e-2, 2e-3)}[x.dtype]
+    return torch.allclose(x, y, rtol=rtol, atol=atol)
+
+
+def version_appropriate_gelu(activations):
+    global torch_minor_version
+    if torch_minor_version is None:
+        torch_minor_version = int(torch.__version__.split('.')[1])
+    # If torch version = 1.12
+    if torch_minor_version < 12:
+        return torch.nn.functional.gelu(activations)
+    else:
+        return torch.nn.functional.gelu(activations, approximate='tanh')
+
+
+def run_gelu_reference(activations):
+    # Expected behavior is that of casting to float32 internally and using the tanh approximation
+    return version_appropriate_gelu(activations.to(torch.float32)).to(activations.dtype)
+
+
+def run_gelu_ds(activations, use_triton_ops=False):
+    if use_triton_ops:
+        from deepspeed.ops.transformer.inference.triton import gelu
+        return gelu(activations)
+
+    channels = activations.shape[-1]
+    bias = torch.zeros((channels), dtype=activations.dtype, device='cuda')
+    global inference_module
+    if inference_module is None:
+        inference_module = InferenceBuilder().load()
+    if activations.dtype == torch.float16:
+        return inference_module.bias_gelu_fp16(activations, bias)
+    else:
+        return inference_module.bias_gelu_fp32(activations, bias)
+
+
+@pytest.mark.inference_ops
+@pytest.mark.parametrize("batch", [1, 2])
+@pytest.mark.parametrize("sequence", [1, 128, 255])
+@pytest.mark.parametrize("channels", [512, 1232, 4096])
+@pytest.mark.parametrize("dtype", [torch.float16])
+@pytest.mark.parametrize("use_triton_ops", [True, False])
+def test_gelu(batch, sequence, channels, dtype, use_triton_ops):
+    activations_ds = torch.randn((batch, sequence, channels), dtype=dtype, device='cuda')
+    activations_ref = activations_ds.clone().detach()
+
+    if not deepspeed.HAS_TRITON and use_triton_ops:
+        pytest.skip("triton has to be installed for the test")
+    ds_out = run_gelu_ds(activations_ds, use_triton_ops)
+    ref_out = run_gelu_reference(activations_ref)
+    assert (allclose(ds_out, ref_out))
diff --git a/tests/unit/ops/transformer/inference/test_layer_norm.py b/tests/unit/ops/transformer/inference/test_layer_norm.py
index 61f6455629e6..711a35213015 100644
--- a/tests/unit/ops/transformer/inference/test_layer_norm.py
+++ b/tests/unit/ops/transformer/inference/test_layer_norm.py
@@ -1,35 +1,34 @@
-"""
-Copyright 2022 The Microsoft DeepSpeed Team
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import deepspeed
 import torch
 import pytest
 from deepspeed.accelerator import get_accelerator
 from deepspeed.ops.op_builder import InferenceBuilder
+from .inference_test_utils import allclose, get_dtypes, assert_almost_equal
+try:
+    import triton  # noqa: F401 # type: ignore
+    from deepspeed.ops.transformer.inference.triton import (
+        layer_norm,
+        layer_norm_residual,
+    )
+except ImportError:
+    print("triton import failed")
 
 if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
-    pytest.skip("Inference ops are not available on this system",
-                allow_module_level=True)
+    pytest.skip("Inference ops are not available on this system", allow_module_level=True)
 
 inference_module = None
 
 
-def allclose(x, y):
-    assert x.dtype == y.dtype
-    rtol, atol = {torch.float32: (5e-4, 5e-5), torch.float16: (3e-2, 2e-3)}[x.dtype]
-    return torch.allclose(x, y, rtol=rtol, atol=atol)
-
-
-def ref_implementation(vals, gamma, beta, espilon, channels, dtype):
+def ref_implementation(vals, gamma, beta, epsilon, channels, dtype):
     vals_f = vals.to(torch.float32)
     gamma_f = gamma.to(torch.float32)
     beta_f = beta.to(torch.float32)
-    return torch.nn.functional.layer_norm(vals_f,
-                                          (channels,
-                                           ),
-                                          weight=gamma_f,
-                                          bias=beta_f).to(dtype)
+    return torch.nn.functional.layer_norm(vals_f, (channels, ), weight=gamma_f, bias=beta_f, eps=epsilon).to(dtype)
 
 
 def ds_implementation(vals, gamma, beta, epsilon):
@@ -39,42 +38,48 @@ def ds_implementation(vals, gamma, beta, epsilon):
     return inference_module.layer_norm(vals, gamma, beta, epsilon)
 
 
+def ds_triton_implementation(vals, gamma, beta, epsilon):
+    return layer_norm(vals, gamma, beta, epsilon)
+
+
 @pytest.mark.inference_ops
 @pytest.mark.parametrize("batch", [1, 32])
 @pytest.mark.parametrize("seq_len", [1, 128])
 @pytest.mark.parametrize("channels", [384, 512, 768, 1024, 2048, 8192, 14432])
-@pytest.mark.parametrize("dtype", [torch.float16, torch.float32])
-def test_layer_norm(batch, seq_len, channels, dtype):
-    vals = torch.randn((batch,
-                        seq_len,
-                        channels),
-                       dtype=dtype,
-                       device=get_accelerator().current_device_name())
-    gamma = torch.randn((channels),
-                        dtype=dtype,
-                        device=get_accelerator().current_device_name())
-    beta = torch.rand((channels),
-                      dtype=dtype,
-                      device=get_accelerator().current_device_name())
+@pytest.mark.parametrize("dtype", get_dtypes())
+@pytest.mark.parametrize("use_triton_ops", [False, True])
+def test_layer_norm(batch, seq_len, channels, dtype, use_triton_ops):
+    if not deepspeed.HAS_TRITON and use_triton_ops:
+        pytest.skip("triton has to be installed for the test")
+
+    vals = torch.randn((batch, seq_len, channels), dtype=dtype, device=get_accelerator().current_device_name())
+    gamma = torch.randn((channels), dtype=dtype, device=get_accelerator().current_device_name())
+    beta = torch.rand((channels), dtype=dtype, device=get_accelerator().current_device_name())
     epsilon = 1e-5
 
     ref_output = ref_implementation(vals, gamma, beta, epsilon, channels, dtype)
-    new_output = ds_implementation(vals, gamma, beta, epsilon)
+    if use_triton_ops:
+        new_output = ds_triton_implementation(vals, gamma, beta, epsilon)
+        if dtype != torch.float16:  # fp16 supported in triton
+            return
+    else:
+        new_output = ds_implementation(vals, gamma, beta, epsilon)
 
-    assert allclose(new_output, ref_output)
+    if not allclose(new_output, ref_output):
+        #print(new_output - ref_output)
+        assert allclose(new_output, ref_output)
 
 
-def residual_ref_implementation(vals, bias, res, gamma, beta, espilon, channels, dtype):
+def residual_ref_implementation(vals, bias, res, gamma, beta, epsilon, channels, dtype):
     vals_f = vals.to(torch.float32)
     bias_f = bias.to(torch.float32).reshape(1, 1, -1)
     res_f = res.to(torch.float32)
     gamma_f = gamma.to(torch.float32)
     beta_f = beta.to(torch.float32)
-    return torch.nn.functional.layer_norm(vals_f + bias_f + res_f,
-                                          (channels,
-                                           ),
+    return torch.nn.functional.layer_norm(vals_f + bias_f + res_f, (channels, ),
                                           weight=gamma_f,
-                                          bias=beta_f).to(dtype)
+                                          bias=beta_f,
+                                          eps=epsilon).to(dtype)
 
 
 def residual_ds_implementation(vals, bias, res, gamma, beta, epsilon):
@@ -84,65 +89,50 @@ def residual_ds_implementation(vals, bias, res, gamma, beta, epsilon):
     return inference_module._layer_norm_residual(vals, bias, res, gamma, beta, epsilon)
 
 
+def residual_ds_triton_implementation(vals, bias, res, gamma, beta, epsilon):
+    return layer_norm_residual(vals, bias, res, gamma, beta, epsilon)
+
+
 @pytest.mark.inference_ops
 @pytest.mark.parametrize("batch", [1, 32])
 @pytest.mark.parametrize("seq_len", [1, 128])
 @pytest.mark.parametrize("channels", [384, 512, 768, 1024, 2048, 8192, 14432])
-@pytest.mark.parametrize("dtype", [torch.float16, torch.float32])
-def test_layer_norm_residual(batch, seq_len, channels, dtype):
-    vals = torch.randn((batch,
-                        seq_len,
-                        channels),
-                       dtype=dtype,
-                       device=get_accelerator().current_device_name())
-    residual = torch.randn((batch,
-                            seq_len,
-                            channels),
-                           dtype=dtype,
-                           device=get_accelerator().current_device_name())
-    bias = torch.randn((channels),
-                       dtype=dtype,
-                       device=get_accelerator().current_device_name())
-    gamma = torch.randn((channels),
-                        dtype=dtype,
-                        device=get_accelerator().current_device_name())
-    beta = torch.rand((channels),
-                      dtype=dtype,
-                      device=get_accelerator().current_device_name())
+@pytest.mark.parametrize("dtype", get_dtypes())
+@pytest.mark.parametrize("use_triton_ops", [False, True])
+def test_layer_norm_residual(batch, seq_len, channels, dtype, use_triton_ops):
+    if not deepspeed.HAS_TRITON and use_triton_ops:
+        pytest.skip("triton has to be installed for the test")
+
+    vals = torch.randn((batch, seq_len, channels), dtype=dtype, device=get_accelerator().current_device_name())
+    residual = torch.randn((batch, seq_len, channels), dtype=dtype, device=get_accelerator().current_device_name())
+    bias = torch.randn((channels), dtype=dtype, device=get_accelerator().current_device_name())
+    gamma = torch.randn((channels), dtype=dtype, device=get_accelerator().current_device_name())
+    beta = torch.rand((channels), dtype=dtype, device=get_accelerator().current_device_name())
     epsilon = 1e-5
 
-    new_output = residual_ds_implementation(vals, bias, residual, gamma, beta, epsilon)
-    ref_output = residual_ref_implementation(vals,
-                                             bias,
-                                             residual,
-                                             gamma,
-                                             beta,
-                                             epsilon,
-                                             channels,
-                                             dtype)
+    if use_triton_ops:
+        new_output = residual_ds_triton_implementation(vals, bias, residual, gamma, beta, epsilon)
+        if dtype != torch.float16:  # fp16 supported in triton
+            return
+    else:
+        new_output = residual_ds_implementation(vals, bias, residual, gamma, beta, epsilon)
+
+    ref_output = residual_ref_implementation(vals, bias, residual, gamma, beta, epsilon, channels, dtype)
+
+    print((new_output - ref_output).abs().max())
 
     assert allclose(new_output, ref_output)
 
 
-def residual_store_ref_implementation(vals,
-                                      bias,
-                                      res,
-                                      gamma,
-                                      beta,
-                                      espilon,
-                                      channels,
-                                      dtype):
+def residual_store_ref_implementation(vals, bias, res, gamma, beta, epsilon, channels, dtype):
     vals_f = vals.to(torch.float32)
     bias_f = bias.to(torch.float32).reshape(1, 1, -1)
     res_f = res.to(torch.float32)
     gamma_f = gamma.to(torch.float32)
     beta_f = beta.to(torch.float32)
     res_output = vals_f + bias_f + res_f
-    norm_output = torch.nn.functional.layer_norm(res_output,
-                                                 (channels,
-                                                  ),
-                                                 weight=gamma_f,
-                                                 bias=beta_f).to(dtype)
+    norm_output = torch.nn.functional.layer_norm(res_output, (channels, ), weight=gamma_f, bias=beta_f,
+                                                 eps=epsilon).to(dtype)
     return norm_output, res_output.to(dtype)
 
 
@@ -150,53 +140,61 @@ def residual_store_ds_implementation(vals, bias, res, gamma, beta, epsilon):
     global inference_module
     if inference_module is None:
         inference_module = InferenceBuilder().load()
-    return inference_module.layer_norm_residual_store_pre_ln_res(
-        vals,
-        bias,
-        res,
-        gamma,
-        beta,
-        epsilon)
+    return inference_module.layer_norm_residual_store_pre_ln_res(vals, bias, res, gamma, beta, epsilon)
 
 
 @pytest.mark.inference_ops
 @pytest.mark.parametrize("batch", [1, 32])
 @pytest.mark.parametrize("seq_len", [1, 128])
 @pytest.mark.parametrize("channels", [384, 512, 768, 1024, 2048, 8192, 14432])
-@pytest.mark.parametrize("dtype", [torch.float16, torch.float32])
+@pytest.mark.parametrize("dtype", get_dtypes())
 def test_layer_norm_residual_store_pre_ln_res(batch, seq_len, channels, dtype):
-    vals = torch.randn((batch,
-                        seq_len,
-                        channels),
-                       dtype=dtype,
-                       device=get_accelerator().current_device_name())
-    residual = torch.randn((batch,
-                            seq_len,
-                            channels),
-                           dtype=dtype,
-                           device=get_accelerator().current_device_name())
-    bias = torch.randn((channels),
-                       dtype=dtype,
-                       device=get_accelerator().current_device_name())
-    gamma = torch.randn((channels),
-                        dtype=dtype,
-                        device=get_accelerator().current_device_name())
-    beta = torch.rand((channels),
-                      dtype=dtype,
-                      device=get_accelerator().current_device_name())
+    vals = torch.randn((batch, seq_len, channels), dtype=dtype, device=get_accelerator().current_device_name())
+    residual = torch.randn((batch, seq_len, channels), dtype=dtype, device=get_accelerator().current_device_name())
+    bias = torch.randn((channels), dtype=dtype, device=get_accelerator().current_device_name())
+    gamma = torch.randn((channels), dtype=dtype, device=get_accelerator().current_device_name())
+    beta = torch.rand((channels), dtype=dtype, device=get_accelerator().current_device_name())
     epsilon = 1e-5
 
     # Need to run the reference first since there's an in-place component to ours
-    ref_norm_output, norm_res_output = residual_store_ref_implementation(vals,
-                                        bias,
-                                        residual,
-                                        gamma,
-                                        beta,
-                                        epsilon,
-                                        channels,
-                                        dtype)
+    ref_norm_output, norm_res_output = residual_store_ref_implementation(vals, bias, residual, gamma, beta, epsilon,
+                                                                         channels, dtype)
 
     ds_norm_output, ds_res_output = residual_store_ds_implementation(vals, bias, residual, gamma, beta, epsilon)
 
     assert allclose(ds_res_output, norm_res_output)
     assert allclose(ds_norm_output, ref_norm_output)
+
+
+@pytest.mark.inference_ops
+@pytest.mark.parametrize("M", [4])
+@pytest.mark.parametrize("N", [4])
+@pytest.mark.parametrize("dtype", [torch.float16])
+@pytest.mark.parametrize("residual", [True, False])
+@pytest.mark.parametrize("input_bias", [True, False])
+def test_triton_layer_norm(M, N, dtype, residual, input_bias, eps=1e-5, device='cuda'):
+    if not deepspeed.HAS_TRITON:
+        pytest.skip("triton has to be installed for the test")
+    torch.manual_seed(0)
+    # create data
+    x_shape = (M, N)
+    w_shape = (x_shape[-1], )
+    weight = torch.rand(w_shape, dtype=dtype, device='cuda', requires_grad=False)
+    bias = torch.rand(w_shape, dtype=dtype, device='cuda', requires_grad=False)
+    x_bias = torch.rand(w_shape, dtype=dtype, device='cuda', requires_grad=False)
+    x = -2.3 + 0.5 * torch.randn(x_shape, dtype=dtype, device='cuda')
+    dy = .1 * torch.randn_like(x)
+    if residual:
+        res = torch.rand(x_shape, dtype=dtype, device='cuda', requires_grad=False)
+    else:
+        res = torch.zeros(x_shape, dtype=dtype, device='cuda', requires_grad=False)
+    x.requires_grad_(True)
+    # forward pass
+    if residual or input_bias:
+        y_tri = layer_norm_residual(x, x_bias if input_bias else None, res, weight, bias, eps)
+    else:
+        y_tri = layer_norm(x, weight, bias, eps)
+    y_ref = torch.nn.functional.layer_norm(x + res + (x_bias if input_bias else 0), w_shape, weight, bias,
+                                           eps).to(dtype)
+    # compare
+    assert_almost_equal(y_tri, y_ref)
diff --git a/tests/unit/ops/transformer/inference/test_matmul.py b/tests/unit/ops/transformer/inference/test_matmul.py
new file mode 100644
index 000000000000..804a85750a3a
--- /dev/null
+++ b/tests/unit/ops/transformer/inference/test_matmul.py
@@ -0,0 +1,59 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import pytest
+import torch
+import deepspeed
+from deepspeed.ops.op_builder import InferenceBuilder
+
+if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
+    pytest.skip("Inference ops are not available on this system", allow_module_level=True)
+
+inference_module = None
+torch_minor_version = None
+
+
+def allclose(x, y):
+    assert x.dtype == y.dtype
+    rtol, atol = {torch.float32: (5e-4, 5e-5), torch.float16: (5e-2, 2e-3)}[x.dtype]
+    return torch.allclose(x, y, rtol=rtol, atol=atol)
+
+
+def run_matmul_ref(a, b):
+    return torch.matmul(a, b)
+
+
+def run_matmul_ds(a, b, use_triton_ops=False):
+    if use_triton_ops:
+        from deepspeed.ops.transformer.inference.triton import matmul_4d as matmul
+        return matmul(a, b)
+
+    assert use_triton_ops, "Only triton softmax is supported for now"
+
+
+@pytest.mark.inference_ops
+@pytest.mark.parametrize("B", [1, 2])
+@pytest.mark.parametrize("H", [1, 2, 16])
+@pytest.mark.parametrize("M", [1, 7, 8, 128])
+@pytest.mark.parametrize("K", [2, 5, 16, 128])
+@pytest.mark.parametrize("N", [1, 2, 8, 512])
+@pytest.mark.parametrize("dtype", [torch.float16])
+@pytest.mark.parametrize("use_triton_ops", [True])
+def test_matmul_4d(B, H, M, K, N, dtype, use_triton_ops):
+    if not deepspeed.HAS_TRITON and use_triton_ops:
+        pytest.skip("triton has to be installed for the test")
+
+    # skip autotune in testing
+    from deepspeed.ops.transformer.inference.triton.matmul_ext import fp16_matmul
+    fp16_matmul.skip_autotune()
+
+    a_ds = torch.randn((B, H, M, K), dtype=dtype, device='cuda')
+    b_ds = torch.randn((B, H, K, N), dtype=dtype, device='cuda')
+    a_ref = a_ds.clone().detach()
+    b_ref = b_ds.clone().detach()
+
+    ds_out = run_matmul_ds(a_ds, b_ds, use_triton_ops)
+    ref_out = run_matmul_ref(a_ref, b_ref)
+    assert (allclose(ds_out, ref_out))
diff --git a/tests/unit/ops/transformer/inference/test_moe_res_matmult.py b/tests/unit/ops/transformer/inference/test_moe_res_matmult.py
index fdd6e8607c71..e1c8127a83ac 100644
--- a/tests/unit/ops/transformer/inference/test_moe_res_matmult.py
+++ b/tests/unit/ops/transformer/inference/test_moe_res_matmult.py
@@ -1,26 +1,21 @@
-"""
-Copyright 2022 The Microsoft DeepSpeed Team
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import pytest
 import torch
 import deepspeed
 from deepspeed.accelerator import get_accelerator
 from deepspeed.ops.op_builder import InferenceBuilder
+from .inference_test_utils import allclose, get_dtypes
 
 if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
-    pytest.skip("Inference ops are not available on this system",
-                allow_module_level=True)
+    pytest.skip("Inference ops are not available on this system", allow_module_level=True)
 
 inference_module = None
 
 
-def allclose(x, y):
-    assert x.dtype == y.dtype
-    rtol, atol = {torch.float32: (5e-4, 5e-5), torch.float16: (3e-2, 2e-3)}[x.dtype]
-    return torch.allclose(x, y, rtol=rtol, atol=atol)
-
-
 def run_moe_res_matmul_reference(residual, coef1, coef2, output):
     return residual * coef1 + output * coef2
 
@@ -36,28 +31,12 @@ def run_moe_res_matmul_ds(residual, coef, output):
 @pytest.mark.inference_ops
 @pytest.mark.parametrize("hidden_dim", [16, 64])
 @pytest.mark.parametrize("c", [1, 4])
-@pytest.mark.parametrize("dtype", [torch.float32, torch.float16])
+@pytest.mark.parametrize("dtype", get_dtypes())
 def test_moe_residual_matmul(hidden_dim, c, dtype):
-    residual_ds = torch.randn((c,
-                               hidden_dim * c,
-                               hidden_dim),
-                              dtype=dtype,
-                              device=get_accelerator().device_name())
-    coeff1 = torch.randn((1,
-                          1,
-                          hidden_dim),
-                         dtype=dtype,
-                         device=get_accelerator().device_name())
-    coeff2 = torch.randn((1,
-                          1,
-                          hidden_dim),
-                         dtype=dtype,
-                         device=get_accelerator().device_name())
-    out_ds = torch.randn((c,
-                          hidden_dim * c,
-                          hidden_dim),
-                         dtype=dtype,
-                         device=get_accelerator().device_name())
+    residual_ds = torch.randn((c, hidden_dim * c, hidden_dim), dtype=dtype, device=get_accelerator().device_name())
+    coeff1 = torch.randn((1, 1, hidden_dim), dtype=dtype, device=get_accelerator().device_name())
+    coeff2 = torch.randn((1, 1, hidden_dim), dtype=dtype, device=get_accelerator().device_name())
+    out_ds = torch.randn((c, hidden_dim * c, hidden_dim), dtype=dtype, device=get_accelerator().device_name())
     coeff_ds = torch.cat((coeff1, coeff2), dim=-1)
     residual_ref = residual_ds.clone().detach()
     coeff_ref = coeff_ds.clone().detach()
diff --git a/tests/unit/ops/transformer/inference/test_residual_add.py b/tests/unit/ops/transformer/inference/test_residual_add.py
index 0dacee355369..c2952f74ff2d 100644
--- a/tests/unit/ops/transformer/inference/test_residual_add.py
+++ b/tests/unit/ops/transformer/inference/test_residual_add.py
@@ -1,21 +1,38 @@
-"""
-Copyright 2022 The Microsoft DeepSpeed Team
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import pytest
 import torch
 import deepspeed
 from deepspeed.accelerator import get_accelerator
 from deepspeed.ops.op_builder import InferenceBuilder
+from .inference_test_utils import get_dtypes
 
 if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
-    pytest.skip("Inference ops are not available on this system",
-                allow_module_level=True)
+    pytest.skip("Inference ops are not available on this system", allow_module_level=True)
+
+TOLERANCES = None
+
+
+def get_tolerances():
+    global TOLERANCES
+    if TOLERANCES is None:
+        # Residual add, as a sequence of casted additions, currently requires a higher tolerance
+        # than the other operators for FP16. We should instead better align the behaviors
+        # of the reference to match our kernel implementation (TODO(cmikeh2))
+        TOLERANCES = {torch.float32: (5e-4, 5e-5), torch.float16: (3e-2, 4e-3)}
+        if get_accelerator().is_bf16_supported():
+            # Note: BF16 tolerance is higher than FP16 because of the lower precision (7 (+1) bits vs
+            # 10 (+1) bits)
+            TOLERANCES[torch.bfloat16] = (4.8e-1, 3.2e-2)
+    return TOLERANCES
 
 
 def allclose(x, y):
     assert x.dtype == y.dtype
-    rtol, atol = {torch.float32: (5e-4, 5e-5), torch.float16: (3e-2, 2e-2)}[x.dtype]
+    rtol, atol = get_tolerances()[x.dtype]
     return torch.allclose(x, y, rtol=rtol, atol=atol)
 
 
@@ -24,13 +41,7 @@ def inference_module():
     return InferenceBuilder().load()
 
 
-def res_add_bias_ref(hidden_state,
-                     residual,
-                     attn_output,
-                     attn_bias,
-                     final_bias,
-                     mp_size=1,
-                     pre_attn_norm=True):
+def res_add_bias_ref(hidden_state, residual, attn_output, attn_bias, final_bias, mp_size=1, pre_attn_norm=True):
     if pre_attn_norm:
         hidden_state += (residual + final_bias + attn_output + attn_bias) / mp_size
     else:
@@ -38,43 +49,19 @@ def res_add_bias_ref(hidden_state,
     return hidden_state
 
 
-def res_add_bias_ref_gptj(hidden_state,
-                          residual,
-                          attn_output,
-                          attn_bias,
-                          final_bias,
-                          add_attn_bias,
-                          mp_size):
+def res_add_bias_ref_gptj(hidden_state, residual, attn_output, attn_bias, final_bias, add_attn_bias, mp_size):
     hidden_state += attn_output + (residual + final_bias) / mp_size
     if add_attn_bias:
         hidden_state += attn_bias / mp_size
     return hidden_state
 
 
-def run_residual_add_reference(hidden_state,
-                               residual,
-                               attn_output,
-                               attn_bias,
-                               final_bias,
-                               mlp_after_attn,
-                               add_attn_bias,
-                               mp_size,
-                               pre_attn_norm):
+def run_residual_add_reference(hidden_state, residual, attn_output, attn_bias, final_bias, mlp_after_attn,
+                               add_attn_bias, mp_size, pre_attn_norm):
     if mlp_after_attn:
-        return res_add_bias_ref(hidden_state,
-                                residual,
-                                attn_output,
-                                attn_bias,
-                                final_bias,
-                                mp_size,
-                                pre_attn_norm)
+        return res_add_bias_ref(hidden_state, residual, attn_output, attn_bias, final_bias, mp_size, pre_attn_norm)
     else:
-        return res_add_bias_ref_gptj(hidden_state,
-                                     residual,
-                                     attn_output,
-                                     attn_bias,
-                                     final_bias,
-                                     add_attn_bias,
+        return res_add_bias_ref_gptj(hidden_state, residual, attn_output, attn_bias, final_bias, add_attn_bias,
                                      mp_size)
 
 
@@ -82,70 +69,51 @@ def run_residual_add_reference(hidden_state,
 @pytest.mark.parametrize("batch", [1, 2])
 @pytest.mark.parametrize("sequence", [1, 128, 255])
 @pytest.mark.parametrize("hidden_dim", [512, 1232, 4096])
-@pytest.mark.parametrize("dtype", [torch.float16, torch.float32])
+@pytest.mark.parametrize("dtype", get_dtypes())
 @pytest.mark.parametrize("mlp_after_attn", [True, False])
 @pytest.mark.parametrize("add_bias", [True, False])
 @pytest.mark.parametrize("mp_size", [1, 2])
 @pytest.mark.parametrize("pre_attn_norm", [True, False])
-def test_residual_add(inference_module,
-                      batch,
-                      sequence,
-                      hidden_dim,
-                      dtype,
-                      mlp_after_attn,
-                      add_bias,
-                      mp_size,
-                      pre_attn_norm):
-    ds_out = torch.randn((batch,
-                          sequence,
-                          hidden_dim),
-                         dtype=dtype,
-                         device=get_accelerator().device_name())
-    residual = torch.randn((batch,
-                            sequence,
-                            hidden_dim),
-                           dtype=dtype,
-                           device=get_accelerator().device_name())
-    attn_output = torch.randn((batch,
-                               sequence,
-                               hidden_dim),
-                              dtype=dtype,
-                              device=get_accelerator().device_name())
-    final_bias = torch.randn((hidden_dim),
-                             dtype=dtype,
-                             device=get_accelerator().device_name())
-    attn_bias = torch.randn((hidden_dim),
-                            dtype=dtype,
-                            device=get_accelerator().device_name())
+@pytest.mark.parametrize("use_triton_ops", [True, False])
+def test_residual_add(inference_module, batch, sequence, hidden_dim, dtype, mlp_after_attn, add_bias, mp_size,
+                      pre_attn_norm, use_triton_ops):
+    if not deepspeed.HAS_TRITON and use_triton_ops and dtype == torch.float16:
+        pytest.skip("triton has to be installed for the test")
+    ds_out = torch.randn((batch, sequence, hidden_dim), dtype=dtype, device=get_accelerator().device_name())
+    residual = torch.randn((batch, sequence, hidden_dim), dtype=dtype, device=get_accelerator().device_name())
+    attn_output = torch.randn((batch, sequence, hidden_dim), dtype=dtype, device=get_accelerator().device_name())
+    final_bias = torch.randn((hidden_dim), dtype=dtype, device=get_accelerator().device_name())
+    attn_bias = torch.randn((hidden_dim), dtype=dtype, device=get_accelerator().device_name())
 
     ref_out = ds_out.clone()
-    ref_out = run_residual_add_reference(ref_out,
-                                         residual,
-                                         attn_output,
-                                         attn_bias,
-                                         final_bias,
-                                         mlp_after_attn,
-                                         add_bias,
-                                         mp_size,
-                                         pre_attn_norm)
+    ref_out = run_residual_add_reference(ref_out, residual, attn_output, attn_bias, final_bias, mlp_after_attn,
+                                         add_bias, mp_size, pre_attn_norm)
 
     res_add_args = [
-        ds_out,
-        residual,
-        attn_output,
-        attn_bias,
-        final_bias,
-        mp_size,
-        mlp_after_attn,
-        add_bias,
-        pre_attn_norm
+        ds_out, residual, attn_output, attn_bias, final_bias, mp_size, mlp_after_attn, add_bias, pre_attn_norm
     ]
 
+    if use_triton_ops:
+        from deepspeed.ops.transformer.inference.triton import residual_add_bias
+        ds_out = residual_add_bias(*res_add_args)
     if dtype == torch.float16:
         ds_out = inference_module.residual_add_bias_fp16(*res_add_args)
     elif dtype == torch.float32:
         ds_out = inference_module.residual_add_bias_fp32(*res_add_args)
+    elif dtype == torch.bfloat16:
+        ds_out = inference_module.residual_add_bias_bf16(*res_add_args)
     else:
-        raise ValueError(f"Unsupported dtype: {dtype}")
+        if dtype == torch.float16:
+            ds_out = inference_module.residual_add_bias_fp16(*res_add_args)
+        elif dtype == torch.float32:
+            ds_out = inference_module.residual_add_bias_fp32(*res_add_args)
+        else:
+            raise ValueError(f"Unsupported dtype: {dtype}")
+
+    if not allclose(ds_out, ref_out):
+        print((ds_out - ref_out).abs().max())
+        print((ds_out - ref_out).abs().mean())
+        print((ds_out - ref_out))
+        assert (allclose(ds_out, ref_out))
 
     assert (allclose(ds_out, ref_out))
diff --git a/tests/unit/ops/transformer/inference/test_rms_norm.py b/tests/unit/ops/transformer/inference/test_rms_norm.py
new file mode 100644
index 000000000000..508a40e12e8d
--- /dev/null
+++ b/tests/unit/ops/transformer/inference/test_rms_norm.py
@@ -0,0 +1,89 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import deepspeed
+import torch
+import pytest
+from deepspeed.accelerator import get_accelerator
+from deepspeed.ops.op_builder import InferenceBuilder  # type: ignore
+from .inference_test_utils import allclose, get_dtypes
+
+if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
+    pytest.skip("Inference ops are not available on this system", allow_module_level=True)
+
+inference_module = None
+
+
+def ref_implementation(vals, gamma, epsilon):
+    variance = vals.to(torch.float32).pow(2).mean(-1, keepdim=True)
+    vals = vals * torch.rsqrt(variance + epsilon)
+
+    if gamma.dtype in [torch.float16, torch.bfloat16]:
+        vals = vals.to(gamma.dtype)
+
+    return gamma * vals
+
+
+def ds_implementation(vals, gamma, epsilon):
+    global inference_module
+    if inference_module is None:
+        inference_module = InferenceBuilder().load()
+    return inference_module.rms_norm(vals, gamma, epsilon)
+
+
+@pytest.mark.inference_ops
+@pytest.mark.parametrize("batch", [1, 32])
+@pytest.mark.parametrize("seq_len", [1, 128])
+@pytest.mark.parametrize("channels", [384, 512, 768, 1024, 2048, 8192, 14432])
+@pytest.mark.parametrize("dtype", get_dtypes())
+def test_rms_norm(batch, seq_len, channels, dtype):
+    device = get_accelerator().current_device_name()
+    vals = torch.randn((batch, seq_len, channels), dtype=dtype, device=device)
+    gamma = torch.randn((channels), dtype=dtype, device=device)
+    epsilon = 1e-5
+
+    ref_output = ref_implementation(vals, gamma, epsilon)
+    new_output = ds_implementation(vals, gamma, epsilon)
+
+    assert allclose(new_output, ref_output)
+
+
+def pre_ds_implementation(vals, residual, gamma, epsilon):
+    global inference_module
+    if inference_module is None:
+        inference_module = InferenceBuilder().load()
+    return inference_module.pre_rms_norm(vals, residual, gamma, epsilon)
+
+
+def pre_ref_implementation(vals, residual, gamma, epsilon):
+    residual = vals.to(torch.float32) + residual.to(torch.float32)
+    vals = residual
+
+    variance = vals.to(torch.float32).pow(2).mean(-1, keepdim=True)
+    vals = vals * torch.rsqrt(variance + epsilon)
+
+    if gamma.dtype in [torch.float16, torch.bfloat16]:
+        vals = vals.to(gamma.dtype)
+
+    return gamma * vals, residual.to(gamma.dtype)
+
+
+@pytest.mark.inference_ops
+@pytest.mark.parametrize("batch", [1, 32])
+@pytest.mark.parametrize("seq_len", [1, 128])
+@pytest.mark.parametrize("channels", [384, 512, 768, 1024, 2048, 8192, 14432])
+@pytest.mark.parametrize("dtype", [torch.float16, torch.float32])
+def test_pre_norm(batch, seq_len, channels, dtype):
+    device = get_accelerator().current_device_name()
+    vals = torch.randn((batch, seq_len, channels), dtype=dtype, device=device)
+    residual = torch.randn((batch, seq_len, channels), dtype=dtype, device=device)
+    gamma = torch.randn((channels), dtype=dtype, device=device)
+    epsilon = 1e-5
+
+    ref_output = pre_ref_implementation(vals, residual, gamma, epsilon)
+    new_output = pre_ds_implementation(vals, residual, gamma, epsilon)
+
+    assert allclose(new_output[0], ref_output[0])
+    #assert allclose(new_output[1], ref_output[1])
diff --git a/tests/unit/ops/transformer/inference/test_softmax.py b/tests/unit/ops/transformer/inference/test_softmax.py
new file mode 100644
index 000000000000..76046f31e01a
--- /dev/null
+++ b/tests/unit/ops/transformer/inference/test_softmax.py
@@ -0,0 +1,51 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import pytest
+import torch
+import deepspeed
+from deepspeed.ops.op_builder import InferenceBuilder
+
+if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
+    pytest.skip("Inference ops are not available on this system", allow_module_level=True)
+
+inference_module = None
+torch_minor_version = None
+
+
+def allclose(x, y):
+    assert x.dtype == y.dtype
+    rtol, atol = {torch.float32: (5e-4, 5e-5), torch.float16: (3e-2, 2e-3)}[x.dtype]
+    return torch.allclose(x, y, rtol=rtol, atol=atol)
+
+
+def run_softmax_reference(input):
+    return torch.nn.functional.softmax(input, dim=-1)
+
+
+def run_softmax_ds(input, use_triton_ops=False):
+    if use_triton_ops:
+        from deepspeed.ops.transformer.inference.triton import softmax
+        # return torch.empty_like(input)
+        return softmax(input)
+
+    assert use_triton_ops, "Only triton softmax is supported for now"
+
+
+@pytest.mark.inference_ops
+@pytest.mark.parametrize("batch", [1, 2])
+@pytest.mark.parametrize("sequence", [1, 128, 255, 1232])
+@pytest.mark.parametrize("channels", [512, 4096])
+@pytest.mark.parametrize("dtype", [torch.float16, torch.float32])
+@pytest.mark.parametrize("use_triton_ops", [True])
+def test_softmax(batch, sequence, channels, dtype, use_triton_ops):
+    if not deepspeed.HAS_TRITON and use_triton_ops:
+        pytest.skip("triton has to be installed for the test")
+    input_ds = torch.randn((batch, sequence, channels), dtype=dtype, device='cuda')
+    input_ref = input_ds.clone().detach()
+
+    ds_out = run_softmax_ds(input_ds, use_triton_ops)
+    ref_out = run_softmax_reference(input_ref)
+    assert (allclose(ds_out, ref_out))
diff --git a/tests/unit/pipe/test_pipe_module.py b/tests/unit/pipe/test_pipe_module.py
index e8404b0d5a17..05c6a82ef55a 100644
--- a/tests/unit/pipe/test_pipe_module.py
+++ b/tests/unit/pipe/test_pipe_module.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import copy
 
@@ -22,10 +25,8 @@
 @pytest.fixture
 def sequential_model():
     model = torch.nn.Sequential(
-        *[nn.Linear(HIDDEN_DIM,
-                    HIDDEN_DIM) for _ in range(LAYERS)],
-        nn.Linear(HIDDEN_DIM,
-                  1),
+        *[nn.Linear(HIDDEN_DIM, HIDDEN_DIM) for _ in range(LAYERS)],
+        nn.Linear(HIDDEN_DIM, 1),
     )
     return model
 
@@ -33,15 +34,14 @@ def sequential_model():
 @pytest.fixture
 def simple_config():
     config_dict = {
-        "train_batch_size": 1,
+        "train_batch_size": 2,
         "train_micro_batch_size_per_gpu": 1,
         "steps_per_print": 1,
         "optimizer": {
             "type": "Adam",
             "params": {
                 "lr": 0.001,
-                "betas": [0.9,
-                          0.999],
+                "betas": [0.9, 0.999],
                 "eps": 1e-8,
                 "weight_decay": 3e-7
             }
@@ -61,7 +61,8 @@ def batch_input():
 class TestPipeModuleSequential(DistributedTest):
     world_size = 2
 
-    def test(self, sequential_model, simple_config, batch_input):
+    @pytest.mark.parametrize("activation_checkpoints", [False, True])
+    def test(self, sequential_model, simple_config, batch_input, activation_checkpoints):
         base_model = copy.deepcopy(sequential_model)
         base_input = batch_input.clone().detach()
         base_output = base_model(base_input)
@@ -73,16 +74,21 @@ def test(self, sequential_model, simple_config, batch_input):
 
         # Ensure all parameters are accounted for.
         my_params = sum(p.numel() for p in pipe_model.parameters())
-        total_pipe_params = torch.LongTensor([my_params
-                                              ]).to(get_accelerator().device_name())
+        total_pipe_params = torch.LongTensor([my_params]).to(get_accelerator().device_name())
         dist.all_reduce(total_pipe_params)
         total_pipe_params = total_pipe_params.item()
         assert total_pipe_params == base_params
 
-        pipe_model, _, _, _ = deepspeed.initialize(
-            config=simple_config,
-            model=pipe_model,
-            model_parameters=[p for p in pipe_model.parameters()])
+        pipe_model, _, _, _ = deepspeed.initialize(config=simple_config,
+                                                   model=pipe_model,
+                                                   model_parameters=[p for p in pipe_model.parameters()])
+
+        if activation_checkpoints:
+            deepspeed.checkpointing.configure(None,
+                                              deepspeed_config=pipe_model.config,
+                                              partition_activations=True,
+                                              contiguous_checkpointing=True,
+                                              num_checkpoints=9)
 
         if pipe_model.is_first_stage or pipe_model.is_last_stage:
             pipe_input = base_input.clone().detach().to(get_accelerator().device_name())
diff --git a/tests/unit/profiling/flops_profiler/test_flops_profiler.py b/tests/unit/profiling/flops_profiler/test_flops_profiler.py
index 1f93533587c0..bbcb01b489f4 100644
--- a/tests/unit/profiling/flops_profiler/test_flops_profiler.py
+++ b/tests/unit/profiling/flops_profiler/test_flops_profiler.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 import pytest
@@ -6,11 +9,13 @@
 from deepspeed.profiling.flops_profiler import get_model_profile
 from unit.simple_model import SimpleModel, random_dataloader
 from unit.common import DistributedTest
+from deepspeed.runtime.utils import required_torch_version
+from deepspeed.accelerator import get_accelerator
+
+if torch.half not in get_accelerator().supported_dtypes():
+    pytest.skip(f"fp16 not supported, valid dtype: {get_accelerator().supported_dtypes()}", allow_module_level=True)
 
-TORCH_MAJOR = int(torch.__version__.split('.')[0])
-TORCH_MINOR = int(torch.__version__.split('.')[1])
-pytestmark = pytest.mark.skipif(TORCH_MAJOR < 1
-                                or (TORCH_MAJOR == 1 and TORCH_MINOR < 3),
+pytestmark = pytest.mark.skipif(not required_torch_version(min_version=1.3),
                                 reason='requires Pytorch version 1.3 or above')
 
 
@@ -22,35 +27,25 @@ def within_range(val, target, tolerance):
 
 
 class LeNet5(torch.nn.Module):
+
     def __init__(self, n_classes):
         super(LeNet5, self).__init__()
 
         self.feature_extractor = torch.nn.Sequential(
-            torch.nn.Conv2d(in_channels=1,
-                            out_channels=6,
-                            kernel_size=5,
-                            stride=1),
+            torch.nn.Conv2d(in_channels=1, out_channels=6, kernel_size=5, stride=1),
             torch.nn.Tanh(),
             torch.nn.AvgPool2d(kernel_size=2),
-            torch.nn.Conv2d(in_channels=6,
-                            out_channels=16,
-                            kernel_size=5,
-                            stride=1),
+            torch.nn.Conv2d(in_channels=6, out_channels=16, kernel_size=5, stride=1),
             torch.nn.Tanh(),
             torch.nn.AvgPool2d(kernel_size=2),
-            torch.nn.Conv2d(in_channels=16,
-                            out_channels=120,
-                            kernel_size=5,
-                            stride=1),
+            torch.nn.Conv2d(in_channels=16, out_channels=120, kernel_size=5, stride=1),
             torch.nn.Tanh(),
         )
 
         self.classifier = torch.nn.Sequential(
-            torch.nn.Linear(in_features=120,
-                            out_features=84),
+            torch.nn.Linear(in_features=120, out_features=84),
             torch.nn.Tanh(),
-            torch.nn.Linear(in_features=84,
-                            out_features=n_classes),
+            torch.nn.Linear(in_features=84, out_features=n_classes),
         )
 
     def forward(self, x):
@@ -90,9 +85,7 @@ def test(self):
         hidden_dim = 10
         model = SimpleModel(hidden_dim, empty_grad=False)
 
-        model, _, _, _ = deepspeed.initialize(config=config_dict,
-                                            model=model,
-                                            model_parameters=model.parameters())
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
 
         data_loader = random_dataloader(model=model,
                                         total_samples=50,
diff --git a/tests/unit/runtime/activation_checkpointing/test_activation_checkpointing.py b/tests/unit/runtime/activation_checkpointing/test_activation_checkpointing.py
index af354fe1caa6..0232457a4f9c 100644
--- a/tests/unit/runtime/activation_checkpointing/test_activation_checkpointing.py
+++ b/tests/unit/runtime/activation_checkpointing/test_activation_checkpointing.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 # TODO: add tests with model parallelism for activation partitioning and other features.
 
@@ -106,6 +109,7 @@ def _test_activation_checkpoint_ordering(module, expected_ordering, *inputs):
 
 
 class MaskedLinear(torch.nn.Linear):
+
     def forward(self, x, mask):
         out = super().forward(x)
         if mask.is_floating_point():
@@ -118,12 +122,14 @@ def forward(self, x, mask):
 
 class MaskedLinearSeq(MaskedLinear):
     """Tests pipeline modules by also returning the mask."""
+
     def forward(self, x, mask):
         return super().forward(x, mask), mask
 
 
 class MaskedLinearSeqDup(MaskedLinearSeq):
     """MaskedLinearSeq, but with more outputs than inputs and in a different order."""
+
     def forward(self, x, mask):
         dup = x.clone().detach() * 1.38  # just an arbitrary scaling
         x, mask = super().forward(x, mask)
@@ -131,16 +137,19 @@ def forward(self, x, mask):
 
 
 class DropMaskLinear(torch.nn.Linear):
+
     def forward(self, x, mask):
         return super().forward(x)
 
 
 class LinearNonTensorInput(torch.nn.Linear):
+
     def forward(self, x, non_tensor_input):
         return super().forward(x)
 
 
 class LinearNonTensorOutput(torch.nn.Linear):
+
     def __init__(self, non_tensor_output):
         super().__init__(HIDDEN_DIM, HIDDEN_DIM)
         self.non_tensor_output = non_tensor_output
@@ -173,11 +182,10 @@ def _bool_to_float(btensor, dtype=torch.float32):
 
 
 # both bool and float are important, as bool is not differentiable
-@pytest.mark.parametrize('mask',
-                         [
-                             _mixed_mask(),
-                             _bool_to_float(_mixed_mask()),
-                         ])
+@pytest.mark.parametrize('mask', [
+    _mixed_mask(),
+    _bool_to_float(_mixed_mask()),
+])
 class TestActivationCheckpoint(DistributedTest):
     world_size = 1
 
@@ -212,16 +220,7 @@ def test_ckpt_arg_none(self, mask):
         _test_activation_checkpoint(module, *inputs)
 
 
-@pytest.mark.parametrize(
-    'non_tensor',
-    [None,
-     2,
-     True,
-     (None,
-      2.5),
-     (None,
-      True,
-      torch.randn(HIDDEN_DIM))])
+@pytest.mark.parametrize('non_tensor', [None, 2, True, (None, 2.5), (None, True, torch.randn(HIDDEN_DIM))])
 class TestCheckpointNonTensor(DistributedTest):
     world_size = 1
 
@@ -238,18 +237,9 @@ def test_ckpt_non_tensor_output(self, non_tensor):
         _test_activation_checkpoint(module, inputs)
 
 
-@pytest.mark.parametrize('non_tensor_output',
-                         [
-                             None,
-                             (torch.randn(HIDDEN_DIM),
-                              2.5),
-                             (None,
-                              torch.randn(HIDDEN_DIM),
-                              True),
-                             (None,
-                              True,
-                              torch.randn(HIDDEN_DIM))
-                         ])
+@pytest.mark.parametrize('non_tensor_output', [
+    None, (torch.randn(HIDDEN_DIM), 2.5), (None, torch.randn(HIDDEN_DIM), True), (None, True, torch.randn(HIDDEN_DIM))
+])
 class TestCheckpointNonTensorOutputOrdering(DistributedTest):
     world_size = 1
 
diff --git a/tests/unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py b/tests/unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py
new file mode 100644
index 000000000000..06e40655e75d
--- /dev/null
+++ b/tests/unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py
@@ -0,0 +1,122 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# TODO: add tests with model parallelism for activation partitioning and other features.
+
+import sys
+import torch
+import pytest
+from importlib import util
+
+from deepspeed.runtime.activation_checkpointing.checkpointing import non_reentrant_checkpoint
+from unit.common import DistributedTest
+
+# the hack to clone the module `test_activation_checkpointing` and inject
+# `non_reentrant_checkpoint` as the `ckpt` of the origin test module
+ORG_SPEC = util.find_spec('test_activation_checkpointing')
+test_act_ckpt = util.module_from_spec(ORG_SPEC)
+ORG_SPEC.loader.exec_module(test_act_ckpt)
+sys.modules['test_act_ckpt'] = test_act_ckpt
+test_act_ckpt.ckpt = non_reentrant_checkpoint
+
+HIDDEN_DIM = test_act_ckpt.HIDDEN_DIM
+
+MaskedLinear = test_act_ckpt.MaskedLinear
+MaskedLinearSeq = test_act_ckpt.MaskedLinearSeq
+MaskedLinearSeqDup = test_act_ckpt.MaskedLinearSeqDup
+DropMaskLinear = test_act_ckpt.DropMaskLinear
+LinearNonTensorInput = test_act_ckpt.LinearNonTensorInput
+LinearNonTensorOutput = test_act_ckpt.LinearNonTensorOutput
+
+_test_activation_checkpoint = test_act_ckpt._test_activation_checkpoint
+_mixed_mask = test_act_ckpt._mixed_mask
+_bool_to_float = test_act_ckpt._bool_to_float
+_test_activation_checkpoint_ordering = test_act_ckpt._test_activation_checkpoint_ordering
+
+
+class TestActivationCheckpointWithGrad(test_act_ckpt.TestActivationCheckpoint):
+    """test `non_reentrant_checkpoint` can still checkpoint activations for inputs with grad"""
+    pass
+
+
+class TestCheckpointNonTensorWithGrad(test_act_ckpt.TestCheckpointNonTensor):
+    """test `non_reentrant_checkpoint` can still checkpoint activations for inputs with grad"""
+    pass
+
+
+class TestCheckpointNonTensorOutputOrderingWithGrad(test_act_ckpt.TestCheckpointNonTensorOutputOrdering):
+    """test `non_reentrant_checkpoint` can still checkpoint activations for inputs with grad"""
+    pass
+
+
+# below classes are used to test the graph with inputs have no grad and parameters has grad, namely partial graph?
+@pytest.mark.parametrize('mask', [
+    _mixed_mask(),
+    _bool_to_float(_mixed_mask()),
+])
+class TestActivationCheckpointWithoutGrad(DistributedTest):
+    """test all input tensors without grad"""
+    world_size = 1
+
+    def test_ckpt_inputs1_outputs1(self, mask):
+        module = torch.nn.Linear(HIDDEN_DIM, HIDDEN_DIM)
+        inputs = torch.rand(HIDDEN_DIM)
+        _test_activation_checkpoint(module, inputs)
+
+    def test_ckpt_inputs2_outputs1(self, mask):
+        module = MaskedLinear(HIDDEN_DIM, HIDDEN_DIM)
+        inputs = torch.rand(HIDDEN_DIM)
+        _test_activation_checkpoint(module, inputs, mask)
+
+    def test_ckpt_inputs2_outputs2(self, mask):
+        module = MaskedLinearSeq(HIDDEN_DIM, HIDDEN_DIM)
+        inputs = torch.rand(HIDDEN_DIM)
+        _test_activation_checkpoint(module, inputs, mask)
+
+    def test_ckpt_inputs2_outputs3(self, mask):
+        module = MaskedLinearSeqDup(HIDDEN_DIM, HIDDEN_DIM)
+        inputs = torch.rand(HIDDEN_DIM)
+        _test_activation_checkpoint(module, inputs, mask)
+
+    def test_ckpt_arg_none(self, mask):
+        module = DropMaskLinear(HIDDEN_DIM, HIDDEN_DIM)
+        inputs = (torch.rand(HIDDEN_DIM), None)
+        _test_activation_checkpoint(module, *inputs)
+
+
+@pytest.mark.parametrize('non_tensor', [None, 2, True, (None, 2.5), (None, True, torch.randn(HIDDEN_DIM))])
+class TestCheckpointNonTensorWithoutGrad(DistributedTest):
+    """test all input tensors without grad"""
+    world_size = 1
+
+    def test_ckpt_non_tensor_input(self, non_tensor):
+        module = LinearNonTensorInput(HIDDEN_DIM, HIDDEN_DIM)
+        inputs = torch.rand(HIDDEN_DIM)
+        _test_activation_checkpoint(module, inputs, non_tensor)
+
+    def test_ckpt_non_tensor_output(self, non_tensor):
+        module = LinearNonTensorOutput(non_tensor)
+        inputs = torch.rand(HIDDEN_DIM)
+        _test_activation_checkpoint(module, inputs)
+
+
+@pytest.mark.parametrize('non_tensor_output', [
+    None, (torch.randn(HIDDEN_DIM), 2.5), (None, torch.randn(HIDDEN_DIM), True), (None, True, torch.randn(HIDDEN_DIM))
+])
+class TestCheckpointNonTensorOutputOrderingWithoutGrad(DistributedTest):
+    """test all input tensors without grad"""
+    world_size = 1
+
+    def test_ckpt_non_tensor_output_ordering(self, non_tensor_output):
+        module = LinearNonTensorOutput(non_tensor_output)
+        inputs = torch.rand(HIDDEN_DIM)
+
+        # First return is a tensor
+        ordering = [True]
+        if type(non_tensor_output) in [list, tuple]:
+            ordering += [torch.is_tensor(t) for t in non_tensor_output]
+        else:
+            ordering += [torch.is_tensor(non_tensor_output)]
+        _test_activation_checkpoint_ordering(module, ordering, inputs)
diff --git a/tests/unit/runtime/comm/test_coalesced_collectives.py b/tests/unit/runtime/comm/test_coalesced_collectives.py
index fa1041379a6b..8e736c1eaaa6 100644
--- a/tests/unit/runtime/comm/test_coalesced_collectives.py
+++ b/tests/unit/runtime/comm/test_coalesced_collectives.py
@@ -1,5 +1,10 @@
-'''Copyright The Microsoft DeepSpeed Team'''
-"""unit tests for coalesced collectives"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""
+unit tests for coalesced collectives
+"""
 
 import torch
 import deepspeed.comm as dist
@@ -13,11 +18,7 @@ class TestReduceScatterCoalesced(DistributedTest):
     world_size = 2
 
     def test_single_input(self):
-        input = torch.full((6,
-                            ),
-                           dist.get_rank(),
-                           dtype=torch.half,
-                           device=get_accelerator().current_device_name())
+        input = torch.full((6, ), dist.get_rank(), dtype=torch.half, device=get_accelerator().current_device_name())
 
         (output, ) = reduce_scatter_coalesced([input], dist.get_world_group())
 
@@ -25,17 +26,10 @@ def test_single_input(self):
         assert torch.allclose(output, torch.full_like(output, 0.5))
 
     def test_two_inputs(self):
-        tensor_kwargs = {
-            "device": get_accelerator().current_device_name(),
-            "dtype": torch.half
-        }
+        tensor_kwargs = {"device": get_accelerator().current_device_name(), "dtype": torch.half}
         inputs = [
-            dist.get_rank() * torch.arange(0,
-                                           6,
-                                           **tensor_kwargs),
-            dist.get_rank() * torch.arange(6,
-                                           9,
-                                           **tensor_kwargs),
+            dist.get_rank() * torch.arange(0, 6, **tensor_kwargs),
+            dist.get_rank() * torch.arange(6, 9, **tensor_kwargs),
         ]
 
         output1, output2 = reduce_scatter_coalesced(inputs, dist.get_world_group())
@@ -56,10 +50,7 @@ class TestReduceScatterCoalescedTensorSmallerThanWorldSize(DistributedTest):
     world_size = 2
 
     def test(self):
-        input = torch.zeros((1,
-                             ),
-                            dtype=torch.half,
-                            device=get_accelerator().current_device_name())
+        input = torch.zeros((1, ), dtype=torch.half, device=get_accelerator().current_device_name())
 
         (output, ) = reduce_scatter_coalesced([input], dist.get_world_group())
 
diff --git a/tests/unit/runtime/half_precision/onebit/test_onebit.py b/tests/unit/runtime/half_precision/onebit/test_onebit.py
index 84a36768174a..ba795a853be0 100644
--- a/tests/unit/runtime/half_precision/onebit/test_onebit.py
+++ b/tests/unit/runtime/half_precision/onebit/test_onebit.py
@@ -1,11 +1,13 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 import torch.nn as nn
 import deepspeed.comm as dist
 import deepspeed
 import pytest
-import copy
 import os
 import numpy as np
 
@@ -15,13 +17,12 @@
 from unit.common import DistributedTest
 from unit.simple_model import SimpleModel, random_dataloader
 from unit.alexnet_model import AlexNetPipe, train_cifar
+from deepspeed.runtime.utils import required_torch_version
 from deepspeed.accelerator import get_accelerator
 
 PipeTopo = PipeDataParallelTopology
 
-TORCH_MAJOR = int(torch.__version__.split(".")[0])
-TORCH_MINOR = int(torch.__version__.split(".")[1])
-if TORCH_MAJOR < 1 or TORCH_MINOR < 8:
+if not required_torch_version(min_version=1.8):
     pytest.skip(
         "NCCL-based 1-bit compression requires torch 1.8 or higher",
         allow_module_level=True,
@@ -29,9 +30,8 @@
 
 rocm_version = OpBuilder.installed_rocm_version()
 if rocm_version[0] > 4:
-    pytest.skip(
-        "NCCL-based 1-bit compression is not yet supported w. ROCm 5 until cupy supports ROCm 5",
-        allow_module_level=True)
+    pytest.skip("NCCL-based 1-bit compression is not yet supported w. ROCm 5 until cupy supports ROCm 5",
+                allow_module_level=True)
 
 
 @pytest.mark.parametrize("dtype", [torch.float32, torch.float16], ids=["fp32", "fp16"])
@@ -62,9 +62,7 @@ def test(self, dtype):
         hidden_dim = 10
 
         model = SimpleModel(hidden_dim)
-        model, _, _, _ = deepspeed.initialize(
-            config=config_dict, model=model, model_parameters=model.parameters()
-        )
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
         data_loader = random_dataloader(
             model=model,
             total_samples=50,
@@ -127,10 +125,7 @@ def test(self):
             model=model,
             model_parameters=optimizer_grouped_parameters,
         )
-        data_loader = random_dataloader(model=model,
-                                        total_samples=50,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
+        data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device)
         for n, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
             model.backward(loss)
@@ -234,14 +229,12 @@ def test(self, tmpdir):
         # Test whether momentum mask still exist after saving checkpoint
         assert optimizer_1.optimizer.adam_freeze_key is True
         mask1 = mask1.to(device=optimizer_1.param_groups[0]["exp_avg_mask"].device)
-        assert torch.allclose(
-            optimizer_1.param_groups[0]["exp_avg_mask"], mask1, atol=1e-07
-        ), f"Incorrect momentum mask"
+        assert torch.allclose(optimizer_1.param_groups[0]["exp_avg_mask"], mask1,
+                              atol=1e-07), f"Incorrect momentum mask"
         save_folder = os.path.join(tmpdir, "saved_checkpoint")
         model_1.save_checkpoint(save_folder, tag=None)
-        assert torch.allclose(
-            optimizer_1.param_groups[0]["exp_avg_mask"], mask1, atol=1e-07
-        ), f"Momentum mask should not change after saving checkpoint"
+        assert torch.allclose(optimizer_1.param_groups[0]["exp_avg_mask"], mask1,
+                              atol=1e-07), f"Momentum mask should not change after saving checkpoint"
 
         model_2, optimizer_2, _, _ = deepspeed.initialize(
             config=config_dict,
@@ -250,18 +243,16 @@ def test(self, tmpdir):
         )
         # Test whether momentum mask stays the same after loading checkpoint
         mask2 = mask2.to(device=optimizer_2.param_groups[0]["exp_avg_mask"].device)
-        assert torch.allclose(
-            optimizer_2.param_groups[0]["exp_avg_mask"], mask2, atol=1e-07
-        ), f"Incorrect momentum mask"
+        assert torch.allclose(optimizer_2.param_groups[0]["exp_avg_mask"], mask2,
+                              atol=1e-07), f"Incorrect momentum mask"
         model_2.load_checkpoint(
             save_folder,
             tag=None,
             load_optimizer_states=True,
             load_lr_scheduler_states=True,
         )
-        assert torch.allclose(
-            optimizer_2.param_groups[0]["exp_avg_mask"], mask2, atol=1e-07
-        ), f"Momentum mask should not change after loading checkpoint"
+        assert torch.allclose(optimizer_2.param_groups[0]["exp_avg_mask"], mask2,
+                              atol=1e-07), f"Momentum mask should not change after loading checkpoint"
         # Test whether worker&server error is reset
         for v in optimizer_2.state.values():
             assert "worker_error" not in v, f"Incorrect worker error"
@@ -286,18 +277,15 @@ def test(self, tmpdir):
             model_3.step()
         assert optimizer_3.optimizer.adam_freeze_key is True
         # Test whether momentum mask stays the same after loading checkpoint
-        assert (
-            "exp_avg_mask" not in optimizer_3.param_groups[0]
-        ), f"Incorrect momentum mask"
+        assert ("exp_avg_mask" not in optimizer_3.param_groups[0]), f"Incorrect momentum mask"
         model_3.load_checkpoint(
             save_folder,
             tag=None,
             load_optimizer_states=True,
             load_lr_scheduler_states=True,
         )
-        assert (
-            "exp_avg_mask" not in optimizer_3.param_groups[0]
-        ), f"Momentum mask should not change after loading checkpoint"
+        assert ("exp_avg_mask"
+                not in optimizer_3.param_groups[0]), f"Momentum mask should not change after loading checkpoint"
         # Test whether worker&server error is reset
         for v in optimizer_3.state.values():
             assert "worker_error" not in v, f"Incorrect worker error"
@@ -328,13 +316,8 @@ def test_overflow(self, tmpdir):
         hidden_dim = 10
 
         model = SimpleModel(hidden_dim)
-        model, _, _, _ = deepspeed.initialize(
-            config=config_dict, model=model, model_parameters=model.parameters()
-        )
-        data_loader = random_dataloader(model=model,
-                                        total_samples=100,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
+        data_loader = random_dataloader(model=model, total_samples=100, hidden_dim=hidden_dim, device=model.device)
         save_folder = os.path.join(tmpdir, "saved_checkpoint")
         for n, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
@@ -350,18 +333,10 @@ def test_overflow(self, tmpdir):
 @pytest.mark.parametrize(
     "topo_config",
     [
-        {
-            "num_pp": 1,
-            "num_dp": 4
-        },
         {
             "num_pp": 2,
             "num_dp": 2
         },
-        {
-            "num_pp": 4,
-            "num_dp": 1
-        },
     ],
 )
 class TestOneBitAdamFP16Pipeline(DistributedTest):
@@ -369,15 +344,14 @@ class TestOneBitAdamFP16Pipeline(DistributedTest):
 
     def test(self, topo_config):
         config_dict = {
-            "train_batch_size": 16,
-            "train_micro_batch_size_per_gpu": 4,
+            "train_batch_size": 4,
+            "grandient_accumulation_steps": 1,
             "steps_per_print": 20,
             "optimizer": {
                 "type": "OneBitAdam",
                 "params": {
                     "lr": 0.00001,
-                    "betas": [0.9,
-                              0.999],
+                    "betas": [0.9, 0.999],
                     "eps": 1e-8,
                     "weight_decay": 3e-7,
                     "freeze_step": 200,
@@ -401,22 +375,12 @@ def test(self, topo_config):
         }
 
         topo = PipeTopo(**topo_config)
-        steps = 500  # Must be >=100
+        steps = 100
 
-        # Allocate model for consistent initial weights.
-        init_net = AlexNetPipe()
-
-        test_net = copy.deepcopy(init_net)
-        test_model = PipelineModule(layers=test_net.to_layers(),
-                                    topology=topo,
-                                    loss_fn=nn.CrossEntropyLoss())
-
-        test_losses = train_cifar(
-            test_model,
-            config=config_dict,
-            num_steps=steps,
-            fp16=config_dict["fp16"]["enabled"],
-        )
+        # TODO: Add correctness tests/asserts comparing with baseline?
+        test_net = AlexNetPipe()
+        test_model = PipelineModule(layers=test_net.to_layers(), topology=topo, loss_fn=nn.CrossEntropyLoss())
+        test_losses = train_cifar(test_model, config=config_dict, num_steps=steps, fp16=config_dict['fp16']['enabled'])
 
 
 @pytest.mark.parametrize("dtype", [torch.float32, torch.float16], ids=["fp32", "fp16"])
@@ -450,9 +414,7 @@ def test(self, dtype):
         hidden_dim = 10
 
         model = SimpleModel(hidden_dim)
-        model, _, _, _ = deepspeed.initialize(
-            config=config_dict, model=model, model_parameters=model.parameters()
-        )
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
         data_loader = random_dataloader(
             model=model,
             total_samples=50,
@@ -518,10 +480,7 @@ def test(self):
             model=model,
             model_parameters=optimizer_grouped_parameters,
         )
-        data_loader = random_dataloader(model=model,
-                                        total_samples=50,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
+        data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device)
         for n, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
             model.backward(loss)
@@ -627,14 +586,12 @@ def test(self, tmpdir):
             model_1.step()
         # Test whether momentum mask still exist after saving checkpoint
         mask1 = mask1.to(device=optimizer_1.param_groups[0]["exp_avg_mask"].device)
-        assert torch.allclose(
-            optimizer_1.param_groups[0]["exp_avg_mask"], mask1, atol=1e-07
-        ), f"Incorrect momentum mask"
+        assert torch.allclose(optimizer_1.param_groups[0]["exp_avg_mask"], mask1,
+                              atol=1e-07), f"Incorrect momentum mask"
         save_folder = os.path.join(tmpdir, "saved_checkpoint")
         model_1.save_checkpoint(save_folder, tag=None)
-        assert torch.allclose(
-            optimizer_1.param_groups[0]["exp_avg_mask"], mask1, atol=1e-07
-        ), f"Momentum mask should not change after saving checkpoint"
+        assert torch.allclose(optimizer_1.param_groups[0]["exp_avg_mask"], mask1,
+                              atol=1e-07), f"Momentum mask should not change after saving checkpoint"
 
         model_2, optimizer_2, _, _ = deepspeed.initialize(
             config=config_dict,
@@ -643,18 +600,16 @@ def test(self, tmpdir):
         )
         # Test whether momentum mask stays the same after loading checkpoint
         mask2 = mask2.to(device=optimizer_2.param_groups[0]["exp_avg_mask"].device)
-        assert torch.allclose(
-            optimizer_2.param_groups[0]["exp_avg_mask"], mask2, atol=1e-07
-        ), f"Incorrect momentum mask"
+        assert torch.allclose(optimizer_2.param_groups[0]["exp_avg_mask"], mask2,
+                              atol=1e-07), f"Incorrect momentum mask"
         model_2.load_checkpoint(
             save_folder,
             tag=None,
             load_optimizer_states=True,
             load_lr_scheduler_states=True,
         )
-        assert torch.allclose(
-            optimizer_2.param_groups[0]["exp_avg_mask"], mask2, atol=1e-07
-        ), f"Momentum mask should not change after loading checkpoint"
+        assert torch.allclose(optimizer_2.param_groups[0]["exp_avg_mask"], mask2,
+                              atol=1e-07), f"Momentum mask should not change after loading checkpoint"
         # Test whether worker&server error is reset
         for v in optimizer_2.state.values():
             assert "worker_error" not in v, f"Incorrect worker error"
@@ -677,18 +632,15 @@ def test(self, tmpdir):
             model_3.backward(loss)
             model_3.step()
         # Test whether momentum mask stays the same after loading checkpoint
-        assert (
-            "exp_avg_mask" not in optimizer_3.param_groups[0]
-        ), f"Incorrect momentum mask"
+        assert ("exp_avg_mask" not in optimizer_3.param_groups[0]), f"Incorrect momentum mask"
         model_3.load_checkpoint(
             save_folder,
             tag=None,
             load_optimizer_states=True,
             load_lr_scheduler_states=True,
         )
-        assert (
-            "exp_avg_mask" not in optimizer_3.param_groups[0]
-        ), f"Momentum mask should not change after loading checkpoint"
+        assert ("exp_avg_mask"
+                not in optimizer_3.param_groups[0]), f"Momentum mask should not change after loading checkpoint"
         # Test whether worker&server error is reset
         for v in optimizer_3.state.values():
             assert "worker_error" not in v, f"Incorrect worker error"
@@ -721,13 +673,8 @@ def test_overflow(self, tmpdir):
         hidden_dim = 10
 
         model = SimpleModel(hidden_dim)
-        model, _, _, _ = deepspeed.initialize(
-            config=config_dict, model=model, model_parameters=model.parameters()
-        )
-        data_loader = random_dataloader(model=model,
-                                        total_samples=100,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
+        data_loader = random_dataloader(model=model, total_samples=100, hidden_dim=hidden_dim, device=model.device)
         save_folder = os.path.join(tmpdir, "saved_checkpoint")
         for n, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
@@ -743,18 +690,10 @@ def test_overflow(self, tmpdir):
 @pytest.mark.parametrize(
     "topo_config",
     [
-        {
-            "num_pp": 1,
-            "num_dp": 4
-        },
         {
             "num_pp": 2,
             "num_dp": 2
         },
-        {
-            "num_pp": 4,
-            "num_dp": 1
-        },
     ],
 )
 class TestZeroOneAdamFP16Pipeline(DistributedTest):
@@ -762,15 +701,14 @@ class TestZeroOneAdamFP16Pipeline(DistributedTest):
 
     def test(self, topo_config):
         config_dict = {
-            "train_batch_size": 16,
-            "train_micro_batch_size_per_gpu": 4,
+            "train_batch_size": 4,
+            "grandient_accumulation_steps": 1,
             "steps_per_print": 20,
             "optimizer": {
                 "type": "ZeroOneAdam",
                 "params": {
                     "lr": 0.00001,
-                    "betas": [0.9,
-                              0.999],
+                    "betas": [0.9, 0.999],
                     "eps": 1e-8,
                     "weight_decay": 3e-7,
                     "var_freeze_step": 4,
@@ -797,22 +735,12 @@ def test(self, topo_config):
         }
 
         topo = PipeTopo(**topo_config)
-        steps = 500  # Must be >=100
-
-        # Allocate model for consistent initial weights.
-        init_net = AlexNetPipe()
-
-        test_net = copy.deepcopy(init_net)
-        test_model = PipelineModule(layers=test_net.to_layers(),
-                                    topology=topo,
-                                    loss_fn=nn.CrossEntropyLoss())
+        steps = 100
 
-        test_losses = train_cifar(
-            test_model,
-            config=config_dict,
-            num_steps=steps,
-            fp16=config_dict["fp16"]["enabled"],
-        )
+        # TODO: Add correctness tests/asserts comparing with baseline?
+        test_net = AlexNetPipe()
+        test_model = PipelineModule(layers=test_net.to_layers(), topology=topo, loss_fn=nn.CrossEntropyLoss())
+        test_losses = train_cifar(test_model, config=config_dict, num_steps=steps, fp16=config_dict['fp16']['enabled'])
 
 
 @pytest.mark.parametrize("dtype", [torch.float32, torch.float16], ids=["fp32", "fp16"])
@@ -849,9 +777,7 @@ def test(self, dtype):
         hidden_dim = 10
 
         model = SimpleModel(hidden_dim)
-        model, _, _, _ = deepspeed.initialize(
-            config=config_dict, model=model, model_parameters=model.parameters()
-        )
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
         data_loader = random_dataloader(
             model=model,
             total_samples=50,
@@ -919,10 +845,7 @@ def test(self):
             model=model,
             model_parameters=optimizer_grouped_parameters,
         )
-        data_loader = random_dataloader(model=model,
-                                        total_samples=50,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
+        data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device)
         for n, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
             model.backward(loss)
@@ -1030,18 +953,16 @@ def test(self, tmpdir):
         # Test whether momentum mask still exist after saving checkpoint
         assert optimizer_1.optimizer.lamb_freeze_key is True
         mask1 = mask1.to(device=optimizer_1.param_groups[0]["exp_avg_mask"].device)
-        assert torch.allclose(
-            optimizer_1.param_groups[0]["exp_avg_mask"], mask1, atol=1e-07
-        ), f"Incorrect momentum mask"
+        assert torch.allclose(optimizer_1.param_groups[0]["exp_avg_mask"], mask1,
+                              atol=1e-07), f"Incorrect momentum mask"
         scaling_coeff_1 = []
         for v in optimizer_1.state.values():
             assert "scaling_coeff" in v, f"Incorrect scaling_coeff"
             scaling_coeff_1.append(v["scaling_coeff"])
         save_folder = os.path.join(tmpdir, "saved_checkpoint")
         model_1.save_checkpoint(save_folder, tag=None)
-        assert torch.allclose(
-            optimizer_1.param_groups[0]["exp_avg_mask"], mask1, atol=1e-07
-        ), f"Momentum mask should not change after saving checkpoint"
+        assert torch.allclose(optimizer_1.param_groups[0]["exp_avg_mask"], mask1,
+                              atol=1e-07), f"Momentum mask should not change after saving checkpoint"
 
         model_2, optimizer_2, _, _ = deepspeed.initialize(
             config=config_dict,
@@ -1050,18 +971,16 @@ def test(self, tmpdir):
         )
         # Test whether momentum mask stays the same after loading checkpoint
         mask2 = mask2.to(device=optimizer_2.param_groups[0]["exp_avg_mask"].device)
-        assert torch.allclose(
-            optimizer_2.param_groups[0]["exp_avg_mask"], mask2, atol=1e-07
-        ), f"Incorrect momentum mask"
+        assert torch.allclose(optimizer_2.param_groups[0]["exp_avg_mask"], mask2,
+                              atol=1e-07), f"Incorrect momentum mask"
         model_2.load_checkpoint(
             save_folder,
             tag=None,
             load_optimizer_states=True,
             load_lr_scheduler_states=True,
         )
-        assert torch.allclose(
-            optimizer_2.param_groups[0]["exp_avg_mask"], mask2, atol=1e-07
-        ), f"Momentum mask should not change after loading checkpoint"
+        assert torch.allclose(optimizer_2.param_groups[0]["exp_avg_mask"], mask2,
+                              atol=1e-07), f"Momentum mask should not change after loading checkpoint"
         # Test whether worker&server error is reset
         assert len(optimizer_2.optimizer.worker_errors) == 0, f"Incorrect worker error"
         assert len(optimizer_2.optimizer.server_errors) == 0, f"Incorrect server error"
@@ -1070,9 +989,7 @@ def test(self, tmpdir):
         for v in optimizer_2.state.values():
             assert "scaling_coeff" in v, f"Incorrect scaling_coeff"
             scaling_coeff_2.append(v["scaling_coeff"])
-        assert list(sorted(scaling_coeff_2)) == list(
-            sorted(scaling_coeff_1)
-        ), f"Incorrect scaling_coeffs"
+        assert list(sorted(scaling_coeff_2)) == list(sorted(scaling_coeff_1)), f"Incorrect scaling_coeffs"
         assert optimizer_2.optimizer.lamb_freeze_key is True
 
         model_3, optimizer_3, _, _ = deepspeed.initialize(
@@ -1093,18 +1010,15 @@ def test(self, tmpdir):
             model_3.step()
         assert optimizer_3.optimizer.lamb_freeze_key is True
         # Test whether momentum mask stays the same after loading checkpoint
-        assert (
-            "exp_avg_mask" not in optimizer_3.param_groups[0]
-        ), f"Incorrect momentum mask"
+        assert ("exp_avg_mask" not in optimizer_3.param_groups[0]), f"Incorrect momentum mask"
         model_3.load_checkpoint(
             save_folder,
             tag=None,
             load_optimizer_states=True,
             load_lr_scheduler_states=True,
         )
-        assert (
-            "exp_avg_mask" not in optimizer_3.param_groups[0]
-        ), f"Momentum mask should not change after loading checkpoint"
+        assert ("exp_avg_mask"
+                not in optimizer_3.param_groups[0]), f"Momentum mask should not change after loading checkpoint"
         # Test whether worker&server error is reset
         assert len(optimizer_3.optimizer.worker_errors) == 0, f"Incorrect worker error"
         assert len(optimizer_3.optimizer.server_errors) == 0, f"Incorrect server error"
@@ -1145,13 +1059,8 @@ def test_overflow(self, tmpdir):
         hidden_dim = 10
 
         model = SimpleModel(hidden_dim)
-        model, _, _, _ = deepspeed.initialize(
-            config=config_dict, model=model, model_parameters=model.parameters()
-        )
-        data_loader = random_dataloader(model=model,
-                                        total_samples=100,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
+        data_loader = random_dataloader(model=model, total_samples=100, hidden_dim=hidden_dim, device=model.device)
         save_folder = os.path.join(tmpdir, "saved_checkpoint")
         for n, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
@@ -1167,18 +1076,10 @@ def test_overflow(self, tmpdir):
 @pytest.mark.parametrize(
     "topo_config",
     [
-        {
-            "num_pp": 1,
-            "num_dp": 4
-        },
         {
             "num_pp": 2,
             "num_dp": 2
         },
-        {
-            "num_pp": 4,
-            "num_dp": 1
-        },
     ],
 )
 class TestOneBitLambFP16Pipeline(DistributedTest):
@@ -1186,15 +1087,14 @@ class TestOneBitLambFP16Pipeline(DistributedTest):
 
     def test(self, topo_config):
         config_dict = {
-            "train_batch_size": 16,
-            "train_micro_batch_size_per_gpu": 4,
+            "train_batch_size": 4,
+            "grandient_accumulation_steps": 1,
             "steps_per_print": 20,
             "optimizer": {
                 "type": "OneBitLamb",
                 "params": {
                     "lr": 0.00001,
-                    "betas": [0.9,
-                              0.999],
+                    "betas": [0.9, 0.999],
                     "eps": 1e-8,
                     "weight_decay": 3e-7,
                     "freeze_step": 200,
@@ -1218,22 +1118,12 @@ def test(self, topo_config):
         }
 
         topo = PipeTopo(**topo_config)
-        steps = 500  # Must be >=100
+        steps = 100
 
-        # Allocate model for consistent initial weights.
-        init_net = AlexNetPipe()
-
-        test_net = copy.deepcopy(init_net)
-        test_model = PipelineModule(layers=test_net.to_layers(),
-                                    topology=topo,
-                                    loss_fn=nn.CrossEntropyLoss())
-
-        test_losses = train_cifar(
-            test_model,
-            config=config_dict,
-            num_steps=steps,
-            fp16=config_dict["fp16"]["enabled"],
-        )
+        # TODO: Add correctness tests/asserts comparing with baseline?
+        test_net = AlexNetPipe()
+        test_model = PipelineModule(layers=test_net.to_layers(), topology=topo, loss_fn=nn.CrossEntropyLoss())
+        test_losses = train_cifar(test_model, config=config_dict, num_steps=steps, fp16=config_dict['fp16']['enabled'])
 
 
 @pytest.mark.sequential
@@ -1258,15 +1148,11 @@ def torch_sim(a):
             worker_error = a - a_compressed
             dist.all_reduce(a_compressed)
             a_compressed.mul_(1 / dist.get_world_size())
-            a_server_sign = (
-                a_compressed.sign().add_(1).bool().float().add_(-0.5).mul_(2.0))
+            a_server_sign = (a_compressed.sign().add_(1).bool().float().add_(-0.5).mul_(2.0))
             a_list = torch.chunk(a_compressed, chunks=dist.get_world_size())
-            server_scale = [
-                chunk_a.norm() / np.sqrt(chunk_a.numel()) for chunk_a in a_list
-            ]
+            server_scale = [chunk_a.norm() / np.sqrt(chunk_a.numel()) for chunk_a in a_list]
             a_sign_list = torch.chunk(a_server_sign, dist.get_world_size())
-            a_server_compressed = torch.cat(
-                [server_scale[i] * a_sign_list[i] for i in range(dist.get_world_size())])
+            a_server_compressed = torch.cat([server_scale[i] * a_sign_list[i] for i in range(dist.get_world_size())])
             rank = dist.get_rank()
             server_error = a_list[rank] - server_scale[rank] * a_sign_list[rank]
             get_accelerator().synchronize()
diff --git a/tests/unit/runtime/half_precision/test_bf16.py b/tests/unit/runtime/half_precision/test_bf16.py
index 3bc5cb138c9b..3f551fb0fd4a 100644
--- a/tests/unit/runtime/half_precision/test_bf16.py
+++ b/tests/unit/runtime/half_precision/test_bf16.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 import deepspeed
@@ -24,6 +27,7 @@ def test(self, zero_stage=2, use_cpu_offload=False):
             pytest.skip("cpu-adam is not compatible")
 
         config_dict = {
+            "train_micro_batch_size_per_gpu": 1,
             "steps_per_print": 1,
             "optimizer": {
                 "type": "Adam",
@@ -59,9 +63,7 @@ def test(self, zero_stage=2, use_cpu_offload=False):
 
         hidden_dim = 10
         model = SimpleModel(hidden_dim)
-        model, _, _, _ = deepspeed.initialize(config=config_dict,
-                                             model=model,
-                                             model_parameters=model.parameters())
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
         data_loader = random_dataloader(model=model,
                                         total_samples=50,
                                         hidden_dim=hidden_dim,
@@ -86,7 +88,7 @@ def test(self, zero_stage=2, use_cpu_offload=False):
             pytest.skip("cpu-adam is not compatible")
 
         config_dict = {
-            "train_batch_size": 4,
+            "train_micro_batch_size_per_gpu": 4,
             "steps_per_print": 1,
             "fp16": {
                 "enabled": False,
@@ -154,9 +156,7 @@ def test(self, zero_stage=2, use_cpu_offload=False):
 
         # Ensure model has 2 parameters, to cause empty partition with DP=3
         assert len(list(model.parameters())) == 2
-        model, _, _, _ = deepspeed.initialize(config=config_dict,
-                                              model=model,
-                                              model_parameters=model.parameters())
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
 
         # Now make sure things work..
         data_loader = random_dataloader(model=model,
@@ -181,7 +181,7 @@ def test(self, optimizer_constructor, zero_stage=2):
             )
 
         config_dict = {
-            "train_batch_size": 2,
+            "train_micro_batch_size_per_gpu": 2,
             "steps_per_print": 1,
             "fp16": {
                 "enabled": False
@@ -197,9 +197,7 @@ def test(self, optimizer_constructor, zero_stage=2):
 
         model = SimpleModel(hidden_dim)
         client_optimizer = optimizer_constructor(params=model.parameters())
-        model, _, _, _ = deepspeed.initialize(config=config_dict,
-                                              model=model,
-                                              optimizer=client_optimizer)
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, optimizer=client_optimizer)
 
 
 class TestZero2ReduceScatterOff(DistributedTest):
@@ -212,7 +210,7 @@ def test(self):
             )
 
         config_dict = {
-            "train_batch_size": 2,
+            "train_micro_batch_size_per_gpu": 2,
             "steps_per_print": 1,
             "optimizer": {
                 "type": "Adam",
@@ -239,9 +237,7 @@ def test(self):
         hidden_dim = 10
 
         model = SimpleModel(hidden_dim)
-        model, _, _, _ = deepspeed.initialize(config=config_dict,
-                                              model=model,
-                                              model_parameters=model.parameters())
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
         data_loader = random_dataloader(model=model,
                                         total_samples=50,
                                         hidden_dim=hidden_dim,
@@ -263,7 +259,7 @@ def test(self, stage=2):
             )
 
         config_dict = {
-            "train_batch_size": 1,
+            "train_micro_batch_size_per_gpu": 1,
             "steps_per_print": 1,
             "fp16": {
                 "enabled": False
@@ -279,9 +275,7 @@ def test(self, stage=2):
 
         model = SimpleModel(hidden_dim)
         optimizer = torch.optim.Adam(model.parameters())
-        model, _, _, _ = deepspeed.initialize(config=config_dict,
-                                              model=model,
-                                              optimizer=optimizer)
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, optimizer=optimizer)
         data_loader = random_dataloader(model=model,
                                         total_samples=50,
                                         hidden_dim=hidden_dim,
@@ -293,18 +287,8 @@ def test(self, stage=2):
             model.step()
 
 
-@pytest.mark.parametrize("comp_type",
-                         [torch.float16,
-                          torch.bfloat16,
-                          torch.float],
-                         ids=["fp16",
-                              "bfp16",
-                              "fp32"])
-@pytest.mark.parametrize("comm_type",
-                         [torch.float16,
-                          torch.bfloat16],
-                         ids=["fp16",
-                              "bfp16"])
+@pytest.mark.parametrize("comp_type", [torch.float16, torch.bfloat16, torch.float], ids=["fp16", "bfp16", "fp32"])
+@pytest.mark.parametrize("comm_type", [torch.float16, torch.bfloat16, None], ids=["fp16", "bfp16", "default"])
 class TestZeroDtypeCocktail(DistributedTest):
     world_size = 2
 
@@ -318,7 +302,7 @@ def test(self, comp_type, comm_type):
         type_str = {torch.float16: "fp16", torch.bfloat16: "bfp16"}
 
         config_dict = {
-            "train_batch_size": 2,
+            "train_micro_batch_size_per_gpu": 2,
             "steps_per_print": 1,
             "fp16": {
                 "enabled": comp_type == torch.float16
@@ -329,15 +313,16 @@ def test(self, comp_type, comm_type):
             "zero_optimization": {
                 "stage": 2
             },
-            "communication_data_type": type_str[comm_type]
         }
+        if comm_type is not None:
+            config_dict["communication_data_type"] = type_str[comm_type]
+        else:
+            comm_type = comp_type
         hidden_dim = 10
 
         model = SimpleModel(hidden_dim)
         optimizer = torch.optim.Adam(model.parameters())
-        model, _, _, _ = deepspeed.initialize(config=config_dict,
-                                              model=model,
-                                              optimizer=optimizer)
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, optimizer=optimizer)
         data_loader = random_dataloader(model=model,
                                         total_samples=2,
                                         hidden_dim=hidden_dim,
diff --git a/tests/unit/runtime/half_precision/test_dynamic_loss_scale.py b/tests/unit/runtime/half_precision/test_dynamic_loss_scale.py
index 3052c4ee117a..2a58fd6b4a57 100644
--- a/tests/unit/runtime/half_precision/test_dynamic_loss_scale.py
+++ b/tests/unit/runtime/half_precision/test_dynamic_loss_scale.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 import deepspeed
@@ -37,9 +40,7 @@ def test_no_overflow(self):
         }
         hidden_dim = 1
         model = SimpleModel(hidden_dim)
-        model, optim, _, _ = deepspeed.initialize(config=config_dict,
-                                                  model=model,
-                                                  model_parameters=model.parameters())
+        model, optim, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
 
         expected_loss_scale = 2**8
         expected_scale_window = 2
@@ -74,9 +75,7 @@ def test_all_overflow(self):
         }
         hidden_dim = 1
         model = SimpleModel(hidden_dim)
-        model, optim, _, _ = deepspeed.initialize(config=config_dict,
-                                                  model=model,
-                                                  model_parameters=model.parameters())
+        model, optim, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
 
         expected_loss_scale = 2**4
         # Ensure the dynamic loss scaler is correctly configured.
@@ -109,9 +108,7 @@ def test_some_overflow(self):
         }
         hidden_dim = 1
         model = SimpleModel(hidden_dim)
-        model, optim, _, _ = deepspeed.initialize(config=config_dict,
-                                                  model=model,
-                                                  model_parameters=model.parameters())
+        model, optim, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
 
         expected_loss_scale = 2**8
         expected_scale_window = 2
@@ -168,9 +165,7 @@ def test_no_overflow(self):
         }
         hidden_dim = 1
         model = SimpleModel(hidden_dim)
-        model, optim, _, _ = deepspeed.initialize(config=config_dict,
-                                                  model=model,
-                                                  model_parameters=model.parameters())
+        model, optim, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
         expected_loss_scale = 2**8
         expected_scale_window = 2
         # Ensure the dynamic loss scaler is correctly configured.
@@ -205,9 +200,7 @@ def test_all_overflow(self):
         }
         hidden_dim = 1
         model = SimpleModel(hidden_dim)
-        model, optim, _, _ = deepspeed.initialize(config=config_dict,
-                                                  model=model,
-                                                  model_parameters=model.parameters())
+        model, optim, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
 
         expected_loss_scale = 2**4
         expected_min_loss_scale = 0.25
@@ -242,9 +235,7 @@ def test_some_overflow(self):
         }
         hidden_dim = 1
         model = SimpleModel(hidden_dim)
-        model, optim, _, _ = deepspeed.initialize(config=config_dict,
-                                                  model=model,
-                                                  model_parameters=model.parameters())
+        model, optim, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
 
         expected_loss_scale = 2**8
         expected_scale_window = 2
diff --git a/tests/unit/runtime/half_precision/test_fp16.py b/tests/unit/runtime/half_precision/test_fp16.py
index b8b3e3d39db6..3d5e18b46502 100644
--- a/tests/unit/runtime/half_precision/test_fp16.py
+++ b/tests/unit/runtime/half_precision/test_fp16.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 import deepspeed.comm as dist
@@ -7,17 +10,16 @@
 from deepspeed.ops.adam import FusedAdam
 from unit.common import DistributedTest
 from unit.simple_model import SimpleModel, SimpleOptimizer, random_dataloader, SimpleMoEModel, sequence_dataloader
-from unit.util import required_torch_version
+from deepspeed.runtime.utils import required_torch_version
 from deepspeed.accelerator import get_accelerator
 from deepspeed.ops.op_builder import CPUAdamBuilder
 
 try:
-    from apex import amp  # noqa: F401
+    from apex import amp  # noqa: F401 # type: ignore
     _amp_available = True
 except ImportError:
     _amp_available = False
-amp_available = pytest.mark.skipif(not _amp_available,
-                                   reason="apex/amp is not installed")
+amp_available = pytest.mark.skipif(not _amp_available, reason="apex/amp is not installed")
 
 
 class TestLambFP32GradClip(DistributedTest):
@@ -38,9 +40,7 @@ def test(self):
         hidden_dim = 10
 
         model = SimpleModel(hidden_dim)
-        model, _, _, _ = deepspeed.initialize(config=config_dict,
-                                              model=model,
-                                              model_parameters=model.parameters())
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
         data_loader = random_dataloader(model=model,
                                         total_samples=50,
                                         hidden_dim=hidden_dim,
@@ -73,13 +73,8 @@ def test__basic(self):
         hidden_dim = 10
 
         model = SimpleModel(hidden_dim)
-        model, _, _, _ = deepspeed.initialize(config=config_dict,
-                                              model=model,
-                                              model_parameters=model.parameters())
-        data_loader = random_dataloader(model=model,
-                                        total_samples=50,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
+        data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device)
         for n, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
             model.backward(loss)
@@ -103,13 +98,8 @@ def test_empty_grad(self):
         hidden_dim = 10
 
         model = SimpleModel(hidden_dim, empty_grad=True)
-        model, _, _, _ = deepspeed.initialize(config=config_dict,
-                                              model=model,
-                                              model_parameters=model.parameters())
-        data_loader = random_dataloader(model=model,
-                                        total_samples=50,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
+        data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device)
         for n, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
             model.backward(loss)
@@ -137,9 +127,7 @@ def test(self):
         hidden_dim = 10
 
         model = SimpleModel(hidden_dim, empty_grad=True)
-        model, _, _, _ = deepspeed.initialize(config=config_dict,
-                                              model=model,
-                                              model_parameters=model.parameters())
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
         data_loader = random_dataloader(model=model,
                                         total_samples=50,
                                         hidden_dim=hidden_dim,
@@ -155,24 +143,13 @@ class TestAdamwFP16Basic(DistributedTest):
     world_size = 1
 
     def test(self):
-        config_dict = {
-            "train_batch_size": 1,
-            "steps_per_print": 1,
-            "fp16": {
-                "enabled": True
-            }
-        }
+        config_dict = {"train_batch_size": 1, "steps_per_print": 1, "fp16": {"enabled": True}}
         hidden_dim = 10
 
         model = SimpleModel(hidden_dim)
         optimizer = torch.optim.AdamW(params=model.parameters())
-        model, _, _, _ = deepspeed.initialize(config=config_dict,
-                                              model=model,
-                                              optimizer=optimizer)
-        data_loader = random_dataloader(model=model,
-                                        total_samples=50,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, optimizer=optimizer)
+        data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device)
         for n, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
             model.backward(loss)
@@ -183,23 +160,15 @@ class TestFP16OptimizerForMoE(DistributedTest):
     world_size = 2
 
     def test_unfused_gradnorm(self, monkeypatch):
-        if not required_torch_version():
+        if not required_torch_version(min_version=1.8):
             pytest.skip("DeepSpeed MoE tests need torch 1.8 or higher to run correctly")
 
-        config_dict = {
-            "train_batch_size": 2,
-            "steps_per_print": 1,
-            "fp16": {
-                "enabled": True
-            }
-        }
+        config_dict = {"train_batch_size": 2, "steps_per_print": 1, "fp16": {"enabled": True}}
         hidden_dim = 10
 
         def mock_unscale_and_clip_grads(total_norm, apply_scale=True):
             torch_norm_tensor = get_accelerator().FloatTensor([total_norm])
-            all_gather_results = [
-                torch.zeros_like(torch_norm_tensor) for _ in range(dist.get_world_size())
-            ]
+            all_gather_results = [torch.zeros_like(torch_norm_tensor) for _ in range(dist.get_world_size())]
             dist.all_gather(all_gather_results, torch_norm_tensor)
             assert len(set([x.item() for x in all_gather_results])) == 1
             return 1.0
@@ -208,39 +177,26 @@ def mock_unscale_and_clip_grads(total_norm, apply_scale=True):
         model = SimpleMoEModel(hidden_dim, ep_size=2)
         optimizer = torch.optim.AdamW(params=model.parameters())
         engine, optimizer, _, _ = deepspeed.initialize(config=config_dict,
-                                              model=model,
-                                              optimizer=optimizer,
-                                              dist_init_required=False)
-        monkeypatch.setattr(optimizer,
-                            'unscale_and_clip_grads',
-                            mock_unscale_and_clip_grads)
-        data_loader = sequence_dataloader(model=engine,
-                                          total_samples=50,
-                                          hidden_dim=hidden_dim,
-                                          device=engine.device)
+                                                       model=model,
+                                                       optimizer=optimizer,
+                                                       dist_init_required=False)
+        monkeypatch.setattr(optimizer, 'unscale_and_clip_grads', mock_unscale_and_clip_grads)
+        data_loader = sequence_dataloader(model=engine, total_samples=50, hidden_dim=hidden_dim, device=engine.device)
         for n, batch in enumerate(data_loader):
             loss = engine(batch[0], batch[1])
             engine.backward(loss)
             engine.step()
 
     def test_fused_gradnorm(self, monkeypatch):
-        if not required_torch_version():
+        if not required_torch_version(min_version=1.8):
             pytest.skip("DeepSpeed MoE tests need torch 1.8 or higher to run correctly")
 
-        config_dict = {
-            "train_batch_size": 2,
-            "steps_per_print": 1,
-            "fp16": {
-                "enabled": True
-            }
-        }
+        config_dict = {"train_batch_size": 2, "steps_per_print": 1, "fp16": {"enabled": True}}
         hidden_dim = 10
 
         def mock_unscale_and_clip_grads(grads_groups_flat, total_norm, apply_scale=True):
             torch_norm_tensor = get_accelerator().FloatTensor([total_norm])
-            all_gather_results = [
-                torch.zeros_like(torch_norm_tensor) for _ in range(dist.get_world_size())
-            ]
+            all_gather_results = [torch.zeros_like(torch_norm_tensor) for _ in range(dist.get_world_size())]
             dist.all_gather(all_gather_results, torch_norm_tensor)
             assert len(set([x.item() for x in all_gather_results])) == 1
             return 1.0
@@ -250,16 +206,11 @@ def mock_unscale_and_clip_grads(grads_groups_flat, total_norm, apply_scale=True)
         # optimizer = torch.optim.AdamW(params=model.parameters())
         optimizer = FusedAdam(params=model.parameters())
         engine, optimizer, _, _ = deepspeed.initialize(config=config_dict,
-                                              model=model,
-                                              optimizer=optimizer,
-                                              dist_init_required=False)
-        monkeypatch.setattr(optimizer,
-                            'unscale_and_clip_grads',
-                            mock_unscale_and_clip_grads)
-        data_loader = sequence_dataloader(model=engine,
-                                          total_samples=50,
-                                          hidden_dim=hidden_dim,
-                                          device=engine.device)
+                                                       model=model,
+                                                       optimizer=optimizer,
+                                                       dist_init_required=False)
+        monkeypatch.setattr(optimizer, 'unscale_and_clip_grads', mock_unscale_and_clip_grads)
+        data_loader = sequence_dataloader(model=engine, total_samples=50, hidden_dim=hidden_dim, device=engine.device)
         for n, batch in enumerate(data_loader):
             loss = engine(batch[0], batch[1])
             engine.backward(loss)
@@ -267,7 +218,7 @@ def mock_unscale_and_clip_grads(grads_groups_flat, total_norm, apply_scale=True)
 
     @pytest.mark.parametrize("fused_lamb_legacy", [(False), (True)])
     def test_lamb_gradnorm(self, monkeypatch, fused_lamb_legacy: bool):
-        if not required_torch_version():
+        if not required_torch_version(min_version=1.8):
             pytest.skip("DeepSpeed MoE tests need torch 1.8 or higher to run correctly")
 
         config_dict = {
@@ -287,9 +238,7 @@ def test_lamb_gradnorm(self, monkeypatch, fused_lamb_legacy: bool):
 
         def mock_unscale_and_clip_grads(total_norm, apply_scale=True):
             torch_norm_tensor = get_accelerator().FloatTensor([total_norm])
-            all_gather_results = [
-                torch.zeros_like(torch_norm_tensor) for _ in range(dist.get_world_size())
-            ]
+            all_gather_results = [torch.zeros_like(torch_norm_tensor) for _ in range(dist.get_world_size())]
             dist.all_gather(all_gather_results, torch_norm_tensor)
             assert len(set([x.item() for x in all_gather_results])) == 1
             return 1.0
@@ -297,17 +246,12 @@ def mock_unscale_and_clip_grads(total_norm, apply_scale=True):
         # initialize MoE
         model = SimpleMoEModel(hidden_dim, ep_size=2)
         engine, optimizer, _, _ = deepspeed.initialize(config=config_dict,
-                                               model=model,
-                                               model_parameters=model.parameters(),
-                                               dist_init_required=False)
-        monkeypatch.setattr(optimizer,
-                            'unscale_and_clip_grads',
-                            mock_unscale_and_clip_grads)
+                                                       model=model,
+                                                       model_parameters=model.parameters(),
+                                                       dist_init_required=False)
+        monkeypatch.setattr(optimizer, 'unscale_and_clip_grads', mock_unscale_and_clip_grads)
         optimizer.fused_lamb_legacy = fused_lamb_legacy
-        data_loader = sequence_dataloader(model=engine,
-                                          total_samples=50,
-                                          hidden_dim=hidden_dim,
-                                          device=engine.device)
+        data_loader = sequence_dataloader(model=engine, total_samples=50, hidden_dim=hidden_dim, device=engine.device)
         for n, batch in enumerate(data_loader):
             loss = engine(batch[0], batch[1])
             engine.backward(loss)
@@ -318,24 +262,13 @@ class TestAdamwFP16EmptyGrad(DistributedTest):
     world_size = 1
 
     def test(self):
-        config_dict = {
-            "train_batch_size": 1,
-            "steps_per_print": 1,
-            "fp16": {
-                "enabled": True
-            }
-        }
+        config_dict = {"train_batch_size": 1, "steps_per_print": 1, "fp16": {"enabled": True}}
         hidden_dim = 10
 
         model = SimpleModel(hidden_dim)
         optimizer = torch.optim.AdamW(params=model.parameters())
-        model, _, _, _ = deepspeed.initialize(config=config_dict,
-                                              model=model,
-                                              optimizer=optimizer)
-        data_loader = random_dataloader(model=model,
-                                        total_samples=50,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, optimizer=optimizer)
+        data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device)
         for n, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
             model.backward(loss)
@@ -385,13 +318,8 @@ def test(self, zero_stage, use_cpu_offload):
         hidden_dim = 10
 
         model = SimpleModel(hidden_dim)
-        model, _, _,_ = deepspeed.initialize(config=config_dict,
-                                             model=model,
-                                             model_parameters=model.parameters())
-        data_loader = random_dataloader(model=model,
-                                        total_samples=50,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
+        data_loader = random_dataloader(model=model, total_samples=10, hidden_dim=hidden_dim, device=model.device)
         for n, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
             model.backward(loss)
@@ -400,11 +328,10 @@ def test(self, zero_stage, use_cpu_offload):
 
 @pytest.mark.parametrize("zero_stage", [1, 2, 3])
 @pytest.mark.parametrize("use_cpu_offload", [True, False])
-@pytest.mark.parametrize("hidden_dim", [9, 10])
 class TestZeroStaticScale(DistributedTest):
     world_size = 1
 
-    def test(self, zero_stage, use_cpu_offload, hidden_dim):
+    def test(self, zero_stage, use_cpu_offload, hidden_dim=4):
         if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
             pytest.skip("cpu-adam is not compatible")
 
@@ -428,19 +355,14 @@ def test(self, zero_stage, use_cpu_offload, hidden_dim):
         }
 
         model = SimpleModel(hidden_dim)
-        model, optim, _, _ = deepspeed.initialize(config=config_dict,
-                                            model=model,
-                                            model_parameters=model.parameters())
+        model, optim, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
 
         # Ensure the static scaler is configured.
         assert optim.dynamic_loss_scale == False
         assert optim.loss_scaler.loss_scale == 138.
 
         # Now make sure things work..
-        data_loader = random_dataloader(model=model,
-                                        total_samples=10,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
+        data_loader = random_dataloader(model=model, total_samples=10, hidden_dim=hidden_dim, device=model.device)
         for n, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
             model.backward(loss)
@@ -517,15 +439,10 @@ def test(self, zero_stage, use_cpu_offload):
 
         # Ensure model has 2 parameters, to cause empty partition with DP=3
         assert len(list(model.parameters())) == 2
-        model, _, _, _ = deepspeed.initialize(config=config_dict,
-                                              model=model,
-                                              model_parameters=model.parameters())
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
 
         # Now make sure things work..
-        data_loader = random_dataloader(model=model,
-                                        total_samples=1,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
+        data_loader = random_dataloader(model=model, total_samples=1, hidden_dim=hidden_dim, device=model.device)
         for n, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
             model.backward(loss)
@@ -537,24 +454,13 @@ class TestAmp(DistributedTest):
     world_size = 2
 
     def test_adam_basic(self):
-        config_dict = {
-            "train_batch_size": 2,
-            "steps_per_print": 1,
-            "amp": {
-                "enabled": True
-            }
-        }
+        config_dict = {"train_batch_size": 2, "steps_per_print": 1, "amp": {"enabled": True}}
         hidden_dim = 10
 
         model = SimpleModel(hidden_dim)
         optimizer = torch.optim.Adam(params=model.parameters())
-        model, _, _, _ = deepspeed.initialize(config=config_dict,
-                                              model=model,
-                                              optimizer=optimizer)
-        data_loader = random_dataloader(model=model,
-                                        total_samples=50,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, optimizer=optimizer)
+        data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device)
         for n, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
             model.backward(loss)
@@ -578,13 +484,8 @@ def test_lamb_basic(self):
         hidden_dim = 10
 
         model = SimpleModel(hidden_dim)
-        model, _, _, _ = deepspeed.initialize(config=config_dict,
-                                              model=model,
-                                              model_parameters=model.parameters())
-        data_loader = random_dataloader(model=model,
-                                        total_samples=50,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
+        data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device)
         for n, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
             model.backward(loss)
@@ -609,13 +510,8 @@ def test_adam_O2(self):
         hidden_dim = 10
 
         model = SimpleModel(hidden_dim)
-        model, _, _, _ = deepspeed.initialize(config=config_dict,
-                                              model=model,
-                                              model_parameters=model.parameters())
-        data_loader = random_dataloader(model=model,
-                                        total_samples=50,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
+        data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device)
         for n, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
             model.backward(loss)
@@ -640,13 +536,8 @@ def test_adam_O2_empty_grad(self):
         hidden_dim = 10
 
         model = SimpleModel(hidden_dim)
-        model, _, _, _ = deepspeed.initialize(config=config_dict,
-                                              model=model,
-                                              model_parameters=model.parameters())
-        data_loader = random_dataloader(model=model,
-                                        total_samples=50,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
+        data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device)
         for n, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
             model.backward(loss)
@@ -673,9 +564,7 @@ def test(self, zero_stage, optimizer_constructor):
 
         model = SimpleModel(hidden_dim)
         client_optimizer = optimizer_constructor(params=model.parameters())
-        model, _, _, _ = deepspeed.initialize(config=config_dict,
-                                              model=model,
-                                              optimizer=client_optimizer)
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, optimizer=client_optimizer)
 
 
 class TestZero2ReduceScatterOff(DistributedTest):
@@ -707,13 +596,8 @@ def test(self):
         hidden_dim = 10
 
         model = SimpleModel(hidden_dim)
-        model, _, _, _ = deepspeed.initialize(config=config_dict,
-                                              model=model,
-                                              model_parameters=model.parameters())
-        data_loader = random_dataloader(model=model,
-                                        total_samples=50,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
+        data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device)
         for n, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
             model.backward(loss)
@@ -744,14 +628,9 @@ def test(self, adam_type, torch_impl):
         hidden_dim = 10
 
         model = SimpleModel(hidden_dim)
-        model, _, _, _ = deepspeed.initialize(config=config_dict,
-                                              model=model,
-                                              model_parameters=model.parameters())
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
 
-        data_loader = random_dataloader(model=model,
-                                        total_samples=10,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
+        data_loader = random_dataloader(model=model, total_samples=10, hidden_dim=hidden_dim, device=model.device)
 
         for _, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
@@ -783,14 +662,9 @@ def test(self):
         hidden_dim = 10
 
         model = SimpleModel(hidden_dim)
-        model, _, _, _ = deepspeed.initialize(config=config_dict,
-                                              model=model,
-                                              model_parameters=model.parameters())
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
 
-        data_loader = random_dataloader(model=model,
-                                        total_samples=10,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
+        data_loader = random_dataloader(model=model, total_samples=10, hidden_dim=hidden_dim, device=model.device)
 
         for _, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
@@ -817,13 +691,8 @@ def test(self, stage):
 
         model = SimpleModel(hidden_dim)
         optimizer = torch.optim.Adam(model.parameters())
-        model, _, _, _ = deepspeed.initialize(config=config_dict,
-                                              model=model,
-                                              optimizer=optimizer)
-        data_loader = random_dataloader(model=model,
-                                        total_samples=50,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, optimizer=optimizer)
+        data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device)
         for n, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
             model.backward(loss)
diff --git a/tests/unit/runtime/half_precision/test_fp8.py b/tests/unit/runtime/half_precision/test_fp8.py
new file mode 100644
index 000000000000..21217ed7dd82
--- /dev/null
+++ b/tests/unit/runtime/half_precision/test_fp8.py
@@ -0,0 +1,91 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+import deepspeed
+import pytest
+from unit.common import DistributedTest
+from unit.util import skip_on_arch
+
+try:
+    import transformer_engine.pytorch as transformer_engine
+    from transformer_engine.common import recipe
+except ImportError:
+    pytest.skip("Transformer Engine package is missing, skipping tests", allow_module_level=True)
+
+
+@pytest.mark.parametrize("base_datatype", ["fp16", "bf16", "fp32"])
+class TestFp8ComposabilityAcrossZero(DistributedTest):
+    world_size = 1
+
+    def test(self, base_datatype):
+        skip_on_arch(min_arch=9)
+
+        def run_zero(stage, model_dtype):
+            num_batches = 128
+            batch_size = 16
+            hidden_dim = 768
+            # Have to set seed before model
+            torch.random.manual_seed(42)
+            enable_fp16 = model_dtype == torch.float16
+            enable_bf16 = model_dtype == torch.bfloat16
+            # TransformerEngine Model
+            model = transformer_engine.Linear(hidden_dim, hidden_dim, bias=True, params_dtype=model_dtype)
+
+            # Create FP8 recipe. Note: All input args are optional.
+            fp8_recipe = recipe.DelayedScaling(fp8_format=recipe.Format.HYBRID,
+                                               amax_history_len=16,
+                                               amax_compute_algo="max")
+            config = {
+                "train_batch_size": batch_size,
+                "gradient_accumulation_steps": 1,
+                "optimizer": {
+                    "type": "Adam",
+                    "params": {
+                        "lr": 0.00001
+                    }
+                },
+                "zero_optimization": {
+                    "stage": stage
+                },
+                "fp16": {
+                    "enabled": enable_fp16,
+                    "loss_scale": 0.1
+                },
+                "bf16": {
+                    "enabled": enable_bf16
+                }
+            }
+            # Init DeepSpeed
+            model, optimizer, _, _ = deepspeed.initialize(args=None,
+                                                          model=model,
+                                                          model_parameters=model.parameters(),
+                                                          config=config)
+
+            batches = torch.randn(num_batches, batch_size, hidden_dim, device=model.device, dtype=model_dtype)
+            for batch in batches:
+                # Enables autocasting for the forward pass
+                with transformer_engine.fp8_autocast(enabled=True, fp8_recipe=fp8_recipe):
+                    out = model(batch)
+                loss = out.mean()
+                model.backward(loss)
+                model.step()
+            return loss
+
+        if base_datatype == "fp16":
+            model_dtype = torch.float16
+        elif base_datatype == "bf16":
+            model_dtype = torch.bfloat16
+        else:
+            model_dtype = torch.float32
+
+        # config
+        zero_stage = [0, 1, 2, 3]
+        losses = []
+        for stage in zero_stage:
+            loss = run_zero(stage, model_dtype)
+            losses.append(loss)
+        all_equal = all(torch.allclose(loss, losses[0], 1e-07, 1e-05) for loss in losses)
+        assert (all_equal)
diff --git a/tests/unit/runtime/pipe/test_pipe.py b/tests/unit/runtime/pipe/test_pipe.py
index 2c4d3aef1f63..88e26290b650 100644
--- a/tests/unit/runtime/pipe/test_pipe.py
+++ b/tests/unit/runtime/pipe/test_pipe.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import copy
 import torch.nn as nn
@@ -9,98 +12,132 @@
 from deepspeed.runtime.pipe.module import PipelineModule
 from unit.alexnet_model import AlexNetPipe, train_cifar
 from unit.common import DistributedTest
+from unit.util import skip_on_arch
 
 PipeTopo = PipeDataParallelTopology
 
+config_dict = {
+    "train_batch_size": 4,
+    "grandient_accumulation_steps": 1,
+    "steps_per_print": 20,
+    "optimizer": {
+        "type": "Adam",
+        "params": {
+            "lr": 0.001,
+            "betas": [0.9, 0.999],
+            "eps": 1e-8,
+            "weight_decay": 3e-7
+        }
+    },
+    "zero_optimization": {
+        "stage": 0
+    },
+    "fp16": {
+        "enabled": False
+    },
+    "pipeline": {
+        "seed_layers": True,
+        "activation_checkpoint_interval": 1
+    }
+}
+
 
 def rel_diff(A, B):
     return abs(A - B) / abs(A)
 
 
-@pytest.mark.parametrize('topo_config',
-                         [
-                             {
-                                 "num_pp": 1,
-                                 "num_dp": 4
-                             },
-                             {
-                                 "num_pp": 2,
-                                 "num_dp": 2
-                             },
-                             {
-                                 "num_pp": 4,
-                                 "num_dp": 1
-                             },
-                         ])
+@pytest.mark.parametrize('topo_config', [
+    {
+        "num_pp": 1,
+        "num_dp": 4
+    },
+    {
+        "num_pp": 2,
+        "num_dp": 2
+    },
+    {
+        "num_pp": 4,
+        "num_dp": 1
+    },
+])
 class TestPipeCifar10(DistributedTest):
     world_size = 4
 
-    def test(self, topo_config):
-        config_dict = {
-            "train_batch_size": 16,
-            "train_micro_batch_size_per_gpu": 4,
-            "steps_per_print": 20,
-            "optimizer": {
-                "type": "Adam",
-                "params": {
-                    "lr": 0.001,
-                    "betas": [0.9,
-                              0.999],
-                    "eps": 1e-8,
-                    "weight_decay": 3e-7
-                }
-            },
-            "zero_optimization": {
-                "stage": 0
-            },
-            "fp16": {
-                "enabled": False
-            },
-            "pipeline": {
-                "seed_layers": True,
-                "activation_checkpoint_interval": 1
-            }
-        }
-
+    def test_pipe_base(self, topo_config):
+        skip_on_arch(min_arch=7)
         topo = PipeTopo(**topo_config)
-        steps = 500  # must be >=100
+        steps = 100  # must be >=100
 
         # Allocate model for consistent initial weights.
         init_net = AlexNetPipe()
 
         base_net = copy.deepcopy(init_net)
-        base_model = PipelineModule(layers=base_net.to_layers(),
-                                    num_stages=1,
-                                    loss_fn=nn.CrossEntropyLoss())
+        base_model = PipelineModule(layers=base_net.to_layers(), num_stages=1, loss_fn=nn.CrossEntropyLoss())
 
         # Train with just data parallelism
-        base_losses = train_cifar(base_model,
-                                  config=config_dict,
-                                  num_steps=steps,
-                                  fp16=config_dict['fp16']['enabled'])
+        base_losses = train_cifar(base_model, config=config_dict, num_steps=steps, fp16=config_dict['fp16']['enabled'])
 
         test_net = copy.deepcopy(init_net)
-        test_model = PipelineModule(layers=test_net.to_layers(),
-                                    topology=topo,
-                                    loss_fn=nn.CrossEntropyLoss())
+        test_model = PipelineModule(layers=test_net.to_layers(), topology=topo, loss_fn=nn.CrossEntropyLoss())
+
+        test_losses = train_cifar(test_model, config=config_dict, num_steps=steps, fp16=config_dict['fp16']['enabled'])
+
+        abs_diffs = [l0 - l1 for l0, l1 in zip(base_losses, test_losses)]
+        rel_diffs = [rel_diff(l0, l1) for l0, l1 in zip(base_losses, test_losses)]
+        if dist.get_rank() == 0:
+            print(f'abs min={min(abs_diffs)} max={max(abs_diffs)} avg={sum(abs_diffs)/len(abs_diffs)}')
+            print(f'rel min={min(rel_diffs)} max={max(rel_diffs)} avg={sum(rel_diffs)/len(rel_diffs)}')
+            print(f'first: base={base_losses[0]} test={test_losses[0]} abs={abs_diffs[0]} rel={rel_diffs[0]}')
+
+            for lastX in [1, 10, 100]:
+                base_avg = sum(base_losses[-lastX:]) / lastX
+                test_avg = sum(test_losses[-lastX:]) / lastX
+                print(
+                    f'last-{lastX}: base={base_avg} test={test_avg} abs={base_avg - test_avg} rel={rel_diff(base_avg, test_avg)}'
+                )
 
+        lastX = 100
+        base = base_losses[-lastX:]
+        base_avg = sum(base) / len(base)
+        test = test_losses[-lastX:]
+        test_avg = sum(test) / len(test)
+        assert rel_diff(base_avg, test_avg) < 0.05  # Originally 0.03, but seeing instability with AMD results
+
+    # def _check_model_params_equal(self, model1, model2):
+    #     for p1, p2 in zip(model1.parameters(), model2.parameters()):
+    #         if p1.data.ne(p2.data).sum() > 0:
+    #             assert False, f"model params not equal"
+
+    def test_pipe_use_reentrant(self, topo_config):
+        skip_on_arch(min_arch=7)
+
+        topo = PipeTopo(**topo_config)
+        steps = 100  # must be >=100
+
+        # Allocate model for consistent initial weights.
+        init_net = AlexNetPipe()
+
+        # Train with not set use_reentrant, default: True
+        base_net = copy.deepcopy(init_net)
+        base_model = PipelineModule(layers=base_net.to_layers(), topology=topo, loss_fn=nn.CrossEntropyLoss())
+        base_losses = train_cifar(base_model, config=config_dict, num_steps=steps, fp16=config_dict['fp16']['enabled'])
+
+        # Train with set use_reentrant=False, this will use ``non_reentrant_checkpoint``
+        test_config_dict = copy.deepcopy(config_dict)
+        test_config_dict['pipeline']['use_reentrant'] = False
+        test_net = copy.deepcopy(init_net)
+        test_model = PipelineModule(layers=test_net.to_layers(), topology=topo, loss_fn=nn.CrossEntropyLoss())
         test_losses = train_cifar(test_model,
-                                  config=config_dict,
+                                  config=test_config_dict,
                                   num_steps=steps,
                                   fp16=config_dict['fp16']['enabled'])
 
         abs_diffs = [l0 - l1 for l0, l1 in zip(base_losses, test_losses)]
         rel_diffs = [rel_diff(l0, l1) for l0, l1 in zip(base_losses, test_losses)]
         if dist.get_rank() == 0:
-            print(
-                f'abs min={min(abs_diffs)} max={max(abs_diffs)} avg={sum(abs_diffs)/len(abs_diffs)}'
-            )
-            print(
-                f'rel min={min(rel_diffs)} max={max(rel_diffs)} avg={sum(rel_diffs)/len(rel_diffs)}'
-            )
-            print(
-                f'first: base={base_losses[0]} test={test_losses[0]} abs={abs_diffs[0]} rel={rel_diffs[0]}'
-            )
+            print(f'abs min={min(abs_diffs)} max={max(abs_diffs)} avg={sum(abs_diffs)/len(abs_diffs)}')
+            print(f'rel min={min(rel_diffs)} max={max(rel_diffs)} avg={sum(rel_diffs)/len(rel_diffs)}')
+            print(f'first: base={base_losses[0]} test={test_losses[0]} abs={abs_diffs[0]} rel={rel_diffs[0]}')
 
             for lastX in [1, 10, 100]:
                 base_avg = sum(base_losses[-lastX:]) / lastX
@@ -108,12 +145,13 @@ def test(self, topo_config):
                 print(
                     f'last-{lastX}: base={base_avg} test={test_avg} abs={base_avg - test_avg} rel={rel_diff(base_avg, test_avg)}'
                 )
-
         lastX = 100
         base = base_losses[-lastX:]
         base_avg = sum(base) / len(base)
         test = test_losses[-lastX:]
         test_avg = sum(test) / len(test)
-        assert rel_diff(
-            base_avg,
-            test_avg) < 0.05  # Originally 0.03, but seeing instability with AMD results
+        assert rel_diff(base_avg, test_avg) < 0.05
+
+        # the following check could passed on higher version docker: nvcr.io/nvidia/pytorch:23.07-py3(torch2.1.0 cuda12.1)
+        # Check if models have same weights after training
+        # self._check_model_params_equal(base_model, test_model)
diff --git a/tests/unit/runtime/pipe/test_pipe_schedule.py b/tests/unit/runtime/pipe/test_pipe_schedule.py
index 5ca3dfe1d2a0..7af7d734e430 100644
--- a/tests/unit/runtime/pipe/test_pipe_schedule.py
+++ b/tests/unit/runtime/pipe/test_pipe_schedule.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import pytest
 import deepspeed.runtime.pipe.schedule as schedule
@@ -38,9 +41,7 @@ def test_pipe_train_schedule_singlestage():
 
 @pytest.mark.parametrize('micro_batches', [1, 3, 8, 10])
 def test_pipe_inference_schedule_firststage(micro_batches, stages=3):
-    sched = schedule.InferenceSchedule(micro_batches=micro_batches,
-                                       stages=stages,
-                                       stage_id=0)
+    sched = schedule.InferenceSchedule(micro_batches=micro_batches, stages=stages, stage_id=0)
     assert sched.num_micro_batches == micro_batches
     full = list(iter(sched))
     for idx, cmds in enumerate(full):
@@ -73,9 +74,7 @@ def test_pipe_inference_schedule_firststage(micro_batches, stages=3):
 
 @pytest.mark.parametrize('micro_batches', [1, 3, 8, 10])
 def test_pipe_inference_schedule_midstage(micro_batches, stages=3):
-    sched = schedule.InferenceSchedule(micro_batches=micro_batches,
-                                       stages=stages,
-                                       stage_id=1)
+    sched = schedule.InferenceSchedule(micro_batches=micro_batches, stages=stages, stage_id=1)
 
     full = list(iter(sched))
     for idx, cmds in enumerate(full):
@@ -99,9 +98,7 @@ def test_pipe_inference_schedule_midstage(micro_batches, stages=3):
 
 @pytest.mark.parametrize('micro_batches', [1, 3, 8, 10])
 def test_pipe_inference_schedule_laststage(micro_batches, stages=3):
-    sched = schedule.InferenceSchedule(micro_batches=micro_batches,
-                                       stages=stages,
-                                       stage_id=2)
+    sched = schedule.InferenceSchedule(micro_batches=micro_batches, stages=stages, stage_id=2)
     full = list(iter(sched))
     for idx, cmds in enumerate(full):
         if idx < sched.stage or idx > sched.stage + sched.num_micro_batches:
diff --git a/tests/unit/runtime/pipe/test_topology.py b/tests/unit/runtime/pipe/test_topology.py
index 4b0cc42d4336..53bc6f7a01fc 100644
--- a/tests/unit/runtime/pipe/test_topology.py
+++ b/tests/unit/runtime/pipe/test_topology.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import pytest
 
@@ -52,9 +55,7 @@ def test_topology_rank_repr():
     assert topo.get_rank_repr(rank=3) == 'a_01-b_01'
 
     assert topo.get_rank_repr(rank=3, inner_sep='+') == 'a+01-b+01'
-    assert topo.get_rank_repr(rank=3,
-                              inner_sep='🤗',
-                              outer_sep='_JEFF_') == 'a🤗01_JEFF_b🤗01'
+    assert topo.get_rank_repr(rank=3, inner_sep='🤗', outer_sep='_JEFF_') == 'a🤗01_JEFF_b🤗01'
 
     topo = Topo(axes=['pipe', 'data'], dims=[2, 2])
     assert topo.get_rank_repr(rank=0) == ''
@@ -132,26 +133,26 @@ def test_topology_comm_list():
     assert topo.get_rank(pipe=1, data=1, model=1) == 7
 
     pipe_list = [
-        [0, 4], # data=0, model=0
-        [1, 5], # data=0, model=1
-        [2, 6], # data=1, model=0
-        [3, 7], # data=1, model=1
+        [0, 4],  # data=0, model=0
+        [1, 5],  # data=0, model=1
+        [2, 6],  # data=1, model=0
+        [3, 7],  # data=1, model=1
     ]
     assert topo.get_axis_comm_lists('pipe') == pipe_list
 
     data_list = [
-        [0, 2], # pipe=0, model=0
-        [1, 3], # pipe=0, model=1
-        [4, 6], # pipe=1, model=0
-        [5, 7], # pipe=1, model=1
+        [0, 2],  # pipe=0, model=0
+        [1, 3],  # pipe=0, model=1
+        [4, 6],  # pipe=1, model=0
+        [5, 7],  # pipe=1, model=1
     ]
     assert topo.get_axis_comm_lists('data') == data_list
 
     model_list = [
-        [0, 1], # pipe=0, data=0
-        [2, 3], # pipe=0, data=1
-        [4, 5], # pipe=1, data=0
-        [6, 7], # pipe=1, data=1
+        [0, 1],  # pipe=0, data=0
+        [2, 3],  # pipe=0, data=1
+        [4, 5],  # pipe=1, data=0
+        [6, 7],  # pipe=1, data=1
     ]
     assert topo.get_axis_comm_lists('model') == model_list
 
@@ -172,8 +173,7 @@ def test_grid_pipe_data(self):
         rank = dist.get_rank()
 
         assert grid.is_first_stage == (grid.get_stage_id() == 0)
-        assert grid.is_last_stage == (
-            grid.get_stage_id() == grid.get_pipe_parallel_world_size() - 1)
+        assert grid.is_last_stage == (grid.get_stage_id() == grid.get_pipe_parallel_world_size() - 1)
 
         # Test collectives along the pipeline parallel process groups
         rank_tensor = torch.LongTensor(data=[rank]).to(get_accelerator().device_name())
@@ -209,6 +209,7 @@ def test_stage_to_global(self):
 
 def test_primes():
     """ Test prime factorizations. """
+
     def _product(ps):
         p = 1
         for num in ps:
diff --git a/tests/unit/runtime/sparse_tensor/test_averaging_sparse_gradients.py b/tests/unit/runtime/sparse_tensor/test_averaging_sparse_gradients.py
index 638a17bad2ff..92da2257bdb0 100644
--- a/tests/unit/runtime/sparse_tensor/test_averaging_sparse_gradients.py
+++ b/tests/unit/runtime/sparse_tensor/test_averaging_sparse_gradients.py
@@ -1,11 +1,16 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 import deepspeed
 from unit.common import DistributedTest
+from unit.util import skip_on_arch
 
 
 class Model(torch.nn.Module):
+
     def __init__(self):
         super().__init__()
         self.emb = torch.nn.EmbeddingBag(10, 3, mode="sum", sparse=True)
@@ -16,6 +21,7 @@ def forward(self, x, offsets):
 
 
 class Adam(torch.optim.Optimizer):
+
     def __init__(self, dense_params, sparse_params):
         super().__init__(dense_params + sparse_params, defaults={})
         self.adam = torch.optim.Adam(dense_params)
@@ -49,16 +55,12 @@ class TestSparseAdam(DistributedTest):
     world_size = 2
 
     def test(self):
-        config_dict = {
-            "train_batch_size": 2,
-            "steps_per_print": 1,
-            "sparse_gradients": True
-        }
+        skip_on_arch(min_arch=7)
+
+        config_dict = {"train_batch_size": 2, "steps_per_print": 1, "sparse_gradients": True}
         model, optimizer = get_model_optimizer()
         loss = torch.nn.BCEWithLogitsLoss()
-        engine, _, _, _ = deepspeed.initialize(model=model,
-                                              optimizer=optimizer,
-                                              config=config_dict)
+        engine, _, _, _ = deepspeed.initialize(model=model, optimizer=optimizer, config=config_dict)
 
         x, offsets, y = get_data(engine.device)
 
diff --git a/tests/unit/runtime/sparse_tensor/test_csr.py b/tests/unit/runtime/sparse_tensor/test_csr.py
index 1e4f81b986e8..937b981735a5 100644
--- a/tests/unit/runtime/sparse_tensor/test_csr.py
+++ b/tests/unit/runtime/sparse_tensor/test_csr.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 import random
diff --git a/tests/unit/runtime/sparse_tensor/test_sparse_grads.py b/tests/unit/runtime/sparse_tensor/test_sparse_grads.py
index ba9a6b028223..0689adc08670 100644
--- a/tests/unit/runtime/sparse_tensor/test_sparse_grads.py
+++ b/tests/unit/runtime/sparse_tensor/test_sparse_grads.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 import deepspeed
@@ -8,6 +11,7 @@
 
 
 class Model(torch.nn.Module):
+
     def __init__(self):
         super().__init__()
         self.emb = torch.nn.EmbeddingBag(10, 3, mode="sum", sparse=True)
@@ -18,6 +22,7 @@ def forward(self, x, offsets):
 
 
 class Adam(torch.optim.Optimizer):
+
     def __init__(self, dense_params, sparse_params):
         super().__init__(dense_params + sparse_params, defaults={})
         self.adam = torch.optim.Adam(dense_params)
@@ -37,38 +42,19 @@ class TestSparseAdam(DistributedTest):
     world_size = 2
 
     def test(self):
-        config_dict = {
-            "train_batch_size": 2,
-            "steps_per_print": 1,
-            "sparse_gradients": True
-        }
+        config_dict = {"train_batch_size": 2, "steps_per_print": 1, "sparse_gradients": True}
 
         model = Model()
         optimizer = Adam(list(model.linear.parameters()), list(model.emb.parameters()))
-        engine, _, _, _ = deepspeed.initialize(model=model,
-                                              optimizer=optimizer,
-                                              config=config_dict)
+        engine, _, _, _ = deepspeed.initialize(model=model, optimizer=optimizer, config=config_dict)
         loss = torch.nn.BCEWithLogitsLoss()
-        x = torch.tensor([1,
-                          2,
-                          4,
-                          5,
-                          4,
-                          3,
-                          2,
-                          9],
-                         dtype=torch.long,
-                         device=engine.device)
+        x = torch.tensor([1, 2, 4, 5, 4, 3, 2, 9], dtype=torch.long, device=engine.device)
         offsets = torch.tensor([0, 4], dtype=torch.long, device=engine.device)
         y = torch.tensor([[1.0], [0.0]], device=engine.device)
         res = engine(x, offsets)
         engine.backward(loss(res, y))
         engine.step()
 
-        results = [
-            engine.all_gather_scalar(i,
-                                     groups._get_data_parallel_group())
-            for i in model.emb.parameters()
-        ]
+        results = [engine.all_gather_scalar(i, groups._get_data_parallel_group()) for i in model.emb.parameters()]
         for res in results:
             assert torch.allclose(res[0], res[1])
diff --git a/tests/unit/runtime/test_autocast.py b/tests/unit/runtime/test_autocast.py
index b0d8d8696cb8..9176770afda7 100644
--- a/tests/unit/runtime/test_autocast.py
+++ b/tests/unit/runtime/test_autocast.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import pytest
 import torch
@@ -9,18 +12,15 @@
 
 @pytest.mark.parametrize('half_op', [False, True])
 class TestAutoCastDisable(DistributedTest):
+
     def test_missing_amp_autocast(self, half_op):
         hidden_dim = 4
         if half_op:
             input = torch.randn(hidden_dim).to(get_accelerator().device_name()).half()
-            ds_linear = LinearModuleForZeroStage3(
-                hidden_dim,
-                hidden_dim).to(get_accelerator().device_name()).half()
+            ds_linear = LinearModuleForZeroStage3(hidden_dim, hidden_dim).to(get_accelerator().device_name()).half()
         else:
             input = torch.randn(hidden_dim).to(get_accelerator().device_name())
-            ds_linear = LinearModuleForZeroStage3(hidden_dim,
-                                                  hidden_dim).to(
-                                                      get_accelerator().device_name())
+            ds_linear = LinearModuleForZeroStage3(hidden_dim, hidden_dim).to(get_accelerator().device_name())
 
         output = ds_linear(input)
         assert output.dtype == ds_linear.weight.dtype
@@ -31,14 +31,10 @@ def test_disable_autocast_linear(self, half_op):
         hidden_dim = 4
         if half_op:
             input = torch.randn(hidden_dim).to(get_accelerator().device_name()).half()
-            ds_linear = LinearModuleForZeroStage3(
-                hidden_dim,
-                hidden_dim).to(get_accelerator().device_name()).half()
+            ds_linear = LinearModuleForZeroStage3(hidden_dim, hidden_dim).to(get_accelerator().device_name()).half()
         else:
             input = torch.randn(hidden_dim).to(get_accelerator().device_name())
-            ds_linear = LinearModuleForZeroStage3(hidden_dim,
-                                                  hidden_dim).to(
-                                                      get_accelerator().device_name())
+            ds_linear = LinearModuleForZeroStage3(hidden_dim, hidden_dim).to(get_accelerator().device_name())
 
         with amp.autocast(False):
             output = ds_linear(input)
@@ -46,24 +42,15 @@ def test_disable_autocast_linear(self, half_op):
 
 
 @pytest.mark.skipif(get_accelerator().amp() is None, reason='amp is not installed')
-@pytest.mark.parametrize('half_input, half_weight',
-                         [(False,
-                           False),
-                          (False,
-                           True),
-                          (True,
-                           False),
-                          (True,
-                           True)])
+@pytest.mark.parametrize('half_input, half_weight', [(False, False), (False, True), (True, False), (True, True)])
 class TestAutoCastEnable(DistributedTest):
+
     def test_autocast_linear(self, tmpdir, half_input, half_weight):
         amp = get_accelerator().amp()
 
         hidden_dim = 4
         input = torch.randn(hidden_dim).to(get_accelerator().device_name())
-        ds_linear = LinearModuleForZeroStage3(hidden_dim,
-                                              hidden_dim).to(
-                                                  get_accelerator().device_name())
+        ds_linear = LinearModuleForZeroStage3(hidden_dim, hidden_dim).to(get_accelerator().device_name())
 
         if half_input:
             input = input.half()
diff --git a/tests/unit/runtime/test_data.py b/tests/unit/runtime/test_data.py
index ed2fee950bc3..7ae0814c823a 100644
--- a/tests/unit/runtime/test_data.py
+++ b/tests/unit/runtime/test_data.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from deepspeed.utils import RepeatingLoader
 import torch
@@ -19,24 +22,12 @@ def test_repeating_loader():
         assert next(loader) == 3
 
 
-@pytest.mark.parametrize('train_batch_size, drop_last',
-                         [(1,
-                           True),
-                          (4,
-                           True),
-                          (1,
-                           False),
-                          (4,
-                           False)])
+@pytest.mark.parametrize('train_batch_size, drop_last', [(1, True), (4, True), (1, False), (4, False)])
 class TestDataLoaderDropLast(DistributedTest):
     world_size = 1
 
     def test(self, train_batch_size, drop_last):
-        config_dict = {
-            "train_batch_size": train_batch_size,
-            "dataloader_drop_last": drop_last,
-            "steps_per_print": 1
-        }
+        config_dict = {"train_batch_size": train_batch_size, "dataloader_drop_last": drop_last, "steps_per_print": 1}
         hidden_dim = 10
 
         model = SimpleModel(hidden_dim)
@@ -51,6 +42,7 @@ def test(self, train_batch_size, drop_last):
                                                                 model=model,
                                                                 training_data=train_dataset,
                                                                 optimizer=optimizer)
+        training_dataloader.num_local_io_workers = 0  # We can't do nested mp.pool
         for n, batch in enumerate(training_dataloader):
             x = batch[0].to(get_accelerator().current_device_name())
             y = batch[1].to(get_accelerator().current_device_name())
diff --git a/tests/unit/runtime/test_data_efficiency.py b/tests/unit/runtime/test_data_efficiency.py
index 993e4aa66e20..b9bd9c3aa56e 100644
--- a/tests/unit/runtime/test_data_efficiency.py
+++ b/tests/unit/runtime/test_data_efficiency.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 import os
@@ -9,6 +12,7 @@
 
 
 class MPU():
+
     def __init__(self, tp_world_size):
         self.rank = deepspeed.comm.get_rank()
         self.world_size = deepspeed.comm.get_world_size()
@@ -103,10 +107,10 @@ def data_post_process(data, data_sampler_state_dict):
         model = SimpleModel(hidden_dim)
         dataset = random_dataset(20, hidden_dim, torch.device('cpu'), dtype=torch.half)
         model, _, data_loader, _ = deepspeed.initialize(config=config_dict,
-                                              model=model,
-                                              training_data=dataset,
-                                              model_parameters=model.parameters(),
-                                              mpu=MPU(1))
+                                                        model=model,
+                                                        training_data=dataset,
+                                                        model_parameters=model.parameters(),
+                                                        mpu=MPU(1))
         if model.mpu.get_data_parallel_rank() == 0 and not os.path.exists('/tmp'):
             os.makedirs('/tmp')
         model.set_data_post_process_func(data_post_process)
@@ -147,15 +151,8 @@ def test_fixed_discrete(self):
                 "max_difficulty": 5,
                 "schedule_type": "fixed_discrete",
                 "schedule_config": {
-                    "difficulty": [1,
-                                   2,
-                                   3,
-                                   4,
-                                   5],
-                    "max_step": [2,
-                                 4,
-                                 6,
-                                 8]
+                    "difficulty": [1, 2, 3, 4, 5],
+                    "max_step": [2, 4, 6, 8]
                 }
             }
         }
@@ -163,13 +160,8 @@ def test_fixed_discrete(self):
         ground_truths = {1: 1, 2: 1, 3: 2, 4: 2, 5: 3, 6: 3, 7: 4, 8: 4}
 
         model = Curriculum_SimpleModel(hidden_dim)
-        model, _, _, _ = deepspeed.initialize(config=config_dict,
-                                              model=model,
-                                              model_parameters=model.parameters())
-        data_loader = random_dataloader(model=model,
-                                        total_samples=20,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
+        data_loader = random_dataloader(model=model, total_samples=20, hidden_dim=hidden_dim, device=model.device)
         for n, batch in enumerate(data_loader):
             loss, seqlen = model(batch[0], batch[1])
             model.backward(loss)
@@ -212,13 +204,8 @@ def test_fixed_linear(self):
         ground_truths = {1: 2, 2: 4, 3: 4, 4: 6, 5: 6, 6: 8, 7: 8, 8: 10, 9: 10, 10: 10}
 
         model = Curriculum_SimpleModel(hidden_dim)
-        model, _, _, _ = deepspeed.initialize(config=config_dict,
-                                              model=model,
-                                              model_parameters=model.parameters())
-        data_loader = random_dataloader(model=model,
-                                        total_samples=20,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
+        data_loader = random_dataloader(model=model, total_samples=20, hidden_dim=hidden_dim, device=model.device)
         for n, batch in enumerate(data_loader):
             loss, seqlen = model(batch[0], batch[1])
             model.backward(loss)
diff --git a/tests/unit/runtime/test_ds_config_dict.py b/tests/unit/runtime/test_ds_config_dict.py
index 54c91a6fc3e6..6cd01644fad5 100644
--- a/tests/unit/runtime/test_ds_config_dict.py
+++ b/tests/unit/runtime/test_ds_config_dict.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 # A test on its own
 import os
@@ -93,10 +96,7 @@ def test(self, num_ranks, batch, micro_batch, gas, success):
         ds_config = DeepSpeedConfig(ds_batch_config)
 
         #test cases when all parameters are provided
-        status = _run_batch_config(ds_config,
-                                   train_batch=batch,
-                                   micro_batch=micro_batch,
-                                   gas=gas)
+        status = _run_batch_config(ds_config, train_batch=batch, micro_batch=micro_batch, gas=gas)
         _batch_assert(status, ds_config, batch, micro_batch, gas, success)
 
         #test cases when two out of three parameters are provided
@@ -139,10 +139,7 @@ def test_temp_config_json(tmpdir):
 
 
 @pytest.mark.parametrize("gather_weights_key",
-                         [
-                             "stage3_gather_16bit_weights_on_model_save",
-                             "stage3_gather_fp16_weights_on_model_save"
-                         ])
+                         ["stage3_gather_16bit_weights_on_model_save", "stage3_gather_fp16_weights_on_model_save"])
 def test_gather_16bit_params_on_model_save(gather_weights_key):
     config_dict = {
         gather_weights_key: True,
@@ -168,9 +165,7 @@ class TestConfigLoad(DistributedTest):
     def test_dict(self, base_config):
         hidden_dim = 10
         model = SimpleModel(hidden_dim)
-        model, _, _, _ = deepspeed.initialize(config=base_config,
-                                              model=model,
-                                              model_parameters=model.parameters())
+        model, _, _, _ = deepspeed.initialize(config=base_config, model=model, model_parameters=model.parameters())
 
     def test_json(self, base_config, tmpdir):
         config_path = os.path.join(tmpdir, "config.json")
@@ -178,9 +173,7 @@ def test_json(self, base_config, tmpdir):
             json.dump(base_config, fp)
         hidden_dim = 10
         model = SimpleModel(hidden_dim)
-        model, _, _, _ = deepspeed.initialize(config=config_path,
-                                              model=model,
-                                              model_parameters=model.parameters())
+        model, _, _, _ = deepspeed.initialize(config=config_path, model=model, model_parameters=model.parameters())
 
     def test_hjson(self, base_config, tmpdir):
         config_path = os.path.join(tmpdir, "config.json")
@@ -188,9 +181,7 @@ def test_hjson(self, base_config, tmpdir):
             hjson.dump(base_config, fp)
         hidden_dim = 10
         model = SimpleModel(hidden_dim)
-        model, _, _, _ = deepspeed.initialize(config=config_path,
-                                              model=model,
-                                              model_parameters=model.parameters())
+        model, _, _, _ = deepspeed.initialize(config=config_path, model=model, model_parameters=model.parameters())
 
 
 class TestDeprecatedDeepScaleConfig(DistributedTest):
@@ -206,13 +197,8 @@ def test(self, base_config, tmpdir):
         hidden_dim = 10
 
         model = SimpleModel(hidden_dim)
-        model, _, _,_ = deepspeed.initialize(args=args,
-                                             model=model,
-                                             model_parameters=model.parameters())
-        data_loader = random_dataloader(model=model,
-                                        total_samples=5,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
+        model, _, _, _ = deepspeed.initialize(args=args, model=model, model_parameters=model.parameters())
+        data_loader = random_dataloader(model=model, total_samples=5, hidden_dim=hidden_dim, device=model.device)
         for n, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
             model.backward(loss)
@@ -226,14 +212,11 @@ def test(self, base_config):
         hidden_dim = 10
 
         model = SimpleModel(hidden_dim)
-        model, _, _,_ = deepspeed.initialize(config=base_config,
-                                             model=model,
-                                             model_parameters=model.parameters(),
-                                             dist_init_required=True)
-        data_loader = random_dataloader(model=model,
-                                        total_samples=5,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
+        model, _, _, _ = deepspeed.initialize(config=base_config,
+                                              model=model,
+                                              model_parameters=model.parameters(),
+                                              dist_init_required=True)
+        data_loader = random_dataloader(model=model, total_samples=5, hidden_dim=hidden_dim, device=model.device)
         for n, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
             model.backward(loss)
@@ -250,10 +233,7 @@ def test(self, base_config):
         model = SimpleModel(hidden_dim=hidden_dim)
 
         model, _, _, _ = deepspeed.initialize(config=base_config, model=model)
-        data_loader = random_dataloader(model=model,
-                                        total_samples=5,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
+        data_loader = random_dataloader(model=model, total_samples=5, hidden_dim=hidden_dim, device=model.device)
         for n, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
             with pytest.raises(AssertionError):
@@ -268,20 +248,14 @@ class TestArgs(DistributedTest):
     def test_none_args(self, base_config):
         model = SimpleModel(hidden_dim=10)
         model, _, _, _ = deepspeed.initialize(args=None, model=model, config=base_config)
-        data_loader = random_dataloader(model=model,
-                                        total_samples=5,
-                                        hidden_dim=10,
-                                        device=model.device)
+        data_loader = random_dataloader(model=model, total_samples=5, hidden_dim=10, device=model.device)
         for n, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
 
     def test_no_args(self, base_config):
         model = SimpleModel(hidden_dim=10)
         model, _, _, _ = deepspeed.initialize(model=model, config=base_config)
-        data_loader = random_dataloader(model=model,
-                                        total_samples=5,
-                                        hidden_dim=10,
-                                        device=model.device)
+        data_loader = random_dataloader(model=model, total_samples=5, hidden_dim=10, device=model.device)
         for n, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
 
diff --git a/tests/unit/runtime/test_ds_config_model.py b/tests/unit/runtime/test_ds_config_model.py
index 24343a999f69..87ea747cf423 100644
--- a/tests/unit/runtime/test_ds_config_model.py
+++ b/tests/unit/runtime/test_ds_config_model.py
@@ -1,20 +1,20 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import pytest
 import os
 import json
-from pydantic import Field, ValidationError
 from typing import List
+from deepspeed.pydantic_v1 import Field, ValidationError
 from deepspeed.runtime import config as ds_config
 from deepspeed.runtime.config_utils import DeepSpeedConfigModel
 
 
 class SimpleConf(DeepSpeedConfigModel):
     param_1: int = 0
-    param_2_old: str = Field(None,
-                             deprecated=True,
-                             new_param="param_2",
-                             new_param_fn=(lambda x: [x]))
+    param_2_old: str = Field(None, deprecated=True, new_param="param_2", new_param_fn=(lambda x: [x]))
     param_2: List[str] = None
     param_3: int = Field(0, alias="param_3_alias")
 
@@ -68,16 +68,7 @@ def test_config_base_aliasfield():
     assert config.param_3 == 10
 
 
-@pytest.mark.parametrize("config_dict",
-                         [{
-                             "param_1": "DS"
-                         },
-                          {
-                              "param_2": "DS"
-                          },
-                          {
-                              "param_1_typo": 0
-                          }])
+@pytest.mark.parametrize("config_dict", [{"param_1": "DS"}, {"param_2": "DS"}, {"param_1_typo": 0}])
 def test_config_base_literalfail(config_dict):
     with pytest.raises(ValidationError):
         config = SimpleConf(**config_dict)
diff --git a/tests/unit/runtime/test_ds_initialize.py b/tests/unit/runtime/test_ds_initialize.py
index c7eeef863bda..8ec9f05a0a17 100644
--- a/tests/unit/runtime/test_ds_initialize.py
+++ b/tests/unit/runtime/test_ds_initialize.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import pytest
 from typing import Callable
@@ -8,13 +11,13 @@
 
 from unit.simple_model import SimpleModel, random_dataloader
 from unit.common import DistributedTest
-from unit.util import required_torch_version, bf16_required_version_check, required_amp_check
+from unit.util import bf16_required_version_check, required_amp_check
 
 import deepspeed
 from deepspeed.ops.adam import FusedAdam
 from deepspeed.runtime.lr_schedules import WARMUP_LR, WarmupLR
 from deepspeed.runtime.config import ADAM_OPTIMIZER
-from deepspeed.runtime.utils import see_memory_usage
+from deepspeed.runtime.utils import see_memory_usage, required_torch_version
 
 
 @pytest.mark.parametrize('zero_stage', [0, 3])
@@ -22,7 +25,7 @@ class TestNoOptim(DistributedTest):
     world_size = 1
 
     def test(self, zero_stage):
-        if zero_stage == 3 and not required_torch_version():
+        if zero_stage == 3 and not required_torch_version(min_version=1.8):
             pytest.skip("zero-3 param offload requires at least torch 1.8")
 
         ds_config = {
@@ -61,6 +64,7 @@ class TestClientOptimizer(DistributedTest):
     world_size = 1
 
     def test(self, optimizer_type):
+
         def _optimizer_callable(params) -> Optimizer:
             return AdamW(params=params)
 
@@ -77,9 +81,9 @@ def _optimizer_callable(params) -> Optimizer:
             client_optimizer = _optimizer_callable
 
         _, ds_optimizer, _, _ = deepspeed.initialize(config=config_dict,
-                                                    model=model,
-                                                    model_parameters=list(model.parameters()),
-                                                    optimizer=client_optimizer)
+                                                     model=model,
+                                                     model_parameters=list(model.parameters()),
+                                                     optimizer=client_optimizer)
         if client_optimizer is None:
             assert isinstance(ds_optimizer, FusedAdam)
         elif isinstance(client_optimizer, Optimizer):
@@ -93,15 +97,7 @@ class TestConfigOptimizer(DistributedTest):
     world_size = 1
 
     def test(self, client_parameters):
-        ds_config = {
-            "train_batch_size": 1,
-            "optimizer": {
-                "type": "Adam",
-                "params": {
-                    "lr": 0.001
-                }
-            }
-        }
+        ds_config = {"train_batch_size": 1, "optimizer": {"type": "Adam", "params": {"lr": 0.001}}}
 
         hidden_dim = 10
         model = SimpleModel(hidden_dim)
@@ -111,29 +107,30 @@ def test(self, client_parameters):
         else:
             model_parameters = None
 
-        _, ds_optimizer, _, _ = deepspeed.initialize(config=ds_config,
-                                                    model=model,
-                                                    model_parameters=model_parameters)
+        _, ds_optimizer, _, _ = deepspeed.initialize(config=ds_config, model=model, model_parameters=model_parameters)
 
         assert isinstance(ds_optimizer, FusedAdam)
 
 
-@pytest.mark.parametrize('optimizer_extension', ['zero1', 'zero2', 'amp', None])
+@pytest.mark.parametrize('optimizer_extension', ['zero1', 'zero2', 'zero3', 'amp', None])
 @pytest.mark.parametrize('model_dtype', ['fp16', 'bf16', 'fp32'])
 @pytest.mark.parametrize('grad_accum_dtype', [None, 'fp16', 'bf16', 'fp32'])
 class TestOptimizerImplementation(DistributedTest):
     world_size = 1
+    reuse_dist_env = True
 
     def test(self, optimizer_extension, model_dtype, grad_accum_dtype):
         if optimizer_extension == 'zero1':
             zero_stage = 1
         elif optimizer_extension == 'zero2':
             zero_stage = 2
+        elif optimizer_extension == 'zero3':
+            zero_stage = 3
         else:
             zero_stage = 0
-        amp = True if optimizer_extension == 'amp' else False
-        fp16 = True if model_dtype == 'fp16' else False
-        bf16 = True if model_dtype == 'bf16' else False
+        amp = (optimizer_extension == 'amp')
+        fp16 = (model_dtype == 'fp16')
+        bf16 = (model_dtype == 'bf16')
         # Skip checks
         if bf16 and not bf16_required_version_check():
             pytest.skip(
@@ -174,18 +171,42 @@ def test(self, optimizer_extension, model_dtype, grad_accum_dtype):
         # ZeRO 1 Wrapper
         is_supported[('zero1', 'fp16', None)] = True
         is_supported[('zero1', 'fp16', 'fp16')] = True
+        is_supported[('zero1', 'fp16', 'bf16')] = True
+        is_supported[('zero1', 'fp16', 'fp32')] = True
         is_supported[('zero1', 'bf16', None)] = True
+        is_supported[('zero1', 'bf16', 'fp16')] = True
         is_supported[('zero1', 'bf16', 'bf16')] = True
         is_supported[('zero1', 'bf16', 'fp32')] = True
         is_supported[('zero1', 'fp32', None)] = True
+        is_supported[('zero1', 'fp32', 'fp16')] = True
+        is_supported[('zero1', 'fp32', 'bf16')] = True
         is_supported[('zero1', 'fp32', 'fp32')] = True
         # ZeRO 2 Wrapper
         is_supported[('zero2', 'fp16', None)] = True
         is_supported[('zero2', 'fp16', 'fp16')] = True
+        is_supported[('zero2', 'fp16', 'bf16')] = True
+        is_supported[('zero2', 'fp16', 'fp32')] = True
         is_supported[('zero2', 'bf16', None)] = True
+        is_supported[('zero2', 'bf16', 'fp16')] = True
         is_supported[('zero2', 'bf16', 'bf16')] = True
+        is_supported[('zero2', 'bf16', 'fp32')] = True
         is_supported[('zero2', 'fp32', None)] = True
+        is_supported[('zero2', 'fp32', 'fp16')] = True
+        is_supported[('zero2', 'fp32', 'bf16')] = True
         is_supported[('zero2', 'fp32', 'fp32')] = True
+        # ZeRO 3 Wrapper
+        is_supported[('zero3', 'fp16', None)] = True
+        is_supported[('zero3', 'fp16', 'fp16')] = True
+        is_supported[('zero3', 'fp16', 'bf16')] = True
+        is_supported[('zero3', 'fp16', 'fp32')] = True
+        is_supported[('zero3', 'bf16', None)] = True
+        is_supported[('zero3', 'bf16', 'fp16')] = True
+        is_supported[('zero3', 'bf16', 'bf16')] = True
+        is_supported[('zero3', 'bf16', 'fp32')] = True
+        is_supported[('zero3', 'fp32', None)] = True
+        is_supported[('zero3', 'fp32', 'fp16')] = True
+        is_supported[('zero3', 'fp32', 'bf16')] = True
+        is_supported[('zero3', 'fp32', 'fp32')] = True
         # Amp Wrapper
         is_supported[('amp', 'fp32', None)] = True
         is_supported[('amp', 'fp32', 'fp32')] = True
@@ -205,14 +226,14 @@ def test(self, optimizer_extension, model_dtype, grad_accum_dtype):
 
         if key in is_supported:
             _, ds_optimizer, _, _ = deepspeed.initialize(config=ds_config,
-                                                        model=model,
-                                                        model_parameters=model_parameters)
+                                                         model=model,
+                                                         model_parameters=model_parameters)
             assert True
         else:
             with pytest.raises(NotImplementedError):
                 _, ds_optimizer, _, _ = deepspeed.initialize(config=ds_config,
-                                                            model=model,
-                                                            model_parameters=model_parameters)
+                                                             model=model,
+                                                             model_parameters=model_parameters)
 
 
 @pytest.mark.parametrize("scheduler_type", [None, _LRScheduler, Callable])
@@ -221,6 +242,7 @@ class TestClientLrScheduler(DistributedTest):
     world_size = 1
 
     def test(self, scheduler_type, optimizer_type):
+
         def _my_lambda(epoch):
             return epoch // 10
 
@@ -252,14 +274,11 @@ def _lr_scheduler_callable(optimizer) -> _LRScheduler:
                 client_scheduler = LambdaLR(client_optimizer, _my_lambda)
             else:
                 # Verify invalid combination is correctly handled
-                client_scheduler = LambdaLR(torch.optim.Adam(model.parameters()),
-                                            _my_lambda)
+                client_scheduler = LambdaLR(torch.optim.Adam(model.parameters()), _my_lambda)
         else:
             client_scheduler = _lr_scheduler_callable
 
-        if isinstance(client_scheduler,
-                      _LRScheduler) and not isinstance(client_optimizer,
-                                                       Optimizer):
+        if isinstance(client_scheduler, _LRScheduler) and not isinstance(client_optimizer, Optimizer):
             with pytest.raises(AssertionError):
                 _, _, _, _ = deepspeed.initialize(config=config_dict,
                                                   model=model,
diff --git a/tests/unit/runtime/test_lr_schedulers.py b/tests/unit/runtime/test_lr_schedulers.py
index 7afcad5426c4..bcfc485f2b8f 100644
--- a/tests/unit/runtime/test_lr_schedulers.py
+++ b/tests/unit/runtime/test_lr_schedulers.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 import deepspeed
@@ -10,6 +13,7 @@
 from deepspeed.runtime.lr_schedules import ONE_CYCLE, CYCLE_MIN_LR, CYCLE_MAX_LR, CYCLE_FIRST_STEP_SIZE, DECAY_LR_RATE, DECAY_STEP_SIZE
 from deepspeed.runtime.lr_schedules import CYCLE_MIN_MOM, CYCLE_MAX_MOM, DECAY_MOM_RATE
 from deepspeed.runtime.lr_schedules import WARMUP_DECAY_LR, TOTAL_NUM_STEPS
+from deepspeed.runtime.lr_schedules import WARMUP_COSINE_LR, WARMUP_MIN_RATIO, COS_MIN_RATIO
 
 
 def _verify_continuous_decrease(values):
@@ -29,21 +33,14 @@ def _verify_staircase_increase(values, step_size):
         assert all([values[i] == v for v in values[i:j]])
 
 
-@pytest.mark.parametrize("scheduler_type,params",
-                         [(WARMUP_LR,
-                           {}),
-                          (WARMUP_DECAY_LR,
-                           {
-                               WARMUP_NUM_STEPS: 10,
-                               TOTAL_NUM_STEPS: 20
-                           }),
-                          (ONE_CYCLE,
-                           {
-                               CYCLE_MIN_LR: 0,
-                               CYCLE_MAX_LR: 0.1
-                           }),
-                          (LR_RANGE_TEST,
-                           {})])
+@pytest.mark.parametrize("scheduler_type,params", [(WARMUP_LR, {}),
+                                                   (WARMUP_DECAY_LR, {
+                                                       WARMUP_NUM_STEPS: 10,
+                                                       TOTAL_NUM_STEPS: 20
+                                                   }), (ONE_CYCLE, {
+                                                       CYCLE_MIN_LR: 0,
+                                                       CYCLE_MAX_LR: 0.1
+                                                   }), (LR_RANGE_TEST, {})])
 class TestGetLrBeforeTrain(DistributedTest):
     world_size = 1
 
@@ -198,26 +195,21 @@ def test_lr_warmup_decay_schedule(self, warmup_num_steps, warmup_type):
             previous_lr = lr
 
 
-@pytest.mark.parametrize("scheduler_type,params",
-                         [(WARMUP_LR,
-                           {}),
-                          (WARMUP_DECAY_LR,
-                           {
-                               WARMUP_NUM_STEPS: 5,
-                               TOTAL_NUM_STEPS: 10
-                           }),
-                          (ONE_CYCLE,
-                           {
-                               CYCLE_MIN_LR: 0,
-                               CYCLE_MAX_LR: 0.1,
-                               CYCLE_FIRST_STEP_SIZE: 5,
-                               DECAY_STEP_SIZE: 5
-                           }),
-                          (LR_RANGE_TEST,
-                           {
-                               LR_RANGE_TEST_MIN_LR: 1e-4,
-                               LR_RANGE_TEST_STEP_SIZE: 1
-                           })])
+@pytest.mark.parametrize("scheduler_type,params", [(WARMUP_LR, {}),
+                                                   (WARMUP_DECAY_LR, {
+                                                       WARMUP_NUM_STEPS: 5,
+                                                       TOTAL_NUM_STEPS: 10
+                                                   }),
+                                                   (ONE_CYCLE, {
+                                                       CYCLE_MIN_LR: 0,
+                                                       CYCLE_MAX_LR: 0.1,
+                                                       CYCLE_FIRST_STEP_SIZE: 5,
+                                                       DECAY_STEP_SIZE: 5
+                                                   }),
+                                                   (LR_RANGE_TEST, {
+                                                       LR_RANGE_TEST_MIN_LR: 1e-4,
+                                                       LR_RANGE_TEST_STEP_SIZE: 1
+                                                   })])
 class TestSchedulerOptimizerParity(DistributedTest):
     world_size = 1
 
@@ -294,8 +286,7 @@ def test(self, min_lr, step_rate, step_size, staircase):
                                                          model=model,
                                                          model_parameters=model.parameters())
         data_loader = random_dataloader(model=model,
-                                        total_samples=max(50,
-                                                          step_size * 2),
+                                        total_samples=max(50, step_size * 2),
                                         hidden_dim=hidden_dim,
                                         device=model.device,
                                         dtype=torch.float)
@@ -358,8 +349,7 @@ def test_lr(self, min_lr, max_lr, decay_rate, cycle_step_size, decay_step_size):
                                                          model=model,
                                                          model_parameters=model.parameters())
         data_loader = random_dataloader(model=model,
-                                        total_samples=max(50,
-                                                          cycle_step_size * 3),
+                                        total_samples=max(50, cycle_step_size * 3),
                                         hidden_dim=hidden_dim,
                                         device=model.device,
                                         dtype=torch.float)
@@ -425,8 +415,7 @@ def test_mom(self, min_mom, max_mom, decay_rate, step_size):
                                                          model=model,
                                                          model_parameters=model.parameters())
         data_loader = random_dataloader(model=model,
-                                        total_samples=max(50,
-                                                          step_size * 3),
+                                        total_samples=max(50, step_size * 3),
                                         hidden_dim=hidden_dim,
                                         device=model.device,
                                         dtype=torch.float)
@@ -453,3 +442,71 @@ def test_mom(self, min_mom, max_mom, decay_rate, step_size):
         # Verify decay phase
         if decay_rate > 0:
             _verify_continuous_increase(step_moms[(step_size * 2):])
+
+
+class TestWarmupCosineLR(DistributedTest):
+    world_size = 1
+
+    @pytest.mark.parametrize("total_num_steps, warmup_num_steps, cos_min_ratio, warmup_min_ratio",
+                             [
+                                 (100, 10, 0.1, 0.2),
+                                 (200, 20, 0.1, 0.2),
+                                 (500, 30, 0.0, 0.2),
+                                 (600, 300, 0.1, 0.0),
+                                 (600, 550, 0.0, 0.0),
+                             ])  # yapf: disable
+    def test_lr(self, total_num_steps, warmup_num_steps, cos_min_ratio, warmup_min_ratio):
+        opt_lr = 0.0015
+        config_dict = {
+            "train_batch_size": 2,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": opt_lr
+                },
+            },
+            "scheduler": {
+                "type": WARMUP_COSINE_LR,
+                "params": {
+                    TOTAL_NUM_STEPS: total_num_steps,
+                    WARMUP_MIN_RATIO: warmup_min_ratio,
+                    WARMUP_NUM_STEPS: warmup_num_steps,
+                    COS_MIN_RATIO: cos_min_ratio,
+                }
+            },
+            "gradient_clipping": 1.0
+        }
+        hidden_dim = 10
+
+        model = SimpleModel(hidden_dim, empty_grad=False)
+        model, _, _, lr_scheduler = deepspeed.initialize(config=config_dict,
+                                                         model=model,
+                                                         model_parameters=model.parameters())
+        data_loader = random_dataloader(model=model,
+                                        total_samples=max(50, total_num_steps * 3),
+                                        hidden_dim=hidden_dim,
+                                        device=model.device,
+                                        dtype=torch.float)
+
+        step_lrs = []
+        for _, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+            step_lrs.extend(lr_scheduler.get_lr())
+
+        # Verify starting lr
+        assert abs(step_lrs[0] - opt_lr * warmup_min_ratio) < 1e-7
+
+        # Verify peak lr
+        assert abs(step_lrs[warmup_num_steps - 1] - opt_lr) < 1e-7
+
+        # Verify end lr
+        assert abs(step_lrs[total_num_steps - 1] - opt_lr * cos_min_ratio) < 1e-7
+
+        # Verify increasing phase
+        _verify_continuous_increase(step_lrs[:warmup_num_steps])
+
+        # Verify decreasing phase
+        _verify_continuous_decrease(step_lrs[warmup_num_steps:total_num_steps])
diff --git a/tests/unit/runtime/test_multi_output_model.py b/tests/unit/runtime/test_multi_output_model.py
index 0a802373a67a..d9aba419b158 100644
--- a/tests/unit/runtime/test_multi_output_model.py
+++ b/tests/unit/runtime/test_multi_output_model.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 import deepspeed
@@ -34,18 +37,14 @@ def test(self, tmpdir):
         weight_value = 0.1
 
         model = MultiOutputModel(hidden_dim, weight_value)
-        model, _, _, _ = deepspeed.initialize(config=config_dict,
-                                              model=model,
-                                              model_parameters=model.parameters())
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
         total_samples = 4
         data_loader = multi_output_dataloader(model=model,
                                               total_samples=total_samples,
                                               hidden_dim=hidden_dim,
                                               device=model.device,
-                                              inputs=[1.0,
-                                                      2.0],
-                                              targets=[1,
-                                                       2])
+                                              inputs=[1.0, 2.0],
+                                              targets=[1, 2])
         for n, batch in enumerate(data_loader):
             assert len(batch) % 2 == 0, \
                  f"multi_output_dataloader failed to return even number of data samples (input+target)"
@@ -54,9 +53,7 @@ def test(self, tmpdir):
             inputs, targets = batch[:midpoint], batch[midpoint:]
             loss_tuple = model(inputs, targets)
 
-            expected_loss = torch.tensor(2.302734375,
-                                         dtype=torch.half,
-                                         device=model.device)
+            expected_loss = torch.tensor(2.302734375, dtype=torch.half, device=model.device)
             for loss in loss_tuple:
                 assert loss.shape == torch.Size([])
                 assert loss.item() == approx(expected_loss.item())
@@ -96,21 +93,15 @@ def test(self, tmpdir):
         weight_value = 0.1
 
         model = MultiOutputModel(hidden_dim, weight_value)
-        model, _, _, _ = deepspeed.initialize(config=config_dict,
-                                              model=model,
-                                              model_parameters=model.parameters())
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
 
         total_samples = grad_accumulation_steps * micro_batch_size * 2
         data_loader = multi_output_dataloader(model=model,
                                               total_samples=total_samples,
                                               hidden_dim=hidden_dim,
                                               device=model.device,
-                                              inputs=[1.0,
-                                                      2.0,
-                                                      3.0],
-                                              targets=[1,
-                                                       2,
-                                                       3])
+                                              inputs=[1.0, 2.0, 3.0],
+                                              targets=[1, 2, 3])
         for n, batch in enumerate(data_loader):
             assert len(batch) % 2 == 0, \
                  f"multi_output_dataloader failed to return even number of data samples (input+target)"
@@ -120,9 +111,7 @@ def test(self, tmpdir):
             loss_tuple = model(inputs, targets)
             assert len(loss_tuple) == 3
 
-            expected_loss = torch.tensor(2.302734375,
-                                         dtype=torch.half,
-                                         device=model.device)
+            expected_loss = torch.tensor(2.302734375, dtype=torch.half, device=model.device)
 
             for loss in loss_tuple:
                 assert loss.shape == torch.Size([])
diff --git a/tests/unit/runtime/test_mup_optimizers.py b/tests/unit/runtime/test_mup_optimizers.py
new file mode 100644
index 000000000000..ebecf73d416f
--- /dev/null
+++ b/tests/unit/runtime/test_mup_optimizers.py
@@ -0,0 +1,53 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import deepspeed
+import torch
+import pytest
+
+from unit.common import DistributedTest
+from unit.simple_model import SimpleModel, random_dataloader
+from mup.shape import set_base_shapes
+
+
+@pytest.mark.parametrize("optimizer, expected_opt_class", [("MuAdam", torch.optim.Adam),
+                                                           ("MuAdamW", torch.optim.AdamW), ("MuSGD", torch.optim.SGD)]) # yapf: disable
+@pytest.mark.parametrize("zero_offload", [True, False]) # yapf: disable
+class TestMuPOptimizers(DistributedTest):
+    world_size = 1
+    reuse_dist_env = True
+
+    def test(self, optimizer, expected_opt_class, zero_offload):
+        config_dict = {
+            "train_batch_size": 2,
+            "steps_per_print": 1,
+            "zero_allow_untested_optimizer": True,
+            "optimizer": {
+                "type": optimizer,
+                "params": {
+                    "lr": 0.00015,
+                }
+            },
+            "gradient_clipping": 1.0,
+            "fp16": {
+                "enabled": True
+            },
+            "zero_optimization": {
+                "stage": 2,
+                "cpu_offload": zero_offload
+            }
+        }
+        hidden_dim = 10
+        model = SimpleModel(hidden_dim)
+        set_base_shapes(model, None)
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
+        data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device)
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+
+        ds_optimizer = model.optimizer.optimizer
+        assert isinstance(ds_optimizer, expected_opt_class)
diff --git a/tests/unit/runtime/test_pld.py b/tests/unit/runtime/test_pld.py
index 8b8ed2365d77..1f602db73b2f 100644
--- a/tests/unit/runtime/test_pld.py
+++ b/tests/unit/runtime/test_pld.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import numpy as np
 import deepspeed
@@ -48,14 +51,9 @@ def test_pld_model(self, theta):
         hidden_dim = 10
 
         model = PLD_SimpleModel(hidden_dim, empty_grad=False)
-        model, _, _, _ = deepspeed.initialize(config=config_dict,
-                                              model=model,
-                                              model_parameters=model.parameters())
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
 
-        data_loader = random_dataloader(model=model,
-                                        total_samples=50,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
+        data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device)
 
         for i, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
@@ -94,14 +92,9 @@ def test_non_pld_model(self):
         hidden_dim = 10
 
         model = SimpleModel(hidden_dim, empty_grad=False)
-        model, _, _, _ = deepspeed.initialize(config=config_dict,
-                                              model=model,
-                                              model_parameters=model.parameters())
-
-        data_loader = random_dataloader(model=model,
-                                        total_samples=1,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
+
+        data_loader = random_dataloader(model=model, total_samples=1, hidden_dim=hidden_dim, device=model.device)
 
         for i, batch in enumerate(data_loader):
             with pytest.raises(TypeError):
diff --git a/tests/unit/runtime/test_runtime_utils.py b/tests/unit/runtime/test_runtime_utils.py
index 18a8bb77a5b6..5d8478b249be 100644
--- a/tests/unit/runtime/test_runtime_utils.py
+++ b/tests/unit/runtime/test_runtime_utils.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 from torch._utils import _flatten_dense_tensors
@@ -41,9 +44,7 @@ def test(self):
         norm = ds_utils.clip_grad_norm_(parameters, max_norm=0.1)
         norm = torch.Tensor([norm]).to(get_accelerator().device_name(dist.get_rank()))
         world_size = dist.get_world_size()
-        gathered_norm = [
-            torch.zeros(1).to(get_accelerator().device_name()) for i in range(world_size)
-        ]
+        gathered_norm = [torch.zeros(1).to(get_accelerator().device_name()) for i in range(world_size)]
 
         dist.all_gather(gathered_norm, norm)
 
diff --git a/tests/unit/runtime/utils/test_partition.py b/tests/unit/runtime/utils/test_partition.py
index 58b62825de3f..e7085ee2c4bd 100644
--- a/tests/unit/runtime/utils/test_partition.py
+++ b/tests/unit/runtime/utils/test_partition.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import pytest
 
@@ -164,33 +167,9 @@ def test_float_midheavy():
 def test_balance_bert():
     # Parameters per layer for a transformer model with 24 transformers and hidden dim 1024
     weights = [
-        52559872,
-        12596224,
-        12596224,
-        12596224,
-        12596224,
-        12596224,
-        12596224,
-        12596224,
-        12596224,
-        12596224,
-        12596224,
-        12596224,
-        12596224,
-        12596224,
-        12596224,
-        12596224,
-        12596224,
-        12596224,
-        12596224,
-        12596224,
-        12596224,
-        12596224,
-        12596224,
-        12596224,
-        12596224,
-        0,
-        52559872
+        52559872, 12596224, 12596224, 12596224, 12596224, 12596224, 12596224, 12596224, 12596224, 12596224, 12596224,
+        12596224, 12596224, 12596224, 12596224, 12596224, 12596224, 12596224, 12596224, 12596224, 12596224, 12596224,
+        12596224, 12596224, 12596224, 0, 52559872
     ]
     P = 8
     parts = partition_balanced(weights, P)
diff --git a/tests/unit/runtime/zero/test_ignore_unused_parameters.py b/tests/unit/runtime/zero/test_ignore_unused_parameters.py
index efd4949c9460..aade488fde42 100644
--- a/tests/unit/runtime/zero/test_ignore_unused_parameters.py
+++ b/tests/unit/runtime/zero/test_ignore_unused_parameters.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import pytest
 from unit.common import DistributedTest
@@ -41,14 +44,9 @@ def test(self, ignore_unused_parameters):
         hidden_dim = 4
 
         model = UnusedParametersModel(hidden_dim=hidden_dim)
-        model, _, _, _ = deepspeed.initialize(config=config_dict,
-                                                  model=model,
-                                                  model_parameters=model.parameters())
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
 
-        data_loader = random_dataloader(model=model,
-                                        total_samples=10,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
+        data_loader = random_dataloader(model=model, total_samples=10, hidden_dim=hidden_dim, device=model.device)
 
         def _loop():
             for n, batch in enumerate(data_loader):
diff --git a/tests/unit/runtime/zero/test_zero.py b/tests/unit/runtime/zero/test_zero.py
index f5d84060bd8e..db81a0578160 100644
--- a/tests/unit/runtime/zero/test_zero.py
+++ b/tests/unit/runtime/zero/test_zero.py
@@ -1,7 +1,11 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import math
-from typing import Dict, List, Set
+from collections import namedtuple
+from typing import Dict, List, NamedTuple, Set, Tuple
 import pytest
 import deepspeed.comm as dist
 import torch
@@ -23,6 +27,7 @@
 
 
 def run_unbalanced_gradients(model, data_loader):
+
     def drop_some_gradients(model, iter):
         odd_iteration = iter % 2
         for i, p in enumerate(model.parameters()):
@@ -47,7 +52,7 @@ def dump_state_dict(model):
             print(f"{name} {param.data}")
 
 
-@pytest.mark.parametrize('zero_stage', [1, 2, 3])
+@pytest.mark.parametrize("zero_stage", [1, 2, 3])
 class TestZeroUnbalancedGradients(DistributedTest):
     world_size = 1
 
@@ -68,18 +73,13 @@ def test(self, zero_stage):
             "fp16": {
                 "enabled": True,
                 "initial_scale_power": 8
-            }
+            },
         }
         hidden_dim = 4
 
         model = SimpleModel(hidden_dim=hidden_dim)
-        model, _, _, _ = deepspeed.initialize(config=config_dict,
-                                              model=model,
-                                              model_parameters=model.parameters())
-        data_loader = random_dataloader(model=model,
-                                        total_samples=16,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
+        data_loader = random_dataloader(model=model, total_samples=16, hidden_dim=hidden_dim, device=model.device)
 
         run_unbalanced_gradients(model, data_loader)
 
@@ -96,7 +96,7 @@ def test(self, zero_stage=3):
             "steps_per_print": 1,
             "zero_optimization": {
                 "stage": zero_stage,
-                "stage3_param_persistence_threshold": 0
+                "stage3_param_persistence_threshold": 0,
             },
             "optimizer": {
                 "type": "Adam",
@@ -107,11 +107,12 @@ def test(self, zero_stage=3):
             "fp16": {
                 "enabled": True,
                 "initial_scale_power": 8
-            }
+            },
         }
         hidden_dim = 4
 
         class AlbertLikeModel(torch.nn.Module):
+
             def __init__(self, hidden_dim):
                 super().__init__()
                 self.linear = torch.nn.Linear(hidden_dim, hidden_dim)
@@ -125,13 +126,8 @@ def forward(self, x, y):
                 return self.cross_entropy_loss(hidden, y)
 
         model = AlbertLikeModel(hidden_dim=hidden_dim)
-        model, _, _, _ = deepspeed.initialize(config=config_dict,
-                                              model=model,
-                                              model_parameters=model.parameters())
-        data_loader = random_dataloader(model=model,
-                                        total_samples=16,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
+        data_loader = random_dataloader(model=model, total_samples=16, hidden_dim=hidden_dim, device=model.device)
 
         for i, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
@@ -141,11 +137,12 @@ def forward(self, x, y):
 
 # testing the fix https://github.com/microsoft/DeepSpeed/pull/1227
 # also reproduces the https://github.com/microsoft/DeepSpeed/pull/1372
-@pytest.mark.parametrize('zero_stage', [2, 3])
+@pytest.mark.parametrize("zero_stage", [2, 3])
+@pytest.mark.parametrize("freeze_params", [True, False])
 class TestZeroToFP32(DistributedTest):
     world_size = 2
 
-    def test_1_param_group(self, tmpdir, zero_stage):
+    def test_1_param_group(self, tmpdir, zero_stage, freeze_params):
         # XXX: ideally refactor with the 2_param_group test as 75% is the same
         # force all params to be partitioned by forcing threshold=0
         config_dict = {
@@ -154,7 +151,7 @@ def test_1_param_group(self, tmpdir, zero_stage):
             "steps_per_print": 1,
             "zero_optimization": {
                 "stage": zero_stage,
-                "stage3_param_persistence_threshold": 0
+                "stage3_param_persistence_threshold": 0,
             },
             "optimizer": {
                 "type": "Adam",
@@ -165,22 +162,24 @@ def test_1_param_group(self, tmpdir, zero_stage):
             "fp16": {
                 "enabled": True,
                 "initial_scale_power": 8
-            }
+            },
         }
 
         class MyModel(torch.nn.Module):
-            def __init__(self, hidden_dim, n_layers):
+
+            def __init__(self, hidden_dim, n_layers, freeze_params):
                 super().__init__()
                 # to reproduce https://github.com/microsoft/DeepSpeed/pull/1372 it is important that
                 # the number of total elements is uneven:
                 # (1) 4 layers of 3*(3+1)=12 elements each, 48 in total
-                self.ll = torch.nn.ModuleList(
-                    torch.nn.Linear(hidden_dim,
-                                    hidden_dim) for i in range(n_layers))
+                self.ll = torch.nn.ModuleList(torch.nn.Linear(hidden_dim, hidden_dim) for i in range(n_layers))
                 # (2) the following adds 4+1=5 elements
                 self.classifier = torch.nn.Linear(4, 1)
                 # total 48+5=53 (uneven as desired) elements
                 self.cross_entropy_loss = torch.nn.CrossEntropyLoss()
+                if freeze_params:
+                    self.ll[0].weight.requires_grad = False
+                    self.ll[0].bias.requires_grad = False
 
             def forward(self, x, y):
                 hidden = x
@@ -193,21 +192,20 @@ def forward(self, x, y):
         world_size = dist.get_world_size()
         # we want at least 2x layers as there are gpus to trigger round_robin_fp16_groups reshuffle in zero2
         n_layers = world_size * 2
-        model = MyModel(hidden_dim=hidden_dim, n_layers=n_layers)
+        model = MyModel(hidden_dim=hidden_dim, n_layers=n_layers, freeze_params=freeze_params)
+
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
+        # Flush zero stage 3 cache
+        model.empty_partition_cache()
 
-        model, _, _, _ = deepspeed.initialize(config=config_dict,
-                                              model=model,
-                                              model_parameters=model.parameters())
-        data_loader = random_dataloader(model=model,
-                                        total_samples=16,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
+        data_loader = random_dataloader(model=model, total_samples=16, hidden_dim=hidden_dim, device=model.device)
 
         for i, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
             model.backward(loss)
             model.step()
 
+        model.empty_partition_cache()
         model.save_checkpoint(tmpdir)
 
         # make sure all sides saved it
@@ -222,23 +220,21 @@ def forward(self, x, y):
                 orig_state_dict[name] = param.detach().cpu()
 
         if zero_stage == 3:
-            with deepspeed.zero.GatheredParameters(model.parameters(),
-                                                   modifier_rank=None):
+            with deepspeed.zero.GatheredParameters(model.parameters(), modifier_rank=None):
                 fp32_model = load_state_dict_from_zero_checkpoint(model.module, tmpdir)
                 fp32_state_dict = fp32_model.state_dict()
         else:
             fp32_model = load_state_dict_from_zero_checkpoint(model.module, tmpdir)
             fp32_state_dict = fp32_model.state_dict()
 
-        #dump_state_dict(fp32_model)
+        # dump_state_dict(fp32_model)
 
         if dist.get_rank() == 0:
             for name in orig_state_dict.keys():
                 # float() workaround for torch<1.6
-                assert torch.allclose(orig_state_dict[name].float(),
-                                      fp32_state_dict[name].float())
+                assert torch.allclose(orig_state_dict[name].float(), fp32_state_dict[name].float())
 
-    def test_2_param_groups(self, tmpdir, zero_stage):
+    def test_2_param_groups(self, tmpdir, zero_stage, freeze_params):
         # TODO:
         # - need to test with multiple param groups
         # force all params to be partitioned by forcing threshold=0
@@ -249,7 +245,7 @@ def test_2_param_groups(self, tmpdir, zero_stage):
             "zero_allow_untested_optimizer": 1,
             "zero_optimization": {
                 "stage": zero_stage,
-                "stage3_param_persistence_threshold": 0
+                "stage3_param_persistence_threshold": 0,
             },
             "optimizer": {
                 "type": "Adam",
@@ -260,16 +256,18 @@ def test_2_param_groups(self, tmpdir, zero_stage):
             "fp16": {
                 "enabled": True,
                 "initial_scale_power": 8
-            }
+            },
         }
 
         class MyModel(torch.nn.Module):
-            def __init__(self, hidden_dim, n_layers):
+
+            def __init__(self, hidden_dim, n_layers, freeze_params):
                 super().__init__()
-                self.ll = torch.nn.ModuleList(
-                    torch.nn.Linear(hidden_dim,
-                                    hidden_dim) for i in range(n_layers))
+                self.ll = torch.nn.ModuleList(torch.nn.Linear(hidden_dim, hidden_dim) for i in range(n_layers))
                 self.cross_entropy_loss = torch.nn.CrossEntropyLoss()
+                if freeze_params:
+                    self.ll[0].weight.requires_grad = False
+                    self.ll[0].bias.requires_grad = False
 
             def forward(self, x, y):
                 hidden = x
@@ -281,7 +279,7 @@ def forward(self, x, y):
 
         world_size = dist.get_world_size()
         n_layers = world_size * 2
-        model = MyModel(hidden_dim=hidden_dim, n_layers=n_layers)
+        model = MyModel(hidden_dim=hidden_dim, n_layers=n_layers, freeze_params=freeze_params)
 
         optim_groups = [
             {
@@ -295,27 +293,28 @@ def forward(self, x, y):
         ]
         optim = torch.optim.SGD(optim_groups, lr=0.1)
 
-        model, _, _, _ = deepspeed.initialize(model=model,
-                                              model_parameters=model.parameters(),
-                                              optimizer=optim,
-                                              config=config_dict
+        model, _, _, _ = deepspeed.initialize(
+            model=model,
+            model_parameters=model.parameters(),
+            optimizer=optim,
+            config=config_dict,
         )
-        data_loader = random_dataloader(model=model,
-                                        total_samples=16,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
+        model.empty_partition_cache()
+
+        data_loader = random_dataloader(model=model, total_samples=16, hidden_dim=hidden_dim, device=model.device)
 
         for i, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
             model.backward(loss)
             model.step()
 
+        model.empty_partition_cache()
         model.save_checkpoint(tmpdir)
 
         # make sure all sides saved it
         dist.barrier()
 
-        #dump_state_dict(model)
+        # dump_state_dict(model)
 
         orig_state_dict = {}
         for name, param in model.module.named_parameters():
@@ -326,21 +325,19 @@ def forward(self, x, y):
                 orig_state_dict[name] = param.detach().cpu()
 
         if zero_stage == 3:
-            with deepspeed.zero.GatheredParameters(model.parameters(),
-                                                   modifier_rank=None):
+            with deepspeed.zero.GatheredParameters(model.parameters(), modifier_rank=None):
                 fp32_model = load_state_dict_from_zero_checkpoint(model.module, tmpdir)
                 fp32_state_dict = fp32_model.state_dict()
         else:
             fp32_model = load_state_dict_from_zero_checkpoint(model.module, tmpdir)
             fp32_state_dict = fp32_model.state_dict()
 
-        #dump_state_dict(fp32_model)
+        # dump_state_dict(fp32_model)
 
         if dist.get_rank() == 0:
             for name in orig_state_dict.keys():
                 # float() workaround for torch<1.6
-                assert torch.allclose(orig_state_dict[name].float(),
-                                      fp32_state_dict[name].float())
+                assert torch.allclose(orig_state_dict[name].float(), fp32_state_dict[name].float())
 
 
 @pytest.mark.parametrize("allgather_bucket_size", [1000, 1001])
@@ -354,7 +351,7 @@ def test(self, allgather_bucket_size, zero_stage=2):
             "steps_per_print": 1,
             "zero_optimization": {
                 "stage": zero_stage,
-                "allgather_bucket_size": allgather_bucket_size
+                "allgather_bucket_size": allgather_bucket_size,
             },
             "optimizer": {
                 "type": "Adam",
@@ -365,22 +362,19 @@ def test(self, allgather_bucket_size, zero_stage=2):
             "fp16": {
                 "enabled": True,
                 "initial_scale_power": 8
-            }
+            },
         }
         hidden_dim = 4
 
         model = SimpleModel(hidden_dim=hidden_dim)
         if allgather_bucket_size % 2 == 0:
-            model, _, _, _ = deepspeed.initialize(config=config_dict,
-                                              model=model,
-                                              model_parameters=model.parameters())
+            model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
         else:
             with pytest.raises(AssertionError) as assertinfo:
                 model, _, _, _ = deepspeed.initialize(config=config_dict,
-                                                  model=model,
-                                                  model_parameters=model.parameters())
-            assert "allgather_bucket_size must be a multiple of nccl_start_alignment_factor" in str(
-                assertinfo)
+                                                      model=model,
+                                                      model_parameters=model.parameters())
+            assert ("allgather_bucket_size must be a multiple of nccl_start_alignment_factor" in str(assertinfo))
 
 
 class TestPartitionNcclAlignment(DistributedTest):
@@ -403,39 +397,31 @@ def test(self, zero_stage=2):
             "fp16": {
                 "enabled": True,
                 "initial_scale_power": 8
-            }
+            },
         }
         hidden_dim = 4
 
         model = SimpleModel(hidden_dim=hidden_dim)
-        model, _, _, _ = deepspeed.initialize(config=config_dict,
-                                              model=model,
-                                              model_parameters=model.parameters())
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
 
         # get nccl all-gather send buffers alignment factor
         nccl_start_alignment_factor = model.optimizer.nccl_start_alignment_factor
 
-        parallel_partitioned_bit16_groups = model.optimizer.parallel_partitioned_bit16_groups if zero_stage == 2 else model.optimizer.parallel_partitioned_fp16_groups
+        parallel_partitioned_bit16_groups = (model.optimizer.parallel_partitioned_bit16_groups
+                                             if zero_stage == 2 else model.optimizer.parallel_partitioned_fp16_groups)
         for data_parallel_partitions in parallel_partitioned_bit16_groups:
             for partition_id, partitioned_data in enumerate(data_parallel_partitions):
                 # verify that data partition start locations are 4-byte aligned
-                assert (partitioned_data.data_ptr() %
-                        (2 * nccl_start_alignment_factor) == 0)
+                assert (partitioned_data.data_ptr() % (2 * nccl_start_alignment_factor) == 0)
 
 
-def _ds_initialize_for_param_partitioning_testing(model: Module,
-                                                  cfg: dict) -> DeepSpeedEngine:
-    ds_engine, _, _, _ = deepspeed.initialize(
-        config=cfg,
-        model=model,
-        model_parameters=model.parameters()
-    )
+def _ds_initialize_for_param_partitioning_testing(model: Module, cfg: dict) -> DeepSpeedEngine:
+    ds_engine, _, _, _ = deepspeed.initialize(config=cfg, model=model, model_parameters=model.parameters())
 
     return ds_engine
 
 
-def _assert_partition_status(model: Module,
-                             valid_statuses: Set[ZeroParamStatus]) -> None:
+def _assert_partition_status(model: Module, valid_statuses: Set[ZeroParamStatus]) -> None:
     for _, param in model.named_parameters():
         assert param.ds_status in valid_statuses, param.ds_summary()
 
@@ -446,6 +432,7 @@ def _assert_fully_available(model: Module) -> None:
 
 
 class EltwiseMultiplicationModule(Module):
+
     def __init__(self, weight: Parameter) -> None:
         super().__init__()
         self.weight = weight
@@ -457,8 +444,9 @@ def forward(self, x: Tensor) -> Tensor:
         return result
 
 
-class EltwiseMultiplicationTestNetwork(Module):
+class EltwiseMultiplicationTestNetwork_Dict(Module):
     """used for testing purposes"""
+
     def __init__(
         self,
         weight1: Parameter,
@@ -472,29 +460,23 @@ def __init__(
 
         self.loss = L1Loss(reduction="none")
 
-    def forward(self,
-                x: Tensor,
-                y: Tensor,
-                use_module_trace: bool,
-                param_prefetching: bool) -> Dict[str,
-                                                 Tensor]:
+    def forward(self, x: Tensor, y: Tensor, use_module_trace: bool, param_prefetching: bool) -> Dict[str, Tensor]:
         _assert_partition_status(
             self,
             {
                 ZeroParamStatus.NOT_AVAILABLE,
                 ZeroParamStatus.INFLIGHT,
-                ZeroParamStatus.AVAILABLE
-            } if use_module_trace else {ZeroParamStatus.NOT_AVAILABLE})
+                ZeroParamStatus.AVAILABLE,
+            } if use_module_trace else {ZeroParamStatus.NOT_AVAILABLE},
+        )
 
         pre_layer_expected_states = {
-            ZeroParamStatus.INFLIGHT
-            if param_prefetching else ZeroParamStatus.NOT_AVAILABLE,
+            ZeroParamStatus.INFLIGHT if param_prefetching else ZeroParamStatus.NOT_AVAILABLE,
             ZeroParamStatus.AVAILABLE,
         }
 
         post_layer_expected_states = {
-            ZeroParamStatus.AVAILABLE
-            if param_prefetching else ZeroParamStatus.NOT_AVAILABLE,
+            ZeroParamStatus.AVAILABLE if param_prefetching else ZeroParamStatus.NOT_AVAILABLE,
         }
 
         _assert_partition_status(self.__layer1, pre_layer_expected_states)
@@ -516,8 +498,9 @@ def forward(self,
             {
                 ZeroParamStatus.NOT_AVAILABLE,
                 ZeroParamStatus.INFLIGHT,
-                ZeroParamStatus.AVAILABLE
-            } if use_module_trace else {ZeroParamStatus.NOT_AVAILABLE})
+                ZeroParamStatus.AVAILABLE,
+            } if use_module_trace else {ZeroParamStatus.NOT_AVAILABLE},
+        )
 
         return {
             "hidden1": hidden1,
@@ -526,24 +509,155 @@ def forward(self,
             "loss": loss,
         }
 
+    @staticmethod
+    def to_dict(outputs: Dict[str, Tensor]) -> Dict[str, Tensor]:
+        return outputs
+
+
+class EltwiseMultiplicationNamedTuple(NamedTuple):
+    hidden1: Tensor
+    hidden2: Tensor
+    y_hat: Tensor
+    loss: Tensor
+
+
+class EltwiseMultiplicationTestNetwork_NamedTuple(EltwiseMultiplicationTestNetwork_Dict):
+
+    def forward(self, *args, **kwargs) -> EltwiseMultiplicationNamedTuple:
+        outputs_dicts = super().forward(*args, **kwargs)
+        return EltwiseMultiplicationNamedTuple(
+            hidden1=outputs_dicts["hidden1"],
+            hidden2=outputs_dicts["hidden2"],
+            y_hat=outputs_dicts["y_hat"],
+            loss=outputs_dicts["loss"],
+        )
+
+    @staticmethod
+    def to_dict(outputs: EltwiseMultiplicationNamedTuple) -> Dict[str, Tensor]:
+        return {
+            "hidden1": outputs.hidden1,
+            "hidden2": outputs.hidden2,
+            "y_hat": outputs.y_hat,
+            "loss": outputs.loss,
+        }
+
+
+EltwiseMultiplication_namedtuple = namedtuple("EltwiseMultiplication_namedtuple",
+                                              ["hidden1", "hidden2", "y_hat", "loss"])
+
+
+class EltwiseMultiplicationTestNetwork_namedtuple(EltwiseMultiplicationTestNetwork_Dict):
+
+    def forward(self, *args, **kwargs) -> EltwiseMultiplication_namedtuple:
+        outputs_dicts = super().forward(*args, **kwargs)
+        return EltwiseMultiplication_namedtuple(
+            hidden1=outputs_dicts["hidden1"],
+            hidden2=outputs_dicts["hidden2"],
+            y_hat=outputs_dicts["y_hat"],
+            loss=outputs_dicts["loss"],
+        )
+
+    @staticmethod
+    def to_dict(outputs: EltwiseMultiplicationNamedTuple) -> Dict[str, Tensor]:
+        return {
+            "hidden1": outputs.hidden1,
+            "hidden2": outputs.hidden2,
+            "y_hat": outputs.y_hat,
+            "loss": outputs.loss,
+        }
+
+
+class EltwiseMultiplicationTestNetwork_Tuple(EltwiseMultiplicationTestNetwork_Dict):
+
+    def forward(self, *args, **kwargs) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
+        outputs_dicts = super().forward(*args, **kwargs)
+        return (
+            outputs_dicts["hidden1"],
+            outputs_dicts["hidden2"],
+            outputs_dicts["y_hat"],
+            outputs_dicts["loss"],
+        )
+
+    @staticmethod
+    def to_dict(outputs: Tuple[Tensor, Tensor, Tensor, Tensor]) -> Dict[str, Tensor]:
+        return {
+            "hidden1": outputs[0],
+            "hidden2": outputs[1],
+            "y_hat": outputs[2],
+            "loss": outputs[3],
+        }
+
+
+class EltwiseMultiplicationTestNetwork_List(EltwiseMultiplicationTestNetwork_Dict):
+
+    def forward(self, *args, **kwargs) -> List[Tensor]:
+        outputs_dicts = super().forward(*args, **kwargs)
+        return [
+            outputs_dicts["hidden1"],
+            outputs_dicts["hidden2"],
+            outputs_dicts["y_hat"],
+            outputs_dicts["loss"],
+        ]
+
+    @staticmethod
+    def to_dict(outputs: List[Tensor]) -> Dict[str, Tensor]:
+        return {
+            "hidden1": outputs[0],
+            "hidden2": outputs[1],
+            "y_hat": outputs[2],
+            "loss": outputs[3],
+        }
+
 
-@pytest.mark.parametrize("param_persistence_threshold", [0, 10])
-@pytest.mark.parametrize("fp16_enabled", [True, False])
-@pytest.mark.parametrize("contiguous_gradients", [True, False])
-@pytest.mark.parametrize("offload_optimizer", [True, False])
-@pytest.mark.parametrize("zero_grad", [True, False])
-@pytest.mark.parametrize("prefetching", [True, False])
 class TestZero3ParamPartitioningBase(DistributedTest):
     world_size = 2
 
-    def test(
+    @pytest.mark.parametrize("param_persistence_threshold", [0, 10])
+    def test_param_persistence_threshold(self, param_persistence_threshold):
+        self._test(param_persistence_threshold=param_persistence_threshold)
+
+    @pytest.mark.parametrize("fp16_enabled", [True, False])
+    def test_fp16_enabled(self, fp16_enabled):
+        self._test(fp16_enabled=fp16_enabled)
+
+    @pytest.mark.parametrize("contiguous_gradients", [True, False])
+    def test_contiguous_gradients(self, contiguous_gradients):
+        self._test(contiguous_gradients=contiguous_gradients)
+
+    @pytest.mark.parametrize("offload_optimizer", [True, False])
+    def test_offload_optimizer(self, offload_optimizer):
+        self._test(offload_optimizer=offload_optimizer)
+
+    @pytest.mark.parametrize("zero_grad", [True, False])
+    def test_zero_grad(self, zero_grad):
+        self._test(zero_grad=zero_grad)
+
+    @pytest.mark.parametrize("prefetching", [True, False])
+    def test_prefetching(self, prefetching):
+        self._test(prefetching=prefetching)
+
+    @pytest.mark.parametrize("reduce_scatter", [True, False])
+    def test_reduce_scatter(self, reduce_scatter):
+        self._test(reduce_scatter=reduce_scatter)
+
+    @pytest.mark.parametrize("model_class", [
+        EltwiseMultiplicationTestNetwork_Dict, EltwiseMultiplicationTestNetwork_NamedTuple,
+        EltwiseMultiplicationTestNetwork_namedtuple, EltwiseMultiplicationTestNetwork_Tuple,
+        EltwiseMultiplicationTestNetwork_List
+    ])
+    def test_model_class(self, model_class):
+        self._test(model_class=model_class)
+
+    def _test(
         self,
-        param_persistence_threshold: int,
-        fp16_enabled: bool,
-        contiguous_gradients: bool,
-        offload_optimizer: bool,
-        zero_grad: bool,
-        prefetching: bool,
+        param_persistence_threshold: int = 0,
+        fp16_enabled: bool = False,
+        contiguous_gradients: bool = False,
+        offload_optimizer: bool = False,
+        zero_grad: bool = False,
+        prefetching: bool = False,
+        reduce_scatter: bool = False,
+        model_class: EltwiseMultiplicationTestNetwork_Dict = EltwiseMultiplicationTestNetwork_Dict,
     ) -> None:
         if offload_optimizer and not contiguous_gradients:
             return
@@ -551,7 +665,7 @@ def test(
         m = 3
         n = 5
         weights = [Parameter(torch.zeros((m, n), dtype=torch.float32)) for _ in range(3)]
-        model = EltwiseMultiplicationTestNetwork(*weights)
+        model = model_class(*weights)
         prefetch_bucket_size = sum([p.numel() for p in model.parameters(recurse=True)])
         cfg = {
             "train_micro_batch_size_per_gpu": 1,
@@ -560,18 +674,19 @@ def test(
                 "stage3_max_reuse_distance": 0,
                 "stage3_param_persistence_threshold": param_persistence_threshold,
                 "contiguous_gradients": contiguous_gradients,
-                "stage3_prefetch_bucket_size": prefetch_bucket_size if prefetching else 0
+                "stage3_prefetch_bucket_size": prefetch_bucket_size if prefetching else 0,
+                "reduce_scatter": reduce_scatter,
             },
             "optimizer": {
                 "type": "Adam",
                 "params": {
-                    "lr": 1.
+                    "lr": 1.0
                 }
             },
             "fp16": {
                 "enabled": fp16_enabled,
-                "loss_scale": 1.,
-            }
+                "loss_scale": 1.0,
+            },
         }
 
         if offload_optimizer:
@@ -582,95 +697,49 @@ def test(
 
         ds_engine = _ds_initialize_for_param_partitioning_testing(model, cfg)
         for i, weight in enumerate(weights):
-            weight.ds_tensor.data = torch.full_like(weight.ds_tensor.data,
-                                                    (i + 1) * (1 + dist.get_rank()))
+            weight.ds_tensor.data = torch.full_like(weight.ds_tensor.data, (i + 1) * (1 + dist.get_rank()))
 
         def create_tensor(vals, dtype: torch.dtype = None) -> Tensor:
-            return torch.as_tensor(vals,
-                                   dtype=dtype
-                                   or (torch.float16 if fp16_enabled else torch.float32),
-                                   device=ds_engine.device)
+            return torch.as_tensor(
+                vals,
+                dtype=dtype or (torch.float16 if fp16_enabled else torch.float32),
+                device=ds_engine.device,
+            )
 
         expected_hidden1 = create_tensor([
-            [1,
-             1,
-             1,
-             1,
-             1],
-            [1,
-             1,
-             1,
-             2,
-             2],
-            [2,
-             2,
-             2,
-             2,
-             2],
+            [1, 1, 1, 1, 1],
+            [1, 1, 1, 2, 2],
+            [2, 2, 2, 2, 2],
         ])
         expected_hidden2 = create_tensor([
-            [2,
-             2,
-             2,
-             2,
-             2],
-            [2,
-             2,
-             2,
-             8,
-             8],
-            [8,
-             8,
-             8,
-             8,
-             8],
+            [2, 2, 2, 2, 2],
+            [2, 2, 2, 8, 8],
+            [8, 8, 8, 8, 8],
         ])
-        expected_yhat = create_tensor([[6,
-                                        6,
-                                        6,
-                                        6,
-                                        6],
-                                       [6,
-                                        6,
-                                        6,
-                                        48,
-                                        48],
-                                       [48,
-                                        48,
-                                        48,
-                                        48,
-                                        48]])
+        expected_yhat = create_tensor([[6, 6, 6, 6, 6], [6, 6, 6, 48, 48], [48, 48, 48, 48, 48]])
         expected_loss = create_tensor([
-            [5,
-             5,
-             5,
-             5,
-             5],
-            [5,
-             5,
-             5,
-             47,
-             47],
-            [47,
-             47,
-             47,
-             47,
-             47],
+            [5, 5, 5, 5, 5],
+            [5, 5, 5, 47, 47],
+            [47, 47, 47, 47, 47],
         ])
 
         for train_iter in range(3):
             activations = ds_engine(
-                x=torch.ones((m,
-                              n),
-                             dtype=torch.float16 if fp16_enabled else torch.float32,
-                             device=ds_engine.device),
-                y=torch.ones((m,
-                              n),
-                             dtype=torch.float16 if fp16_enabled else torch.float32,
-                             device=ds_engine.device),
+                x=torch.ones(
+                    (m, n),
+                    dtype=torch.float16 if fp16_enabled else torch.float32,
+                    device=ds_engine.device,
+                ),
+                y=torch.ones(
+                    (m, n),
+                    dtype=torch.float16 if fp16_enabled else torch.float32,
+                    device=ds_engine.device,
+                ),
                 use_module_trace=train_iter > 0,
                 param_prefetching=prefetching and train_iter > 0,
             )
+            # for ease in testing convert outputs to dict.
+            activations = model_class.to_dict(activations)
             assert torch.allclose(activations["hidden1"], expected_hidden1)
             assert torch.allclose(activations["hidden2"], expected_hidden2)
             assert torch.allclose(activations["y_hat"], expected_yhat)
@@ -680,7 +749,8 @@ def create_tensor(vals, dtype: torch.dtype = None) -> Tensor:
 
             # check the gradients
             grad_partitions = ds_engine.optimizer.get_fp32_grad_partitions()
-            assert set(grad_partitions.keys()) == {0}, f"should have one parameter group but got {len(grad_partitions)}"
+            assert set(grad_partitions.keys()) == {0
+                                                   }, f"should have one parameter group but got {len(grad_partitions)}"
             assert set(grad_partitions[0].keys()) == {0, 1, 2}
             dloss_wrt_layer1 = grad_partitions[0][0]
             dloss_wrt_layer2 = grad_partitions[0][1]
@@ -701,31 +771,31 @@ def create_tensor(vals, dtype: torch.dtype = None) -> Tensor:
             if dist.get_rank() == 0:
                 assert torch.allclose(
                     dloss_wrt_layer3.to(get_accelerator().device_name()),
-                    grad_multiplier * create_tensor([2] * 8,
-                                                    torch.float))
+                    grad_multiplier * create_tensor([2] * 8, torch.float),
+                )
                 assert torch.allclose(
                     dloss_wrt_layer2.to(get_accelerator().device_name()),
-                    grad_multiplier * create_tensor([3 * 1] * 8,
-                                                    torch.float))
+                    grad_multiplier * create_tensor([3 * 1] * 8, torch.float),
+                )
                 assert torch.allclose(
                     dloss_wrt_layer1.to(get_accelerator().device_name()),
-                    grad_multiplier * create_tensor([3 * 2 * 1] * 8,
-                                                    torch.float))
+                    grad_multiplier * create_tensor([3 * 2 * 1] * 8, torch.float),
+                )
             elif dist.get_rank() == 1:
                 # parameters dont split evenly across ranks so rank 1 has a zero-padded
                 # partition
                 assert torch.allclose(
                     dloss_wrt_layer3.to(get_accelerator().device_name()),
-                    grad_multiplier * create_tensor(([8] * 7) + [0],
-                                                    torch.float))
+                    grad_multiplier * create_tensor(([8] * 7) + [0], torch.float),
+                )
                 assert torch.allclose(
                     dloss_wrt_layer2.to(get_accelerator().device_name()),
-                    grad_multiplier * create_tensor(([6 * 2] * 7) + [0],
-                                                    torch.float))
+                    grad_multiplier * create_tensor(([6 * 2] * 7) + [0], torch.float),
+                )
                 assert torch.allclose(
                     dloss_wrt_layer1.to(get_accelerator().device_name()),
-                    grad_multiplier * create_tensor(([6 * 4 * 1] * 7) + [0],
-                                                    torch.float))
+                    grad_multiplier * create_tensor(([6 * 4 * 1] * 7) + [0], torch.float),
+                )
             else:
                 raise RuntimeError("test has world size of two")
 
@@ -742,11 +812,14 @@ def create_tensor(vals, dtype: torch.dtype = None) -> Tensor:
 
 
 @pytest.mark.parametrize("init_context_manager", [True, False])
+@pytest.mark.parametrize("reduce_scatter", [True, False])
 class TestZero3ParamPartitioningLargeParam(DistributedTest):
     world_size = 4
 
-    def test(self, init_context_manager: bool, param_sz: int = 8100) -> None:
+    def test(self, init_context_manager: bool, reduce_scatter: bool, param_sz: int = 8100) -> None:
+
         class LargeParamModel(Module):
+
             def __init__(self):
                 super().__init__()
                 self.param = Parameter(torch.zeros((param_sz, ), dtype=torch.float32))
@@ -771,79 +844,68 @@ def forward(self, x: Tensor) -> Tensor:
                 "stage3_max_reuse_distance": 0,
                 "contiguous_gradients": True,
                 "overlap_comm": True,
+                "reduce_scatter": reduce_scatter,
             },
             "optimizer": {
                 "type": "Adam",
                 "params": {
-                    "lr": 1.
+                    "lr": 1.0
                 }
             },
             "fp16": {
                 "enabled": True,
-                "loss_scale": 1.,
-            }
+                "loss_scale": 1.0,
+            },
         }
-        with deepspeed.zero.Init(mem_efficient_linear=False,
-                                 enabled=init_context_manager):
+        with deepspeed.zero.Init(mem_efficient_linear=False, enabled=init_context_manager):
             model = LargeParamModel()
         ds_engine = _ds_initialize_for_param_partitioning_testing(model, ds_config)
 
         for train_iter in range(3):  # test multiple iterations to cover prefetching
-            activation: Tensor = ds_engine(
-                torch.ones(param_sz,
-                           dtype=torch.float16,
-                           device=ds_engine.device))
+            activation: Tensor = ds_engine(torch.ones(param_sz, dtype=torch.float16, device=ds_engine.device))
 
             partition_sz = math.ceil(param_sz / self.world_size)
             for rank_idx, start_idx in enumerate(range(0, param_sz, partition_sz)):
-                activation_from_partition = activation[start_idx:start_idx +
-                                                       partition_sz]
+                activation_from_partition = activation[start_idx:start_idx + partition_sz]
                 assert torch.allclose(
                     activation_from_partition,
-                    torch.full_like(activation_from_partition,
-                                    rank_idx))
+                    torch.full_like(activation_from_partition, rank_idx),
+                )
 
             ds_engine.backward(activation.sum())
             ds_engine.allreduce_gradients()
 
             avgd_gradients = ds_engine.optimizer.averaged_gradients
             assert set(avgd_gradients.keys()) == {0}, "should only have one parameter group"
-            weight_gradient, = avgd_gradients[0]
-            expected_weight_gradient = (train_iter + 1) * torch.full_like(
-                weight_gradient,
-                1)
+            (weight_gradient, ) = avgd_gradients[0]
+            expected_weight_gradient = (train_iter + 1) * torch.full_like(weight_gradient, 1)
 
             assert torch.allclose(weight_gradient, expected_weight_gradient)
 
 
-@pytest.mark.parametrize("param_sz", [100, 1_000, 10_000])
-@pytest.mark.parametrize("n_layers", [100, 1_000])
 @pytest.mark.parametrize("init_context_manager", [True, False])
 class TestZero3ParamPartitioningManyParams(DistributedTest):
-    world_size = 4
+    world_size = 2
+
+    def test(self, init_context_manager: bool, param_sz: int = 100, n_layers: int = 100) -> None:
 
-    def test(self, param_sz: int, n_layers: int, init_context_manager: bool) -> None:
         class ManyParamModel(Module):
+
             def __init__(self) -> None:
                 super().__init__()
 
                 self.modulelist = ModuleList(
-                    EltwiseMultiplicationModule(
-                        weight=Parameter(torch.empty((param_sz,
-                                                      ),
-                                                     dtype=torch.float32)))
+                    EltwiseMultiplicationModule(weight=Parameter(torch.empty((param_sz, ), dtype=torch.float32)))
                     for _ in range(n_layers))
 
                 for layer_num, module in enumerate(self.modulelist):
-                    with deepspeed.zero.GatheredParameters(module.weight,
-                                                           modifier_rank=0):
+                    with deepspeed.zero.GatheredParameters(module.weight, modifier_rank=0):
                         param: Parameter = module.weight
                         partition_sz = math.ceil(param.numel() / dist.get_world_size())
                         offset = 0
                         for rank in range(dist.get_world_size()):
                             with torch.no_grad():
-                                param[offset:offset + partition_sz].fill_(2 * layer_num *
-                                                                          rank)
+                                param[offset:offset + partition_sz].fill_(2 * layer_num * rank)
                             offset += partition_sz
 
             def forward(self, x: Tensor) -> Tensor:
@@ -866,37 +928,29 @@ def forward(self, x: Tensor) -> Tensor:
             "optimizer": {
                 "type": "Adam",
                 "params": {
-                    "lr": 1.
+                    "lr": 1.0
                 }
             },
             "fp16": {
                 "enabled": True,
-                "loss_scale": 1.,
-            }
+                "loss_scale": 1.0,
+            },
         }
 
-        with deepspeed.zero.Init(config=ds_cfg,
-                                 mem_efficient_linear=False,
-                                 enabled=init_context_manager):
+        with deepspeed.zero.Init(config=ds_cfg, mem_efficient_linear=False, enabled=init_context_manager):
             model = ManyParamModel()
 
         ds_engine = _ds_initialize_for_param_partitioning_testing(model, ds_cfg)
 
         for _ in range(3):  # test multiple iterations to cover prefetching
             activations: List[Tensor] = ds_engine(
-                torch.ones((param_sz,
-                            ),
-                           dtype=torch.float16,
-                           device=ds_engine.device))
+                torch.ones((param_sz, ), dtype=torch.float16, device=ds_engine.device))
             assert len(activations) == n_layers
 
             partition_sz = math.ceil(param_sz / self.world_size)
-            expected_activations = torch.empty(param_sz,
-                                               dtype=torch.float16,
-                                               device=ds_engine.device)
+            expected_activations = torch.empty(param_sz, dtype=torch.float16, device=ds_engine.device)
             for start_idx in range(0, param_sz, partition_sz):
-                expected_activations[start_idx:start_idx +
-                                     partition_sz] = dist.get_rank()
+                expected_activations[start_idx:start_idx + partition_sz] = dist.get_rank()
 
             for layer_num, activation in enumerate(activations):
                 expected_activations *= 2 * layer_num
@@ -917,7 +971,9 @@ class TestZero3InitForParentWeightInitialization(DistributedTest):
     world_size = 4
 
     def test(self):
+
         class ModelWhereParentInitializesChildWeights(Module):
+
             def __init__(self) -> None:
                 super().__init__()
 
@@ -941,32 +997,46 @@ def __init_weights(self, module):
             "optimizer": {
                 "type": "Adam",
                 "params": {
-                    "lr": 1.
+                    "lr": 1.0
                 }
             },
             "fp16": {
                 "enabled": True,
-                "loss_scale": 1.,
-            }
+                "loss_scale": 1.0,
+            },
         }
 
-        with deepspeed.zero.Init(config=ds_cfg,
-                                 mem_efficient_linear=False,
-                                 enabled=True):
+        with deepspeed.zero.Init(config=ds_cfg, mem_efficient_linear=False, enabled=True):
             model = ModelWhereParentInitializesChildWeights()
 
         assert model.linear.weight.ds_tensor.numel() == math.ceil(12 / self.world_size)
-        assert torch.allclose(model.linear.weight.ds_tensor,
-                              torch.full_like(model.linear.weight.ds_tensor,
-                                              1))
+        assert torch.allclose(
+            model.linear.weight.ds_tensor,
+            torch.full_like(model.linear.weight.ds_tensor, 1),
+        )
 
 
-@pytest.mark.skip("not working")
+"""
 @pytest.mark.parametrize("param_persistence_threshold", [0, 10])
 @pytest.mark.parametrize("contiguous_gradients", [True, False])
 @pytest.mark.parametrize("offload_optimizer", [True, False])
 @pytest.mark.parametrize("zero_grad", [True, False])
 @pytest.mark.parametrize("prefetching", [True, False])
+@pytest.mark.parametrize("reduce_scatter", [True, False])
+@pytest.mark.parametrize(
+    "model_class",
+    [
+        EltwiseMultiplicationTestNetwork_Dict,
+        EltwiseMultiplicationTestNetwork_NamedTuple,
+        EltwiseMultiplicationTestNetwork_namedtuple,
+        EltwiseMultiplicationTestNetwork_Tuple,
+        EltwiseMultiplicationTestNetwork_List,
+    ],
+)
+"""
+
+
+@pytest.mark.skip("not working")
 class TestZero3ParamPartitioningBaseBF16(DistributedTest):
     world_size = 2
 
@@ -977,6 +1047,8 @@ def test(
         offload_optimizer: bool,
         zero_grad: bool,
         prefetching: bool,
+        reduce_scatter: bool,
+        model_class: EltwiseMultiplicationTestNetwork_Dict,
     ) -> None:
         if offload_optimizer and not contiguous_gradients:
             return
@@ -984,7 +1056,7 @@ def test(
         m = 3
         n = 5
         weights = [Parameter(torch.zeros((m, n), dtype=torch.float32)) for _ in range(3)]
-        model = EltwiseMultiplicationTestNetwork(*weights)
+        model = model_class(*weights)
         prefetch_bucket_size = sum([p.numel() for p in model.parameters(recurse=True)])
         cfg = {
             "train_micro_batch_size_per_gpu": 1,
@@ -993,18 +1065,19 @@ def test(
                 "stage3_max_reuse_distance": 0,
                 "stage3_param_persistence_threshold": param_persistence_threshold,
                 "contiguous_gradients": contiguous_gradients,
-                "stage3_prefetch_bucket_size": prefetch_bucket_size if prefetching else 0
+                "stage3_prefetch_bucket_size": prefetch_bucket_size if prefetching else 0,
+                "reduce_scatter": reduce_scatter,
             },
             "optimizer": {
                 "type": "Adam",
                 "params": {
-                    "lr": 1.
+                    "lr": 1.0
                 }
             },
             "bf16": {
                 "enabled": True,
-                "loss_scale": 1.,
-            }
+                "loss_scale": 1.0,
+            },
         }
 
         if offload_optimizer:
@@ -1015,93 +1088,38 @@ def test(
 
         ds_engine = _ds_initialize_for_param_partitioning_testing(model, cfg)
         for i, weight in enumerate(weights):
-            weight.ds_tensor.data = torch.full_like(weight.ds_tensor.data,
-                                                    (i + 1) * (1 + dist.get_rank()))
+            weight.ds_tensor.data = torch.full_like(weight.ds_tensor.data, (i + 1) * (1 + dist.get_rank()))
 
         def create_tensor(vals):
             return torch.as_tensor(vals, dtype=torch.bfloat16, device=ds_engine.device)
 
         expected_hidden1 = create_tensor([
-            [1,
-             1,
-             1,
-             1,
-             1],
-            [1,
-             1,
-             1,
-             2,
-             2],
-            [2,
-             2,
-             2,
-             2,
-             2],
+            [1, 1, 1, 1, 1],
+            [1, 1, 1, 2, 2],
+            [2, 2, 2, 2, 2],
         ])
         expected_hidden2 = create_tensor([
-            [2,
-             2,
-             2,
-             2,
-             2],
-            [2,
-             2,
-             2,
-             8,
-             8],
-            [8,
-             8,
-             8,
-             8,
-             8],
+            [2, 2, 2, 2, 2],
+            [2, 2, 2, 8, 8],
+            [8, 8, 8, 8, 8],
         ])
-        expected_yhat = create_tensor([[6,
-                                        6,
-                                        6,
-                                        6,
-                                        6],
-                                       [6,
-                                        6,
-                                        6,
-                                        48,
-                                        48],
-                                       [48,
-                                        48,
-                                        48,
-                                        48,
-                                        48]])
+        expected_yhat = create_tensor([[6, 6, 6, 6, 6], [6, 6, 6, 48, 48], [48, 48, 48, 48, 48]])
         expected_loss = create_tensor([
-            [5,
-             5,
-             5,
-             5,
-             5],
-            [5,
-             5,
-             5,
-             47,
-             47],
-            [47,
-             47,
-             47,
-             47,
-             47],
+            [5, 5, 5, 5, 5],
+            [5, 5, 5, 47, 47],
+            [47, 47, 47, 47, 47],
         ])
 
         for train_iter in range(3):
             _assert_partition_status(ds_engine, {ZeroParamStatus.NOT_AVAILABLE})
             activations = ds_engine(
-                x=torch.ones((m,
-                              n),
-                             dtype=torch.bfloat16,
-                             device=ds_engine.device),
-                y=torch.ones((m,
-                              n),
-                             dtype=torch.bfloat16,
-                             device=ds_engine.device),
+                x=torch.ones((m, n), dtype=torch.bfloat16, device=ds_engine.device),
+                y=torch.ones((m, n), dtype=torch.bfloat16, device=ds_engine.device),
                 use_module_trace=train_iter > 0,
                 param_prefetching=prefetching and train_iter > 0,
             )
+            # for ease in testing convert outputs to dict.
+            activations = model_class.to_dict(activations)
             assert torch.allclose(activations["hidden1"], expected_hidden1)
             assert torch.allclose(activations["hidden2"], expected_hidden2)
             assert torch.allclose(activations["y_hat"], expected_yhat)
@@ -1112,7 +1130,8 @@ def create_tensor(vals):
 
             # check the gradients
             grad_partitions = ds_engine.optimizer.get_fp32_grad_partitions()
-            assert set(grad_partitions.keys()) == {0}, f"should have one parameter group but got {len(grad_partitions)}"
+            assert set(grad_partitions.keys()) == {0
+                                                   }, f"should have one parameter group but got {len(grad_partitions)}"
             assert set(grad_partitions[0].keys()) == {0, 1, 2}
             dloss_wrt_layer1 = grad_partitions[0][0]
             dloss_wrt_layer2 = grad_partitions[0][1]
@@ -1131,29 +1150,31 @@ def create_tensor(vals):
             if dist.get_rank() == 0:
                 assert torch.allclose(
                     dloss_wrt_layer3.to(get_accelerator().device_name()),
-                    grad_multiplier * create_tensor([2] * 8).to(expected_grad_dtype))
+                    grad_multiplier * create_tensor([2] * 8).to(expected_grad_dtype),
+                )
                 assert torch.allclose(
                     dloss_wrt_layer2.to(get_accelerator().device_name()),
-                    grad_multiplier * create_tensor([3 * 1] * 8).to(expected_grad_dtype))
+                    grad_multiplier * create_tensor([3 * 1] * 8).to(expected_grad_dtype),
+                )
                 assert torch.allclose(
                     dloss_wrt_layer1.to(get_accelerator().device_name()),
-                    grad_multiplier *
-                    create_tensor([3 * 2 * 1] * 8).to(expected_grad_dtype))
+                    grad_multiplier * create_tensor([3 * 2 * 1] * 8).to(expected_grad_dtype),
+                )
             elif dist.get_rank() == 1:
                 # parameters dont split evenly across ranks so rank 1 has a zero-padded
                 # partition
                 assert torch.allclose(
                     dloss_wrt_layer3.to(get_accelerator().device_name()),
-                    grad_multiplier *
-                    create_tensor(([8] * 7) + [0]).to(expected_grad_dtype))
+                    grad_multiplier * create_tensor(([8] * 7) + [0]).to(expected_grad_dtype),
+                )
                 assert torch.allclose(
                     dloss_wrt_layer2.to(get_accelerator().device_name()),
-                    grad_multiplier *
-                    create_tensor(([6 * 2] * 7) + [0]).to(expected_grad_dtype))
+                    grad_multiplier * create_tensor(([6 * 2] * 7) + [0]).to(expected_grad_dtype),
+                )
                 assert torch.allclose(
                     dloss_wrt_layer1.to(get_accelerator().device_name()),
-                    grad_multiplier *
-                    create_tensor(([6 * 4 * 1] * 7) + [0]).to(expected_grad_dtype))
+                    grad_multiplier * create_tensor(([6 * 4 * 1] * 7) + [0]).to(expected_grad_dtype),
+                )
             else:
                 raise RuntimeError("test has world size of two")
 
@@ -1188,18 +1209,13 @@ def test(self):
                 "offload_optimizer": {
                     "device": "cpu"
                 }
-            }
+            },
         }
         hidden_dim = 10
 
         model = SimpleModel(hidden_dim)
-        model, _, _, _ = deepspeed.initialize(model=model,
-                                              model_parameters=model.parameters(),
-                                              config=config_dict)
-        data_loader = random_dataloader(model=model,
-                                        total_samples=50,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
+        model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config_dict)
+        data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device)
         dist.barrier()
         for n, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
@@ -1207,7 +1223,7 @@ def test(self):
             model.step()
 
 
-@pytest.mark.parametrize('return_type', [tuple, list, dict])
+@pytest.mark.parametrize("return_type", [tuple, list, dict])
 class TestZero3DictFwd(DistributedTest):
     world_size = 1
 
@@ -1226,11 +1242,12 @@ def test(self, return_type):
             },
             "zero_optimization": {
                 "stage": 3
-            }
+            },
         }
         hidden_dim = 10
 
         class MyModel(torch.nn.Module):
+
             def __init__(self, hidden_dim):
                 super(MyModel, self).__init__()
                 self.l1 = torch.nn.Linear(hidden_dim, hidden_dim)
@@ -1240,7 +1257,7 @@ def forward(self, x, y):
                 x = self.l1(x)
                 loss = self.cel(x, y)
                 if return_type == dict:
-                    val = {'a': x, 'loss': loss, 'b': 1, 'c': None}
+                    val = {"a": x, "loss": loss, "b": 1, "c": None}
                 elif return_type == list:
                     val = [x, loss]
                 elif return_type == tuple:
@@ -1252,25 +1269,20 @@ def forward(self, x, y):
         with deepspeed.zero.Init():
             model = MyModel(hidden_dim)
 
-        model, _, _, _ = deepspeed.initialize(model=model,
-                                              model_parameters=model.parameters(),
-                                              config=config_dict)
-        data_loader = random_dataloader(model=model,
-                                        total_samples=50,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
+        model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config_dict)
+        data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device)
         dist.barrier()
         for n, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
             if return_type == dict:
-                loss = loss['loss']
+                loss = loss["loss"]
             else:
                 loss = loss[1]
             model.backward(loss)
             model.step()
 
 
-@pytest.mark.parametrize('zero_stage', [1, 2, 3])
+@pytest.mark.parametrize("zero_stage", [1, 2, 3])
 class TestZeroAdamOptimizerStepCount(DistributedTest):
     world_size = 1
 
@@ -1294,7 +1306,7 @@ def test(self, zero_stage):
             "fp16": {
                 "enabled": True,
                 "initial_scale_power": 8
-            }
+            },
         }
         hidden_dim = 4
 
@@ -1302,10 +1314,7 @@ def test(self, zero_stage):
         model, optimizer, _, _ = deepspeed.initialize(config=config_dict,
                                                       model=model,
                                                       model_parameters=model.parameters())
-        data_loader = random_dataloader(model=model,
-                                        total_samples=16,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
+        data_loader = random_dataloader(model=model, total_samples=16, hidden_dim=hidden_dim, device=model.device)
 
         for i, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
@@ -1317,20 +1326,21 @@ def test(self, zero_stage):
                 for sub_group_id, _ in enumerate(optimizer.fp16_groups):
                     fp32_param = optimizer.fp32_partitioned_groups_flat[sub_group_id]
                     state = optimizer.optimizer.state[fp32_param]
-                    step_counts.append(state['step'])
+                    step_counts.append(state["step"])
                 assert all(step == step_counts[0] for step in step_counts)
             elif zero_stage == 1 or zero_stage == 2:
                 for param_group in optimizer.optimizer.param_groups:
-                    for param in param_group['params']:
+                    for param in param_group["params"]:
                         state = optimizer.optimizer.state[param]
-                        step_counts.append(state['step'])
+                        step_counts.append(state["step"])
                 assert all(step == step_counts[0] for step in step_counts)
 
 
+@pytest.mark.parametrize("zero_stage", [1, 2, 3])
 class TestZeroFrozenWeights(DistributedTest):
-    world_size = 1
+    world_size = 2
 
-    def test(self):
+    def test(self, zero_stage):
         config_dict = {
             "train_batch_size": 4,
             "steps_per_print": 1,
@@ -1344,12 +1354,13 @@ def test(self):
                 "enabled": True
             },
             "zero_optimization": {
-                "stage": 3
-            }
+                "stage": zero_stage
+            },
         }
         hidden_dim = 10
 
         class MyModel(torch.nn.Module):
+
             def __init__(self, hidden_dim):
                 super(MyModel, self).__init__()
                 self.l1 = torch.nn.Linear(hidden_dim, hidden_dim)
@@ -1369,16 +1380,11 @@ def forward(self, x, y):
                 val = (x, loss)
                 return val
 
-        with deepspeed.zero.Init(config_dict_or_path=config_dict):
+        with deepspeed.zero.Init(config_dict_or_path=config_dict, enabled=zero_stage == 3):
             model = MyModel(hidden_dim)
 
-        model, _, _, _ = deepspeed.initialize(model=model,
-                                              model_parameters=model.parameters(),
-                                              config=config_dict)
-        data_loader = random_dataloader(model=model,
-                                        total_samples=50,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
+        model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config_dict)
+        data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device)
         dist.barrier()
         for n, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
@@ -1387,7 +1393,7 @@ def forward(self, x, y):
             model.step()
 
 
-@pytest.mark.parametrize('force_ds_optim', [True, False])
+@pytest.mark.parametrize("force_ds_optim", [True, False])
 class TestZeroOffloadOptim(DistributedTest):
     world_size = 1
 
@@ -1415,10 +1421,104 @@ def test(self, force_ds_optim):
 
         if force_ds_optim:
             with pytest.raises(ZeRORuntimeException):
-                model, _, _, _ = deepspeed.initialize(model=model,
-                                                      optimizer=optimizer,
-                                                      config=config_dict)
+                model, _, _, _ = deepspeed.initialize(model=model, optimizer=optimizer, config=config_dict)
+        else:
+            model, _, _, _ = deepspeed.initialize(model=model, optimizer=optimizer, config=config_dict)
+
+
+@pytest.mark.parametrize("training", [True, False])
+class TestZeroPartitionCache(DistributedTest):
+    world_size = 1
+
+    def test_training_partition_cache(self, training):
+        hidden_dim = 10
+        config_dict = {
+            "train_batch_size": 2,
+            "fp16": {
+                "enabled": True,
+                "initial_scale_power": 8
+            },
+            "zero_optimization": {
+                "stage": 3,
+                "stage3_param_persistence_threshold": hidden_dim,
+            },
+        }
+        if training:
+            config_dict["optimizer"] = {"type": "Adam"}
+
+        with deepspeed.zero.Init(config_dict_or_path=config_dict):
+            model = SimpleModel(hidden_dim, empty_grad=False)
+
+        model, _, _, _ = deepspeed.initialize(model=model, config=config_dict)
+
+        dtype = torch.half
+        data_loader = random_dataloader(
+            model=model,
+            total_samples=6,
+            hidden_dim=hidden_dim,
+            device=model.device,
+            dtype=dtype,
+        )
+
+        for _, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            if training:
+                model.backward(loss)
+                model.step()
+
+        persist_param_size = sum([p.numel() for p in model.parameters() if p.ds_persist])
+
+        assert persist_param_size >= sum([p.numel() for p in model.parameters()])
+
+        model.empty_partition_cache()
+        assert sum([p.numel() for p in model.parameters()]) == 0
+
+
+@pytest.mark.parametrize("use_client_optimizer", [True, False])
+@pytest.mark.parametrize("empty_weight_group", [True, False])
+@pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16, torch.float32])
+class TestEmptyParameterGroup(DistributedTest):
+    world_size = 1
+
+    def test_empty_param_groups(self, dtype, use_client_optimizer, empty_weight_group):
+        model = SimpleModel(hidden_dim=4, nlayers=4)
+        param_groups = [
+            {
+                "params": [] if empty_weight_group else [l.weight for l in model.linears],
+                "weight_decay": 0.01,
+            },
+            {
+                "params": [l.bias for l in model.linears] if empty_weight_group else [],
+                "weight_decay": 0.0
+            },
+        ]
+
+        config_dict = {
+            "train_micro_batch_size_per_gpu": 1,
+            "steps_per_print": 1,
+            "zero_optimization": {
+                "stage": 3,
+                "stage3_param_persistence_threshold": 0,
+            },
+            "fp16": {
+                "enabled": dtype == torch.float16,
+            },
+            "bf16": {
+                "enabled": dtype == torch.bfloat16
+            }
+        }
+
+        if use_client_optimizer:
+            optimizer = deepspeed.ops.adam.FusedAdam(param_groups, lr=0.1)
+            model_parameters = model.parameters()
         else:
-            model, _, _, _ = deepspeed.initialize(model=model,
-                                                  optimizer=optimizer,
-                                                  config=config_dict)
+            config_dict["optimizer"] = {"type": "adamw"}
+            optimizer = None
+            model_parameters = param_groups
+
+        model, _, _, _ = deepspeed.initialize(
+            model=model,
+            model_parameters=model_parameters,
+            optimizer=optimizer,
+            config=config_dict,
+        )
diff --git a/tests/unit/runtime/zero/test_zero_config.py b/tests/unit/runtime/zero/test_zero_config.py
index 84852ec2e6f8..db9fd6516034 100644
--- a/tests/unit/runtime/zero/test_zero_config.py
+++ b/tests/unit/runtime/zero/test_zero_config.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from deepspeed.runtime.zero.config import DeepSpeedZeroConfig, DeepSpeedZeroOffloadParamConfig, DeepSpeedZeroOffloadOptimizerConfig
 
@@ -28,6 +31,12 @@ def test_zero_config_aliasfields():
     assert config.gather_16bit_weights_on_model_save == True
 
 
+def test_zero_config_pipeline_loading_checkpoint():
+    for stage in [0, 1, 2]:
+        config = DeepSpeedZeroConfig(**{"stage": stage})
+        assert config.pipeline_loading_checkpoint == False
+
+
 def test_zero_config_overlapcomm():
     for stage in [0, 1, 2]:
         config = DeepSpeedZeroConfig(**{"stage": stage})
@@ -55,20 +64,11 @@ def test_zero_offload_optimizer_config_pipeline():
     config = DeepSpeedZeroOffloadOptimizerConfig()
     assert config.pipeline == False
 
-    config = DeepSpeedZeroOffloadOptimizerConfig(**{
-        "pipeline_read": True,
-        "pipeline_write": False
-    })
+    config = DeepSpeedZeroOffloadOptimizerConfig(**{"pipeline_read": True, "pipeline_write": False})
     assert config.pipeline == True
 
-    config = DeepSpeedZeroOffloadOptimizerConfig(**{
-        "pipeline_read": False,
-        "pipeline_write": True
-    })
+    config = DeepSpeedZeroOffloadOptimizerConfig(**{"pipeline_read": False, "pipeline_write": True})
     assert config.pipeline == True
 
-    config = DeepSpeedZeroOffloadOptimizerConfig(**{
-        "pipeline_read": True,
-        "pipeline_write": True
-    })
+    config = DeepSpeedZeroOffloadOptimizerConfig(**{"pipeline_read": True, "pipeline_write": True})
     assert config.pipeline == True
diff --git a/tests/unit/runtime/zero/test_zero_context.py b/tests/unit/runtime/zero/test_zero_context.py
index a88db44888ef..aabe7f0b7f15 100644
--- a/tests/unit/runtime/zero/test_zero_context.py
+++ b/tests/unit/runtime/zero/test_zero_context.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from types import SimpleNamespace
 
@@ -14,6 +17,7 @@
 
 # Test that no sub-class or super-class is missed
 class ConvX(torch.nn.Conv1d):
+
     def __init__(self, *args):
         super().__init__(*args)
         # This would not be partitioned before bugfix 5ca8167
@@ -24,6 +28,7 @@ def forward(self, x):
 
 
 class ConvNet(torch.nn.Module):
+
     def __init__(self):
         super().__init__()
         self.conv1 = ConvX(1, 3, 4)
@@ -61,6 +66,7 @@ def test(self):
         hidden_dim = 10
 
         class MyModel(torch.nn.Module):
+
             def __init__(self, hidden_dim):
                 super(MyModel, self).__init__()
                 self.l1 = torch.nn.Linear(hidden_dim, hidden_dim)
@@ -126,9 +132,9 @@ def test_throughput_calculation(self):
         args = SimpleNamespace(local_rank=0)
         net = SimpleModel(hidden_dim=4)
         engine, _, _, _ = deepspeed.initialize(args=args,
-                                            config=config_dict,
-                                            model=net,
-                                            model_parameters=net.parameters())
+                                               config=config_dict,
+                                               model=net,
+                                               model_parameters=net.parameters())
         assert engine.tput_timer.batch_size == train_micro_batch_size_per_gpu * gradient_accumulation_steps
 
         assert not engine.tput_timer.initialized
@@ -167,11 +173,9 @@ def test_throughput_calculation(self):
         assert engine.tput_timer.total_elapsed_time == 0
 
         # calling start()/stop() to increment the step counter until start_step
-        while engine.tput_timer.micro_step_count < (gradient_accumulation_steps *
-                                                    engine.tput_timer.start_step):
+        while engine.tput_timer.micro_step_count < (gradient_accumulation_steps * engine.tput_timer.start_step):
             engine.tput_timer.start()
-            global_step = (engine.tput_timer.micro_step_count +
-                           1) % gradient_accumulation_steps == 0
+            global_step = (engine.tput_timer.micro_step_count + 1) % gradient_accumulation_steps == 0
             engine.tput_timer.stop(global_step=global_step)
         assert engine.tput_timer.global_step_count == engine.tput_timer.start_step
         assert engine.tput_timer.total_elapsed_time == 0
@@ -182,20 +186,20 @@ def test_throughput_calculation(self):
             current_duration = engine.tput_timer.step_elapsed_time
             total_duration = engine.tput_timer.total_elapsed_time
 
-            global_step = (engine.tput_timer.micro_step_count +
-                           1) % gradient_accumulation_steps == 0
+            global_step = (engine.tput_timer.micro_step_count + 1) % gradient_accumulation_steps == 0
             engine.tput_timer.stop(global_step=global_step)
             duration = engine.tput_timer.end_time - engine.tput_timer.start_time
             # step elapsed time is reset after gradient accumulation steps
             assert engine.tput_timer.step_elapsed_time == (
-                0 if engine.tput_timer.global_step_count != engine.tput_timer.start_step
-                else current_duration + duration)
+                0 if engine.tput_timer.global_step_count != engine.tput_timer.start_step else current_duration +
+                duration)
             assert engine.tput_timer.total_elapsed_time == total_duration + duration
 
     def test_ext_param_getattr(self):
         setup_serial_env()
 
         class ExtLinear(torch.nn.Module):
+
             def __init__(self, dim=16):
                 super().__init__()
                 self.dim = dim
@@ -214,9 +218,9 @@ def forward(self, input):
 
         args = SimpleNamespace(local_rank=0)
         engine, optim, _, _ = deepspeed.initialize(args=args,
-                                                model=net,
-                                                model_parameters=net.parameters(),
-                                                config=config)
+                                                   model=net,
+                                                   model_parameters=net.parameters(),
+                                                   config=config)
 
         with deepspeed.zero.GatheredParameters(net.linear1.weight):
             assert net.linear1.weight.numel() == net.dim**2
diff --git a/tests/unit/runtime/zero/test_zero_context_ancestry.py b/tests/unit/runtime/zero/test_zero_context_ancestry.py
index 38ae524906d5..21955f5df152 100644
--- a/tests/unit/runtime/zero/test_zero_context_ancestry.py
+++ b/tests/unit/runtime/zero/test_zero_context_ancestry.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 import deepspeed
@@ -31,32 +34,30 @@
 # test that sub-classes get params that aren't prematurely partitioned and thus requiring gathering
 # fixed by https://github.com/microsoft/DeepSpeed/pull/1202
 class GrandPa(torch.nn.Module):
+
     def __init__(self, *args):
         super().__init__(*args)
         self.param_grandpa = torch.nn.Parameter(torch.ones(5))
-        self.param_grandpa.data = (self.param_grandpa.data +
-                                   1).data  # test param is not yet partitioned
+        self.param_grandpa.data = (self.param_grandpa.data + 1).data  # test param is not yet partitioned
 
 
 class Pa(GrandPa):
+
     def __init__(self, *args):
         super().__init__(*args)
         self.param_pa = torch.nn.Parameter(torch.ones(5))
-        self.param_pa.data = (self.param_pa.data +
-                              1).data  # test param is not yet partitioned
-        self.param_grandpa.data = (self.param_grandpa.data +
-                                   1).data  # test param is not yet partitioned
+        self.param_pa.data = (self.param_pa.data + 1).data  # test param is not yet partitioned
+        self.param_grandpa.data = (self.param_grandpa.data + 1).data  # test param is not yet partitioned
 
 
 class Son(Pa):
+
     def __init__(self):
         super().__init__()
         self.param = torch.nn.Parameter(torch.ones(5))
         self.param.data = (self.param.data + 1).data  # test param is not yet partitioned
-        self.param_pa.data = (self.param_pa.data +
-                              1).data  # test param is not yet partitioned
-        self.param_grandpa.data = (self.param_grandpa.data +
-                                   1).data  # test param is not yet partitioned
+        self.param_pa.data = (self.param_pa.data + 1).data  # test param is not yet partitioned
+        self.param_grandpa.data = (self.param_grandpa.data + 1).data  # test param is not yet partitioned
 
 
 class TestSerialParamInit(DistributedTest):
@@ -98,6 +99,7 @@ def test(self):
         }
 
         class Model(torch.nn.Module):
+
             def __init__(self):
                 super(Model, self).__init__()
                 self.linear = torch.nn.Linear(4, 4)
diff --git a/tests/unit/runtime/zero/test_zero_context_return.py b/tests/unit/runtime/zero/test_zero_context_return.py
index 68329cb886c2..874a8ea3b676 100644
--- a/tests/unit/runtime/zero/test_zero_context_return.py
+++ b/tests/unit/runtime/zero/test_zero_context_return.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from types import SimpleNamespace
 import torch
@@ -11,6 +14,7 @@
 
 
 class DanglingBias(torch.nn.Linear):
+
     def forward(self, *inputs):
         out = super().forward(*inputs)
         # return the bias to trigger a dangling external param
@@ -19,18 +23,21 @@ def forward(self, *inputs):
 
 class DataClass:
     """Just wraps data in an object. """
+
     def __init__(self, out=None, bias=None):
         self.out = out
         self.bias = bias
 
 
 class DanglingBiasClass(DanglingBias):
+
     def forward(self, *inputs):
         out, bias = super().forward(*inputs)
         return DataClass(out=out, bias=bias)
 
 
 class DanglingAttention(torch.nn.Linear):
+
     def __init__(self, dim=16, return_obj=False):
         super().__init__(dim, dim)
         self.dim = dim
@@ -56,6 +63,7 @@ def forward(self, input):
 
 
 class ModelContainer(torch.nn.Module):
+
     def __init__(self, dim=16, return_obj=False):
         super().__init__()
         self.dim = dim
@@ -70,6 +78,7 @@ def forward(self, input):
 
 
 class DanglingExt(torch.nn.Module):
+
     def __init__(self, dim=16):
         super().__init__()
         self.dim = dim
@@ -86,6 +95,7 @@ def forward(self, input):
 
 
 class ModelContainerVariableOutputType(ModelContainer):
+
     def __init__(self, dim=16, output_type=dict):
         super().__init__()
         self.output_type = output_type
@@ -129,10 +139,7 @@ def test_ext_param_return(self):
         net = DanglingExt()
 
         args = SimpleNamespace(local_rank=0)
-        engine, _, _, _ = deepspeed.initialize(args=args,
-                                                model=net,
-                                                model_parameters=net.parameters(),
-                                                config=config)
+        engine, _, _, _ = deepspeed.initialize(args=args, model=net, model_parameters=net.parameters(), config=config)
 
         for _ in range(5):
             input = torch.rand(net.dim).to(engine.device).half()
@@ -148,10 +155,7 @@ def test_ext_param_returnobj(self):
         net = ModelContainer(return_obj=True)
 
         args = SimpleNamespace(local_rank=0)
-        engine, _, _, _ = deepspeed.initialize(args=args,
-                                                model=net,
-                                                model_parameters=net.parameters(),
-                                                config=config)
+        engine, _, _, _ = deepspeed.initialize(args=args, model=net, model_parameters=net.parameters(), config=config)
 
         for _ in range(5):
             input = torch.rand(net.dim).to(engine.device).half()
@@ -169,10 +173,7 @@ def test_stage_3_output_type(self, output_type):
         net = ModelContainerVariableOutputType(output_type=output_type)
 
         args = SimpleNamespace(local_rank=0)
-        engine, _, _, _ = deepspeed.initialize(args=args,
-                                                model=net,
-                                                model_parameters=net.parameters(),
-                                                config=config)
+        engine, _, _, _ = deepspeed.initialize(args=args, model=net, model_parameters=net.parameters(), config=config)
 
         for _ in range(1):
             input = torch.rand(net.dim).to(engine.device).half()
diff --git a/tests/unit/runtime/zero/test_zero_dynamic_class.py b/tests/unit/runtime/zero/test_zero_dynamic_class.py
new file mode 100644
index 000000000000..e235206d4dc4
--- /dev/null
+++ b/tests/unit/runtime/zero/test_zero_dynamic_class.py
@@ -0,0 +1,53 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+
+from unit.common import DistributedTest
+
+import deepspeed
+
+
+class TestNewClassDeclaredNestingInit(DistributedTest):
+    world_size = 1
+
+    def test_new_class_declared_nesting_init(self):
+        ds_config = dict(train_batch_size=1, zero_optimization=dict(stage=3))
+
+        with deepspeed.zero.Init(config_dict_or_path=ds_config):
+
+            class MyModel(torch.nn.Module):
+
+                def __init__(self):
+                    super().__init__()
+                    self.fc = torch.nn.Linear(4, 4)
+
+            with deepspeed.zero.Init(config_dict_or_path=ds_config):
+                model = MyModel()
+
+        # ensure that zero3 processed the parameter
+        assert hasattr(model.fc.weight, "ds_id")
+        deepspeed_engine, *_ = deepspeed.initialize(model=model, config_params=ds_config)
+
+
+class TestNewClassDeclaredInsideNestingInit(DistributedTest):
+    world_size = 1
+
+    def test_new_class_declared_inside_nesting_init(self):
+        ds_config = dict(train_batch_size=1, zero_optimization=dict(stage=3))
+
+        with deepspeed.zero.Init(config_dict_or_path=ds_config):
+
+            class MyModel(torch.nn.Module):
+
+                def __init__(self):
+                    super().__init__()
+                    self.fc = torch.nn.Linear(1, 1)
+
+            model = MyModel()
+
+        # ensure that zero3 processed the parameter
+        assert hasattr(model.fc.weight, "ds_id")
+        deepspeed_engine, *_ = deepspeed.initialize(model=model, config_params=ds_config)
diff --git a/tests/unit/runtime/zero/test_zero_nesting_init.py b/tests/unit/runtime/zero/test_zero_nesting_init.py
new file mode 100644
index 000000000000..143e7e997b13
--- /dev/null
+++ b/tests/unit/runtime/zero/test_zero_nesting_init.py
@@ -0,0 +1,72 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+
+from unit.common import DistributedTest
+
+from transformers import VisionEncoderDecoderModel
+from transformers.deepspeed import HfDeepSpeedConfig
+
+import deepspeed
+
+
+class TestNestingInit(DistributedTest):
+    world_size = 1
+
+    def test_nesting_init(self):
+        ds_config = dict(train_batch_size=1, zero_optimization=dict(stage=3))
+
+        with deepspeed.zero.Init(config_dict_or_path=ds_config):
+            with deepspeed.zero.Init(config_dict_or_path=ds_config):
+                model = torch.nn.Linear(4, 4)
+
+        # ensure that zero3 processed the parameter
+        assert hasattr(model.weight, "ds_id")
+
+        deepspeed_engine, *_ = deepspeed.initialize(model=model, config_params=ds_config)
+
+
+class TestShutdownInNestingInit(DistributedTest):
+    world_size = 1
+
+    def test_shutdown_in_nesting_init(self):
+        ds_config = dict(train_batch_size=1, zero_optimization=dict(stage=3))
+
+        with deepspeed.zero.Init(config_dict_or_path=ds_config):
+            with deepspeed.zero.Init(config_dict_or_path=ds_config):
+                model1 = torch.nn.Linear(4, 4)
+
+            assert hasattr(model1.weight, "ds_id")
+            deepspeed_engine1, *_ = deepspeed.initialize(model=model1, config_params=ds_config)
+            with deepspeed.zero.Init(config_dict_or_path=ds_config):
+                model2 = torch.nn.Linear(4, 4)
+
+        # ensure that zero3 processed the parameter
+        assert hasattr(model2.weight, "ds_id")
+        deepspeed_engine2, *_ = deepspeed.initialize(model=model2, config_params=ds_config)
+
+
+class TestNestedParallelInit(DistributedTest):
+    world_size = 1
+
+    # Testing a model with composed and nested zero.Inits, with 3 zero.Init contexts, 1 parent and 2 children.
+    # The skeleton of the model is like so
+    #
+    # class VisionEncoderDecoderModel(...)::
+    #     def __init__(self):
+    #             encoder = AutoModel.from_config(config.encoder)
+    #             decoder = AutoModelForCausalLM.from_config(config.decoder)
+    #
+    # And the user calls like below:
+    # VisionEncoderDecoderModel.from_pretrained(...)
+    # which calls this constructor inside zero.Init
+
+    def test_nested_parallel_init(self):
+        ds_config = dict(train_batch_size=1, zero_optimization=dict(stage=3))
+        dschf = HfDeepSpeedConfig(ds_config)  # keep this object alive
+        model = VisionEncoderDecoderModel.from_pretrained(
+            "hf-internal-testing/tiny-random-VisionEncoderDecoderModel-vit-gpt2")
+        assert all([hasattr(p, 'ds_id') for p in model.parameters()])
diff --git a/tests/unit/runtime/zero/test_zero_offloadpp.py b/tests/unit/runtime/zero/test_zero_offloadpp.py
new file mode 100644
index 000000000000..c376686f8052
--- /dev/null
+++ b/tests/unit/runtime/zero/test_zero_offloadpp.py
@@ -0,0 +1,75 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+import pytest
+import deepspeed.comm as dist
+from unit.common import DistributedTest
+from unit.simple_model import random_dataloader
+
+import deepspeed
+
+from deepspeed.runtime.zero.offload_config import DeepSpeedZeroOffloadOptimizerConfig
+
+import torch.nn as nn
+
+
+class NNModel(nn.Module):
+
+    def __init__(self, h_dim=1024, n_layers=2):
+        super(NNModel, self).__init__()
+        self.layers = nn.ModuleList([nn.Linear(h_dim, h_dim) for i in range(n_layers)])
+        self.cross_entropy_loss = nn.CrossEntropyLoss()
+
+    def forward(self, x, y):
+        for layer in self.layers:
+            x = layer(x)
+        return self.cross_entropy_loss(x, y)
+
+
+def test_zero_partial_offload_config():
+    config = DeepSpeedZeroOffloadOptimizerConfig(**{"ratio": 0.3})
+    assert config.ratio == 0.3
+
+
+#Large sweep along hidden dim, num_layers of different sizes
+@pytest.mark.parametrize("h_dim", [1024])
+@pytest.mark.parametrize("n_layers", [4, 8])
+class TestZeroPartialOffloadConfigSweep(DistributedTest):
+    world_size = 4
+
+    def test(self, h_dim: int, n_layers: int) -> None:
+        config_dict = {
+            "train_batch_size": 256,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 0.00015,
+                }
+            },
+            "fp16": {
+                "enabled": True,
+                "initial_scale_power": 15
+            },
+            "zero_optimization": {
+                "stage": 3,
+                "sub_group_size": 8,
+                "reduce_bucket_size": 20,
+                "offload_optimizer": {
+                    "device": "cpu",
+                    "pin_memory": True,
+                    "ratio": 0.3
+                }
+            }
+        }
+
+        model = NNModel(h_dim, n_layers)
+        model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config_dict)
+        data_loader = random_dataloader(model=model, total_samples=20, hidden_dim=h_dim, device=model.device)
+        dist.barrier()
+
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
diff --git a/tests/unit/runtime/zero/test_zero_tensor_fragment.py b/tests/unit/runtime/zero/test_zero_tensor_fragment.py
index 20caf05dd9d5..e50b03035bad 100644
--- a/tests/unit/runtime/zero/test_zero_tensor_fragment.py
+++ b/tests/unit/runtime/zero/test_zero_tensor_fragment.py
@@ -1,18 +1,28 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import pytest
 import deepspeed.comm as dist
 import torch
 
 from unit.common import DistributedTest
-from unit.simple_model import random_dataloader
+from unit.simple_model import random_dataloader, SimpleModel
 from unit.util import bf16_required_version_check
 
 import deepspeed
 from deepspeed.utils import safe_get_full_fp32_param, safe_get_full_grad, safe_get_full_optimizer_state
+from deepspeed.utils import safe_set_full_fp32_param, safe_set_full_optimizer_state
+from deepspeed.utils import safe_get_local_fp32_param, safe_get_local_grad, safe_get_local_optimizer_state
+from deepspeed.utils import safe_set_local_fp32_param, safe_set_local_optimizer_state
 from deepspeed.runtime.zero.offload_config import OffloadDeviceEnum
 from deepspeed.ops.aio import AsyncIOBuilder
 
+WEIGHT_KEY = 'weight'
+FIRST_ORDER_KEY = 'exp_avg'
+SECOND_ORDER_KEY = 'exp_avg_sq'
+
 
 def validate_full_tensors(model):
     for _, lp in model.named_parameters():
@@ -27,19 +37,32 @@ def validate_full_tensors(model):
             assert all([p is None for p in param_list])
 
 
+def validate_local_tensors(model):
+    for _, lp in model.named_parameters():
+        hp = safe_get_local_fp32_param(lp)
+        exp_avg = safe_get_local_optimizer_state(lp, 'exp_avg')
+        exp_avg_sq = safe_get_local_optimizer_state(lp, 'exp_avg_sq')
+        hp_grad = safe_get_local_grad(lp)
+        param_list = [hp, hp_grad, exp_avg, exp_avg_sq]
+        if lp.requires_grad:
+            assert all([p is not None for p in param_list])
+        else:
+            assert all([p is None for p in param_list])
+
+
+validate_funcs_mapping = {"full": validate_full_tensors, "local": validate_local_tensors}
+
+
 class MyModel(torch.nn.Module):
+
     def __init__(self, hidden_dim, frozen_weights):
         super(MyModel, self).__init__()
         self.act = torch.nn.ReLU()
         self.cel = torch.nn.CrossEntropyLoss()
-        self.linears = torch.nn.ModuleList([
-            torch.nn.Linear(hidden_dim,
-                            1),
-            torch.nn.Linear(1,
-                            1),
-            torch.nn.Linear(1,
-                            hidden_dim)
-        ])
+        self.linears = torch.nn.ModuleList(
+            [torch.nn.Linear(hidden_dim, 1),
+             torch.nn.Linear(1, 1),
+             torch.nn.Linear(1, hidden_dim)])
         if frozen_weights:
             self.linears[0].weight.requires_grad = False
             self.linears[0].bias.requires_grad = False
@@ -53,10 +76,8 @@ def forward(self, x, y):
         return val
 
 
-def run_fragmented_model(model, config_dict, hidden_dim, dtype):
-    model, _, _, _ = deepspeed.initialize(model=model,
-                                            model_parameters=model.parameters(),
-                                            config=config_dict)
+def run_fragmented_model(model, config_dict, hidden_dim, dtype, validate_func):
+    model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config_dict)
     data_loader = random_dataloader(model=model,
                                     total_samples=10,
                                     hidden_dim=hidden_dim,
@@ -67,28 +88,32 @@ def run_fragmented_model(model, config_dict, hidden_dim, dtype):
         loss = model(batch[0], batch[1])
         loss = loss[1]
         model.backward(loss)
-        validate_full_tensors(model)
+        validate_func(model)
         model.step()
 
+    # Needed in ZeRO 3. Not doing so can give memory leak
+    model.destroy()
+
 
 @pytest.mark.parametrize('frozen_weights', [True, False])
-class TestTensorFragment(DistributedTest):
+class TestTensorFragmentGet(DistributedTest):
     # Need multiple gpus to test possible hanging
     world_size = 2
+    reuse_dist_env = True
 
+    @pytest.mark.parametrize('api_type', ['local', 'full'])
     @pytest.mark.parametrize('zero_stage', [1, 2, 3])
-    @pytest.mark.parametrize(
-        'offload_device',
-        [OffloadDeviceEnum.none,
-         OffloadDeviceEnum.cpu,
-         OffloadDeviceEnum.nvme])
-    def test_zero_fragments(self, tmpdir, zero_stage, offload_device, frozen_weights):
+    @pytest.mark.parametrize('offload_device', [OffloadDeviceEnum.none, OffloadDeviceEnum.cpu, OffloadDeviceEnum.nvme])
+    def test_zero_fragments(self, tmpdir, api_type, zero_stage, offload_device, frozen_weights):
         if offload_device == OffloadDeviceEnum.nvme:
             if zero_stage != 3:
                 pytest.skip(f"Nvme offload not supported for zero stage {zero_stage}")
             if not deepspeed.ops.__compatible_ops__[AsyncIOBuilder.NAME]:
                 pytest.skip('Skip tests since async-io is not compatible')
 
+        if api_type == "local" and zero_stage != 3:
+            pytest.skip(f"Local APIs only for zero stage 3 but current stage is {zero_stage}")
+
         config_dict = {
             "train_micro_batch_size_per_gpu": 1,
             "steps_per_print": 1,
@@ -108,9 +133,7 @@ def test_zero_fragments(self, tmpdir, zero_stage, offload_device, frozen_weights
         }
 
         if offload_device == OffloadDeviceEnum.cpu:
-            config_dict["zero_optimization"]["offload_optimizer"] = {
-                "device": offload_device
-            }
+            config_dict["zero_optimization"]["offload_optimizer"] = {"device": offload_device}
         elif offload_device == OffloadDeviceEnum.nvme:
             config_dict["zero_optimization"]["offload_optimizer"] = {
                 "device": offload_device,
@@ -124,7 +147,9 @@ def test_zero_fragments(self, tmpdir, zero_stage, offload_device, frozen_weights
         else:
             model = MyModel(hidden_dim, frozen_weights)
 
-        run_fragmented_model(model, config_dict, hidden_dim, torch.float16)
+        validate_func = validate_funcs_mapping[api_type]
+
+        run_fragmented_model(model, config_dict, hidden_dim, torch.float16, validate_func)
 
     def test_bf16_fragments(self, frozen_weights):
         if frozen_weights:
@@ -153,4 +178,160 @@ def test_bf16_fragments(self, frozen_weights):
 
         hidden_dim = 128
         model = MyModel(hidden_dim, frozen_weights)
-        run_fragmented_model(model, config_dict, hidden_dim, torch.bfloat16)
+        run_fragmented_model(model, config_dict, hidden_dim, torch.bfloat16, validate_full_tensors)
+
+
+def create_random_values(model, key_list, group, use_cuda=True):
+    param_values = {}
+    for n, lp in model.named_parameters():
+        param_shape = lp.ds_shape if hasattr(lp, 'ds_id') else lp.shape
+        param_values[n] = {}
+        for key in key_list:
+            rand_value = torch.rand(param_shape, dtype=torch.float32, device=model.device)
+            dist.broadcast(rand_value, src=0, group=group)
+            param_values[n][key] = rand_value
+    return param_values
+
+
+def set_param_values_with_dict(model, value_dict):
+    for n, lp in model.named_parameters():
+        for key, value_tensor in value_dict[n].items():
+            if key == WEIGHT_KEY:
+                safe_set_full_fp32_param(lp, value_tensor)
+            else:
+                safe_set_full_optimizer_state(lp, value_tensor, key)
+
+
+def validate_param_values_with_dict(model, value_dict):
+    for n, lp in model.named_parameters():
+        for key, expected_tensor in value_dict[n].items():
+            if key == WEIGHT_KEY:
+                actual_tensor = safe_get_full_fp32_param(lp)
+            else:
+                actual_tensor = safe_get_full_optimizer_state(lp, key)
+            assert torch.equal(expected_tensor, actual_tensor)
+
+
+def create_random_values_for_local(model, key_list, group, use_cuda=True):
+    param_values = {}
+    for n, lp in model.named_parameters():
+        param_shape = lp.ds_tensor.shape
+        param_values[n] = {}
+        for key in key_list:
+            device = model.device if use_cuda else "cpu"
+            rand_value = torch.rand(param_shape, dtype=torch.float32, device=device)
+            # dist.broadcast(rand_value, src=0, group=group)
+            param_values[n][key] = rand_value
+    return param_values
+
+
+def set_local_param_values_with_dict(model, value_dict):
+    for n, lp in model.named_parameters():
+
+        for key, value_tensor in value_dict[n].items():
+            if key == WEIGHT_KEY:
+                safe_set_local_fp32_param(lp, value_tensor)
+            else:
+                safe_set_local_optimizer_state(lp, value_tensor, key)
+
+
+def validate_local_param_values_with_dict(model, value_dict):
+    for n, lp in model.named_parameters():
+        for key, expected_tensor in value_dict[n].items():
+            if key == WEIGHT_KEY:
+                actual_tensor = safe_get_local_fp32_param(lp)
+            else:
+                actual_tensor = safe_get_local_optimizer_state(lp, key)
+            assert torch.equal(expected_tensor, actual_tensor)
+
+
+helper_funcs_mapping = {
+    "full": {
+        "create_random_values": create_random_values,
+        "set_param_values_with_dict": set_param_values_with_dict,
+        "validate_param_values_with_dict": validate_param_values_with_dict
+    },
+    "local": {
+        "create_random_values": create_random_values_for_local,
+        "set_param_values_with_dict": set_local_param_values_with_dict,
+        "validate_param_values_with_dict": validate_local_param_values_with_dict
+    }
+}
+
+
+@pytest.mark.parametrize('dtype', [torch.bfloat16, torch.float16, torch.float32])
+class TestTensorFragmentUpdate(DistributedTest):
+    # Need multiple gpus to test possible hanging
+    world_size = 2
+    reuse_dist_env = True
+
+    @pytest.mark.parametrize('api_type', ['local', 'full'])
+    @pytest.mark.parametrize('zero_stage', [1, 2, 3])
+    @pytest.mark.parametrize('offload_device', [OffloadDeviceEnum.none, OffloadDeviceEnum.cpu, OffloadDeviceEnum.nvme])
+    def test_zero_fragments(self, tmpdir, api_type, zero_stage, offload_device, dtype):
+
+        if dtype == torch.bfloat16 and not bf16_required_version_check(accelerator_check=False):
+            pytest.skip(
+                " DeepSpeed BFloat16 tests need torch >= 1.10, NCCL >= 2.10.3, CUDA > =11.0 and HW support for BFloat16 to run correctly"
+            )
+
+        if api_type == "local" and zero_stage != 3:
+            pytest.skip(f"Local APIs only for zero stage 3 but current stage is {zero_stage}")
+
+        if offload_device == OffloadDeviceEnum.nvme:
+            if zero_stage != 3:
+                pytest.skip(f"Nvme offload not supported for zero stage {zero_stage}")
+            if not deepspeed.ops.__compatible_ops__[AsyncIOBuilder.NAME]:
+                pytest.skip('Skip tests since async-io is not compatible')
+
+        config_dict = {
+            "train_micro_batch_size_per_gpu": 1,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 1e-6
+                }
+            },
+            "zero_optimization": {
+                "stage": zero_stage,
+            }
+        }
+
+        if offload_device == OffloadDeviceEnum.cpu:
+            config_dict["zero_optimization"]["offload_optimizer"] = {"device": offload_device}
+        elif offload_device == OffloadDeviceEnum.nvme:
+            config_dict["zero_optimization"]["offload_optimizer"] = {
+                "device": offload_device,
+                "nvme_path": str(tmpdir)
+            }
+
+        if dtype == torch.float16:
+            config_dict["fp16"] = {"enabled": True, "initial_scale_power": 8}
+        elif dtype == torch.bfloat16:
+            config_dict["bf16"] = {"enabled": True}
+
+        hidden_dim = 128
+        if zero_stage == 3:
+            config_dict["zero_optimization"]["param_persistence_threshold"] = hidden_dim
+            with deepspeed.zero.Init(config_dict_or_path=config_dict):
+                model = SimpleModel(hidden_dim, nlayers=4)
+        else:
+            model = SimpleModel(hidden_dim, nlayers=4)
+
+        model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config_dict)
+        world = dist.get_world_size()
+        group = dist.new_group(ranks=list(range(world)))
+
+        dist.barrier()
+        optim_keys = [WEIGHT_KEY, FIRST_ORDER_KEY, SECOND_ORDER_KEY]
+        helper_funcs = helper_funcs_mapping[api_type]
+        optim_state_values = helper_funcs["create_random_values"](model,
+                                                                  optim_keys,
+                                                                  group,
+                                                                  use_cuda=offload_device == OffloadDeviceEnum.none)
+        helper_funcs["set_param_values_with_dict"](model, optim_state_values)
+        helper_funcs["validate_param_values_with_dict"](model, optim_state_values)
+
+        # Needed in ZeRO 3. Not doing so can leak memory.
+        model.destroy()
diff --git a/tests/unit/runtime/zero/test_zero_tiled.py b/tests/unit/runtime/zero/test_zero_tiled.py
index 5858b5936872..96b9116126ac 100644
--- a/tests/unit/runtime/zero/test_zero_tiled.py
+++ b/tests/unit/runtime/zero/test_zero_tiled.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import copy
 
@@ -120,6 +123,7 @@ class LinearWrapper(torch.nn.Linear):
 
     Megatron-LM optionally delays the bias addition to fuse with a proceeding kernel.
     """
+
     def forward(self, input):
         out = super().forward(input)
         return out, self.bias
diff --git a/tests/unit/runtime/zero/test_zeropp.py b/tests/unit/runtime/zero/test_zeropp.py
new file mode 100644
index 000000000000..27ec7269afc6
--- /dev/null
+++ b/tests/unit/runtime/zero/test_zeropp.py
@@ -0,0 +1,94 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+import pytest
+import deepspeed.comm as dist
+from torch.nn import Module
+
+from unit.common import DistributedTest
+from unit.simple_model import random_dataloader
+
+import deepspeed
+
+from deepspeed.runtime.zero.config import DeepSpeedZeroConfig
+
+import torch.nn as nn
+
+
+class NNModel(nn.Module):
+
+    def __init__(self, h_dim=1024, n_layers=2):
+        super(NNModel, self).__init__()
+        self.layers = nn.ModuleList([nn.Linear(h_dim, h_dim) for i in range(n_layers)])
+        self.cross_entropy_loss = nn.CrossEntropyLoss()
+
+    def forward(self, x, y):
+        for layer in self.layers:
+            x = layer(x)
+        return self.cross_entropy_loss(x, y)
+
+
+def test_zero_hpz_partition_size_config():
+    config = DeepSpeedZeroConfig(**{"zero_hpz_partition_size": 4})
+    assert config.zero_hpz_partition_size == 4
+
+
+def _assert_no_secondary_tensor_group(model: Module) -> None:
+    for _, param in model.named_parameters():
+        assert param.ds_secondary_tensor is None
+        assert param.ds_zero_param_process_group is None
+
+
+def _assert_secondary_tensor_size(model: Module) -> None:
+    for _, param in model.named_parameters():
+        assert param.ds_secondary_tensor is not None
+        assert param.ds_secondary_tensor.size()[0] % param.ds_tensor.size()[0] == 0
+
+
+#Large sweep along hidden dim, num_layers, and zpg of different sizes
+#Assert when zpg=1 that secondary group and tensors are invalid
+@pytest.mark.sequential
+@pytest.mark.parametrize("h_dim", [1024])
+@pytest.mark.parametrize("n_layers", [4, 9])
+@pytest.mark.parametrize("zpg", [1, 2, 4])
+class TestZeroPPConfigSweep(DistributedTest):
+    world_size = 4
+
+    def test(self, h_dim: int, n_layers: int, zpg: int) -> None:
+        config_dict = {
+            "train_micro_batch_size_per_gpu": 1,
+            "zero_optimization": {
+                "stage": 3,
+                "stage3_max_reuse_distance": 0,
+                "zero_hpz_partition_size": zpg,
+                "zero_quantized_weights": True,
+                "zero_quantized_gradients": True,
+                "contiguous_gradients": True,
+                "overlap_comm": True,
+            },
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 1.
+                }
+            },
+            "fp16": {
+                "enabled": True,
+                "loss_scale": 1.,
+            }
+        }
+
+        model = NNModel(h_dim, n_layers)
+        model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config_dict)
+        data_loader = random_dataloader(model=model, total_samples=20, hidden_dim=h_dim, device=model.device)
+        dist.barrier()
+        if zpg == 1:
+            _assert_no_secondary_tensor_group(model)
+
+        for n, batch in enumerate(data_loader):
+            if n == 0 and zpg != 1:
+                _assert_secondary_tensor_size(model)
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
diff --git a/tests/unit/runtime/zero/utils.py b/tests/unit/runtime/zero/utils.py
index 5f0687892d43..ceb594a2a05d 100644
--- a/tests/unit/runtime/zero/utils.py
+++ b/tests/unit/runtime/zero/utils.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import os
 from unit.common import get_master_port
diff --git a/tests/unit/simple_model.py b/tests/unit/simple_model.py
index 481aae0bfdcd..01ce3d2fe4c9 100644
--- a/tests/unit/simple_model.py
+++ b/tests/unit/simple_model.py
@@ -1,9 +1,13 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import os
 import json
 import argparse
 import torch
+from collections import OrderedDict
 
 from deepspeed.pipe import PipelineModule, LayerSpec
 from deepspeed.moe.layer import MoE
@@ -13,15 +17,44 @@
 
 
 class SimpleModel(torch.nn.Module):
+
     def __init__(self, hidden_dim, empty_grad=False, nlayers=1):
         super(SimpleModel, self).__init__()
-        self.linears = torch.nn.ModuleList(
-            [torch.nn.Linear(hidden_dim,
-                             hidden_dim) for i in range(nlayers)])
+        self.linears = torch.nn.ModuleList([torch.nn.Linear(hidden_dim, hidden_dim) for i in range(nlayers)])
+        if empty_grad:
+            self.linear2 = torch.nn.Linear(hidden_dim, hidden_dim)
+        self.cross_entropy_loss = torch.nn.CrossEntropyLoss()
+        self.empty_grad = empty_grad
+
+    def forward(self, x, y):
+        if len(self.linears) == 1:
+            x = self.linears[0](x)
+        else:
+            for i, l in enumerate(self.linears):
+                x = self.linears[i // 2](x) + l(x)
+        return self.cross_entropy_loss(x, y)
+
+
+class SimpleFrozenModel(torch.nn.Module):
+
+    def __init__(self, hidden_dim, empty_grad=False):
+        super(SimpleFrozenModel, self).__init__()
+        self.linears = torch.nn.ModuleList([torch.nn.Linear(hidden_dim, hidden_dim) for i in range(2)])
         if empty_grad:
             self.linear2 = torch.nn.Linear(hidden_dim, hidden_dim)
         self.cross_entropy_loss = torch.nn.CrossEntropyLoss()
         self.empty_grad = empty_grad
+        # Freeze first layer
+        self.linears[0].weight.requires_grad = False
+        self.linears[0].bias.requires_grad = False
+
+    def custom_state_dict(self, *args, **kwargs):
+        state_dict = super(SimpleFrozenModel, self).state_dict(*args, **kwargs)
+        custom = OrderedDict()
+        for k, v in state_dict.items():
+            if 'linears.0.weight' not in k:
+                custom[k] = v
+        return custom
 
     def forward(self, x, y):
         if len(self.linears) == 1:
@@ -33,6 +66,7 @@ def forward(self, x, y):
 
 
 class Curriculum_SimpleModel(SimpleModel):
+
     def __init__(self, hidden_dim, empty_grad=False):
         super(Curriculum_SimpleModel, self).__init__(hidden_dim, empty_grad)
 
@@ -43,35 +77,43 @@ def forward(self, x, y, **kwargs):
 
 
 class SimpleMoEModel(torch.nn.Module):
+
     def __init__(self, hidden_dim, num_experts=4, ep_size=1, use_residual=False):
         super(SimpleMoEModel, self).__init__()
-        self.linear = torch.nn.Linear(hidden_dim, hidden_dim)
-        expert = torch.nn.Linear(hidden_dim, hidden_dim)
+        self.linear1 = torch.nn.Linear(hidden_dim, hidden_dim)
+        expert = torch.nn.Sequential(torch.nn.Linear(hidden_dim, hidden_dim), torch.nn.Linear(hidden_dim, hidden_dim))
         # using two MoE layers to check implications of sharing a single storage
-        self.linear2 = MoE(hidden_size=hidden_dim,
-                           expert=expert,
-                           ep_size=ep_size,
-                           use_residual=use_residual,
-                           num_experts=num_experts,
-                           k=1)
-        self.linear3 = MoE(hidden_size=hidden_dim,
-                           expert=expert,
-                           ep_size=ep_size,
-                           use_residual=use_residual,
-                           num_experts=num_experts,
-                           k=1)
+        self.moe_1 = MoE(hidden_size=hidden_dim,
+                         expert=expert,
+                         ep_size=ep_size,
+                         use_residual=use_residual,
+                         num_experts=num_experts,
+                         k=1)
+        # interleaving MoE modules with dense to create an opportunity
+        # for gradients to be merged in ZeRO stage 2 average_tensor reduce bucket
+        self.linear2 = torch.nn.Linear(hidden_dim, hidden_dim)
+        self.moe_2 = MoE(hidden_size=hidden_dim,
+                         expert=expert,
+                         ep_size=ep_size,
+                         use_residual=use_residual,
+                         num_experts=num_experts,
+                         k=1)
+        self.linear3 = torch.nn.Linear(hidden_dim, hidden_dim)
         self.cross_entropy_loss = torch.nn.CrossEntropyLoss()
 
     def forward(self, x, y):
-        hidden_dim = self.linear(x)
-        output, _, _ = self.linear2(hidden_dim)
-        output, _, _ = self.linear3(output)
+        hidden_dim = self.linear1(x)
+        output, _, _ = self.moe_1(hidden_dim)
+        output = self.linear2(output)
+        output, _, _ = self.moe_2(output)
+        output = self.linear3(output)
         hidden_dim = hidden_dim + output
         sentence_embed = hidden_dim.mean(1)
         return self.cross_entropy_loss(sentence_embed, y)
 
 
 class SimplePRMoEModel(torch.nn.Module):
+
     def __init__(self, hidden_dim, num_experts=2, ep_size=1, use_residual=False):
         super(SimplePRMoEModel, self).__init__()
         self.linear = torch.nn.Linear(hidden_dim, hidden_dim)
@@ -102,6 +144,7 @@ def forward(self, x, y):
 
 
 class UnusedParametersModel(SimpleModel):
+
     def __init__(self, hidden_dim, empty_grad=False):
         super().__init__(hidden_dim, empty_grad)
 
@@ -109,21 +152,19 @@ def __init__(self, hidden_dim, empty_grad=False):
 
 
 class LinearStack(torch.nn.Module):
+
     def __init__(self, input_dim=128, hidden_dim=128, output_dim=128, num_layers=4):
         super().__init__()
         self.input_dim = input_dim
         self.output_dim = output_dim
         self.hidden_dim = hidden_dim
 
-        self.input_layer = torch.nn.Linear(in_features=self.input_dim,
-                                           out_features=self.hidden_dim)
+        self.input_layer = torch.nn.Linear(in_features=self.input_dim, out_features=self.hidden_dim)
         self.layers = torch.nn.ModuleList([
-            torch.nn.Linear(in_features=self.hidden_dim,
-                            out_features=self.hidden_dim,
-                            bias=False) for x in range(num_layers)
+            torch.nn.Linear(in_features=self.hidden_dim, out_features=self.hidden_dim, bias=False)
+            for x in range(num_layers)
         ])
-        self.output_layer = torch.nn.Linear(in_features=self.hidden_dim,
-                                            out_features=self.output_dim)
+        self.output_layer = torch.nn.Linear(in_features=self.hidden_dim, out_features=self.output_dim)
 
         self.cross_entropy_loss = torch.nn.CrossEntropyLoss()
 
@@ -136,12 +177,8 @@ def forward(self, x, y):
 
 
 class LinearStackPipe(PipelineModule):
-    def __init__(self,
-                 input_dim=128,
-                 hidden_dim=128,
-                 output_dim=128,
-                 num_layers=4,
-                 **kwargs):
+
+    def __init__(self, input_dim=128, hidden_dim=128, output_dim=128, num_layers=4, **kwargs):
         self.input_dim = input_dim
         self.output_dim = output_dim
         self.hidden_dim = hidden_dim
@@ -150,11 +187,7 @@ def __init__(self,
         layers = []
         layers.append(LayerSpec(torch.nn.Linear, self.input_dim, self.hidden_dim))
         for x in range(self.num_layers):
-            layers.append(
-                LayerSpec(torch.nn.Linear,
-                          self.hidden_dim,
-                          self.hidden_dim,
-                          bias=False))
+            layers.append(LayerSpec(torch.nn.Linear, self.hidden_dim, self.hidden_dim, bias=False))
             layers.append(lambda x: x)
         layers.append(LayerSpec(torch.nn.Linear, self.hidden_dim, self.output_dim))
 
@@ -162,6 +195,7 @@ def __init__(self,
 
 
 class SimpleOptimizer(torch.optim.Optimizer):
+
     def __init__(self, params, lr=0.11072018):
         defaults = dict(lr=lr)
         super(SimpleOptimizer, self).__init__(params, defaults)
@@ -185,6 +219,7 @@ def step(self, closure=None):
 
 
 class HybridStateOptimizer(torch.optim.Optimizer):
+
     def __init__(self, params, lr=0.11072018):
         defaults = dict(lr=lr)
         super(HybridStateOptimizer, self).__init__(params, defaults)
@@ -216,6 +251,7 @@ def step(self, closure=None):
 
 
 class PLD_SimpleModel(SimpleModel):
+
     def __init__(self, hidden_dim, empty_grad=False):
         super(PLD_SimpleModel, self).__init__(hidden_dim, empty_grad)
 
@@ -228,9 +264,7 @@ def forward(self, x, y, **kwargs):
 
 def random_dataset(total_samples, hidden_dim, device, dtype=torch.half):
     train_data = torch.randn(total_samples, hidden_dim, device=device, dtype=dtype)
-    train_label = torch.empty(total_samples,
-                              dtype=torch.long,
-                              device=device).random_(hidden_dim)
+    train_label = torch.empty(total_samples, dtype=torch.long, device=device).random_(hidden_dim)
     train_dataset = torch.utils.data.TensorDataset(train_data, train_label)
     return train_dataset
 
@@ -242,21 +276,10 @@ def random_dataloader(model, total_samples, hidden_dim, device, dtype=torch.half
     return train_loader
 
 
-def sequence_dataloader(model,
-                        total_samples,
-                        hidden_dim,
-                        device,
-                        seq_len: int = 32,
-                        dtype=torch.half):
+def sequence_dataloader(model, total_samples, hidden_dim, device, seq_len: int = 32, dtype=torch.half):
     batch_size = model.train_micro_batch_size_per_gpu()
-    train_data = torch.randn(total_samples,
-                             seq_len,
-                             hidden_dim,
-                             device=device,
-                             dtype=dtype)
-    train_label = torch.empty(total_samples,
-                              dtype=torch.long,
-                              device=device).random_(hidden_dim)
+    train_data = torch.randn(total_samples, seq_len, hidden_dim, device=device, dtype=dtype)
+    train_label = torch.empty(total_samples, dtype=torch.long, device=device).random_(hidden_dim)
     train_dataset = torch.utils.data.TensorDataset(train_data, train_label)
     train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size)
     return train_loader
diff --git a/tests/unit/util.py b/tests/unit/util.py
index 2face75846d7..13eab3ef3a72 100644
--- a/tests/unit/util.py
+++ b/tests/unit/util.py
@@ -1,17 +1,34 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
 
+# DeepSpeed Team
+
+import pytest
 import torch
+from deepspeed.accelerator import get_accelerator, is_current_accelerator_supported
 from deepspeed.git_version_info import torch_info
+from packaging import version as pkg_version
 
 
-def required_torch_version():
-    TORCH_MAJOR = int(torch.__version__.split('.')[0])
-    TORCH_MINOR = int(torch.__version__.split('.')[1])
+def skip_on_arch(min_arch=7):
+    if get_accelerator().device_name() == 'cuda':
+        if torch.cuda.get_device_capability()[0] < min_arch:  #ignore-cuda
+            pytest.skip(f"needs higher compute capability than {min_arch}")
+    else:
+        assert is_current_accelerator_supported()
+        return
 
-    if TORCH_MAJOR >= 1 and TORCH_MINOR >= 8:
-        return True
+
+def skip_on_cuda(valid_cuda):
+    split_version = lambda x: map(int, x.split('.')[:2])
+    if get_accelerator().device_name() == 'cuda':
+        CUDA_MAJOR, CUDA_MINOR = split_version(torch_info['cuda_version'])
+        CUDA_VERSION = (CUDA_MAJOR * 10) + CUDA_MINOR
+        if valid_cuda.count(CUDA_VERSION) == 0:
+            pytest.skip(f"requires cuda versions {valid_cuda}")
     else:
-        return False
+        assert is_current_accelerator_supported()
+        return
 
 
 def bf16_required_version_check(accelerator_check=True):
@@ -26,33 +43,31 @@ def bf16_required_version_check(accelerator_check=True):
     else:
         accelerator_pass = True
 
-    if (TORCH_MAJOR > 1 or
-        (TORCH_MAJOR == 1 and TORCH_MINOR >= 10)) and (CUDA_MAJOR >= 11) and (
-            NCCL_MAJOR > 2 or
-            (NCCL_MAJOR == 2 and NCCL_MINOR >= 10)) and accelerator_pass:
+    torch_version_available = TORCH_MAJOR > 1 or (TORCH_MAJOR == 1 and TORCH_MINOR >= 10)
+    cuda_version_available = CUDA_MAJOR >= 11
+    nccl_version_available = NCCL_MAJOR > 2 or (NCCL_MAJOR == 2 and NCCL_MINOR >= 10)
+    npu_available = get_accelerator().device_name() == 'npu'
+
+    if torch_version_available and cuda_version_available and nccl_version_available and accelerator_pass:
+        return True
+    elif npu_available:
         return True
     else:
         return False
 
 
-def required_minimum_torch_version(major_version, minor_version):
-    TORCH_MAJOR = int(torch.__version__.split('.')[0])
-    TORCH_MINOR = int(torch.__version__.split('.')[1])
-
-    if TORCH_MAJOR < major_version:
-        return False
-
-    return TORCH_MAJOR > major_version or TORCH_MINOR >= minor_version
+def required_torch_version(min_version=None, max_version=None):
+    assert min_version or max_version, "Must provide a min_version or max_version argument"
 
+    torch_version = pkg_version.parse(torch.__version__)
 
-def required_maximum_torch_version(major_version, minor_version):
-    TORCH_MAJOR = int(torch.__version__.split('.')[0])
-    TORCH_MINOR = int(torch.__version__.split('.')[1])
+    if min_version and pkg_version.parse(str(min_version)) > torch_version:
+        return False
 
-    if TORCH_MAJOR > major_version:
+    if max_version and pkg_version.parse(str(max_version)) < torch_version:
         return False
 
-    return TORCH_MAJOR < major_version or TORCH_MINOR <= minor_version
+    return True
 
 
 def required_amp_check():
diff --git a/tests/unit/utils/test_get_optim_files.py b/tests/unit/utils/test_get_optim_files.py
index ccbd9626d6e8..4b84ebf10040 100644
--- a/tests/unit/utils/test_get_optim_files.py
+++ b/tests/unit/utils/test_get_optim_files.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import os
 import pytest
diff --git a/tests/unit/utils/test_groups.py b/tests/unit/utils/test_groups.py
index 06b391e2e301..d8f12be4f3c6 100644
--- a/tests/unit/utils/test_groups.py
+++ b/tests/unit/utils/test_groups.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from deepspeed.utils.groups import _get_expert_parallel_ranks
 
@@ -14,42 +17,22 @@ def test_get_expert_parallel_ranks():
     expert_parallel_group = [0,2,4,6], [8,10,12,14]             [1,3,5,7], [9,11,13,15]
     expert_data_parallel_group = [0,8],[2,10],[4,12],[6,14],    [1,9],[3,11],[5,13],[7,15]
     """
-    expert_parallel_groups, expert_data_parallel_groups = _get_expert_parallel_ranks(
-        world_size=16, model_parallel_size_=2, expert_parallel_size_=4
-    )
+    expert_parallel_groups, expert_data_parallel_groups = _get_expert_parallel_ranks(world_size=16,
+                                                                                     model_parallel_size_=2,
+                                                                                     expert_parallel_size_=4)
     assert expert_parallel_groups == [
-        [0,
-         2,
-         4,
-         6],
-        [8,
-         10,
-         12,
-         14],
-        [1,
-         3,
-         5,
-         7],
-        [9,
-         11,
-         13,
-         15],
+        [0, 2, 4, 6],
+        [8, 10, 12, 14],
+        [1, 3, 5, 7],
+        [9, 11, 13, 15],
     ]
     assert expert_data_parallel_groups == [
-        [0,
-         8],
-        [2,
-         10],
-        [4,
-         12],
-        [6,
-         14],
-        [1,
-         9],
-        [3,
-         11],
-        [5,
-         13],
-        [7,
-         15],
+        [0, 8],
+        [2, 10],
+        [4, 12],
+        [6, 14],
+        [1, 9],
+        [3, 11],
+        [5, 13],
+        [7, 15],
     ]
diff --git a/tests/unit/utils/test_init_on_device.py b/tests/unit/utils/test_init_on_device.py
index 25d102fd05a7..5d84e9be855a 100644
--- a/tests/unit/utils/test_init_on_device.py
+++ b/tests/unit/utils/test_init_on_device.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 import pytest
@@ -14,8 +17,7 @@ class TestOnDevice(DistributedTest):
     world_size = 1
 
     def test_on_device(self, device):
-        if device == "meta" and pkg_version.parse(
-                torch.__version__) < pkg_version.parse("1.10"):
+        if device == "meta" and pkg_version.parse(torch.__version__) < pkg_version.parse("1.10"):
             pytest.skip("meta tensors only became stable after torch 1.10")
 
         with OnDevice(dtype=torch.half, device=device):
diff --git a/version.txt b/version.txt
index ee94dd834b53..e01e0ddd8e8b 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-0.8.3
+0.12.4