diff --git a/.flake8 b/.flake8
new file mode 100644
index 000000000000..e694a9d33d04
--- /dev/null
+++ b/.flake8
@@ -0,0 +1,5 @@
+[flake8]
+ignore = E,F403,F405,F541,F841,W
+select = E9,F,W6
+per-file-ignores =
+ __init__.py:F401
diff --git a/.github/ISSUE_TEMPLATE/ci_failure_report.md b/.github/ISSUE_TEMPLATE/ci_failure_report.md
new file mode 100644
index 000000000000..6bf4c7762319
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/ci_failure_report.md
@@ -0,0 +1,10 @@
+---
+name: CI failure report
+about: Report a DeepSpeed CI failure
+title: "{{ env.GITHUB_WORKFLOW }} CI test failure"
+labels: ci-failure
+assignees: ''
+
+---
+
+The Nightly CI for {{ env.GITHUB_SERVER_URL }}/{{ env.GITHUB_REPOSITORY }}/actions/runs/{{ env.GITHUB_RUN_ID }} failed.
diff --git a/.github/ISSUE_TEMPLATE/deepspeed_chat_bug_report.md b/.github/ISSUE_TEMPLATE/deepspeed_chat_bug_report.md
new file mode 100644
index 000000000000..bf997775fe32
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/deepspeed_chat_bug_report.md
@@ -0,0 +1,44 @@
+---
+name: Bug report (DeepSpeed-Chat)
+about: Create a DeepSpeed-Chat related issue to help us improve
+title: "[BUG]"
+labels: bug,deepspeed-chat
+assignees: ''
+
+---
+
+**Describe the bug**
+A clear and concise description of what the bug is. Please include which training step you are using and which model you are training.
+
+**Log output**
+If you used `train.py` to launch the application, please include the contents of the output log file.
+
+**To Reproduce**
+Steps to reproduce the behavior:
+1. Command/Script to reproduce
+2. What packages are required and their versions
+3. How to run the script
+4. ...
+
+**Expected behavior**
+A clear and concise description of what you expected to happen.
+
+**ds_report output**
+Please run `ds_report` to give us details about your setup.
+
+**Screenshots**
+If applicable, add screenshots to help explain your problem.
+
+**System info (please complete the following information):**
+ - OS: [e.g. Ubuntu 18.04]
+ - GPU count and types [e.g. two machines with x8 A100s each]
+ - (if applicable) what [DeepSpeed-MII](https://github.com/microsoft/deepspeed-mii) version are you using
+ - (if applicable) Hugging Face Transformers/Accelerate/etc. versions
+ - Python version
+ - Any other relevant info about your setup
+
+**Docker context**
+Are you using a specific docker image that you can share?
+
+**Additional context**
+Add any other context about the problem here.
diff --git a/.github/workflows/amd.yml b/.github/workflows/amd-mi100.yml
similarity index 68%
rename from .github/workflows/amd.yml
rename to .github/workflows/amd-mi100.yml
index 1552bff9695a..7ad0f4178db4 100644
--- a/.github/workflows/amd.yml
+++ b/.github/workflows/amd-mi100.yml
@@ -1,35 +1,29 @@
-name: amd
+name: amd-mi100
on:
- push:
- branches:
- - 'staging**'
- paths-ignore:
- - 'docs/**'
- pull_request:
- paths-ignore:
- - 'docs/**'
+ schedule:
+ - cron: "0 0 * * *"
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
- unit-tests:
+ amd-tests:
# The type of runner that the job will run on
- runs-on: [self-hosted, amd]
+ runs-on: [self-hosted, amd, mi100]
# Steps represent a sequence of tasks that will be executed as part of the job
steps:
# Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
- - uses: actions/checkout@v2
+ - uses: actions/checkout@v3
- id: setup-venv
uses: ./.github/workflows/setup-venv
- name: Install pytorch
run: |
- pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/rocm5.1.1
+ pip install --cache-dir $TORCH_CACHE torch==1.13.1 torchvision --extra-index-url https://download.pytorch.org/whl/rocm5.1.1
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
@@ -56,7 +50,7 @@ jobs:
# Runs a set of commands using the runners shell
- name: Unit tests
run: |
- if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
+ unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
cd tests
- TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 -n 4 --verbose unit/
- TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --verbose -m 'sequential' unit/
+ pytest $PYTEST_OPTS -n 4 --verbose unit/
+ pytest $PYTEST_OPTS -m 'sequential' unit/
diff --git a/.github/workflows/amd-mi200.yml b/.github/workflows/amd-mi200.yml
new file mode 100644
index 000000000000..8c4292d4675c
--- /dev/null
+++ b/.github/workflows/amd-mi200.yml
@@ -0,0 +1,82 @@
+name: amd-mi200
+
+on:
+ schedule:
+ - cron: "0 0 * * *"
+ workflow_dispatch:
+
+concurrency:
+ group: ${{ github.workflow }}-${{ github.ref }}
+ cancel-in-progress: true
+
+permissions:
+ contents: read
+ issues: write
+
+jobs:
+ amd-tests:
+ # The type of runner that the job will run on
+ runs-on: [self-hosted, amd, mi200]
+
+ # Steps represent a sequence of tasks that will be executed as part of the job
+ steps:
+ # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
+ - uses: actions/checkout@v3
+
+ - id: setup-venv
+ uses: ./.github/workflows/setup-venv
+
+ - name: Install pytorch
+ run: |
+ pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/rocm5.6
+ python -c "import torch; print('torch:', torch.__version__, torch)"
+ python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
+
+ - name: Install transformers
+ run: |
+ git clone https://github.com/huggingface/transformers
+ cd transformers
+ # if needed switch to the last known good SHA until transformers@master is fixed
+ # git checkout 1cc453d33
+ git rev-parse --short HEAD
+ pip install .
+
+ - name: Install (ROCm) apex
+ run: |
+ git clone https://github.com/ROCmSoftwarePlatform/apex.git
+ cd apex
+ git checkout torch_2.1_higher
+ CURRENT_VER=$(git rev-parse HEAD)
+ INSTALLED_VER=$(cat /blob/amd-apex/.venv_installed_version)
+ if [[ "$CURRENT_VER" != "$INSTALLED_VER" ]]; then
+ pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings="--global-option=--cpp_ext" --config-settings="--global-option=--cuda_ext" --target=/blob/amd-apex/ --upgrade .
+ git rev-parse HEAD > /blob/amd-apex/.venv_installed_version
+ fi
+ echo PYTHONPATH=$PYTHONPATH:/blob/amd-apex/ >> $GITHUB_ENV
+ # Runs a set of commands using the runners shell
+ - name: Install deepspeed
+ run: |
+ pip install .[dev,1bit,autotuning]
+ #python -c "from deepspeed.env_report import cli_main; cli_main()"
+ ds_report
+
+ - name: Python environment
+ run: |
+ pip list
+
+ # Runs a set of commands using the runners shell
+ - name: Unit tests
+ run: |
+ unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
+ cd tests
+ pytest $PYTEST_OPTS -n 4 --verbose unit/
+ pytest $PYTEST_OPTS -m 'sequential' unit/
+
+ - name: Open GitHub issue if nightly CI fails
+ if: ${{ failure() && (github.event_name == 'schedule') }}
+ uses: JasonEtco/create-an-issue@v2
+ env:
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+ with:
+ filename: .github/ISSUE_TEMPLATE/ci_failure_report.md
+ update_existing: true
diff --git a/.github/workflows/auto-sync.yml b/.github/workflows/auto-sync.yml
index 5cc5dc02224f..bfbf5a2ae37a 100644
--- a/.github/workflows/auto-sync.yml
+++ b/.github/workflows/auto-sync.yml
@@ -11,7 +11,7 @@ jobs:
runs-on: ubuntu-20.04
steps:
- - uses: actions/checkout@v2
+ - uses: actions/checkout@v3
with:
token: ${{ secrets.GHP_TOKEN }}
repository: ${{ secrets.DST_REPO }}
diff --git a/.github/workflows/cpu-inference.yml b/.github/workflows/cpu-inference.yml
new file mode 100644
index 000000000000..521fe2b5bea4
--- /dev/null
+++ b/.github/workflows/cpu-inference.yml
@@ -0,0 +1,75 @@
+name: cpu-inference
+
+on:
+ workflow_dispatch:
+
+concurrency:
+ group: ${{ github.workflow }}-${{ github.ref }}
+ cancel-in-progress: true
+
+jobs:
+ unit-tests:
+ runs-on: ubuntu-20.04
+
+ steps:
+ - uses: actions/checkout@v3
+
+ - id: setup-venv
+ uses: ./.github/workflows/setup-venv
+
+ - name: Detect instruction sets on instance
+ run: |
+ lscpu
+ pip install cmake
+ git clone https://github.com/intel/intel-extension-for-pytorch
+ cd intel-extension-for-pytorch/tests/cpu/isa
+ cmake .
+ make
+ ./cpu_features
+
+ - name: Install numactl
+ run: |
+ sudo apt-get install -y numactl
+
+ - name: Install oneCCL Bindings for PyTorch
+ run: |
+ python -m pip install intel_extension_for_pytorch
+ python -m pip install oneccl_bind_pt==2.0 -f https://developer.intel.com/ipex-whl-stable-cpu
+
+ - name: Install oneCCL
+ run: |
+ git clone https://github.com/oneapi-src/oneCCL
+ cd oneCCL
+ mkdir build
+ cd build
+ cmake ..
+ make
+ make install
+ #source ./_install/env/setvars.sh
+ # test whether oneCCL is correctly installed
+ #mpirun -n 2 ./examples/benchmark/benchmark
+
+ - name: Install transformers
+ run: |
+ git clone https://github.com/huggingface/transformers
+ cd transformers
+ git rev-parse --short HEAD
+ pip install .
+
+ - name: Install deepspeed
+ run: |
+ # check why the host does not have AVX2 support
+ pip install .[dev,1bit,autotuning,inf]
+ ds_report
+
+ - name: Python environment
+ run: |
+ pip list
+
+ - name: Unit tests
+ run: |
+ source oneCCL/build/_install/env/setvars.sh
+ unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
+ cd tests
+ TRANSFORMERS_CACHE=~/tmp/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest -m 'seq_inference' unit/
+ TRANSFORMERS_CACHE=~/tmp/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest -m 'inference_ops' -m 'inference' unit/
diff --git a/.github/workflows/formatting.yml b/.github/workflows/formatting.yml
index b62e6266cbb5..a168af277fb8 100644
--- a/.github/workflows/formatting.yml
+++ b/.github/workflows/formatting.yml
@@ -1,12 +1,13 @@
name: Formatting
on:
- push:
- branches:
- - 'staging**'
pull_request:
branches:
'**'
+ merge_group:
+ branches: [ master ]
+ schedule:
+ - cron: "0 0 * * *"
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
@@ -19,19 +20,20 @@ jobs:
runs-on: ubuntu-20.04
steps:
- - uses: actions/checkout@v2
+ - uses: actions/checkout@v3
- name: environment
run: |
which python
python --version
- - name: Install deepspeed
+ - name: Install dependencies
run: |
- pip install .[dev,autotuning]
- ds_report
+ # Previously we would do pip install .[dev] but this is causing out of
+ # space errors start with torch 2.1.0 release
+ grep -E "clang-format|pre-commit" requirements/requirements-dev.txt | xargs pip install
- name: Formatting checks
run: |
- pip show pre-commit clang-format
- pre-commit run --all-files
+ pip show pre-commit clang-format
+ pre-commit run --all-files
diff --git a/.github/workflows/nv-a6000.yml b/.github/workflows/nv-a6000.yml
new file mode 100644
index 000000000000..a2b99de488d5
--- /dev/null
+++ b/.github/workflows/nv-a6000.yml
@@ -0,0 +1,63 @@
+name: nv-a6000
+
+on:
+ pull_request:
+ paths:
+ - "deepspeed/inference/v2/**"
+ - "tests/unit/inference/v2/**"
+ workflow_dispatch:
+
+concurrency:
+ group: ${{ github.workflow }}-${{ github.ref }}
+ cancel-in-progress: true
+
+permissions:
+ contents: read
+ issues: write
+
+jobs:
+ unit-tests:
+ runs-on: [self-hosted, nvidia, a6000]
+ container:
+ image: nvcr.io/nvidia/pytorch:23.03-py3
+ ports:
+ - 80
+ options: --gpus all --shm-size "8G"
+
+ steps:
+ - uses: actions/checkout@v3
+
+ - name: Check container state
+ run: |
+ ldd --version
+ nvcc --version
+ nvidia-smi
+ python -c "import torch; print('torch:', torch.__version__, torch)"
+ python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
+ - name: Install transformers
+ run: |
+ git clone --depth=1 https://github.com/huggingface/transformers
+ cd transformers
+ git rev-parse --short HEAD
+ python -m pip install .
+ - name: Install deepspeed
+ run: |
+ python -m pip install docutils==0.18.1 jinja2==3.0 urllib3==1.26.11 ninja
+ python -m pip install .[dev,1bit,autotuning]
+ ds_report
+ - name: Python environment
+ run: |
+ python -m pip list
+ - name: Unit tests
+ run: |
+ unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
+ cd tests
+ python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2' unit/ --torch_ver="2.0" --cuda_ver="12"
+ python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2_ops' unit/ --torch_ver="2.0" --cuda_ver="12"
+ - name: MII unit tests
+ run: |
+ git clone --depth=1 https://github.com/microsoft/DeepSpeed-MII.git
+ cd DeepSpeed-MII
+ pip install .[dev]
+ cd tests
+ python -m pytest --color=yes --durations=0 --verbose -rF ./
diff --git a/.github/workflows/nv-accelerate-v100.yml b/.github/workflows/nv-accelerate-v100.yml
index 081e2c7b0f00..0f6491e08336 100644
--- a/.github/workflows/nv-accelerate-v100.yml
+++ b/.github/workflows/nv-accelerate-v100.yml
@@ -1,14 +1,16 @@
name: nv-accelerate-v100
on:
- push:
- branches:
- - 'staging**'
- paths-ignore:
- - 'docs/**'
pull_request:
paths-ignore:
- 'docs/**'
+ - 'blogs/**'
+ - 'deepspeed/inference/v2/**'
+ - "tests/unit/inference/v2/**"
+ merge_group:
+ branches: [ master ]
+ schedule:
+ - cron: "0 0 * * *"
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
@@ -19,14 +21,14 @@ jobs:
runs-on: [self-hosted, nvidia, cu111, v100]
steps:
- - uses: actions/checkout@v2
+ - uses: actions/checkout@v3
- id: setup-venv
uses: ./.github/workflows/setup-venv
- name: Install pytorch
run: |
- pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/cu111
+ pip install -U --cache-dir $TORCH_CACHE torch torchvision --extra-index-url https://download.pytorch.org/whl/cu111
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
@@ -41,11 +43,9 @@ jobs:
- name: HF Accelerate tests
run: |
- if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
+ unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
git clone https://github.com/huggingface/accelerate
cd accelerate
- # tmp fix
- git checkout 5f4ba04628eeea14f9d248ab0e54399899503532
git rev-parse --short HEAD
# installing dependencies
pip install .[testing]
@@ -54,4 +54,4 @@ jobs:
# tmp fix: force newer datasets version
#pip install "datasets>=2.0.0"
pip list
- HF_DATASETS_CACHE=/blob/datasets_cache/ TRANSFORMERS_CACHE=/blob/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --verbose tests/deepspeed
+ pytest $PYTEST_OPTS --color=yes --durations=0 --verbose tests/deepspeed
diff --git a/.github/workflows/nv-ds-chat.yml b/.github/workflows/nv-ds-chat.yml
new file mode 100644
index 000000000000..b53fac36315b
--- /dev/null
+++ b/.github/workflows/nv-ds-chat.yml
@@ -0,0 +1,69 @@
+name: nv-ds-chat
+
+on:
+ schedule:
+ - cron: "0 0 * * *"
+ workflow_dispatch:
+ inputs:
+ dse_branch:
+ description: 'DeepSpeedExamples Branch'
+ required: false
+ default: 'master'
+ type: string
+
+concurrency:
+ group: ${{ github.workflow }}-${{ github.ref }}
+ cancel-in-progress: true
+
+jobs:
+ unit-tests:
+ runs-on: [self-hosted, nvidia, cu116, v100]
+
+ steps:
+ - uses: actions/checkout@v3
+
+ - id: setup-venv
+ uses: ./.github/workflows/setup-venv
+
+ - name: Install pytorch
+ run: |
+ pip3 install -U --cache-dir $TORCH_CACHE torch --index-url https://download.pytorch.org/whl/cu118
+ python -c "import torch; print('torch:', torch.__version__, torch)"
+ python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
+
+ - name: Install deepspeed
+ run: |
+ pip install .[dev]
+ ds_report
+
+ - name: Install deepspeed-chat
+ run: |
+ BRANCH="master"
+ if [[ ! -z "${{ github.event.inputs.dse_branch }}" ]]; then
+ BRANCH="${{ github.event.inputs.dse_branch }}"
+ fi
+ echo "DeepSpeedExamples Branch: $BRANCH"
+ git clone -b $BRANCH https://github.com/microsoft/DeepSpeedExamples.git
+ cd DeepSpeedExamples/applications/DeepSpeed-Chat
+ pip install -r requirements.txt
+ pip install -e .
+
+ - name: Python environment
+ run: |
+ pip list
+
+ - name: DS-Chat unit tests
+ run: |
+ cd DeepSpeedExamples/applications/DeepSpeed-Chat
+ unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
+ cd tests
+ pytest $PYTEST_OPTS ./
+
+ - name: Open GitHub issue if nightly CI fails
+ if: ${{ failure() && (github.event_name == 'schedule') }}
+ uses: JasonEtco/create-an-issue@v2
+ env:
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+ with:
+ filename: .github/ISSUE_TEMPLATE/ci_failure_report.md
+ update_existing: true
diff --git a/.github/workflows/nv-h100.yml b/.github/workflows/nv-h100.yml
new file mode 100644
index 000000000000..a1b812b3eafd
--- /dev/null
+++ b/.github/workflows/nv-h100.yml
@@ -0,0 +1,65 @@
+name: nv-h100
+
+on:
+ schedule:
+ - cron: "0 0 * * *"
+ workflow_dispatch:
+
+concurrency:
+ group: ${{ github.workflow }}-${{ github.ref }}
+ cancel-in-progress: true
+
+permissions:
+ contents: read
+ issues: write
+
+jobs:
+ unit-tests:
+ runs-on: [self-hosted, nvidia, h100]
+ container:
+ image: nvcr.io/nvidia/pytorch:23.03-py3
+ ports:
+ - 80
+ options: --gpus all --shm-size "8G"
+
+ steps:
+ - uses: actions/checkout@v3
+
+ - name: Check container state
+ run: |
+ nvidia-smi
+ python -c "import torch; print('torch:', torch.__version__, torch)"
+ python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
+
+ - name: Install transformers
+ run: |
+ git clone https://github.com/huggingface/transformers
+ cd transformers
+ git rev-parse --short HEAD
+ python -m pip install .
+
+ - name: Install deepspeed
+ run: |
+ python -m pip install docutils==0.18.1 jinja2==3.0 urllib3==1.26.11 ninja
+ python -m pip install .[dev,1bit,autotuning]
+ ds_report
+
+ - name: Python environment
+ run: |
+ python -m pip list
+
+ - name: Unit tests
+ run: |
+ unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
+ cd tests
+ python -m pytest $PYTEST_OPTS -n 4 unit/ --torch_ver="2.0" --cuda_ver="12"
+ python -m pytest $PYTEST_OPTS -m 'sequential' unit/ --torch_ver="2.0" --cuda_ver="12"
+
+ - name: Open GitHub issue if nightly CI fails
+ if: ${{ failure() && (github.event_name == 'schedule') }}
+ uses: JasonEtco/create-an-issue@v2
+ env:
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+ with:
+ filename: .github/ISSUE_TEMPLATE/ci_failure_report.md
+ update_existing: true
diff --git a/.github/workflows/nv-inference.yml b/.github/workflows/nv-inference.yml
index 01e6ac9ee264..f20b4496b6df 100644
--- a/.github/workflows/nv-inference.yml
+++ b/.github/workflows/nv-inference.yml
@@ -1,14 +1,16 @@
name: nv-inference
on:
- push:
- branches:
- - 'staging**'
- paths-ignore:
- - 'docs/**'
pull_request:
paths-ignore:
- 'docs/**'
+ - 'blogs/**'
+ - 'deepspeed/inference/v2/**'
+ - "tests/unit/inference/v2/**"
+ merge_group:
+ branches: [ master ]
+ schedule:
+ - cron: "0 0 * * *"
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
@@ -19,14 +21,14 @@ jobs:
runs-on: [self-hosted, nvidia, cu116, v100]
steps:
- - uses: actions/checkout@v2
+ - uses: actions/checkout@v3
- id: setup-venv
uses: ./.github/workflows/setup-venv
- name: Install pytorch
run: |
- pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/cu116
+ pip install -U --cache-dir $TORCH_CACHE torch==1.13.1 torchvision --extra-index-url https://download.pytorch.org/whl/cu116
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
@@ -34,12 +36,13 @@ jobs:
run: |
git clone https://github.com/huggingface/transformers
cd transformers
+ git checkout f370bebdc
git rev-parse --short HEAD
pip install .
- name: Install deepspeed
run: |
- pip install .[dev,1bit,autotuning,inf]
+ pip install .[dev,1bit,autotuning,inf,triton]
ds_report
- name: Python environment
@@ -49,8 +52,13 @@ jobs:
- name: Unit tests
run: |
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
- if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
cd tests
- TRANSFORMERS_CACHE=/blob/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --verbose -m 'seq_inference' unit/ --torch_ver="1.13" --cuda_ver="11.6"
- TRANSFORMERS_CACHE=/blob/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --verbose -m 'inference_ops' unit/ --torch_ver="1.13" --cuda_ver="11.6"
- TRANSFORMERS_CACHE=/blob/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked -n 4 --verbose -m 'inference' unit/ --torch_ver="1.13" --cuda_ver="11.6"
+ coverage run --concurrency=multiprocessing -m pytest $PYTEST_OPTS -m 'seq_inference' unit/ --torch_ver="1.13" --cuda_ver="11.6"
+ coverage run --concurrency=multiprocessing -m pytest $PYTEST_OPTS -m 'inference_ops' unit/ --torch_ver="1.13" --cuda_ver="11.6"
+ coverage run --concurrency=multiprocessing -m pytest $PYTEST_OPTS --forked -n 4 -m 'inference' unit/ --torch_ver="1.13" --cuda_ver="11.6"
+
+ - name: Coverage report
+ run: |
+ cd tests
+ coverage combine
+ coverage report -m
diff --git a/.github/workflows/nv-lightning-v100.yml b/.github/workflows/nv-lightning-v100.yml
index e564a29d1ab2..d25d40aef967 100644
--- a/.github/workflows/nv-lightning-v100.yml
+++ b/.github/workflows/nv-lightning-v100.yml
@@ -1,14 +1,16 @@
name: nv-lightning-v100
on:
- push:
- branches:
- - 'staging**'
- paths-ignore:
- - 'docs/**'
pull_request:
paths-ignore:
- 'docs/**'
+ - 'blogs/**'
+ - 'deepspeed/inference/v2/**'
+ - "tests/unit/inference/v2/**"
+ merge_group:
+ branches: [ master ]
+ schedule:
+ - cron: "0 0 * * *"
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
@@ -19,14 +21,14 @@ jobs:
runs-on: [self-hosted, nvidia, cu111, v100]
steps:
- - uses: actions/checkout@v2
+ - uses: actions/checkout@v3
- id: setup-venv
uses: ./.github/workflows/setup-venv
- name: Install pytorch
run: |
- pip install torch==1.9.1+cu111 torchvision==0.10.1+cu111 torchaudio==0.9.1 -f https://download.pytorch.org/whl/torch_stable.html
+ pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu118
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
@@ -41,8 +43,8 @@ jobs:
- name: PyTorch Lightning Tests
run: |
- if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
+ unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
pip install pytorch-lightning
pip install "protobuf<4.21.0"
cd tests
- TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --verbose lightning/
+ pytest $PYTEST_OPTS lightning/
diff --git a/.github/workflows/nv-megatron.yml b/.github/workflows/nv-megatron.yml
index c490bc45d357..3a3b70dcd17d 100644
--- a/.github/workflows/nv-megatron.yml
+++ b/.github/workflows/nv-megatron.yml
@@ -1,14 +1,16 @@
name: nv-megatron
on:
- push:
- branches:
- - 'staging**'
- paths-ignore:
- - 'docs/**'
pull_request:
paths-ignore:
- 'docs/**'
+ - 'blogs/**'
+ - 'deepspeed/inference/v2/**'
+ - "tests/unit/inference/v2/**"
+ merge_group:
+ branches: [ master ]
+ schedule:
+ - cron: "0 0 * * *"
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
@@ -19,14 +21,14 @@ jobs:
runs-on: [self-hosted, nvidia, cu116, v100]
steps:
- - uses: actions/checkout@v2
+ - uses: actions/checkout@v3
- id: setup-venv
uses: ./.github/workflows/setup-venv
- name: Install pytorch
run: |
- pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/cu116
+ pip install -U --cache-dir $TORCH_CACHE torch==1.13.1 torchvision --extra-index-url https://download.pytorch.org/whl/cu116
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
@@ -37,7 +39,15 @@ jobs:
- name: Install apex
run: |
- pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" git+https://github.com/NVIDIA/apex.git
+ git clone https://github.com/NVIDIA/apex.git
+ cd apex
+ CURRENT_VER=$(git rev-parse HEAD)
+ INSTALLED_VER=$(cat /blob/apex/.venv_installed_version)
+ if [[ "$CURRENT_VER" != "$INSTALLED_VER" ]]; then
+ pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--global-option=--cpp_ext" --config-settings "--global-option=--cuda_ext" --target=/blob/apex/ --upgrade .
+ git rev-parse HEAD > /blob/apex/.venv_installed_version
+ fi
+ echo PYTHONPATH=$PYTHONPATH:/blob/apex/ >> $GITHUB_ENV
- name: Python environment
run: |
@@ -45,10 +55,9 @@ jobs:
- name: Megatron unit tests
run: |
- git clone --branch mrwyattii/fix-deprecated-numpy-types https://github.com/microsoft/Megatron-DeepSpeed.git
+ git clone https://github.com/microsoft/Megatron-DeepSpeed.git
cd Megatron-DeepSpeed
pip install .
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
- if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
cd tests
- MEGATRON_CKPT_DIR=/blob/megatron_ckpt/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --verbose ./
+ pytest $PYTEST_OPTS ./
diff --git a/.github/workflows/nv-mii.yml b/.github/workflows/nv-mii.yml
index b5f54fad46ec..86de2a3b0bcb 100644
--- a/.github/workflows/nv-mii.yml
+++ b/.github/workflows/nv-mii.yml
@@ -1,14 +1,14 @@
name: nv-mii
on:
- push:
- branches:
- - 'staging**'
- paths-ignore:
- - 'docs/**'
pull_request:
paths-ignore:
- 'docs/**'
+ - 'blogs/**'
+ merge_group:
+ branches: [ master ]
+ schedule:
+ - cron: "0 0 * * *"
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
@@ -19,14 +19,14 @@ jobs:
runs-on: [self-hosted, nvidia, cu116, v100]
steps:
- - uses: actions/checkout@v2
+ - uses: actions/checkout@v3
- id: setup-venv
uses: ./.github/workflows/setup-venv
- name: Install pytorch
run: |
- pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/cu116
+ pip3 install -U --cache-dir $TORCH_CACHE torch --index-url https://download.pytorch.org/whl/cu118
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
@@ -54,6 +54,5 @@ jobs:
cd DeepSpeed-MII
pip install .[dev]
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
- if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
- cd tests
- TRANSFORMERS_CACHE=/blob/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -m "CPU or local" ./
+ cd tests/legacy
+ pytest $PYTEST_OPTS --forked -m "deepspeed" ./
diff --git a/.github/workflows/nv-nightly.yml b/.github/workflows/nv-nightly.yml
index 04d545dadd6b..1ed7d34a6be4 100644
--- a/.github/workflows/nv-nightly.yml
+++ b/.github/workflows/nv-nightly.yml
@@ -8,19 +8,23 @@ concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
+permissions:
+ contents: read
+ issues: write
+
jobs:
unit-tests:
runs-on: [self-hosted, nvidia, cu116, v100]
steps:
- - uses: actions/checkout@v2
+ - uses: actions/checkout@v3
- id: setup-venv
uses: ./.github/workflows/setup-venv
- name: Install pytorch
run: |
- pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/cu116
+ pip install -U --cache-dir $TORCH_CACHE torch==1.13.1 torchvision --extra-index-url https://download.pytorch.org/whl/cu116
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
@@ -45,6 +49,14 @@ jobs:
- name: Unit tests
run: |
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
- if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
cd tests
- TRANSFORMERS_CACHE=/blob/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -m 'nightly' unit/ --torch_ver="1.13" --cuda_ver="11.6"
+ pytest $PYTEST_OPTS --forked -m 'nightly' unit/ --torch_ver="1.13" --cuda_ver="11.6"
+
+ - name: Open GitHub issue if nightly CI fails
+ if: ${{ failure() && (github.event_name == 'schedule') }}
+ uses: JasonEtco/create-an-issue@v2
+ env:
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+ with:
+ filename: .github/ISSUE_TEMPLATE/ci_failure_report.md
+ update_existing: true
diff --git a/.github/workflows/nv-pre-compile-ops.yml b/.github/workflows/nv-pre-compile-ops.yml
new file mode 100644
index 000000000000..839312190d22
--- /dev/null
+++ b/.github/workflows/nv-pre-compile-ops.yml
@@ -0,0 +1,41 @@
+name: nv-pre-compile-ops
+
+on:
+ pull_request:
+ branches:
+ '**'
+ paths-ignore:
+ - 'docs/**'
+ - 'blogs/**'
+ - 'deepspeed/inference/v2/**'
+ - "tests/unit/inference/v2/**"
+ merge_group:
+ branches: [ master ]
+ schedule:
+ - cron: "0 0 * * *"
+
+concurrency:
+ group: ${{ github.workflow }}-${{ github.ref }}
+ cancel-in-progress: true
+
+jobs:
+ build-ops:
+ runs-on: ubuntu-20.04
+ container:
+ image: deepspeed/gh-builder:ubuntu1804-py38-torch1131-cu116
+
+ steps:
+ - uses: actions/checkout@v3
+
+ - name: environment
+ run: |
+ which python
+ python --version
+ python -c "import torch; print('torch:', torch.__version__, torch)"
+ #python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
+ - name: Compile DeepSpeed Ops
+ run: |
+ TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0" DS_BUILD_OPS=1 DS_BUILD_SPARSE_ATTN=0 DS_BUILD_CUTLASS_OPS=0 DS_BUILD_RAGGED_DEVICE_OPS=0 DS_BUILD_EVOFORMER_ATTN=0 pip3 install .
+ - name: DS Report
+ run: |
+ ds_report
diff --git a/.github/workflows/nv-sd.yml b/.github/workflows/nv-sd.yml
new file mode 100644
index 000000000000..5ca159074a4d
--- /dev/null
+++ b/.github/workflows/nv-sd.yml
@@ -0,0 +1,70 @@
+name: nv-sd
+
+on:
+ schedule:
+ - cron: "0 0 * * 0"
+ workflow_dispatch:
+ pull_request:
+ paths:
+ - "deepspeed/ops/transformer/inference/diffusers_**"
+ - "tests/unit/inference/test_stable_diffusion.py"
+ - "deepspeed/model_implementations/diffusers/unet.py"
+ - "deepspeed/model_implementations/diffusers/vae.py"
+
+concurrency:
+ group: ${{ github.workflow }}-${{ github.ref }}
+ cancel-in-progress: true
+
+permissions:
+ contents: read
+ issues: write
+
+jobs:
+ sd-tests:
+ runs-on: [self-hosted, nvidia, a6000]
+ container:
+ image: nvcr.io/nvidia/pytorch:23.03-py3
+ ports:
+ - 80
+ options: --gpus all --shm-size "8G"
+
+ steps:
+ - uses: actions/checkout@v3
+
+ - name: Check container state
+ run: |
+ ldd --version
+ nvcc --version
+ nvidia-smi
+ python -c "import torch; print('torch:', torch.__version__, torch)"
+ python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
+ - name: Install transformers
+ run: |
+ git clone https://github.com/huggingface/transformers
+ cd transformers
+ git rev-parse --short HEAD
+ python -m pip install .
+ - name: Install deepspeed
+ run: |
+ pip install image-similarity-measures
+ python -m pip install opencv-python==4.6.* --force-reinstall
+ python -m pip install docutils==0.18.1 jinja2==3.0 urllib3==1.26.11 ninja
+ python -m pip install .[dev,1bit,autotuning,sd]
+ ds_report
+ - name: Python environment
+ run: |
+ python -m pip list
+ - name: Unit tests
+ run: |
+ unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
+ cd tests
+ python -m pytest --color=yes --durations=0 --verbose -rF -m 'stable_diffusion' -k "TestStableDiffusion" unit/ --torch_ver="2.0" --cuda_ver="12"
+
+ - name: Open GitHub issue if weekly CI fails
+ if: ${{ failure() && (github.event_name == 'schedule') }}
+ uses: JasonEtco/create-an-issue@v2
+ env:
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+ with:
+ filename: .github/ISSUE_TEMPLATE/ci_failure_report.md
+ update_existing: true
diff --git a/.github/workflows/nv-torch-latest-cpu.yml b/.github/workflows/nv-torch-latest-cpu.yml
index d0ccc29deaa5..9ca1529d9018 100644
--- a/.github/workflows/nv-torch-latest-cpu.yml
+++ b/.github/workflows/nv-torch-latest-cpu.yml
@@ -1,14 +1,16 @@
name: nv-torch-latest-cpu
on:
- push:
- branches:
- - 'staging**'
- paths-ignore:
- - 'docs/**'
pull_request:
paths-ignore:
- 'docs/**'
+ - 'blogs/**'
+ - 'deepspeed/inference/v2/**'
+ - "tests/unit/inference/v2/**"
+ merge_group:
+ branches: [ master ]
+ schedule:
+ - cron: "0 0 * * *"
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
@@ -19,7 +21,7 @@ jobs:
runs-on: ubuntu-20.04
steps:
- - uses: actions/checkout@v2
+ - uses: actions/checkout@v3
- id: setup-venv
uses: ./.github/workflows/setup-venv
@@ -42,7 +44,6 @@ jobs:
- name: Unit tests
run: |
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
- if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
cd tests
- TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --verbose -n 4 unit/ --torch_ver="1.12"
- TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --verbose -m 'sequential' unit/ --torch_ver="1.12"
+ TRANSFORMERS_CACHE=/tmp/transformers_cache/ pytest $PYTEST_OPTS -n 4 unit/ --torch_ver="1.12"
+ TRANSFORMERS_CACHE=/tmp/transformers_cache/ pytest $PYTEST_OPTS -m 'sequential' unit/ --torch_ver="1.12"
diff --git a/.github/workflows/nv-torch-latest-v100.yml b/.github/workflows/nv-torch-latest-v100.yml
index 2a3dcc4acc99..8813a4bb2c4f 100644
--- a/.github/workflows/nv-torch-latest-v100.yml
+++ b/.github/workflows/nv-torch-latest-v100.yml
@@ -1,14 +1,16 @@
name: nv-torch-latest-v100
on:
- push:
- branches:
- - 'staging**'
- paths-ignore:
- - 'docs/**'
pull_request:
paths-ignore:
- 'docs/**'
+ - 'blogs/**'
+ - 'deepspeed/inference/v2/**'
+ - "tests/unit/inference/v2/**"
+ merge_group:
+ branches: [ master ]
+ schedule:
+ - cron: "0 0 * * *"
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
@@ -19,14 +21,14 @@ jobs:
runs-on: [self-hosted, nvidia, cu116, v100]
steps:
- - uses: actions/checkout@v2
+ - uses: actions/checkout@v3
- id: setup-venv
uses: ./.github/workflows/setup-venv
- name: Install pytorch
run: |
- pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/cu116
+ pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu118
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
@@ -51,7 +53,12 @@ jobs:
- name: Unit tests
run: |
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
- if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
cd tests
- TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --verbose --forked -n 4 unit/ --torch_ver="1.13" --cuda_ver="11.6"
- TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --verbose --forked -m 'sequential' unit/ --torch_ver="1.13" --cuda_ver="11.6"
+ coverage run --concurrency=multiprocessing -m pytest $PYTEST_OPTS --forked -n 4 unit/ --torch_ver="2.1" --cuda_ver="11.8"
+ coverage run --concurrency=multiprocessing -m pytest $PYTEST_OPTS --forked -m 'sequential' unit/ --torch_ver="2.1" --cuda_ver="11.8"
+
+ - name: Coverage report
+ run: |
+ cd tests
+ coverage combine
+ coverage report -m
diff --git a/.github/workflows/nv-torch-nightly-v100.yml b/.github/workflows/nv-torch-nightly-v100.yml
index 625517d59167..d0df6e546982 100644
--- a/.github/workflows/nv-torch-nightly-v100.yml
+++ b/.github/workflows/nv-torch-nightly-v100.yml
@@ -8,19 +8,23 @@ concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
+permissions:
+ contents: read
+ issues: write
+
jobs:
unit-tests:
runs-on: [self-hosted, nvidia, cu116, v100]
steps:
- - uses: actions/checkout@v2
+ - uses: actions/checkout@v3
- id: setup-venv
uses: ./.github/workflows/setup-venv
- name: Install pytorch
run: |
- pip install --pre torch torchvision --extra-index-url https://download.pytorch.org/whl/nightly/cu116
+ pip install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cu118
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
@@ -45,7 +49,15 @@ jobs:
- name: Unit tests
run: |
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
- if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
cd tests
- TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -n 4 unit/
- TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -m 'sequential' unit/
+ pytest $PYTEST_OPTS --forked -n 4 unit/
+ pytest $PYTEST_OPTS --forked -m 'sequential' unit/
+
+ - name: Open GitHub issue if nightly CI fails
+ if: ${{ failure() && (github.event_name == 'schedule') }}
+ uses: JasonEtco/create-an-issue@v2
+ env:
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+ with:
+ filename: .github/ISSUE_TEMPLATE/ci_failure_report.md
+ update_existing: true
diff --git a/.github/workflows/nv-torch18-v100.yml b/.github/workflows/nv-torch110-p40.yml
similarity index 54%
rename from .github/workflows/nv-torch18-v100.yml
rename to .github/workflows/nv-torch110-p40.yml
index 0e1cd79b419f..45f3e0438233 100644
--- a/.github/workflows/nv-torch18-v100.yml
+++ b/.github/workflows/nv-torch110-p40.yml
@@ -1,32 +1,31 @@
-name: nv-torch18-v100
+name: nv-torch110-p40
on:
- push:
- branches:
- - 'staging**'
- paths-ignore:
- - 'docs/**'
- pull_request:
- paths-ignore:
- - 'docs/**'
+ schedule:
+ - cron: "0 0 * * *"
+ workflow_dispatch:
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
+permissions:
+ contents: read
+ issues: write
+
jobs:
unit-tests:
- runs-on: [self-hosted, nvidia, cu111, v100]
+ runs-on: [self-hosted, nvidia, cu111, p40]
steps:
- - uses: actions/checkout@v2
+ - uses: actions/checkout@v3
- id: setup-venv
uses: ./.github/workflows/setup-venv
- name: Install pytorch
run: |
- pip install torch==1.8.2+cu111 torchvision==0.9.2+cu111 -f https://download.pytorch.org/whl/lts/1.8/torch_lts.html
+ pip install -U --cache-dir $TORCH_CACHE torch==1.10.0+cu111 torchvision==0.11.0+cu111 -f https://download.pytorch.org/whl/torch_stable.html
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
@@ -41,7 +40,7 @@ jobs:
- name: Install deepspeed
run: |
- pip install .[dev,1bit,autotuning]
+ pip install .[dev,1bit,autotuning] --no-build-isolation
ds_report
- name: Python environment
@@ -51,7 +50,14 @@ jobs:
- name: Unit tests
run: |
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
- if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
cd tests
- TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -n 4 unit/ --torch_ver="1.8" --cuda_ver="11"
- TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -m 'sequential' unit/ --torch_ver="1.8" --cuda_ver="11"
+ pytest $PYTEST_OPTS --forked -n 4 unit/ --torch_ver="1.10" --cuda_ver="11.1"
+
+ - name: Open GitHub issue if nightly CI fails
+ if: ${{ failure() && (github.event_name == 'schedule') }}
+ uses: JasonEtco/create-an-issue@v2
+ env:
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+ with:
+ filename: .github/ISSUE_TEMPLATE/ci_failure_report.md
+ update_existing: true
diff --git a/.github/workflows/nv-torch110-v100.yml b/.github/workflows/nv-torch110-v100.yml
new file mode 100644
index 000000000000..1fd8aaac0ffa
--- /dev/null
+++ b/.github/workflows/nv-torch110-v100.yml
@@ -0,0 +1,64 @@
+name: nv-torch110-v100
+
+on:
+ schedule:
+ - cron: "0 0 * * *"
+ workflow_dispatch:
+
+concurrency:
+ group: ${{ github.workflow }}-${{ github.ref }}
+ cancel-in-progress: true
+
+permissions:
+ contents: read
+ issues: write
+
+jobs:
+ unit-tests:
+ runs-on: [self-hosted, nvidia, cu111, v100]
+
+ steps:
+ - uses: actions/checkout@v3
+
+ - id: setup-venv
+ uses: ./.github/workflows/setup-venv
+
+ - name: Install pytorch
+ run: |
+ pip install -U --cache-dir $TORCH_CACHE torch==1.10.0+cu111 torchvision==0.11.0+cu111 -f https://download.pytorch.org/whl/torch_stable.html
+ python -c "import torch; print('torch:', torch.__version__, torch)"
+ python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
+
+ - name: Install transformers
+ run: |
+ git clone https://github.com/huggingface/transformers
+ cd transformers
+ # if needed switch to the last known good SHA until transformers@master is fixed
+ # git checkout 1cc453d33
+ git rev-parse --short HEAD
+ pip install .
+
+ - name: Install deepspeed
+ run: |
+ pip install .[dev,1bit,autotuning] --no-build-isolation
+ ds_report
+
+ - name: Python environment
+ run: |
+ pip list
+
+ - name: Unit tests
+ run: |
+ unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
+ cd tests
+ pytest $PYTEST_OPTS --forked -n 4 unit/ --torch_ver="1.10" --cuda_ver="11"
+ pytest $PYTEST_OPTS --forked -m 'sequential' unit/ --torch_ver="1.10" --cuda_ver="11"
+
+ - name: Open GitHub issue if nightly CI fails
+ if: ${{ failure() && (github.event_name == 'schedule') }}
+ uses: JasonEtco/create-an-issue@v2
+ env:
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+ with:
+ filename: .github/ISSUE_TEMPLATE/ci_failure_report.md
+ update_existing: true
diff --git a/.github/workflows/nv-torch18-p40.yml b/.github/workflows/nv-torch18-p40.yml
deleted file mode 100644
index 45aeeed5078f..000000000000
--- a/.github/workflows/nv-torch18-p40.yml
+++ /dev/null
@@ -1,55 +0,0 @@
-name: nv-torch18-p40
-
-on:
- push:
- branches:
- - 'staging**'
- paths-ignore:
- - 'docs/**'
- pull_request:
- paths-ignore:
- - 'docs/**'
-
-concurrency:
- group: ${{ github.workflow }}-${{ github.ref }}
- cancel-in-progress: true
-
-jobs:
- unit-tests:
- runs-on: [self-hosted, nvidia, cu101, p40]
-
- steps:
- - uses: actions/checkout@v2
-
- - id: setup-venv
- uses: ./.github/workflows/setup-venv
-
- - name: Install pytorch
- run: |
- pip install torch==1.8.2 torchvision==0.9.2 --extra-index-url https://download.pytorch.org/whl/lts/1.8/cu101
- python -c "import torch; print('torch:', torch.__version__, torch)"
- python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
-
- - name: Install transformers
- run: |
- git clone https://github.com/huggingface/transformers
- cd transformers
- # if needed switch to the last known good SHA until transformers@master is fixed
- # git checkout 1cc453d33
- git rev-parse --short HEAD
- pip install .
-
- - name: Install deepspeed
- run: |
- pip install .[dev,1bit,autotuning]
- ds_report
-
- - name: Python environment
- run: |
- pip list
-
- - name: Unit tests
- run: |
- if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
- cd tests
- TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -n 4 unit/ --torch_ver="1.8" --cuda_ver="10.1"
diff --git a/.github/workflows/nv-transformers-v100.yml b/.github/workflows/nv-transformers-v100.yml
index ec84f2234836..7753133f2886 100644
--- a/.github/workflows/nv-transformers-v100.yml
+++ b/.github/workflows/nv-transformers-v100.yml
@@ -1,14 +1,16 @@
name: nv-transformers-v100
on:
- push:
- branches:
- - 'staging**'
- paths-ignore:
- - 'docs/**'
pull_request:
paths-ignore:
- 'docs/**'
+ - 'blogs/**'
+ - 'deepspeed/inference/v2/**'
+ - "tests/unit/inference/v2/**"
+ merge_group:
+ branches: [ master ]
+ schedule:
+ - cron: "0 0 * * *"
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
@@ -16,20 +18,30 @@ concurrency:
jobs:
unit-tests:
- runs-on: [self-hosted, nvidia, cu111, v100]
+ runs-on: [self-hosted, nvidia, cu116, v100]
steps:
- - uses: actions/checkout@v2
+ - uses: actions/checkout@v3
- id: setup-venv
uses: ./.github/workflows/setup-venv
- name: Install pytorch
run: |
- pip install torch==1.8.2+cu111 torchvision==0.9.2+cu111 -f https://download.pytorch.org/whl/lts/1.8/torch_lts.html
+ # use the same pytorch version as transformers CI
+ pip install -U --cache-dir $TORCH_CACHE torch==2.0.1+cu118 --index-url https://download.pytorch.org/whl/cu118
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
+ - name: Install transformers
+ run: |
+ git clone https://github.com/huggingface/transformers
+ cd transformers
+ # if needed switch to the last known good SHA until transformers@master is fixed
+ git checkout e7e9261a2
+ git rev-parse --short HEAD
+ pip install .
+
- name: Install deepspeed
run: |
pip install .[dev,autotuning]
@@ -41,19 +53,12 @@ jobs:
- name: HF transformers tests
run: |
- if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
- git clone https://github.com/huggingface/transformers
+ unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
cd transformers
- # if needed switch to the last known good SHA until transformers@master is fixed
- #git checkout 6268694e2
- git rev-parse --short HEAD
- # scipy/sklearn required for tests, using the 'dev' extra forces torch re-install
pip install .[testing]
# find reqs used in ds integration tests
find examples/pytorch -regextype posix-egrep -regex '.*(language-modeling|question-answering|summarization|image-classification|text-classification|translation).*/requirements.txt' -exec grep -v 'torch' {} \; | xargs -I {} pip install --upgrade {}
- # force datasets version due to issues
- pip install datasets==2.2.2
# force protobuf version due to issues
pip install "protobuf<4.21.0"
pip list
- HF_DATASETS_CACHE=/blob/datasets_cache/ TRANSFORMERS_CACHE=/blob/transformers_cache/ WANDB_DISABLED=true TORCH_EXTENSIONS_DIR=./torch-extensions RUN_SLOW=1 pytest --color=yes --durations=0 --verbose tests/deepspeed
+ WANDB_DISABLED=true RUN_SLOW=1 pytest $PYTEST_OPTS tests/deepspeed
diff --git a/.github/workflows/pre-compile-ops.yml b/.github/workflows/pre-compile-ops.yml
deleted file mode 100644
index 2ff3bb6a4fc7..000000000000
--- a/.github/workflows/pre-compile-ops.yml
+++ /dev/null
@@ -1,45 +0,0 @@
-# This is a basic workflow to help you get started with Actions
-
-name: Tests-w-precompiled-ops
-
-# Controls when the action will run.
-on:
- # Allows you to run this workflow manually from the Actions tab
- workflow_dispatch:
-
-# A workflow run is made up of one or more jobs that can run sequentially or in parallel
-jobs:
- # This workflow contains a single job called "build"
- build:
- # The type of runner that the job will run on
- runs-on: self-hosted
-
- # Steps represent a sequence of tasks that will be executed as part of the job
- steps:
- # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
- - uses: actions/checkout@v2
-
- - id: setup-venv
- uses: ./.github/workflows/setup-venv
-
- # Runs a single command using the runners shell
- - name: environment
- run: |
- python -c "import torch; print('torch:', torch.__version__, torch)"
- python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
-
- # Runs a set of commands using the runners shell
- - name: Install deepspeed
- run: |
- DS_BUILD_OPS=1 pip install .[dev]
- ds_report
-
- - name: Formatting checks
- run: |
- pre-commit run --all-files
-
- # Runs a set of commands using the runners shell
- - name: Unit tests
- run: |
- if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
- TORCH_EXTENSIONS_DIR=./torch-extensions pytest --durations=0 --forked --verbose -x tests/unit/
diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml
index 9de35a6d17f6..279bad471c01 100644
--- a/.github/workflows/python.yml
+++ b/.github/workflows/python.yml
@@ -1,12 +1,16 @@
name: python
on:
- push:
- branches:
- - 'staging**'
pull_request:
branches:
'**'
+ paths-ignore:
+ - 'docs/**'
+ - 'blogs/**'
+ merge_group:
+ branches: [ master ]
+ schedule:
+ - cron: "0 0 * * *"
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
@@ -24,7 +28,7 @@ jobs:
image: deepspeed/gh-builder:py${{ matrix.pyVersion }}
steps:
- - uses: actions/checkout@v2
+ - uses: actions/checkout@v3
- name: environment
run: |
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
new file mode 100644
index 000000000000..8e016b4169cb
--- /dev/null
+++ b/.github/workflows/release.yml
@@ -0,0 +1,50 @@
+name: Build and publish DeepSpeed release
+
+on:
+ push:
+ tags:
+ - 'v*.*.*'
+
+jobs:
+ deploy:
+ runs-on: ubuntu-20.04
+ environment: release-env
+
+ steps:
+ - uses: actions/checkout@v3
+ with:
+ ref: "master"
+ - id: setup-venv
+ uses: ./.github/workflows/setup-venv
+ - name: Get release version from tag
+ run: |
+ echo "RELEASE_VERSION=${GITHUB_REF#refs/*/v}" >> $GITHUB_ENV
+ - name: Check release version
+ run: |
+ pip install packaging
+ python release/check_release_version.py --release_version ${{ env.RELEASE_VERSION }}
+ - name: Build DeepSpeed
+ run: |
+ DS_BUILD_STRING=" " python setup.py sdist
+ - name: Publish to PyPI
+ uses: pypa/gh-action-pypi-publish@release/v1
+ with:
+ password: ${{ secrets.PYPI_API_TOKEN }}
+ repository-url: https://upload.pypi.org/legacy/
+ - name: Bump version
+ run: |
+ python release/bump_patch_version.py --current_version ${{ env.RELEASE_VERSION }}
+ - name: Create Pull Request
+ uses: peter-evans/create-pull-request@v4
+ with:
+ token: ${{ secrets.GH_PAT }}
+ add-paths: |
+ version.txt
+ body: |
+ **Auto-generated PR to update version.txt after a DeepSpeed release**
+ Released version - ${{ env.RELEASE_VERSION }}
+ Author - @${{ github.actor }}
+ branch: AutoPR/${{ env.RELEASE_VERSION }}
+ assignees: ${{ github.actor }}
+ title: "Update version.txt after ${{ env.RELEASE_VERSION }} release"
+ author: ${{ github.actor }} <${{ github.actor }}@users.noreply.github.com>
diff --git a/.github/workflows/setup-venv/action.yml b/.github/workflows/setup-venv/action.yml
index dacd50b8d471..ce2c458b9e57 100644
--- a/.github/workflows/setup-venv/action.yml
+++ b/.github/workflows/setup-venv/action.yml
@@ -12,11 +12,24 @@ runs:
shell: bash
- id: create-venv
run: |
+ rm -rf ./unit-test-venv
python -m venv unit-test-venv
source ./unit-test-venv/bin/activate
python -m pip install --upgrade pip
+ pip install wheel # required after pip>=23.1
echo PATH=$PATH >> $GITHUB_ENV # Make it so venv is inherited for other steps
shell: bash
+ - id: set-env-vars
+ run: |
+ echo TEST_DATA_DIR=/blob/ >> $GITHUB_ENV
+ echo TRANSFORMERS_CACHE=/blob/transformers_cache/ >> $GITHUB_ENV
+ echo TORCH_EXTENSIONS_DIR=./torch-extensions/ >> $GITHUB_ENV
+ echo TORCH_CACHE=/blob/torch_cache/ >> $GITHUB_ENV
+ echo HF_DATASETS_CACHE=/blob/datasets_cache/ >> $GITHUB_ENV
+ echo MEGATRON_CKPT_DIR=/blob/megatron_ckpt/ >> $GITHUB_ENV
+ echo CRITIC_CKPT_DIR=/blob/step2_opt_125m_ckpt/ >> $GITHUB_ENV
+ echo PYTEST_OPTS="--color=yes --durations=0 --verbose -rF" >> $GITHUB_ENV
+ shell: bash
- id: print-env
run: |
which python
diff --git a/.gitignore b/.gitignore
index ab364ad8a7e7..5b9cc7ac3156 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,31 +1,40 @@
+# Ignore Python compiled files
*.pyc
-.idea/
-*~
-*.swp
+
+# Ignore IDE-specific files and directories
+.idea/ # JetBrains IDE settings
+.vscode/ # Visual Studio Code settings
+.theia/ # Theia IDE settings
+
+# Ignore temporary and backup files
+*~ # General backup files
+*.swp # Vim swap files
+
+# Ignore log files
*.log
+
+# Ignore a specific generated file
deepspeed/git_version_info_installed.py
+
+# Ignore Python bytecode cache
__pycache__
# Build + installation data
-build/
-dist/
-*.so
-deepspeed.egg-info/
-build.txt
-
-# Website
-docs/_site/
-docs/build
+build/ # Build artifacts
+dist/ # Distribution files
+*.so # Compiled shared objects
+deepspeed.egg-info/ # Deepspeed package info
+build.txt # Build information
+
+# Website generated files
+docs/_site/ # Jekyll generated site
+docs/build # Generated documentation
docs/code-docs/source/_build
docs/code-docs/_build
docs/code-docs/build
-.sass-cache/
-.jekyll-cache/
+.sass-cache/ # SASS cache
+.jekyll-cache/ # Jekyll cache
.jekyll-metadata
# Testing data
-tests/unit/saved_checkpoint/
-
-# Dev/IDE data
-.vscode
-.theia
+tests/unit/saved_checkpoint/ # Saved checkpoints for testing
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index db67534e1936..2432a7a24124 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -22,12 +22,12 @@ repos:
- id: requirements-txt-fixer
- id: trailing-whitespace
-- repo: https://github.com/pre-commit/mirrors-yapf
- rev: v0.31.0
+- repo: https://github.com/google/yapf
+ rev: v0.32.0
hooks:
- id: yapf
-- repo: https://gitlab.com/daverona/pre-commit-cpp
+- repo: https://gitlab.com/daverona/pre-commit/cpp
rev: 0.8.0
hooks:
- id: clang-format # formatter of C/C++ code based on a style guide: LLVM, Google, Chromium, Mozilla, and WebKit available
@@ -38,7 +38,7 @@ repos:
- id: check-torchdist
name: check-torchdist
entry: ./scripts/check-torchdist.py
- language: script
+ language: python
exclude: ^(deepspeed/comm/|docs/|benchmarks/|scripts/check-torchdist.py|deepspeed/moe/sharded_moe.py|deepspeed/runtime/comm/coalesced_collectives.py|deepspeed/elasticity/elastic_agent.py|deepspeed/launcher/launch.py|tests/unit/comm/test_dist.py)
# Specific deepspeed/ files are excluded for now until we wrap ProcessGroup in deepspeed.comm
@@ -47,8 +47,9 @@ repos:
- id: check-license
name: check-license
entry: ./scripts/check-license.py
- language: script
- files: \.(py|cc|cu|h|cuh|hip)$
+ language: python
+ files: \.(py|c|cpp|cu|cc|h|hpp|cuh|hip|tr)$
+ exclude: ^(deepspeed/inference/v2/kernels/ragged_ops/blocked_flash|deepspeed/inference/v2/kernels/cutlass_ops/grouped_gemm)
- repo: https://github.com/codespell-project/codespell
rev: v2.1.0
@@ -58,7 +59,7 @@ repos:
# Do not check files that are automatically generated
'--skip=docs/Gemfile.lock,tests/unit/gpt2-merges.txt,tests/unit/gpt2-vocab.json',
'--ignore-regex=\\n', # Do not count the 'n' in an escaped newline as part of a word
- '--ignore-words-list=unsupport', # Word used in error messages that need rewording
+ '--ignore-words-list=youn,unsupport,noe', # Word used in error messages that need rewording
--check-filenames,
--check-hidden
]
@@ -67,4 +68,13 @@ repos:
rev: 4.0.1
hooks:
- id: flake8
- args: ['--ignore=E,F403,F405,F541,F841,W', '--select=E9,F,W6', '--per-file-ignores=__init__.py:F401']
+ args: ['--config=.flake8']
+
+- repo: local
+ hooks:
+ - id: check-torchcuda
+ name: check-torchcuda
+ entry: ./scripts/check-torchcuda.py
+ language: python
+ exclude: ^(.github/workflows/|scripts/check-torchcuda.py|docs/_tutorials/accelerator-abstraction-interface.md|accelerator/cuda_accelerator.py|deepspeed/inference/engine.py|deepspeed/model_implementations/transformers/clip_encoder.py|deepspeed/model_implementations/diffusers/vae.py|deepspeed/model_implementations/diffusers/unet.py|op_builder/spatial_inference.py|op_builder/transformer_inference.py|op_builder/builder.py|setup.py|tests/unit/ops/sparse_attention/test_sparse_attention.py)
+ # Specific deepspeed/ files are excluded for now until we wrap ProcessGroup in deepspeed.comm
diff --git a/.readthedocs.yml b/.readthedocs.yml
index a2da36620152..91102a7de54b 100644
--- a/.readthedocs.yml
+++ b/.readthedocs.yml
@@ -1,6 +1,9 @@
-
# Required
version: 2
+build:
+ os: "ubuntu-22.04"
+ tools:
+ python: "3.8"
# Build documentation in the docs/ directory with Sphinx
sphinx:
@@ -13,6 +16,5 @@ formats:
# Optionally set the version of Python and requirements required to build your docs
python:
- version: 3.7
install:
- requirements: requirements/requirements-readthedocs.txt
diff --git a/.style.yapf b/.style.yapf
index 4a4850fe4df6..be8721dd3e5c 100644
--- a/.style.yapf
+++ b/.style.yapf
@@ -1,3 +1,3 @@
[style]
-SPLIT_ALL_COMMA_SEPARATED_VALUES = true
-COLUMN_LIMIT = 89
+SPLIT_ALL_COMMA_SEPARATED_VALUES = false
+COLUMN_LIMIT = 119
diff --git a/CODEOWNERS b/CODEOWNERS
index 5fc20409c276..2410b3ebc09b 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -7,7 +7,7 @@
# top-level repo folders
-/.github/ @jeffra @mrwyattii
+/.github/ @jeffra @mrwyattii @loadams
/azure/ @jeffra @awan-10
/benchmarks/ @jeffra @awan-10 @mrwyattii @molly-smith
/bin/ @jeffra
diff --git a/LICENSE b/LICENSE
index 9e841e7a26e4..261eeb9e9f8b 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,21 +1,201 @@
- MIT License
-
- Copyright (c) Microsoft Corporation.
-
- Permission is hereby granted, free of charge, to any person obtaining a copy
- of this software and associated documentation files (the "Software"), to deal
- in the Software without restriction, including without limitation the rights
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- copies of the Software, and to permit persons to whom the Software is
- furnished to do so, subject to the following conditions:
-
- The above copyright notice and this permission notice shall be included in all
- copies or substantial portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- SOFTWARE
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
+
+ APPENDIX: How to apply the Apache License to your work.
+
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "[]"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+
+ Copyright [yyyy] [name of copyright owner]
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
diff --git a/MANIFEST.in b/MANIFEST.in
index 2fec750c6644..ab79573ef96c 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,4 +1,6 @@
include *.txt README.md
+include deepspeed/inference/v2/kernels/ragged_ops/libs/*.so
+include deepspeed/inference/v2/kernels/cutlass_ops/libs/*.so
recursive-include requirements *.txt
recursive-include deepspeed *.cpp *.h *.cu *.hip *.tr *.cuh *.cc *.json
recursive-include csrc *.cpp *.h *.cu *.tr *.cuh *.cc
diff --git a/README.md b/README.md
index bfa03a6e8c9a..b50b85af844f 100755
--- a/README.md
+++ b/README.md
@@ -1,7 +1,4 @@
[![License MIT](https://badgen.net/badge/license/MIT/blue)](https://github.com/Microsoft/DeepSpeed/blob/master/LICENSE)
-[![PyPI version](https://badge.fury.io/py/deepspeed.svg)](https://pypi.org/project/deepspeed/)
-[![Downloads](https://pepy.tech/badge/deepspeed)](https://pepy.tech/project/deepspeed)
-[![Build](https://badgen.net/badge/build/check-status/blue)](#build-pipeline-status)
@@ -9,222 +6,11 @@
-## Latest News
- DeepSpeed trained the world's most powerful language models ([MT-530B](https://www.microsoft.com/en-us/research/blog/using-deepspeed-and-megatron-to-train-megatron-turing-nlg-530b-the-worlds-largest-and-most-powerful-generative-language-model/), [BLOOM](https://huggingface.co/blog/bloom-megatron-deepspeed)); [learn how](https://www.deepspeed.ai/tutorials/large-models-w-deepspeed/).
+## DeeperSpeed
-* [2023/02] [Automatic Tensor Parallelism: Enables tensor parallelism by default without providing an injection policy](https://www.deepspeed.ai/tutorials/automatic-tensor-parallelism/)
-* [2022/12] [DeepSpeed Data Efficiency: A composable library that makes better use of data, increases training efficiency, and improves model quality](https://www.deepspeed.ai/2022/12/11/data-efficiency.html)
-* [2022/11] [Stable Diffusion Image Generation under 1 second w. DeepSpeed MII](https://github.com/microsoft/DeepSpeed-MII/tree/main/examples/benchmark/txt2img)
-* [2022/10] [DeepSpeed-MII: instant speedup on 24,000+ open-source DL models with up to 40x cheaper inference](https://www.deepspeed.ai/2022/10/10/mii.html)
-* [2022/09] [ZeRO-Inference: Democratizing massive model inference](https://www.deepspeed.ai/2022/09/09/zero-inference.html)
-* [2022/07] [Azure and DeepSpeed empower easy-to-use and high-performance model training](https://azure.microsoft.com/en-us/blog/azure-empowers-easytouse-highperformance-and-hyperscale-model-training-using-deepspeed/)
+DeeperSpeed is a fork of Microsoft's [Deepspeed](https://github.com/microsoft/DeepSpeed) library that is tailor-made for the [GPT-NeoX](https://github.com/EleutherAI/gpt-neox) by [EleutherAI](https://www.eleuther.ai/).
----
+Prior to 3/9/2023, DeeperSpeed was based on an old version of DeepSpeed (0.3.15). In order to migrate to the latest upstream DeepSpeed version while allowing users to access the old versions of GPT-NeoX and DeeperSpeed, we have introduced two versioned releases for both libraries:
-# Extreme Speed and Scale for DL Training and Inference
-
-[DeepSpeed](https://www.deepspeed.ai/) is an easy-to-use deep learning optimization software suite that enables unprecedented scale and speed for Deep Learning Training and Inference. With DeepSpeed you can:
-
-* Train/Inference dense or sparse models with billions or trillions of parameters
-* Achieve excellent system throughput and efficiently scale to thousands of GPUs
-* Train/Inference on resource constrained GPU systems
-* Achieve unprecedented low latency and high throughput for inference
-* Achieve extreme compression for an unparalleled inference latency and model size reduction with low costs
-
----
-
-# DeepSpeed's three innovation pillars
-
-
-
-
-## DeepSpeed-Training
-
-DeepSpeed offers a confluence of system innovations, that has made large scale DL training effective, and efficient, greatly improved ease of use, and redefined the DL training landscape in terms of scale that is possible. These innovations such as ZeRO, 3D-Parallelism, DeepSpeed-MoE, ZeRO-Infinity, etc. fall under the training pillar. Learn more: [DeepSpeed-Training](https://www.deepspeed.ai/training/)
-
-## DeepSpeed-Inference
-
-DeepSpeed brings together innovations in parallelism technology such as tensor, pipeline, expert and ZeRO-parallelism, and combines them with high performance custom inference kernels, communication optimizations and heterogeneous memory technologies to enable inference at an unprecedented scale, while achieving unparalleled latency, throughput and cost reduction. This systematic composition of system technologies for inference falls under the inference pillar. Learn more: [DeepSpeed-Inference](https://www.deepspeed.ai/inference)
-
-
-## DeepSpeed-Compression
-
-To further increase the inference efficiency, DeepSpeed offers easy-to-use and flexible-to-compose compression techniques for researchers and practitioners to compress their models while delivering faster speed, smaller model size, and significantly reduced compression cost. Moreover, SoTA innovations on compression like ZeroQuant and XTC are included under the compression pillar. Learn more: [DeepSpeed-Compression](https://www.deepspeed.ai/compression)
-
----
-
-# DeepSpeed Software Suite
-
-## DeepSpeed Library
-
- The [DeepSpeed](https://github.com/microsoft/deepspeed) library (this repository) implements and packages the innovations and technologies in DeepSpeed Training, Inference and Compression Pillars into a single easy-to-use, open-sourced repository. It allows for easy composition of multitude of features within a single training, inference or compression pipeline. The DeepSpeed Library is heavily adopted by the DL community, and has been used to enable some of the most powerful models (see [DeepSpeed Adoption](#deepspeed-adoption)).
-
-## Model Implementations for Inference (MII)
-
- [Model Implementations for Inference (MII)](https://github.com/microsoft/deepspeed-mii) is an open-sourced repository for making low-latency and high-throughput inference accessible to all data scientists by alleviating the need to apply complex system optimization techniques themselves. Out-of-box, MII offers support for thousands of widely used DL models, optimized using DeepSpeed-Inference, that can be deployed with a few lines of code, while achieving significant latency reduction compared to their vanilla open-sourced versions.
-
-## DeepSpeed on Azure
-
- DeepSpeed users are diverse and have access to different environments. We recommend to try DeepSpeed on Azure as it is the simplest and easiest method. The recommended method to try DeepSpeed on Azure is through AzureML [recipes](https://github.com/Azure/azureml-examples/tree/main/v1/python-sdk/workflows/train/deepspeed). The job submission and data preparation scripts have been made available [here](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples/azureml). For more details on how to use DeepSpeed on Azure, please follow the [Azure tutorial](https://www.deepspeed.ai/tutorials/azure/).
-
----
-
-# DeepSpeed Adoption
-
-DeepSpeed is an important part of Microsoft’s new
-[AI at Scale](https://www.microsoft.com/en-us/research/project/ai-at-scale/)
-initiative to enable next-generation AI capabilities at scale, where you can find more
-information [here](https://innovation.microsoft.com/en-us/exploring-ai-at-scale).
-
-DeepSpeed has been used to train many different large-scale models, below is a list of several examples that we are aware of (if you'd like to include your model please submit a PR):
-
- * [Megatron-Turing NLG (530B)](https://www.microsoft.com/en-us/research/blog/using-deepspeed-and-megatron-to-train-megatron-turing-nlg-530b-the-worlds-largest-and-most-powerful-generative-language-model/)
- * [Jurassic-1 (178B)](https://uploads-ssl.webflow.com/60fd4503684b466578c0d307/61138924626a6981ee09caf6_jurassic_tech_paper.pdf)
- * [BLOOM (176B)](https://huggingface.co/blog/bloom-megatron-deepspeed)
- * [GLM (130B)](https://github.com/THUDM/GLM-130B)
- * [YaLM (100B)](https://github.com/yandex/YaLM-100B)
- * [GPT-NeoX (20B)](https://github.com/EleutherAI/gpt-neox)
- * [AlexaTM (20B)](https://www.amazon.science/blog/20b-parameter-alexa-model-sets-new-marks-in-few-shot-learning)
- * [Turing NLG (17B)](https://www.microsoft.com/en-us/research/blog/turing-nlg-a-17-billion-parameter-language-model-by-microsoft/)
- * [METRO-LM (5.4B)](https://arxiv.org/pdf/2204.06644.pdf)
-
-DeepSpeed has been integrated with several different popular open-source DL frameworks such as:
-
-| | Documentation |
-| ---------------------------------------------------------------------------------------------- | -------------------------------------------- |
- | [Transformers with DeepSpeed](https://huggingface.co/docs/transformers/main/main_classes/deepspeed) |
-| | [Accelerate with DeepSpeed](https://huggingface.co/docs/accelerate/usage_guides/deepspeed) |
-| | [Lightning with DeepSpeed](https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.strategies.DeepSpeedStrategy.html) |
-| | [MosaicML with DeepSpeed](https://docs.mosaicml.com/en/latest/trainer/using_the_trainer.html?highlight=deepspeed#deepspeed-integration) |
-| | [Determined with DeepSpeed](https://docs.determined.ai/latest/training/apis-howto/deepspeed/overview.html) |
-
----
-
-# Build Pipeline Status
-
-| Description | Status |
-| ----------- | ------ |
-| NVIDIA | [![nv-torch12-p40](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch12-p40.yml/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch12-p40.yml) [![nv-torch18-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch18-v100.yml/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch18-v100.yml) [![nv-torch-latest-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch-latest-v100.yml/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch-latest-v100.yml) [![nv-inference](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-inference.yml/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-inference.yml) [![nv-nightly](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-nightly.yml/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-nightly.yml) |
-| AMD | [![amd](https://github.com/microsoft/DeepSpeed/actions/workflows/amd.yml/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/amd.yml) |
-| PyTorch Nightly | [![nv-torch-nightly-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch-nightly-v100.yml/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch-nightly-v100.yml) |
-| Integrations | [![nv-transformers-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-transformers-v100.yml/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-transformers-v100.yml) [![nv-lightning-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-lightning-v100.yml/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-lightning-v100.yml) [![nv-accelerate-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-accelerate-v100.yml/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-accelerate-v100.yml) |
-| Misc | [![Formatting](https://github.com/microsoft/DeepSpeed/actions/workflows/formatting.yml/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/formatting.yml) [![pages-build-deployment](https://github.com/microsoft/DeepSpeed/actions/workflows/pages/pages-build-deployment/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/pages/pages-build-deployment) [![Documentation Status](https://readthedocs.org/projects/deepspeed/badge/?version=latest)](https://deepspeed.readthedocs.io/en/latest/?badge=latest)|
-
-# Installation
-
-The quickest way to get started with DeepSpeed is via pip, this will install
-the latest release of DeepSpeed which is not tied to specific PyTorch or CUDA
-versions. DeepSpeed includes several C++/CUDA extensions that we commonly refer
-to as our 'ops'. By default, all of these extensions/ops will be built
-just-in-time (JIT) using [torch's JIT C++ extension loader that relies on
-ninja](https://pytorch.org/docs/stable/cpp_extension.html) to build and
-dynamically link them at runtime.
-
-## Requirements
-* [PyTorch](https://pytorch.org/) must be installed _before_ installing DeepSpeed.
-* For full feature support we recommend a version of PyTorch that is >= 1.8 and ideally the latest PyTorch stable release.
-* A CUDA or ROCm compiler such as [nvcc](https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/#introduction) or [hipcc](https://github.com/ROCm-Developer-Tools/HIPCC) used to compile C++/CUDA/HIP extensions.
-* Specific GPUs we develop and test against are listed below, this doesn't mean your GPU will not work if it doesn't fall into this category it's just DeepSpeed is most well tested on the following:
- * NVIDIA: Pascal, Volta, Ampere, and Hopper architectures
- * AMD: MI100 and MI200
-
-## PyPI
-We regularly push releases to [PyPI](https://pypi.org/project/deepspeed/) and encourage users to install from there in most cases.
-
-```bash
-pip install deepspeed
-```
-
-After installation, you can validate your install and see which extensions/ops
-your machine is compatible with via the DeepSpeed environment report.
-
-```bash
-ds_report
-```
-
-If you would like to pre-install any of the DeepSpeed extensions/ops (instead
-of JIT compiling) or install pre-compiled ops via PyPI please see our [advanced
-installation instructions](https://www.deepspeed.ai/tutorials/advanced-install/).
-
-## Windows
-Windows support is partially supported with DeepSpeed. On Windows you can build wheel with following steps, currently only inference mode is supported.
-1. Install pytorch, such as pytorch 1.8 + cuda 11.1
-2. Install visual cpp build tools, such as VS2019 C++ x64/x86 build tools
-3. Launch cmd console with Administrator privilege for creating required symlink folders
-4. Run `python setup.py bdist_wheel` to build wheel in `dist` folder
-
-# Features
-
-Please checkout [DeepSpeed-Training](https://www.deepspeed.ai/training), [DeepSpeed-Inference](https://www.deepspeed.ai/inference) and [DeepSpeed-Compression](https://www.deepspeed.ai/compression) pages for full set of features offered along each of these three pillars.
-
-# Further Reading
-
-All DeepSpeed documentation, tutorials, and blogs can be found on our website: [deepspeed.ai](https://www.deepspeed.ai/)
-
-
-| | Description |
-| ---------------------------------------------------------------------------------------------- | -------------------------------------------- |
-| [Getting Started](https://www.deepspeed.ai/getting-started/) | First steps with DeepSpeed |
-| [DeepSpeed JSON Configuration](https://www.deepspeed.ai/docs/config-json/) | Configuring DeepSpeed |
-| [API Documentation](https://deepspeed.readthedocs.io/en/latest/) | Generated DeepSpeed API documentation |
-| [Tutorials](https://www.deepspeed.ai/tutorials/) | Tutorials |
-| [Blogs](https://www.deepspeed.ai/posts/) | Blogs |
-
-
-# Contributing
-DeepSpeed welcomes your contributions! Please see our
-[contributing](CONTRIBUTING.md) guide for more details on formatting, testing,
-etc.
-
-## Contributor License Agreement
-This project welcomes contributions and suggestions. Most contributions require you to
-agree to a Contributor License Agreement (CLA) declaring that you have the right to, and
-actually do, grant us the rights to use your contribution. For details, visit
-https://cla.opensource.microsoft.com.
-
-When you submit a pull request, a CLA bot will automatically determine whether you need
-to provide a CLA and decorate the PR appropriately (e.g., status check, comment). Simply
-follow the instructions provided by the bot. You will only need to do this once across
-all repos using our CLA.
-
-## Code of Conduct
-This project has adopted the [Microsoft Open Source Code of
-Conduct](https://opensource.microsoft.com/codeofconduct/). For more information see the
-[Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or contact
-[opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
-
-# Publications
-1. Samyam Rajbhandari, Jeff Rasley, Olatunji Ruwase, Yuxiong He. (2019) ZeRO: memory optimizations toward training trillion parameter models. [arXiv:1910.02054](https://arxiv.org/abs/1910.02054) and [In Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis (SC '20)](https://dl.acm.org/doi/10.5555/3433701.3433727).
-2. Jeff Rasley, Samyam Rajbhandari, Olatunji Ruwase, and Yuxiong He. (2020) DeepSpeed: System Optimizations Enable Training Deep Learning Models with Over 100 Billion Parameters. [In Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining (KDD '20, Tutorial)](https://dl.acm.org/doi/10.1145/3394486.3406703).
-3. Minjia Zhang, Yuxiong He. (2020) Accelerating Training of Transformer-Based Language Models with Progressive Layer Dropping. [arXiv:2010.13369](https://arxiv.org/abs/2010.13369) and [NeurIPS 2020](https://proceedings.neurips.cc/paper/2020/hash/a1140a3d0df1c81e24ae954d935e8926-Abstract.html).
-4. Jie Ren, Samyam Rajbhandari, Reza Yazdani Aminabadi, Olatunji Ruwase, Shuangyan Yang, Minjia Zhang, Dong Li, Yuxiong He. (2021) ZeRO-Offload: Democratizing Billion-Scale Model Training. [arXiv:2101.06840](https://arxiv.org/abs/2101.06840) and [USENIX ATC 2021](https://www.usenix.org/conference/atc21/presentation/ren-jie).
-5. Hanlin Tang, Shaoduo Gan, Ammar Ahmad Awan, Samyam Rajbhandari, Conglong Li, Xiangru Lian, Ji Liu, Ce Zhang, Yuxiong He. (2021) 1-bit Adam: Communication Efficient Large-Scale Training with Adam's Convergence Speed. [arXiv:2102.02888](https://arxiv.org/abs/2102.02888) and [ICML 2021](http://proceedings.mlr.press/v139/tang21a.html).
-6. Samyam Rajbhandari, Olatunji Ruwase, Jeff Rasley, Shaden Smith, Yuxiong He. (2021) ZeRO-Infinity: Breaking the GPU Memory Wall for Extreme Scale Deep Learning. [arXiv:2104.07857](https://arxiv.org/abs/2104.07857) and [SC 2021](https://dl.acm.org/doi/abs/10.1145/3458817.3476205).
-7. Conglong Li, Ammar Ahmad Awan, Hanlin Tang, Samyam Rajbhandari, Yuxiong He. (2021) 1-bit LAMB: Communication Efficient Large-Scale Large-Batch Training with LAMB's Convergence Speed. [arXiv:2104.06069](https://arxiv.org/abs/2104.06069) and [HiPC 2022](https://hipc.org/advance-program/).
-8. Conglong Li, Minjia Zhang, Yuxiong He. (2021) The Stability-Efficiency Dilemma: Investigating Sequence Length Warmup for Training GPT Models. [arXiv:2108.06084](https://arxiv.org/abs/2108.06084) and [NeurIPS 2022](https://openreview.net/forum?id=JpZ5du_Kdh).
-9. Yucheng Lu, Conglong Li, Minjia Zhang, Christopher De Sa, Yuxiong He. (2022) Maximizing Communication Efficiency for Large-scale Training via 0/1 Adam. [arXiv:2202.06009](https://arxiv.org/abs/2202.06009).
-10. Samyam Rajbhandari, Conglong Li, Zhewei Yao, Minjia Zhang, Reza Yazdani Aminabadi, Ammar Ahmad Awan, Jeff Rasley, Yuxiong He. (2022) DeepSpeed-MoE: Advancing Mixture-of-Experts Inference and Training to Power Next-Generation AI Scale [arXiv:2201.05596](https://arxiv.org/abs/2201.05596) and [ICML 2022](https://proceedings.mlr.press/v162/rajbhandari22a.html).
-11. Shaden Smith, Mostofa Patwary, Brandon Norick, Patrick LeGresley, Samyam Rajbhandari, Jared Casper, Zhun Liu, Shrimai Prabhumoye, George Zerveas, Vijay Korthikanti, Elton Zhang, Rewon Child, Reza Yazdani Aminabadi, Julie Bernauer, Xia Song, Mohammad Shoeybi, Yuxiong He, Michael Houston, Saurabh Tiwary, Bryan Catanzaro. (2022) Using DeepSpeed and Megatron to Train Megatron-Turing NLG 530B, A Large-Scale Generative Language Model [arXiv:2201.11990](https://arxiv.org/abs/2201.11990).
-12. Xiaoxia Wu, Zhewei Yao, Minjia Zhang, Conglong Li, Yuxiong He. (2022) Extreme Compression for Pre-trained Transformers Made Simple and Efficient. [arXiv:2206.01859](https://arxiv.org/abs/2206.01859) and [NeurIPS 2022](https://openreview.net/forum?id=xNeAhc2CNAl).
-13. Zhewei Yao, Reza Yazdani Aminabadi, Minjia Zhang, Xiaoxia Wu, Conglong Li, Yuxiong He. (2022) ZeroQuant: Efficient and Affordable Post-Training Quantization for Large-Scale Transformers. [arXiv:2206.01861](https://arxiv.org/abs/2206.01861) and [NeurIPS 2022](https://openreview.net/forum?id=f-fVCElZ-G1).
-14. Reza Yazdani Aminabadi, Samyam Rajbhandari, Minjia Zhang, Ammar Ahmad Awan, Cheng Li, Du Li, Elton Zheng, Jeff Rasley, Shaden Smith, Olatunji Ruwase, Yuxiong He. (2022) DeepSpeed Inference: Enabling Efficient Inference of Transformer Models at Unprecedented Scale. [arXiv:2207.00032](https://arxiv.org/abs/2207.00032) and [SC 2022](https://dl.acm.org/doi/abs/10.5555/3571885.3571946).
-15. Zhewei Yao, Xiaoxia Wu, Conglong Li, Connor Holmes, Minjia Zhang, Cheng Li, Yuxiong He. (2022) Random-LTD: Random and Layerwise Token Dropping Brings Efficient Training for Large-scale Transformers. [arXiv:2211.11586](https://arxiv.org/abs/2211.11586).
-16. Conglong Li, Zhewei Yao, Xiaoxia Wu, Minjia Zhang, Yuxiong He. (2022) DeepSpeed Data Efficiency: Improving Deep Learning Model Quality and Training Efficiency via Efficient Data Sampling and Routing. [arXiv:2212.03597](https://arxiv.org/abs/2212.03597).
-
-
-# Videos
-1. DeepSpeed KDD 2020 Tutorial
- 1. [Overview](https://www.youtube.com/watch?v=CaseqC45DNc&list=PLa85ZdUjfWS21mgibJ2vCvLziprjpKoW0&index=29)
- 2. [ZeRO + large model training](https://www.youtube.com/watch?v=y4_bCiAsIAk&list=PLa85ZdUjfWS21mgibJ2vCvLziprjpKoW0&index=28)
- 3. [17B T-NLG demo](https://www.youtube.com/watch?v=9V-ZbP92drg&list=PLa85ZdUjfWS21mgibJ2vCvLziprjpKoW0&index=27)
- 4. [Fastest BERT training + RScan tuning](https://www.youtube.com/watch?v=o1K-ZG9F6u0&list=PLa85ZdUjfWS21mgibJ2vCvLziprjpKoW0&index=26)
- 5. DeepSpeed hands on deep dive: [part 1](https://www.youtube.com/watch?v=_NOk-mBwDYg&list=PLa85ZdUjfWS21mgibJ2vCvLziprjpKoW0&index=92), [part 2](https://www.youtube.com/watch?v=sG6_c4VXLww&list=PLa85ZdUjfWS21mgibJ2vCvLziprjpKoW0&index=94), [part 3](https://www.youtube.com/watch?v=k9yPkBTayos&list=PLa85ZdUjfWS21mgibJ2vCvLziprjpKoW0&index=93)
- 6. [FAQ](https://www.youtube.com/watch?v=nsHu6vEgPew&list=PLa85ZdUjfWS21mgibJ2vCvLziprjpKoW0&index=24)
-2. Microsoft Research Webinar
- * Registration is free and all videos are available on-demand.
- * [ZeRO & Fastest BERT: Increasing the scale and speed of deep learning training in DeepSpeed](https://note.microsoft.com/MSR-Webinar-DeepSpeed-Registration-On-Demand.html).
-3. [DeepSpeed on AzureML](https://youtu.be/yBVXR8G8Bg8)
-4. Community Tutorials
- * [DeepSpeed: All the tricks to scale to gigantic models (Mark Saroufim)](https://www.youtube.com/watch?v=pDGI668pNg0)
- * [Turing-NLG, DeepSpeed and the ZeRO optimizer (Yannic Kilcher)](https://www.youtube.com/watch?v=tC01FRB0M7w)
- * [Ultimate Guide To Scaling ML Models (The AI Epiphany)](https://www.youtube.com/watch?v=hc0u4avAkuM)
+- Version 1.0 of [GPT-NeoX](https://github.com/EleutherAI/gpt-neox/releases/tag/v1.0) and [DeeperSpeed](https://github.com/EleutherAI/DeeperSpeed/releases/tag/v1.0) maintain snapshots of the old stable versions that [GPT-NeoX-20B](https://arxiv.org/abs/2204.06745) and the [Pythia Suite](https://github.com/EleutherAI/pythia) were trained on.
+- Version 2.0 of [GPT-NeoX](https://github.com/EleutherAI/gpt-neox/releases/tag/v2.0) and [DeeperSpeed](https://github.com/EleutherAI/DeeperSpeed/releases/tag/v2.0) are the latest versions built on the latest DeepSpeed, and will be maintained going forward.
diff --git a/SECURITY.md b/SECURITY.md
index e0dfff56a956..9e9391ee0bac 100644
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -12,7 +12,7 @@ If you believe you have found a security vulnerability in any Microsoft-owned re
Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://msrc.microsoft.com/create-report).
-If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc).
+If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc).
You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc).
diff --git a/accelerator/__init__.py b/accelerator/__init__.py
index d29c3b15467b..efed1ef84aca 100644
--- a/accelerator/__init__.py
+++ b/accelerator/__init__.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
from .abstract_accelerator import DeepSpeedAccelerator
-from .real_accelerator import get_accelerator, set_accelerator
+from .real_accelerator import get_accelerator, set_accelerator, is_current_accelerator_supported
diff --git a/accelerator/abstract_accelerator.py b/accelerator/abstract_accelerator.py
index 03dfbe9df43d..0c7ee3212fad 100644
--- a/accelerator/abstract_accelerator.py
+++ b/accelerator/abstract_accelerator.py
@@ -1,14 +1,22 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
import abc
from abc import ABC
class DeepSpeedAccelerator(ABC):
+
def __init__(self):
self._name = None
self._communication_backend_name = None
+ @abc.abstractmethod
+ def is_synchronized_device(self):
+ ...
+
# Device APIs
@abc.abstractmethod
def device_name(self, device_index):
@@ -139,6 +147,10 @@ def max_memory_reserved(self, device_index=None):
def total_memory(self, device_index=None):
...
+ @abc.abstractmethod
+ def available_memory(self, device_index=None):
+ ...
+
# Data types
@abc.abstractmethod
def is_bf16_supported(self):
@@ -148,6 +160,10 @@ def is_bf16_supported(self):
def is_fp16_supported(self):
...
+ @abc.abstractmethod
+ def supported_dtypes(self):
+ ...
+
# Misc
@abc.abstractmethod
def amp(self):
@@ -173,6 +189,10 @@ def lazy_call(self, callback):
def communication_backend_name(self):
...
+ @abc.abstractmethod
+ def is_triton_supported(self):
+ ...
+
# Tensor operations
@property
@abc.abstractmethod
@@ -210,7 +230,11 @@ def LongTensor(self):
...
@abc.abstractmethod
- def pin_memory(self, tensor):
+ def pin_memory(self, tensor, align_bytes=1):
+ ...
+
+ @abc.abstractmethod
+ def is_pinned(self, tensor):
...
@abc.abstractmethod
diff --git a/accelerator/cpu_accelerator.py b/accelerator/cpu_accelerator.py
new file mode 100644
index 000000000000..fdbbd33c07a2
--- /dev/null
+++ b/accelerator/cpu_accelerator.py
@@ -0,0 +1,282 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+from deepspeed.accelerator.abstract_accelerator import DeepSpeedAccelerator
+import oneccl_bindings_for_pytorch # noqa: F401 # type: ignore
+import psutil
+import os
+
+
+# accelerator for Intel CPU
+class CPU_Accelerator(DeepSpeedAccelerator):
+
+ def __init__(self):
+ self._name = 'cpu'
+ self._communication_backend_name = 'ccl'
+ self.max_mem = psutil.Process().memory_info().rss
+
+ def is_synchronized_device(self):
+ return True
+
+ # Device APIs
+ def device_name(self, device_index=None):
+ return 'cpu'
+
+ def device(self, device_index=None):
+ return None
+
+ def set_device(self, device_index):
+ return
+
+ def current_device(self):
+ return os.environ.get('LOCAL_RANK', 0)
+
+ def current_device_name(self):
+ return 'cpu'
+
+ def device_count(self):
+ device_count = int(os.environ.get('LOCAL_SIZE', 0))
+ if device_count > 0:
+ return device_count
+ else:
+ from deepspeed.utils.numa import get_numa_cores
+ # Count NUMA node for number of cpu accelerators. On machine with HBM
+ # In flat mode, HBM is in separate NUMA node with no cores on this node.
+ # Ignore these NUMA nodes with no cores.
+ numa_core_lists = get_numa_cores()
+ numa_count = 0
+ prev_core_list = []
+ for core_list in numa_core_lists:
+ if len(core_list) > 0 and core_list != prev_core_list:
+ numa_count += 1
+ prev_core_list = core_list
+ return numa_count
+
+ def synchronize(self, device_index=None):
+ return
+
+ # RNG APIs
+ def random(self):
+ return torch.random
+
+ def set_rng_state(self, new_state, device_index=None):
+ if device_index == None:
+ return torch.set_rng_state(new_state)
+ return torch.set_rng_state(new_state, device_index)
+
+ def get_rng_state(self, device_index=None):
+ return torch.get_rng_state()
+
+ def manual_seed(self, seed):
+ return torch.manual_seed(seed)
+
+ def manual_seed_all(self, seed):
+ return torch.manual_seed(seed)
+
+ def initial_seed(self, seed):
+ return torch.initial_seed(seed)
+
+ def default_generator(self, device_index):
+ return torch.default_generator
+
+ # Streams/Events
+ @property
+ def Stream(self):
+ return None
+
+ def stream(self, stream):
+ from deepspeed.runtime.utils import noop_context
+ return noop_context()
+
+ def current_stream(self, device_index=None):
+ return None
+
+ def default_stream(self, device_index=None):
+ return None
+
+ @property
+ def Event(self):
+ return None
+
+ # Memory management
+ def empty_cache(self):
+ return
+
+ def get_rss(self):
+ mem = psutil.Process().memory_info().rss
+ if mem > self.max_mem:
+ self.max_mem = mem
+ return mem
+
+ def reset_rss(self):
+ mem = psutil.Process().memory_info().rss
+ self.max_mem = mem
+ return mem
+
+ def memory_allocated(self, device_index=None):
+ return self.get_rss()
+
+ def max_memory_allocated(self, device_index=None):
+ self.get_rss()
+ return self.max_mem
+
+ def reset_max_memory_allocated(self, device_index=None):
+ self.reset_rss()
+ return
+
+ def memory_cached(self, device_index=None):
+ return self.get_rss()
+
+ def max_memory_cached(self, device_index=None):
+ self.get_rss()
+ return self.max_mem
+
+ def reset_max_memory_cached(self, device_index=None):
+ self.reset_rss()
+ return
+
+ def memory_stats(self, device_index=None):
+ mem = self.get_rss()
+ mem_stat = {}
+ mem_stat['allocated_bytes.all.current'] = mem
+ mem_stat['allocated_bytes.all.peak'] = self.max_mem
+ return mem_stat
+
+ def reset_peak_memory_stats(self, device_index=None):
+ self.reset_rss()
+ return
+
+ def memory_reserved(self, device_index=None):
+ return self.get_rss()
+
+ def max_memory_reserved(self, device_index=None):
+ self.get_rss()
+ return self.max_mem
+
+ def total_memory(self, device_index=None):
+ return psutil.virtual_memory().total
+
+ def available_memory(self, device_index=None):
+ return psutil.virtual_memory().available
+
+ # Misc
+ def amp(self):
+ return torch.cpu.amp
+
+ def is_available(self):
+ return True
+
+ def range_push(self, msg):
+ # TODO itt is currently not supported yet
+ # return torch.profiler.itt.range_push(msg)
+ return
+
+ def range_pop(self):
+ # TODO itt is currently not supported yet
+ # return torch.profiler.itt.range_pop()
+ return
+
+ def lazy_call(self, callback):
+ return callback()
+
+ def communication_backend_name(self):
+ return self._communication_backend_name
+
+ def is_triton_supported(self):
+ return False
+
+ # Data types
+ def is_bf16_supported(self):
+ return True
+
+ def is_fp16_supported(self):
+ return False
+
+ def supported_dtypes(self):
+ return [torch.float, torch.bfloat16]
+
+ # Tensor operations
+
+ @property
+ def BFloat16Tensor(self):
+ return torch.BFloat16Tensor
+
+ @property
+ def ByteTensor(self):
+ return torch.ByteTensor
+
+ @property
+ def DoubleTensor(self):
+ return torch.DoubleTensor
+
+ @property
+ def FloatTensor(self):
+ return torch.FloatTensor
+
+ @property
+ def HalfTensor(self):
+ return torch.HalfTensor
+
+ @property
+ def IntTensor(self):
+ return torch.IntTensor
+
+ @property
+ def LongTensor(self):
+ return torch.LongTensor
+
+ def pin_memory(self, tensor, align_bytes=1):
+ return tensor
+
+ def is_pinned(self, tensor):
+ return tensor.is_pinned()
+
+ def op_builder_dir(self):
+ try:
+ # is op_builder from deepspeed or a 3p version? this should only succeed if it's deepspeed
+ # if successful this also means we're doing a local install and not JIT compile path
+ from op_builder import __deepspeed__ # noqa: F401 # type: ignore
+ return "op_builder.cpu"
+ except ImportError:
+ return "deepspeed.ops.op_builder.cpu"
+
+ def on_accelerator(self, tensor):
+ device_str = str(tensor.device)
+ if device_str.startswith('cpu'):
+ return True
+ else:
+ return False
+
+ # create an instance of op builder and return, name specified by class_name
+ def create_op_builder(self, op_name):
+ builder_class = self.get_op_builder(op_name)
+ if builder_class != None:
+ return builder_class()
+ return None
+
+ # return an op builder class, name specified by class_name
+ def get_op_builder(self, class_name):
+ try:
+ # is op_builder from deepspeed or a 3p version? this should only succeed if it's deepspeed
+ # if successful this also means we're doing a local install and not JIT compile path
+ from op_builder import __deepspeed__ # noqa: F401 # type: ignore
+ from op_builder.cpu import CCLCommBuilder, FusedAdamBuilder, CPUAdamBuilder, NotImplementedBuilder
+ except ImportError:
+ from deepspeed.ops.op_builder.cpu import CCLCommBuilder, FusedAdamBuilder, CPUAdamBuilder, NotImplementedBuilder
+
+ if class_name == "CCLCommBuilder":
+ return CCLCommBuilder
+ elif class_name == "FusedAdamBuilder":
+ return FusedAdamBuilder
+ elif class_name == "CPUAdamBuilder":
+ return CPUAdamBuilder
+ else:
+ # return a NotImplementedBuilder to avoid get NoneType[Name] in unit tests
+ return NotImplementedBuilder
+
+ def build_extension(self):
+ from torch.utils.cpp_extension import BuildExtension
+ return BuildExtension
diff --git a/accelerator/cuda_accelerator.py b/accelerator/cuda_accelerator.py
index 945ba42a3a9c..2786b425ca7f 100644
--- a/accelerator/cuda_accelerator.py
+++ b/accelerator/cuda_accelerator.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
import os
import pkgutil
@@ -12,30 +15,32 @@
except ImportError:
pass
+# Delay import pynvml to avoid import error when CUDA is not available
+pynvml = None
+
class CUDA_Accelerator(DeepSpeedAccelerator):
+
def __init__(self):
self._name = 'cuda'
self._communication_backend_name = 'nccl'
+ if pynvml is None:
+ self._init_pynvml()
- # begin initialize for create_op_builder()
- # put all valid class name <--> class type mapping into class_dict
- op_builder_dir = self.op_builder_dir()
- op_builder_module = importlib.import_module(op_builder_dir)
-
- for _, module_name, _ in pkgutil.iter_modules([os.path.dirname(op_builder_module.__file__)]):
- # avoid self references
- if module_name != 'all_ops' and module_name != 'builder':
- module = importlib.import_module("{}.{}".format(
- op_builder_dir,
- module_name))
- for member_name in module.__dir__():
- if member_name.endswith(
- 'Builder'
- ) and member_name != "OpBuilder" and member_name != "CUDAOpBuilder" and member_name != "TorchCPUOpBuilder": # avoid abstract classes
- if not member_name in self.class_dict:
- self.class_dict[member_name] = getattr(module, member_name)
- # end initialize for create_op_builder()
+ def _init_pynvml(self):
+ global pynvml
+ try:
+ import pynvml
+ except ImportError:
+ return
+ try:
+ pynvml.nvmlInit()
+ except pynvml.NVMLError:
+ pynvml = None
+ return
+
+ def is_synchronized_device(self):
+ return False
# Device APIs
def device_name(self, device_index=None):
@@ -148,6 +153,31 @@ def max_memory_reserved(self, device_index=None):
def total_memory(self, device_index=None):
return torch.cuda.get_device_properties(device_index).total_memory
+ def _get_nvml_gpu_id(self, torch_gpu_id):
+ """
+ credit: https://discuss.pytorch.org/t/making-pynvml-match-torch-device-ids-cuda-visible-devices/103020
+
+ Remap torch device id to nvml device id, respecting CUDA_VISIBLE_DEVICES.
+
+ If the latter isn't set return the same id
+ """
+ # if CUDA_VISIBLE_DEVICES is used automagically remap the id since pynvml ignores this env var
+ if "CUDA_VISIBLE_DEVICES" in os.environ:
+ ids = list(map(int, os.environ.get("CUDA_VISIBLE_DEVICES", "").split(",")))
+ return ids[torch_gpu_id] # remap
+ else:
+ return torch_gpu_id
+
+ def available_memory(self, device_index=None):
+ if pynvml:
+ if device_index is None:
+ device_index = self.current_device()
+ handle = pynvml.nvmlDeviceGetHandleByIndex(self._get_nvml_gpu_id(device_index))
+ info = pynvml.nvmlDeviceGetMemoryInfo(handle)
+ return info.free
+ else:
+ return self.total_memory(device_index) - self.memory_allocated(device_index)
+
# Data types
def is_bf16_supported(self):
return torch.cuda.is_bf16_supported()
@@ -159,6 +189,9 @@ def is_fp16_supported(self):
else:
return False
+ def supported_dtypes(self):
+ return [torch.float, torch.half, torch.bfloat16]
+
# Misc
def amp(self):
if hasattr(torch.cuda, 'amp'):
@@ -182,6 +215,13 @@ def lazy_call(self, callback):
def communication_backend_name(self):
return self._communication_backend_name
+ def is_triton_supported(self):
+ major, _ = torch.cuda.get_device_capability()
+ if major >= 8:
+ return True
+ else:
+ return False
+
# Tensor operations
@property
@@ -212,9 +252,12 @@ def IntTensor(self):
def LongTensor(self):
return torch.cuda.LongTensor
- def pin_memory(self, tensor):
+ def pin_memory(self, tensor, align_bytes=1):
return tensor.pin_memory()
+ def is_pinned(self, tensor):
+ return tensor.is_pinned()
+
def on_accelerator(self, tensor):
device_str = str(tensor.device)
if device_str.startswith('cuda:'):
@@ -226,7 +269,7 @@ def op_builder_dir(self):
try:
# is op_builder from deepspeed or a 3p version? this should only succeed if it's deepspeed
# if successful this also means we're doing a local install and not JIT compile path
- from op_builder import __deepspeed__ # noqa: F401
+ from op_builder import __deepspeed__ # noqa: F401 # type: ignore
return "op_builder"
except ImportError:
return "deepspeed.ops.op_builder"
@@ -234,10 +277,35 @@ def op_builder_dir(self):
# dict that holds class name <--> class type mapping i.e.
# 'AsyncIOBuilder':
# this dict will be filled at init stage
- class_dict = {}
+ class_dict = None
+
+ def _lazy_init_class_dict(self):
+ if self.class_dict != None:
+ return
+ else:
+ self.class_dict = {}
+ # begin initialize for create_op_builder()
+ # put all valid class name <--> class type mapping into class_dict
+ op_builder_dir = self.op_builder_dir()
+ op_builder_module = importlib.import_module(op_builder_dir)
+ op_builder_absolute_path = os.path.dirname(op_builder_module.__file__)
+ for _, module_name, _ in pkgutil.iter_modules([op_builder_absolute_path]):
+ # avoid self references,
+ # skip sub_directories which contains ops for other backend(cpu, npu, etc.).
+ if module_name != 'all_ops' and module_name != 'builder' and not os.path.isdir(
+ os.path.join(op_builder_absolute_path, module_name)):
+ module = importlib.import_module("{}.{}".format(op_builder_dir, module_name))
+ for member_name in module.__dir__():
+ if member_name.endswith(
+ 'Builder'
+ ) and member_name != "OpBuilder" and member_name != "CUDAOpBuilder" and member_name != "TorchCPUOpBuilder": # avoid abstract classes
+ if not member_name in self.class_dict:
+ self.class_dict[member_name] = getattr(module, member_name)
+ # end initialize for create_op_builder()
# create an instance of op builder and return, name specified by class_name
def create_op_builder(self, class_name):
+ self._lazy_init_class_dict()
if class_name in self.class_dict:
return self.class_dict[class_name]()
else:
@@ -245,6 +313,7 @@ def create_op_builder(self, class_name):
# return an op builder class, name specified by class_name
def get_op_builder(self, class_name):
+ self._lazy_init_class_dict()
if class_name in self.class_dict:
return self.class_dict[class_name]
else:
diff --git a/accelerator/mps_accelerator.py b/accelerator/mps_accelerator.py
new file mode 100644
index 000000000000..77595f6b636c
--- /dev/null
+++ b/accelerator/mps_accelerator.py
@@ -0,0 +1,237 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+
+from .abstract_accelerator import DeepSpeedAccelerator
+
+# During setup stage torch may not be installed, pass on no torch will
+# allow op builder related API to be executed.
+try:
+ import torch.mps
+except ImportError:
+ pass
+
+
+class MPS_Accelerator(DeepSpeedAccelerator):
+
+ def __init__(self):
+ self._name = "mps"
+ self._communication_backend_name = None
+
+ def is_synchronized_device(self):
+ return False
+
+ # Device APIs
+ def device_name(self, device_index=None):
+ if device_index == None:
+ return "mps"
+ return "mps:{}".format(device_index)
+
+ def device(self, device_index):
+ return torch.device("mps", index=0)
+
+ def set_device(self, device_index):
+ return
+
+ def current_device(self):
+ return torch.device("mps", index=0)
+
+ def current_device_name(self):
+ return "mps:0"
+
+ def device_count(self):
+ return 1
+
+ def synchronize(self, device_index=None):
+ return torch.mps.synchronize()
+
+ # RNG APIs
+ def random(self):
+ return torch.random
+
+ def set_rng_state(self, new_state, device_index=None):
+ return torch.mps.set_rng_state(new_state)
+
+ def get_rng_state(self, device_index=None):
+ return torch.mps.get_rng_state()
+
+ def manual_seed(self, seed):
+ return torch.mps.manual_seed(seed)
+
+ def manual_seed_all(self, seed):
+ return torch.mps.manual_seed(seed)
+
+ def seed(self):
+ return torch.mps.seed()
+
+ def initial_seed(self, seed):
+ return
+
+ def default_generator(self, device_index):
+ return
+
+ # Streams/Events
+ @property
+ def Stream(self):
+ return None
+
+ def stream(self, stream):
+ return None
+
+ def current_stream(self, device_index=None):
+ return None
+
+ def default_stream(self, device_index=None):
+ return None
+
+ @property
+ def Event(self):
+ return None
+
+ # Memory management
+ def empty_cache(self):
+ return torch.mps.empty_cache()
+
+ def memory_allocated(self, device_index=None):
+ return torch.mps.current_allocated_memory()
+
+ def max_memory_allocated(self, device_index=None):
+ return torch.mps.driver_allocated_memory()
+
+ def set_per_process_memory_fraction(self, fraction):
+ return torch.mps.set_per_process_memory_fraction(fraction)
+
+ def reset_max_memory_allocated(self, device_index=None):
+ return
+
+ def memory_cached(self, device_index=None):
+ return
+
+ def max_memory_cached(self, device_index=None):
+ return
+
+ def reset_max_memory_cached(self, device_index=None):
+ return
+
+ def memory_stats(self, device_index=None):
+ return
+
+ def reset_peak_memory_stats(self, device_index=None):
+ return
+
+ def memory_reserved(self, device_index=None):
+ return
+
+ def max_memory_reserved(self, device_index=None):
+ return
+
+ def total_memory(self, device_index=None):
+ return
+
+ def available_memory(self, device_index=None):
+ return
+
+ # Data types
+ def is_bf16_supported(self):
+ return False
+
+ def is_fp16_supported(self):
+ return False
+
+ def supported_dtypes(self):
+ return [torch.float]
+
+ # Misc
+ def amp(self):
+ return
+
+ def is_available(self):
+ return hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
+
+ def range_push(self, msg):
+ return
+
+ def range_pop(self):
+ return
+
+ def lazy_call(self, callback):
+ return
+
+ def communication_backend_name(self):
+ return self._communication_backend_name
+
+ def is_triton_supported(self):
+ return False
+
+ # Tensor operations
+ @property
+ def BFloat16Tensor(self):
+ return
+
+ @property
+ def ByteTensor(self):
+ return
+
+ @property
+ def DoubleTensor(self):
+ return
+
+ @property
+ def FloatTensor(self):
+ return
+
+ @property
+ def HalfTensor(self):
+ return
+
+ @property
+ def IntTensor(self):
+ return
+
+ @property
+ def LongTensor(self):
+ return
+
+ def pin_memory(self, tensor, align_bytes=1):
+ return tensor.pin_memory()
+
+ def is_pinned(self, tensor):
+ return tensor.is_pinned()
+
+ def on_accelerator(self, tensor):
+ device_str = str(tensor.device)
+ if device_str.startswith("mps"):
+ return True
+ else:
+ return False
+
+ def op_builder_dir(self):
+ try:
+ # is op_builder from deepspeed or a 3p version? this should only succeed if it's deepspeed
+ # if successful this also means we're doing a local install and not JIT compile path
+ from op_builder import __deepspeed__ # noqa: F401 # type: ignore
+
+ return "op_builder"
+ except ImportError:
+ return "deepspeed.ops.op_builder"
+
+ # create an instance of op builder, specified by class_name
+ def create_op_builder(self, op_name):
+ builder_class = self.get_op_builder(op_name)
+ if builder_class != None:
+ return builder_class()
+ return None
+
+ # return an op builder class, specified by class_name
+ def get_op_builder(self, class_name):
+ from deepspeed.ops.op_builder.cpu import NotImplementedBuilder
+
+ return NotImplementedBuilder
+
+ def build_extension(self):
+ from torch.utils.cpp_extension import BuildExtension
+
+ return BuildExtension
diff --git a/accelerator/npu_accelerator.py b/accelerator/npu_accelerator.py
new file mode 100644
index 000000000000..7ebbd320bb15
--- /dev/null
+++ b/accelerator/npu_accelerator.py
@@ -0,0 +1,257 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+import importlib
+import inspect
+
+from .abstract_accelerator import DeepSpeedAccelerator
+# During setup stage torch may not be installed, pass on no torch will
+# allow op builder related API to be executed.
+try:
+ import torch.npu
+except ImportError:
+ pass
+
+
+class NPU_Accelerator(DeepSpeedAccelerator):
+
+ def __init__(self):
+ super().__init__()
+ self._name = 'npu'
+ self._communication_backend_name = 'hccl'
+ # dict that holds class name <--> class type mapping i.e.
+ # 'AsyncIOBuilder':
+ # this dict will be filled at init stage
+ self.class_dict = None
+
+ def is_synchronized_device(self):
+ return False
+
+ # Device APIs
+ def device_name(self, device_index=None):
+ if device_index == None:
+ return 'npu'
+ return 'npu:{}'.format(device_index)
+
+ def device(self, device_index=None):
+ return torch.npu.device(device_index)
+
+ def set_device(self, device_index):
+ torch.npu.set_device(device_index)
+
+ def current_device(self):
+ return torch.npu.current_device()
+
+ def current_device_name(self):
+ return 'npu:{}'.format(torch.npu.current_device())
+
+ def device_count(self):
+ return torch.npu.device_count()
+
+ def synchronize(self, device_index=None):
+ return torch.npu.synchronize(device_index)
+
+ # RNG APIs
+ def random(self):
+ return torch.random
+
+ def set_rng_state(self, new_state, device_index=None):
+ if device_index is None:
+ return torch.npu.set_rng_state(new_state)
+
+ return torch.npu.set_rng_state(new_state, device_index)
+
+ def get_rng_state(self, device_index=None):
+ if device_index is None:
+ return torch.npu.get_rng_state()
+
+ return torch.npu.get_rng_state(device_index)
+
+ def manual_seed(self, seed):
+ return torch.npu.manual_seed(seed)
+
+ def manual_seed_all(self, seed):
+ return torch.npu.manual_seed_all(seed)
+
+ def initial_seed(self, seed):
+ return torch.npu.initial_seed(seed)
+
+ def default_generator(self, device_index):
+ return torch.npu.default_generators[device_index]
+
+ # Streams/Events
+ @property
+ def Stream(self):
+ return torch.npu.Stream
+
+ def stream(self, stream):
+ return torch.npu.stream(stream)
+
+ def current_stream(self, device_index=None):
+ return torch.npu.current_stream(device_index)
+
+ def default_stream(self, device_index=None):
+ return torch.npu.default_stream(device_index)
+
+ @property
+ def Event(self):
+ return torch.npu.Event
+
+ # Memory management
+ def empty_cache(self):
+ return torch.npu.empty_cache()
+
+ def memory_allocated(self, device_index=None):
+ return torch.npu.memory_allocated(device_index)
+
+ def max_memory_allocated(self, device_index=None):
+ return torch.npu.max_memory_allocated(device_index)
+
+ def reset_max_memory_allocated(self, device_index=None):
+ return torch.npu.reset_max_memory_allocated(device_index)
+
+ def memory_cached(self, device_index=None):
+ return torch.npu.memory_cached(device_index)
+
+ def max_memory_cached(self, device_index=None):
+ return torch.npu.max_memory_cached(device_index)
+
+ def reset_max_memory_cached(self, device_index=None):
+ return torch.npu.reset_max_memory_cached(device_index)
+
+ def memory_stats(self, device_index=None):
+ if hasattr(torch.npu, 'memory_stats'):
+ return torch.npu.memory_stats(device_index)
+
+ def reset_peak_memory_stats(self, device_index=None):
+ if hasattr(torch.npu, 'reset_peak_memory_stats'):
+ return torch.npu.reset_peak_memory_stats(device_index)
+
+ def memory_reserved(self, device_index=None):
+ if hasattr(torch.npu, 'memory_reserved'):
+ return torch.npu.memory_reserved(device_index)
+
+ def max_memory_reserved(self, device_index=None):
+ if hasattr(torch.npu, 'max_memory_reserved'):
+ return torch.npu.max_memory_reserved(device_index)
+
+ def total_memory(self, device_index=None):
+ return torch.npu.get_device_properties(device_index).total_memory
+
+ def available_memory(self, device_index=None):
+ return self.total_memory(device_index) - self.memory_allocated(device_index)
+
+ # Data types
+ def is_bf16_supported(self):
+ return torch.npu.is_bf16_supported()
+
+ def is_fp16_supported(self):
+ return True
+
+ def supported_dtypes(self):
+ return [torch.float, torch.half, torch.bfloat16]
+
+ # Misc
+ def amp(self):
+ if hasattr(torch.npu, 'amp'):
+ return torch.npu.amp
+ return None
+
+ def is_available(self):
+ return torch.npu.is_available()
+
+ def range_push(self, msg):
+ return
+
+ def range_pop(self):
+ return
+
+ def lazy_call(self, callback):
+ return torch.npu._lazy_call(callback)
+
+ def communication_backend_name(self):
+ return self._communication_backend_name
+
+ def is_triton_supported(self):
+ return False
+
+ # Tensor operations
+
+ @property
+ def BFloat16Tensor(self):
+ return torch.npu.BFloat16Tensor
+
+ @property
+ def ByteTensor(self):
+ return torch.npu.ByteTensor
+
+ @property
+ def DoubleTensor(self):
+ return torch.npu.DoubleTensor
+
+ @property
+ def FloatTensor(self):
+ return torch.npu.FloatTensor
+
+ @property
+ def HalfTensor(self):
+ return torch.npu.HalfTensor
+
+ @property
+ def IntTensor(self):
+ return torch.npu.IntTensor
+
+ @property
+ def LongTensor(self):
+ return torch.npu.LongTensor
+
+ def pin_memory(self, tensor, align_bytes=1):
+ return tensor.pin_memory()
+
+ def is_pinned(self, tensor):
+ return tensor.is_pinned()
+
+ def on_accelerator(self, tensor):
+ device_str = str(tensor.device)
+ if device_str.startswith('npu:'):
+ return True
+ else:
+ return False
+
+ def op_builder_dir(self):
+ try:
+ # is op_builder from deepspeed or a 3p version? this should only succeed if it's deepspeed
+ # if successful this also means we're doing a local install and not JIT compile path
+ from op_builder import __deepspeed__ # noqa: F401 # type: ignore
+ return "op_builder.npu"
+ except ImportError:
+ return "deepspeed.ops.op_builder.npu"
+
+ def _lazy_init_class_dict(self):
+ if self.class_dict:
+ return
+
+ op_builder_module = importlib.import_module(self.op_builder_dir())
+
+ # get op builder class from op_builder/npu/__init__.py
+ self.class_dict = {}
+ for class_name, class_obj in inspect.getmembers(op_builder_module, inspect.isclass):
+ self.class_dict[class_name] = class_obj
+
+ # create an instance of op builder and return, name specified by class_name
+ def create_op_builder(self, class_name):
+ builder_class = self.get_op_builder(class_name)
+ return None if builder_class is None else builder_class()
+
+ # return an op builder class, name specified by class_name
+ def get_op_builder(self, class_name):
+ self._lazy_init_class_dict()
+ if class_name in self.class_dict:
+ return self.class_dict[class_name]
+ else:
+ return self.class_dict['NotImplementedBuilder'] if 'NotImplementedBuilder' in self.class_dict else None
+
+ def build_extension(self):
+ from torch.utils.cpp_extension import BuildExtension
+ return BuildExtension
diff --git a/accelerator/real_accelerator.py b/accelerator/real_accelerator.py
index 06cbb0b08e7a..49133489b051 100644
--- a/accelerator/real_accelerator.py
+++ b/accelerator/real_accelerator.py
@@ -1,4 +1,15 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+import os
+
+try:
+ # Importing logger currently requires that torch is installed, hence the try...except
+ # TODO: Remove logger dependency on torch.
+ from deepspeed.utils import logger as accel_logger
+except ImportError as e:
+ accel_logger = None
try:
from accelerator.abstract_accelerator import DeepSpeedAccelerator as dsa1
@@ -9,6 +20,8 @@
except ImportError as e:
dsa2 = None
+SUPPORTED_ACCELERATOR_LIST = ['cuda', 'cpu', 'xpu', 'npu', 'mps']
+
ds_accelerator = None
@@ -23,51 +36,148 @@ def _validate_accelerator(accel_obj):
# accelerator.abstractor_accelerator
# or deepspeed.accelerator.abstract_accelerator, consider accel_obj
# is a conforming object
- if not ((dsa1 != None and isinstance(accel_obj,
- dsa1)) or
- (dsa2 != None and isinstance(accel_obj,
- dsa2))):
- raise AssertionError(
- f'{accel_obj.__class__.__name__} accelerator is not subclass of DeepSpeedAccelerator'
- )
+ if not ((dsa1 is not None and isinstance(accel_obj, dsa1)) or (dsa2 is not None and isinstance(accel_obj, dsa2))):
+ raise AssertionError(f"{accel_obj.__class__.__name__} accelerator is not subclass of DeepSpeedAccelerator")
# TODO: turn off is_available test since this breaks tests
- #assert accel_obj.is_available(), \
+ # assert accel_obj.is_available(), \
# f'{accel_obj.__class__.__name__} accelerator fails is_available() test'
+def is_current_accelerator_supported():
+ return get_accelerator() in SUPPORTED_ACCELERATOR_LIST
+
+
def get_accelerator():
global ds_accelerator
- if ds_accelerator is None:
+ if ds_accelerator is not None:
+ return ds_accelerator
+
+ accelerator_name = None
+ ds_set_method = None
+ # 1. Detect whether there is override of DeepSpeed accelerators from environment variable.
+ if "DS_ACCELERATOR" in os.environ.keys():
+ accelerator_name = os.environ["DS_ACCELERATOR"]
+ if accelerator_name == "xpu":
+ try:
+ from intel_extension_for_deepspeed import XPU_Accelerator # noqa: F401 # type: ignore
+ except ImportError as e:
+ raise ValueError(
+ f"XPU_Accelerator requires intel_extension_for_deepspeed, which is not installed on this system.")
+ elif accelerator_name == "cpu":
+ try:
+ import intel_extension_for_pytorch # noqa: F401 # type: ignore
+ except ImportError as e:
+ raise ValueError(
+ f"CPU_Accelerator requires intel_extension_for_pytorch, which is not installed on this system.")
+ elif accelerator_name == "npu":
+ try:
+ import torch_npu # noqa: F401 # type: ignore
+ except ImportError as e:
+ raise ValueError(f"NPU_Accelerator requires torch_npu, which is not installed on this system.")
+ pass
+ elif accelerator_name == "mps":
+ try:
+ import torch.mps
+
+ # should use torch.mps.is_available() if it exists someday but this is used as proxy
+ torch.mps.current_allocated_memory()
+ except (RuntimeError, ImportError) as e:
+ raise ValueError(f"MPS_Accelerator requires torch.mps, which is not installed on this system.")
+ elif is_current_accelerator_supported():
+ raise ValueError(f'DS_ACCELERATOR must be one of {SUPPORTED_ACCELERATOR_LIST}. '
+ f'Value "{accelerator_name}" is not supported')
+ ds_set_method = "override"
+
+ # 2. If no override, detect which accelerator to use automatically
+ if accelerator_name is None:
+ # We need a way to choose among different accelerator types.
+ # Currently we detect which accelerator extension is installed
+ # in the environment and use it if the installing answer is True.
+ # An alternative might be detect whether CUDA device is installed on
+ # the system but this comes with two pitfalls:
+ # 1. the system may not have torch pre-installed, so
+ # get_accelerator().is_available() may not work.
+ # 2. Some scenario like install on login node (without CUDA device)
+ # and run on compute node (with CUDA device) may cause mismatch
+ # between installation time and runtime.
+
try:
- from intel_extension_for_deepspeed import XPU_Accelerator
+ from intel_extension_for_deepspeed import XPU_Accelerator # noqa: F401,F811 # type: ignore
+
+ accelerator_name = "xpu"
except ImportError as e:
pass
- else:
- ds_accelerator = XPU_Accelerator()
- _validate_accelerator(ds_accelerator)
- return ds_accelerator
-
+ if accelerator_name is None:
+ try:
+ import intel_extension_for_pytorch # noqa: F401,F811 # type: ignore
+
+ accelerator_name = "cpu"
+ except ImportError as e:
+ pass
+ if accelerator_name is None:
+ try:
+ import torch_npu # noqa: F401,F811 # type: ignore
+
+ accelerator_name = "npu"
+ except ImportError as e:
+ pass
+ if accelerator_name is None:
+ try:
+ import torch.mps
+
+ # should use torch.mps.is_available() if it exists someday but this is used as proxy
+ torch.mps.current_allocated_memory()
+ accelerator_name = "mps"
+ except (RuntimeError, ImportError) as e:
+ pass
+ if accelerator_name is None:
+ accelerator_name = "cuda"
+
+ ds_set_method = "auto detect"
+
+ # 3. Set ds_accelerator accordingly
+ if accelerator_name == "cuda":
from .cuda_accelerator import CUDA_Accelerator
+
ds_accelerator = CUDA_Accelerator()
- _validate_accelerator(ds_accelerator)
+ elif accelerator_name == "cpu":
+ from .cpu_accelerator import CPU_Accelerator
+
+ ds_accelerator = CPU_Accelerator()
+ elif accelerator_name == "xpu":
+ # XPU_Accelerator is already imported in detection stage
+ ds_accelerator = XPU_Accelerator()
+ elif accelerator_name == "npu":
+ from .npu_accelerator import NPU_Accelerator
+
+ ds_accelerator = NPU_Accelerator()
+ elif accelerator_name == "mps":
+ from .mps_accelerator import MPS_Accelerator
+
+ ds_accelerator = MPS_Accelerator()
+ _validate_accelerator(ds_accelerator)
+ if accel_logger is not None:
+ accel_logger.info(f"Setting ds_accelerator to {ds_accelerator._name} ({ds_set_method})")
return ds_accelerator
def set_accelerator(accel_obj):
global ds_accelerator
_validate_accelerator(accel_obj)
+ if accel_logger is not None:
+ accel_logger.info(f"Setting ds_accelerator to {accel_obj._name} (model specified)")
ds_accelerator = accel_obj
-'''
+"""
-----------[code] test_get.py -----------
from deepspeed.accelerator import get_accelerator
my_accelerator = get_accelerator()
-print(f'{my_accelerator._name=}')
-print(f'{my_accelerator._communication_backend=}')
-print(f'{my_accelerator.HalfTensor().device=}')
-print(f'{my_accelerator.total_memory()=}')
+logger.info(f'{my_accelerator._name=}')
+logger.info(f'{my_accelerator._communication_backend=}')
+logger.info(f'{my_accelerator.HalfTensor().device=}')
+logger.info(f'{my_accelerator.total_memory()=}')
-----------[code] test_get.py -----------
---[output] python test_get.py---------
@@ -81,16 +191,16 @@ def set_accelerator(accel_obj):
-----------[code] test_set.py -----------
from deepspeed.accelerator.cuda_accelerator import CUDA_Accelerator
cu_accel = CUDA_Accelerator()
-print(f'{id(cu_accel)=}')
+logger.info(f'{id(cu_accel)=}')
from deepspeed.accelerator import set_accelerator, get_accelerator
set_accelerator(cu_accel)
my_accelerator = get_accelerator()
-print(f'{id(my_accelerator)=}')
-print(f'{my_accelerator._name=}')
-print(f'{my_accelerator._communication_backend=}')
-print(f'{my_accelerator.HalfTensor().device=}')
-print(f'{my_accelerator.total_memory()=}')
+logger.info(f'{id(my_accelerator)=}')
+logger.info(f'{my_accelerator._name=}')
+logger.info(f'{my_accelerator._communication_backend=}')
+logger.info(f'{my_accelerator.HalfTensor().device=}')
+logger.info(f'{my_accelerator.total_memory()=}')
-----------[code] test_set.py -----------
@@ -102,4 +212,4 @@ def set_accelerator(accel_obj):
my_accelerator.HalfTensor().device=device(type='cuda', index=0)
my_accelerator.total_memory()=34089730048
---[output] python test_set.py---------
-'''
+"""
diff --git a/benchmarks/README.md b/benchmarks/README.md
new file mode 100644
index 000000000000..4c88b2dd091c
--- /dev/null
+++ b/benchmarks/README.md
@@ -0,0 +1,6 @@
+# DeepSpeed Benchmarks
+
+If you are looking for DeepSpeed benchmarks, please see the following resources:
+
+1. [Communication Benchmarking Suite](https://github.com/microsoft/DeepSpeedExamples/tree/master/benchmarks/communication)
+2. [Inference Benchmarks](https://github.com/microsoft/DeepSpeedExamples/tree/master/benchmarks/inference)
diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py
deleted file mode 100644
index fcb45ab2b685..000000000000
--- a/benchmarks/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-'''Copyright The Microsoft DeepSpeed Team'''
diff --git a/benchmarks/communication/README.md b/benchmarks/communication/README.md
deleted file mode 100644
index f760465b5c97..000000000000
--- a/benchmarks/communication/README.md
+++ /dev/null
@@ -1,75 +0,0 @@
-# Running Communication Benchmarks
-
-
-To run benchmarks, there are two options:
-
-1. Run a single communication operation:
-
-For example, run with a single large message size:
-
-deepspeed all_reduce.py
-
-
-Scan across message sizes:
-
-deepspeed all_reduce.py --scan
-
-
-2. Run all available communication benchmarks:
-
-
-deepspeed run_all.py
-
-
-Like the individual benchmarks, `run_all.py` supports scanning arguments for the max message size, bw-unit, etc. Simply pass the desired arguments to `run_all.py` and they'll be propagated to each comm op.
-
-
-usage: ds_bench [-h] [--local_rank LOCAL_RANK] [--trials TRIALS] [--warmups WARMUPS] [--maxsize MAXSIZE] [--async-op] [--bw-unit {Gbps,GBps}] [--backend {nccl}] [--dist {deepspeed,torch}] [--scan] [--raw] [--all-reduce] [--all-gather] [--all-to-all]
- [--pt2pt] [--broadcast] [--dtype DTYPE] [--mem-factor MEM_FACTOR] [--debug]
-
-optional arguments:
- -h, --help show this help message and exit
- --local_rank LOCAL_RANK
- --trials TRIALS Number of timed iterations
- --warmups WARMUPS Number of warmup (non-timed) iterations
- --maxsize MAXSIZE Max message size as a power of 2
- --async-op Enables non-blocking communication
- --bw-unit {Gbps,GBps}
- --backend {nccl} Communication library to use
- --dist {deepspeed,torch}
- Distributed DL framework to use
- --scan Enables scanning all message sizes
- --raw Print the message size and latency without units
- --all-reduce Run all_reduce
- --all-gather Run all_gather
- --all-to-all Run all_to_all
- --pt2pt Run pt2pt
- --broadcast Run broadcast
- --dtype DTYPE PyTorch tensor dtype
- --mem-factor MEM_FACTOR
- Proportion of max available GPU memory to use for single-size evals
- --debug Enables all_to_all debug prints
-
-
-Note that `ds_bench` is a pre-packaged wrapper around `run_all.py`. Users can pass the same arguments as well:
-
-
-/bin/ds_bench --scan --trials=10
-
-
-Finally, users can choose specific communication operations to run in `run_all.py` or `ds_bench` by passing them as arguments (all operations are run by default). For example:
-
-
-deepspeed run_all.py --scan --all-reduce --all-to-all --broadcast
-
-
-
-# Adding Communication Benchmarks
-
-To add new communication benchmarks, follow this general procedure:
-
-1. Copy a similar benchmark file (e.g. to add `reduce_scatter`, copy `all_reduce.py` as a template)
-2. Add a new bw formula in `utils.get_bw`, a new maximum tensor element formula in `utils.max_numel`, and a new arg in `utils.benchmark_parser`
-3. Replace comm op calls in new file with find-replace
-4. Find a good default `mem_factor` for use in `run__single()` function
-5. Add new comm op to `run_all.py`
diff --git a/benchmarks/communication/__init__.py b/benchmarks/communication/__init__.py
deleted file mode 100644
index fcb45ab2b685..000000000000
--- a/benchmarks/communication/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-'''Copyright The Microsoft DeepSpeed Team'''
diff --git a/benchmarks/communication/all_gather.py b/benchmarks/communication/all_gather.py
deleted file mode 100644
index dc97267b3840..000000000000
--- a/benchmarks/communication/all_gather.py
+++ /dev/null
@@ -1,159 +0,0 @@
-'''Copyright The Microsoft DeepSpeed Team'''
-
-from benchmarks.communication.utils import *
-from benchmarks.communication.constants import *
-from deepspeed.accelerator import get_accelerator
-
-import time
-
-
-# Run all_gather and print metrics
-def timed_all_gather(input, output, args):
- if args.dist == 'torch':
- import torch.distributed as dist
- elif args.dist == 'deepspeed':
- import deepspeed.comm as dist
-
- sync_all()
- # Warmups, establish connections, etc.
- for i in range(args.warmups):
- # use all_gather_base if available
- if args.dist == 'torch':
- if hasattr(torch.distributed, "_all_gather_base"):
- dist._all_gather_base(output, input, group=None, async_op=args.async_op)
- else:
- output_tensors = list(
- torch.chunk(output_tensor,
- cdb.get_world_size(group)))
- dist.all_gather(output_tensors, input_tensor, group=group, async_op=True)
- elif args.dist == 'deepspeed':
- dist.allgather_fn(output, input, group=None, async_op=args.async_op)
- sync_all()
-
- # time the actual comm op trials times and average it
- pre = time.perf_counter()
- for i in range(args.trials):
- # use all_gather_base if available
- if args.dist == 'torch':
- if hasattr(torch.distributed, "_all_gather_base"):
- dist._all_gather_base(output, input, group=None, async_op=args.async_op)
- else:
- output_tensors = list(
- torch.chunk(output_tensor,
- cdb.get_world_size(group)))
- dist.all_gather(output_tensors, input_tensor, group=group, async_op=True)
- elif args.dist == 'deepspeed':
- dist.allgather_fn(output, input, group=None, async_op=args.async_op)
- sync_all()
- duration = time.perf_counter() - pre
-
- # maintain and clean performance data
- avg_duration = duration / args.trials
- size = input.element_size() * input.nelement()
- n = dist.get_world_size()
- tput, busbw = get_bw('all_gather', size, avg_duration, args)
- tput_str, busbw_str, duration_str = get_metric_strings(args, tput, busbw, avg_duration)
- desc = f'{input.nelement()}x{input.element_size()}'
-
- if not args.raw:
- size = convert_size(size)
-
- print_rank_0(
- f"{size:<20} {desc:25s} {duration_str:20s} {tput_str:20s} {busbw_str:20s}")
-
-
-def run_all_gather(local_rank, args):
- if args.dist == 'torch':
- import torch.distributed as dist
- elif args.dist == 'deepspeed':
- import deepspeed.comm as dist
-
- # Prepare benchmark header
- print_header(args, 'all_gather')
- global_rank = dist.get_rank()
- world_size = dist.get_world_size()
-
- if args.scan:
- # Create list of message sizes
- M_LIST = []
- for x in (2**p for p in range(1, args.maxsize)):
- M_LIST.append(x)
-
- sync_all()
- # loop over various tensor sizes
- for M in M_LIST:
- global_rank = dist.get_rank()
- try:
- mat = torch.ones(world_size,
- M,
- dtype=getattr(
- torch,
- args.dtype)).to(
- get_accelerator().device_name(local_rank))
- sync_all()
- input = ((mat.mul_(float(global_rank))).view(-1))
- # Delete original mat to avoid OOM
- del mat
- get_accelerator().empty_cache()
- output = torch.zeros(input.nelement() * world_size,
- dtype=getattr(
- torch,
- args.dtype)).to(
- get_accelerator().device_name(local_rank))
- except RuntimeError as e:
- if 'out of memory' in str(e):
- if dist.get_rank() == 0:
- print('WARNING: Ran out of GPU memory. Exiting comm op.')
- sync_all()
- break
- sync_all()
- timed_all_gather(input, output, args)
- else:
- # all_gather_base saves memory
- if (args.dist == 'torch'
- and hasattr(torch.distributed,
- "_all_gather_base")) or (args.dist == 'deepspeed'
- and dist.has_allgather_base):
- mem_factor = args.mem_factor + 0.2
- else:
- mem_factor = args.mem_factor
- # Send the biggest message size our GPUs can fit. If you're facing OOM errors, reduce the mem_factor
- sync_all()
- elements_per_gpu = max_numel(comm_op='all_gather',
- dtype=getattr(torch,
- args.dtype),
- mem_factor=mem_factor,
- local_rank=local_rank,
- args=args)
- try:
- mat = torch.ones(elements_per_gpu,
- dtype=getattr(torch,
- args.dtype)).to(
- get_accelerator().device_name(local_rank))
- # multiply each GPU's tensor by the rank to ease debugging
- input = ((mat.mul_(float(global_rank))).view(-1))
- # Delete original mat to avoid OOM
- del mat
- get_accelerator().empty_cache()
- output = torch.zeros(
- elements_per_gpu * world_size,
- dtype=getattr(torch,
- args.dtype)).to(get_accelerator().device_name(local_rank))
- except RuntimeError as e:
- if 'out of memory' in str(e):
- if dist.get_rank() == 0:
- print(
- 'WARNING: Ran out of GPU memory. Try to reduce the --mem-factor argument!'
- )
- sync_all()
- return
-
- sync_all()
- timed_all_gather(input, output, args)
-
-
-if __name__ == "__main__":
- args = benchmark_parser().parse_args()
- rank = args.local_rank
- init_processes(local_rank=rank, args=args)
- run_all_gather(local_rank=rank, args=args)
diff --git a/benchmarks/communication/all_reduce.py b/benchmarks/communication/all_reduce.py
deleted file mode 100644
index edc1b99301c0..000000000000
--- a/benchmarks/communication/all_reduce.py
+++ /dev/null
@@ -1,113 +0,0 @@
-'''Copyright The Microsoft DeepSpeed Team'''
-
-from benchmarks.communication.utils import *
-from benchmarks.communication.constants import *
-from deepspeed.accelerator import get_accelerator
-
-import time
-
-
-def timed_all_reduce(input, args):
- if args.dist == 'torch':
- import torch.distributed as dist
- elif args.dist == 'deepspeed':
- import deepspeed.comm as dist
-
- sync_all()
- # Warmups, establish connections, etc.
- for i in range(args.warmups):
- dist.all_reduce(input, async_op=args.async_op)
- sync_all()
-
- # time the actual comm op trials times and average it
- pre = time.perf_counter()
- for i in range(args.trials):
- dist.all_reduce(input, async_op=args.async_op)
- sync_all()
- duration = time.perf_counter() - pre
-
- # maintain and clean performance data
- avg_duration = duration / args.trials
- size = input.element_size() * input.nelement()
- n = dist.get_world_size()
- tput, busbw = get_bw('all_reduce', size, avg_duration, args)
- tput_str, busbw_str, duration_str = get_metric_strings(args, tput, busbw, avg_duration)
- desc = f'{input.nelement()}x{input.element_size()}'
-
- if not args.raw:
- size = convert_size(size)
-
- print_rank_0(
- f"{size:<20} {desc:25s} {duration_str:20s} {tput_str:20s} {busbw_str:20s}")
-
-
-def run_all_reduce(local_rank, args):
- if args.dist == 'torch':
- import torch.distributed as dist
- elif args.dist == 'deepspeed':
- import deepspeed.comm as dist
-
- # Prepare benchmark header
- print_header(args, 'all_reduce')
-
- world_size = dist.get_world_size()
- global_rank = dist.get_rank()
-
- if args.scan:
- M_LIST = []
- for x in (2**p for p in range(1, args.maxsize)):
- M_LIST.append(x)
-
- sync_all()
- # loop over various tensor sizes
- for M in M_LIST:
- global_rank = dist.get_rank()
- try:
- mat = torch.ones(world_size,
- M,
- dtype=getattr(
- torch,
- args.dtype)).to(
- get_accelerator().device_name(local_rank))
- sync_all()
- input = ((mat.mul_(float(global_rank))).view(-1))
- except RuntimeError as e:
- if 'out of memory' in str(e):
- if dist.get_rank() == 0:
- print('WARNING: Ran out of GPU memory. Exiting comm op.')
- sync_all()
- break
- sync_all()
- timed_all_reduce(input, args)
- else:
- # Send the biggest message size our GPUs can fit. If you're facing OOM errors, reduce the mem_factor
- # Don't need output tensor, so we double mem_factor
- elements_per_gpu = max_numel(comm_op='all_reduce',
- dtype=getattr(torch,
- args.dtype),
- mem_factor=args.mem_factor * 2,
- local_rank=local_rank,
- args=args)
- try:
- mat = torch.ones(elements_per_gpu,
- dtype=getattr(torch,
- args.dtype)).to(
- get_accelerator().device_name(local_rank))
- input = ((mat.mul_(float(global_rank))).view(-1))
- except RuntimeError as e:
- if 'out of memory' in str(e):
- if dist.get_rank() == 0:
- print(
- 'WARNING: Ran out of GPU memory. Try to reduce the --mem-factor argument!'
- )
- sync_all()
- return
- sync_all()
- timed_all_reduce(input, args)
-
-
-if __name__ == "__main__":
- args = benchmark_parser().parse_args()
- rank = args.local_rank
- init_processes(local_rank=rank, args=args)
- run_all_reduce(local_rank=rank, args=args)
diff --git a/benchmarks/communication/all_to_all.py b/benchmarks/communication/all_to_all.py
deleted file mode 100644
index bd35cf290e4c..000000000000
--- a/benchmarks/communication/all_to_all.py
+++ /dev/null
@@ -1,134 +0,0 @@
-'''Copyright The Microsoft DeepSpeed Team'''
-
-from benchmarks.communication.utils import *
-from benchmarks.communication.constants import *
-from deepspeed.accelerator import get_accelerator
-
-import time
-
-
-def timed_all_to_all(input, output, args):
- if args.dist == 'torch':
- import torch.distributed as dist
- elif args.dist == 'deepspeed':
- import deepspeed.comm as dist
-
- sync_all()
- # Warmups, establish connections, etc.
- for i in range(args.warmups):
- dist.all_to_all_single(output, input, async_op=args.async_op)
- sync_all()
-
- # time the actual comm op trials times and average it
- pre = time.perf_counter()
- for i in range(args.trials):
- dist.all_to_all_single(output, input, async_op=args.async_op)
- sync_all()
- duration = time.perf_counter() - pre
-
- # maintain and clean performance data
- avg_duration = duration / args.trials
- size = input.element_size() * input.nelement()
- n = dist.get_world_size()
- tput, busbw = get_bw('all_to_all', size, avg_duration, args)
- tput_str, busbw_str, duration_str = get_metric_strings(args, tput, busbw, avg_duration)
- desc = f'{input.nelement()}x{input.element_size()}'
-
- if not args.raw:
- size = convert_size(size)
-
- print_rank_0(
- f"{size:<20} {desc:25s} {duration_str:20s} {tput_str:20s} {busbw_str:20s}")
-
-
-def run_all_to_all(local_rank, args):
- if args.dist == 'torch':
- import torch.distributed as dist
- elif args.dist == 'deepspeed':
- import deepspeed.comm as dist
-
- world_size = dist.get_world_size()
- global_rank = dist.get_rank()
- # Prepare benchmark header
- print_header(args, 'all_to_all')
-
- if args.scan:
- M_LIST = []
- for x in (2**p for p in range(1, args.maxsize)):
- M_LIST.append(x)
-
- sync_all()
- # loop over various tensor sizes
- for M in M_LIST:
- global_rank = dist.get_rank()
- try:
- mat = torch.ones(world_size,
- M,
- dtype=getattr(
- torch,
- args.dtype)).to(
- get_accelerator().device_name(local_rank))
- assert mat.numel() % world_size == 0, f"tensor cannot be divided in {world_size} chunks"
- sync_all()
- input = ((mat.mul_(float(global_rank))).view(-1))
- output = (mat.clone().view(-1))
- except RuntimeError as e:
- if 'out of memory' in str(e):
- if dist.get_rank() == 0:
- print('WARNING: Ran out of GPU memory. Exiting comm op.')
- sync_all()
- break
- sync_all()
- timed_all_to_all(input, output, args)
- else:
- # Send the biggest message size our GPUs can fit. If you're facing OOM errors, reduce the mem_factor
- elements_per_gpu = max_numel(comm_op='all_to_all',
- dtype=getattr(torch,
- args.dtype),
- mem_factor=args.mem_factor,
- local_rank=local_rank,
- args=args)
- try:
- mat = torch.ones(elements_per_gpu,
- dtype=getattr(torch,
- args.dtype)).to(
- get_accelerator().device_name(local_rank))
- assert mat.numel() % world_size == 0, f"tensor with {mat.numel()} elements cannot be divided in {world_size} chunks"
- input = ((mat.mul_(float(global_rank))).view(-1))
- # Delete original mat to avoid OOM
- del mat
- get_accelerator().empty_cache()
- output = torch.zeros(
- elements_per_gpu,
- dtype=getattr(torch,
- args.dtype)).to(get_accelerator().device_name(local_rank))
- except RuntimeError as e:
- if 'out of memory' in str(e):
- if dist.get_rank() == 0:
- print(
- 'WARNING: Ran out of GPU memory. Try to reduce the --mem-factor argument!'
- )
- sync_all()
- return
- sync_all()
-
- if args.debug:
- for i in range(world_size):
- if i == global_rank:
- print(f"Before AllToAll Input List at rank {global_rank}: {input}")
- dist.barrier()
-
- timed_all_to_all(input, output, args)
-
- if args.debug:
- for i in range(world_size):
- if i == global_rank:
- print(f"AllToAll Results at rank {global_rank}: {output}")
- dist.barrier()
-
-
-if __name__ == "__main__":
- args = benchmark_parser().parse_args()
- rank = args.local_rank
- init_processes(local_rank=rank, args=args)
- run_all_to_all(local_rank=rank, args=args)
diff --git a/benchmarks/communication/broadcast.py b/benchmarks/communication/broadcast.py
deleted file mode 100644
index 633e46638fac..000000000000
--- a/benchmarks/communication/broadcast.py
+++ /dev/null
@@ -1,114 +0,0 @@
-'''Copyright The Microsoft DeepSpeed Team'''
-
-import torch
-from benchmarks.communication.utils import *
-from benchmarks.communication.constants import *
-from deepspeed.accelerator import get_accelerator
-
-import time
-
-
-def timed_broadcast(input, args):
- if args.dist == 'torch':
- import torch.distributed as dist
- elif args.dist == 'deepspeed':
- import deepspeed.comm as dist
-
- sync_all()
- # Warmups, establish connections, etc.
- for i in range(args.warmups):
- dist.broadcast(input, 0, async_op=args.async_op)
- sync_all()
-
- # time the actual comm op trials times and average it
- pre = time.perf_counter()
- for i in range(args.trials):
- dist.broadcast(input, 0, async_op=args.async_op)
- sync_all()
- duration = time.perf_counter() - pre
-
- # maintain and clean performance data
- avg_duration = duration / args.trials
- size = input.element_size() * input.nelement()
- n = dist.get_world_size()
- tput, busbw = get_bw('broadcast', size, avg_duration, args)
- tput_str, busbw_str, duration_str = get_metric_strings(args, tput, busbw, avg_duration)
- desc = f'{input.nelement()}x{input.element_size()}'
-
- if not args.raw:
- size = convert_size(size)
-
- print_rank_0(
- f"{size:<20} {desc:25s} {duration_str:20s} {tput_str:20s} {busbw_str:20s}")
-
-
-def run_broadcast(local_rank, args):
- if args.dist == 'torch':
- import torch.distributed as dist
- elif args.dist == 'deepspeed':
- import deepspeed.comm as dist
-
- # Prepare benchmark header
- print_header(args, 'broadcast')
-
- world_size = dist.get_world_size()
- global_rank = dist.get_rank()
-
- if args.scan:
- M_LIST = []
- for x in (2**p for p in range(1, args.maxsize)):
- M_LIST.append(x)
-
- sync_all()
- # loop over various tensor sizes
- for M in M_LIST:
- global_rank = dist.get_rank()
- try:
- mat = torch.ones(world_size,
- M,
- dtype=getattr(
- torch,
- args.dtype)).to(
- get_accelerator().device_name(local_rank))
- sync_all()
- input = ((mat.mul_(float(global_rank))).view(-1))
- except RuntimeError as e:
- if 'out of memory' in str(e):
- if dist.get_rank() == 0:
- print('WARNING: Ran out of GPU memory. Exiting comm op.')
- sync_all()
- break
- sync_all()
- timed_broadcast(input, args)
- else:
- # Send the biggest message size our GPUs can fit. If you're facing OOM errors, reduce the mem_factor
- # Don't need output tensor, so we double mem_factor
- elements_per_gpu = max_numel(comm_op='broadcast',
- dtype=getattr(torch,
- args.dtype),
- mem_factor=args.mem_factor * 2,
- local_rank=local_rank,
- args=args)
- try:
- mat = torch.ones(elements_per_gpu,
- dtype=getattr(torch,
- args.dtype)).to(
- get_accelerator().device_name(local_rank))
- input = ((mat.mul_(float(global_rank))).view(-1))
- except RuntimeError as e:
- if 'out of memory' in str(e):
- if dist.get_rank() == 0:
- print(
- 'WARNING: Ran out of GPU memory. Try to reduce the --mem-factor argument!'
- )
- sync_all()
- return
- sync_all()
- timed_broadcast(input, args)
-
-
-if __name__ == "__main__":
- args = benchmark_parser().parse_args()
- rank = args.local_rank
- init_processes(local_rank=rank, args=args)
- run_broadcast(local_rank=rank, args=args)
diff --git a/benchmarks/communication/constants.py b/benchmarks/communication/constants.py
deleted file mode 100644
index 935927acd174..000000000000
--- a/benchmarks/communication/constants.py
+++ /dev/null
@@ -1,10 +0,0 @@
-'''Copyright The Microsoft DeepSpeed Team'''
-from deepspeed.accelerator import get_accelerator
-
-DEFAULT_WARMUPS = 5
-DEFAULT_TRIALS = 50
-DEFAULT_TYPE = 'float'
-DEFAULT_BACKEND = get_accelerator().communication_backend_name()
-DEFAULT_UNIT = 'Gbps'
-DEFAULT_DIST = 'deepspeed'
-DEFAULT_MAXSIZE = 24
diff --git a/benchmarks/communication/pt2pt.py b/benchmarks/communication/pt2pt.py
deleted file mode 100644
index 1c890fc42e93..000000000000
--- a/benchmarks/communication/pt2pt.py
+++ /dev/null
@@ -1,132 +0,0 @@
-'''Copyright The Microsoft DeepSpeed Team'''
-
-from benchmarks.communication.utils import *
-from benchmarks.communication.constants import *
-from deepspeed.accelerator import get_accelerator
-
-import time
-
-
-def timed_pt2pt(input, args):
- if args.dist == 'torch':
- import torch.distributed as dist
- elif args.dist == 'deepspeed':
- import deepspeed.comm as dist
-
- sync_all()
- # Warmups, establish connections, etc.
- for i in range(args.warmups):
- if dist.get_rank() == 0:
- if args.async_op:
- dist.isend(input, 1)
- else:
- dist.send(input, 1)
- if dist.get_rank() == 1:
- if args.async_op:
- dist.irecv(input, src=0)
- else:
- dist.recv(input, src=0)
- sync_all()
-
- # time the actual comm op trials times and average it
- pre = time.perf_counter()
- for i in range(args.trials):
- if dist.get_rank() == 0:
- if args.async_op:
- dist.isend(input, 1)
- else:
- dist.send(input, 1)
- if dist.get_rank() == 1:
- if args.async_op:
- dist.irecv(input, src=0)
- else:
- dist.recv(input, src=0)
-
- sync_all()
- duration = time.perf_counter() - pre
-
- # maintain and clean performance data
- avg_duration = duration / args.trials
- size = input.element_size() * input.nelement()
- n = dist.get_world_size()
- tput, busbw = get_bw('pt2pt', size, avg_duration, args)
- tput_str, busbw_str, duration_str = get_metric_strings(args, tput, busbw, avg_duration)
- desc = f'{input.nelement()}x{input.element_size()}'
-
- if not args.raw:
- size = convert_size(size)
-
- print_rank_0(
- f"{size:<20} {desc:25s} {duration_str:20s} {tput_str:20s} {busbw_str:20s}")
-
-
-def run_pt2pt(local_rank, args):
- if args.dist == 'torch':
- import torch.distributed as dist
- elif args.dist == 'deepspeed':
- import deepspeed.comm as dist
-
- # Prepare benchmark header
- print_header(args, 'pt2pt')
- global_rank = dist.get_rank()
- world_size = dist.get_world_size()
-
- if args.scan:
- # Create list of message sizes
- M_LIST = []
- for x in (2**p for p in range(1, args.maxsize)):
- M_LIST.append(x)
-
- sync_all()
- # loop over various tensor sizes
- for M in M_LIST:
- global_rank = dist.get_rank()
- try:
- mat = torch.ones(world_size,
- M,
- dtype=getattr(
- torch,
- args.dtype)).to(
- get_accelerator().device_name(local_rank))
- sync_all()
- input = ((mat.mul_(float(global_rank))).view(-1))
- except RuntimeError as e:
- if 'out of memory' in str(e):
- if dist.get_rank() == 0:
- print('WARNING: Ran out of GPU memory. Exiting comm op.')
- sync_all()
- break
- sync_all()
- timed_pt2pt(input, args)
- else:
- # Send the biggest message size our GPUs can fit. If you're facing OOM errors, reduce the mem_factor
- # Don't need output tensor, so double mem_factor
- elements_per_gpu = max_numel(comm_op='pt2pt',
- dtype=getattr(torch,
- args.dtype),
- mem_factor=args.mem_factor * 2,
- local_rank=local_rank,
- args=args)
- try:
- mat = torch.ones(elements_per_gpu,
- dtype=getattr(torch,
- args.dtype)).to(
- get_accelerator().device_name(local_rank))
- input = ((mat.mul_(float(global_rank))).view(-1))
- except RuntimeError as e:
- if 'out of memory' in str(e):
- if dist.get_rank() == 0:
- print(
- 'WARNING: Ran out of GPU memory. Try to reduce the --mem-factor argument!'
- )
- sync_all()
- return
- sync_all()
- timed_pt2pt(input, args)
-
-
-if __name__ == "__main__":
- args = benchmark_parser().parse_args()
- rank = args.local_rank
- init_processes(local_rank=rank, args=args)
- run_pt2pt(local_rank=rank, args=args)
diff --git a/benchmarks/communication/run_all.py b/benchmarks/communication/run_all.py
deleted file mode 100644
index 7ec562cc9ae0..000000000000
--- a/benchmarks/communication/run_all.py
+++ /dev/null
@@ -1,49 +0,0 @@
-'''Copyright The Microsoft DeepSpeed Team'''
-
-from benchmarks.communication.utils import *
-from benchmarks.communication.all_reduce import run_all_reduce
-from benchmarks.communication.all_gather import run_all_gather
-from benchmarks.communication.all_to_all import run_all_to_all
-from benchmarks.communication.pt2pt import run_pt2pt
-from benchmarks.communication.broadcast import run_broadcast
-from benchmarks.communication.constants import *
-
-
-# For importing
-def main(args, rank):
-
- init_processes(local_rank=rank, args=args)
-
- ops_to_run = []
- if args.all_reduce:
- ops_to_run.append('all_reduce')
- if args.all_gather:
- ops_to_run.append('all_gather')
- if args.broadcast:
- ops_to_run.append('broadcast')
- if args.pt2pt:
- ops_to_run.append('pt2pt')
- if args.all_to_all:
- ops_to_run.append('all_to_all')
-
- if len(ops_to_run) == 0:
- ops_to_run = ['all_reduce', 'all_gather', 'all_to_all', 'broadcast', 'pt2pt']
-
- for comm_op in ops_to_run:
- if comm_op == 'all_reduce':
- run_all_reduce(local_rank=rank, args=args)
- if comm_op == 'all_gather':
- run_all_gather(local_rank=rank, args=args)
- if comm_op == 'all_to_all':
- run_all_to_all(local_rank=rank, args=args)
- if comm_op == 'pt2pt':
- run_pt2pt(local_rank=rank, args=args)
- if comm_op == 'broadcast':
- run_broadcast(local_rank=rank, args=args)
-
-
-# For directly calling benchmark
-if __name__ == "__main__":
- args = benchmark_parser().parse_args()
- rank = args.local_rank
- main(args, rank)
diff --git a/benchmarks/communication/utils.py b/benchmarks/communication/utils.py
deleted file mode 100644
index b913dda14fe5..000000000000
--- a/benchmarks/communication/utils.py
+++ /dev/null
@@ -1,220 +0,0 @@
-'''Copyright The Microsoft DeepSpeed Team'''
-
-import torch
-import os
-import math
-import argparse
-from benchmarks.communication.constants import *
-from deepspeed.accelerator import get_accelerator
-
-global dist
-
-
-def init_torch_distributed(backend):
- global dist
- import torch.distributed as dist
- torch.distributed.init_process_group(backend)
- local_rank = int(os.environ['LOCAL_RANK'])
- get_accelerator().set_device(local_rank)
-
-
-def init_deepspeed_comm(backend):
- global dist
- import deepspeed
- import deepspeed.comm as dist
- deepspeed.init_distributed(dist_backend=backend)
- local_rank = int(os.environ['LOCAL_RANK'])
- get_accelerator().set_device(local_rank)
-
-
-def init_processes(local_rank, args):
- if args.dist == 'deepspeed':
- init_deepspeed_comm(args.backend)
- elif args.dist == 'torch':
- init_torch_distributed(args.backend)
- else:
- print_rank_0(f"distributed framework {args.dist} not supported")
- exit(0)
-
-
-def print_rank_0(message):
- if dist.get_rank() == 0:
- print(message)
-
-
-def print_header(args, comm_op):
- if comm_op == 'pt2pt':
- world_size = 2
- else:
- world_size = dist.get_world_size()
- tput = f'Throughput ({args.bw_unit})'
- busbw = f'BusBW ({args.bw_unit})'
- header = f"\n---- Performance of {comm_op} on {world_size} devices ---------------------------------------------------------\n"
- duration_str = 'Duration'
- if args.raw:
- duration_str += ' (us)'
- header += f"{'Size (Bytes)':20s} {'Description':25s} {duration_str:20s} {tput:20s} {busbw:20s}\n"
- header += "----------------------------------------------------------------------------------------------------"
- print_rank_0(header)
-
-
-def get_bw(comm_op, size, duration, args):
- n = dist.get_world_size()
- tput = 0
- busbw = 0
- if comm_op == "all_to_all":
- tput = (size / duration)
- busbw = (size / duration) * ((n - 1) / n)
- elif comm_op == "all_gather":
- size *= n
- tput = (size / duration)
- busbw = (size / duration) * ((n - 1) / n)
- elif comm_op == "all_reduce":
- tput = (size * 2 / duration)
- busbw = (size / duration) * (2 * (n - 1) / n)
- elif comm_op == "pt2pt" or comm_op == "broadcast":
- tput = (size / duration)
- busbw = tput
- else:
- print_rank_0("wrong comm_op specified")
- exit(0)
-
- if args.bw_unit == 'Gbps':
- tput *= 8
- busbw *= 8
-
- return tput, busbw
-
-
-def get_metric_strings(args, tput, busbw, duration):
- duration_ms = duration * 1e3
- duration_us = duration * 1e6
- tput = f'{tput / 1e9:.3f}'
- busbw = f'{busbw /1e9:.3f}'
-
- if duration_us < 1e3 or args.raw:
- duration = f'{duration_us:.3f}'
- if not args.raw:
- duration += ' us'
- else:
- duration = f'{duration_ms:.3f} ms'
- return tput, busbw, duration
-
-
-def sync_all():
- get_accelerator().synchronize()
- dist.barrier()
-
-
-def max_numel(comm_op, dtype, mem_factor, local_rank, args):
- dtype_size = _element_size(dtype)
- max_memory_per_gpu = get_accelerator().total_memory(local_rank) * mem_factor
- if comm_op == 'all_reduce' or comm_op == 'pt2pt' or comm_op == 'broadcast':
- elements_per_gpu = int(max_memory_per_gpu // dtype_size)
- elif comm_op == 'all_gather':
- # all_gather performance is lower for non-powers of two, and the output buffer size scales with world size
- # Therefore, divide by world size and round down to nearest power of 2
- elements_per_gpu = int(max_memory_per_gpu // dtype_size // dist.get_world_size())
- elements_per_gpu = int(pow(2, int(math.log(elements_per_gpu, 2))))
- elif comm_op == 'all_to_all':
- # Number of elements must be divisible by world_size
- # all_to_all performance is lower for non-powers of two. Round down like all_gather.
- elements_per_gpu = int(max_memory_per_gpu // dtype_size)
- elements_per_gpu = int(dist.get_world_size() *
- round(elements_per_gpu / dist.get_world_size()))
- elements_per_gpu = int(pow(2, int(math.log(elements_per_gpu, 2))))
- else:
- print(f"This communication operation: {comm_op} is not supported yet")
- exit(0)
- return elements_per_gpu
-
-
-# Helper function to pretty-print message sizes
-def convert_size(size_bytes):
- if size_bytes == 0:
- return "0B"
- size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
- i = int(math.floor(math.log(size_bytes, 1024)))
- p = math.pow(1024, i)
- s = round(size_bytes / p, 2)
- return "%s %s" % (s, size_name[i])
-
-
-# Copied from torch. Need to add the func here for old torch compatibility.
-def _element_size(dtype):
- """
- Returns the element size for a dtype, in bytes
- """
- if not isinstance(dtype, torch.dtype):
- raise RuntimeError(f'expected torch.dtype, but got {type(dtype)}')
-
- if dtype.is_complex:
- return torch.finfo(dtype).bits >> 2
- elif dtype.is_floating_point:
- return torch.finfo(dtype).bits >> 3
- elif dtype == torch.bool:
- # NOTE: torch.bool is not supported in torch.iinfo()
- return 1
- else:
- return torch.iinfo(dtype).bits >> 3
-
-
-def benchmark_parser():
- parser = argparse.ArgumentParser()
- parser.add_argument("--local_rank", type=int)
- parser.add_argument("--trials",
- type=int,
- default=DEFAULT_TRIALS,
- help='Number of timed iterations')
- parser.add_argument("--warmups",
- type=int,
- default=DEFAULT_WARMUPS,
- help='Number of warmup (non-timed) iterations')
- parser.add_argument("--maxsize",
- type=int,
- default=24,
- help='Max message size as a power of 2')
- parser.add_argument("--async-op",
- action="store_true",
- help='Enables non-blocking communication')
- parser.add_argument("--bw-unit",
- type=str,
- default=DEFAULT_UNIT,
- choices=['Gbps',
- 'GBps'])
- parser.add_argument("--backend",
- type=str,
- default=DEFAULT_BACKEND,
- choices=['nccl',
- 'ccl'],
- help='Communication library to use')
- parser.add_argument("--dist",
- type=str,
- default=DEFAULT_DIST,
- choices=['deepspeed',
- 'torch'],
- help='Distributed DL framework to use')
- parser.add_argument("--scan",
- action="store_true",
- help='Enables scanning all message sizes')
- parser.add_argument("--raw",
- action="store_true",
- help='Print the message size and latency without units')
- parser.add_argument("--all-reduce", action="store_true", help='Run all_reduce')
- parser.add_argument("--all-gather", action="store_true", help='Run all_gather')
- parser.add_argument("--all-to-all", action="store_true", help='Run all_to_all')
- parser.add_argument("--pt2pt", action="store_true", help='Run pt2pt')
- parser.add_argument("--broadcast", action="store_true", help='Run broadcast')
- parser.add_argument("--dtype",
- type=str,
- default=DEFAULT_TYPE,
- help='PyTorch tensor dtype')
- parser.add_argument(
- "--mem-factor",
- type=float,
- default=.4,
- help='Proportion of max available GPU memory to use for single-size evals')
- parser.add_argument("--debug",
- action="store_true",
- help='Enables all_to_all debug prints')
- return parser
diff --git a/benchmarks/inference/bert-bench.py b/benchmarks/inference/bert-bench.py
deleted file mode 100644
index 9d586d033cd7..000000000000
--- a/benchmarks/inference/bert-bench.py
+++ /dev/null
@@ -1,92 +0,0 @@
-'''Copyright The Microsoft DeepSpeed Team'''
-
-import torch
-import time
-import deepspeed
-import argparse
-from transformers import pipeline
-from deepspeed.accelerator import get_accelerator
-
-parser = argparse.ArgumentParser()
-parser.add_argument("--model", "-m", type=str, help="hf model name")
-parser.add_argument("--deepspeed", action="store_true", help="use deepspeed inference")
-parser.add_argument("--dtype", type=str, default="fp16", help="fp16 or fp32")
-parser.add_argument("--max-tokens", type=int, default=50, help="max new tokens")
-parser.add_argument("--local_rank", type=int, default=0, help="local rank")
-parser.add_argument("--trials", type=int, default=30, help="number of trials")
-parser.add_argument("--kernel-inject", action="store_true", help="inject kernels on")
-parser.add_argument("--graphs", action="store_true", help="CUDA Graphs on")
-args = parser.parse_args()
-
-
-def print_latency(latency_set, title, warmup=3):
- # trim warmup queries
- latency_set = latency_set[warmup:]
- count = len(latency_set)
- if count > 0:
- latency_set.sort()
- n50 = (count - 1) * 0.5 + 1
- n90 = (count - 1) * 0.9 + 1
- n95 = (count - 1) * 0.95 + 1
- n99 = (count - 1) * 0.99 + 1
- n999 = (count - 1) * 0.999 + 1
-
- avg = sum(latency_set) / count
- p50 = latency_set[int(n50) - 1]
- p90 = latency_set[int(n90) - 1]
- p95 = latency_set[int(n95) - 1]
- p99 = latency_set[int(n99) - 1]
- p999 = latency_set[int(n999) - 1]
-
- print(f"====== latency stats {title} ======")
- print("\tAvg Latency: {0:8.2f} ms".format(avg * 1000))
- print("\tP50 Latency: {0:8.2f} ms".format(p50 * 1000))
- print("\tP90 Latency: {0:8.2f} ms".format(p90 * 1000))
- print("\tP95 Latency: {0:8.2f} ms".format(p95 * 1000))
- print("\tP99 Latency: {0:8.2f} ms".format(p99 * 1000))
- print("\t999 Latency: {0:8.2f} ms".format(p999 * 1000))
-
-
-deepspeed.init_distributed()
-
-print(args.model, args.max_tokens, args.dtype)
-
-if args.dtype.lower() == "fp16":
- dtype = torch.float16
-else:
- dtype = torch.float32
-
-pipe = pipeline("fill-mask", model=args.model, framework="pt", device=args.local_rank)
-
-if dtype == torch.half:
- pipe.model.half()
-
-mask = pipe.tokenizer.mask_token
-
-br = pipe(f"Hello I'm a {mask} model")
-if args.deepspeed:
- pipe.model = deepspeed.init_inference(pipe.model,
- dtype=dtype,
- mp_size=1,
- replace_with_kernel_inject=args.kernel_inject,
- enable_cuda_graph=args.graphs)
- pipe.model.profile_model_time()
-
-responses = []
-times = []
-mtimes = []
-for i in range(args.trials):
- get_accelerator().synchronize()
- start = time.time()
- r = pipe(f"Hello I'm a {mask} model")
- get_accelerator().synchronize()
- end = time.time()
- responses.append(r)
- times.append((end - start))
- mtimes += pipe.model.model_times()
- #print(f"{pipe.model.model_times()=}")
-
-print_latency(times, "e2e latency")
-print_latency(mtimes, "model latency")
-
-print(responses[0:3])
diff --git a/benchmarks/inference/collect_results.py b/benchmarks/inference/collect_results.py
deleted file mode 100644
index 0e51033114db..000000000000
--- a/benchmarks/inference/collect_results.py
+++ /dev/null
@@ -1,147 +0,0 @@
-'''Copyright The Microsoft DeepSpeed Team'''
-
-import os
-import re
-import argparse
-import pandas as pd
-
-parser = argparse.ArgumentParser()
-parser.add_argument(
- "--results-dir",
- "-r",
- type=str,
- default="./results",
- help="directory containing sweep results",
-)
-parser.add_argument("--version",
- "-v",
- type=int,
- default=0,
- help="version to be collected")
-parser.add_argument("--gen-text-n",
- "-n",
- type=int,
- default=1,
- help="expected number of generated text")
-parser.add_argument("--output",
- "-o",
- type=str,
- default="./results.csv",
- help="output file")
-args = parser.parse_args()
-
-
-def get_branch(file_path):
- match = re.match(r".*\/(.*)\.log", file_path)
- if match is None:
- return False
- else:
- return match.groups()[0]
-
-
-def get_benchmark_params(root_dir, file_path):
- match = re.match(
- rf"{root_dir}\/(.+?)_(fp\d+)_(true|false)_(true|false)_(\d+)gpus_v(\d+)\/",
- file_path,
- )
- if match is None:
- return False
- else:
- model, dtype, graphs, kernel, gpus, version = match.groups()
- bool_dict = {"true": True, "false": False}
- return {
- "model": model,
- "dtype": dtype,
- "graphs": bool_dict[graphs.lower()],
- "kernel": bool_dict[kernel.lower()],
- "gpus": int(gpus),
- "version": int(version),
- }
-
-
-def get_perf_data(file_content):
- matches = re.findall(r"\s+(.+?)\sLatency:\s+(\d+\.\d+)\sms", file_content)
- if matches is []:
- return False
- else:
- return {f"latency-{key}": float(val) for key, val in matches}
-
-
-def get_generated_text(file_content, gen_text_n):
- file_content = file_content.replace("\n", " ")
- file_content = file_content.replace("\t", " ")
- matches = re.findall(r"RESPONSE\s(\d+):\s+[-]{30}\s+(.+?)\s+[-]{30}", file_content)
- if len(matches) != gen_text_n:
- return False
- else:
- return {f"generated-text-{key}": val for key, val in matches}
-
-
-def get_error(file_content):
- matches = re.findall(r"Error:\s+(.+?)\n", file_content)
- if matches is []:
- return False
- else:
- return {f"error": val for val in matches}
-
-
-if __name__ == "__main__":
- # List to collect data from all benchmarks
- benchmarks_data = []
-
- # Walk through directory of results from sweep.sh
- for root, dirs, files in os.walk(args.results_dir):
- # Because of how some models are named, the dir structure for results can vary, e.g.:
- # "EleutherAI/gpt-neo_*/baseline.log" versus "gpt2_*/baseline.log"
- if dirs:
- continue
-
- # Get data from baseline and each tested branch
- for name in files:
- file_path = os.path.join(root, name)
-
- branch = get_branch(file_path)
- if not branch:
- print(f"WARNING: Could not detect branch for file {file_path}, skipping")
- continue
-
- params = get_benchmark_params(args.results_dir, file_path)
- if not params:
- print(
- f"WARNING: Could not detect benchmark settings for file {file_path}, skipping"
- )
- continue
-
- # Verify that the version matches that which we want to collect
- if params["version"] != args.version:
- continue
-
- with open(file_path, "r") as f:
- file_content = f.read()
-
- perf_data = get_perf_data(file_content)
- if not perf_data:
- print(
- f"WARNING: Could not detect benchmark performance data for file {file_path}"
- )
-
- generated_text = get_generated_text(file_content, args.gen_text_n)
- if not generated_text:
- print(f"WARNING: Could not detect generated text for file {file_path}")
-
- error = get_error(file_content)
- if error:
- print(f"Error found in {file_path}, collecting error info...")
- benchmarks_data.append({"branch": branch, **params, **error})
- continue
-
- benchmarks_data.append({
- "branch": branch,
- **params,
- **perf_data,
- **generated_text
- })
-
- # Convert to a DataFrame and save
- benchmarks_df = pd.DataFrame(benchmarks_data)
- benchmarks_df.to_csv(args.output)
diff --git a/benchmarks/inference/gpt-bench.py b/benchmarks/inference/gpt-bench.py
deleted file mode 100644
index 29578b30cf1f..000000000000
--- a/benchmarks/inference/gpt-bench.py
+++ /dev/null
@@ -1,124 +0,0 @@
-'''Copyright The Microsoft DeepSpeed Team'''
-
-import os
-import torch
-import time
-import deepspeed
-import argparse
-from transformers import pipeline
-from deepspeed.accelerator import get_accelerator
-
-parser = argparse.ArgumentParser()
-parser.add_argument("--model", "-m", type=str, help="hf model name")
-parser.add_argument("--deepspeed", action="store_true", help="use deepspeed inference")
-parser.add_argument("--dtype",
- type=str,
- default="fp16",
- choices=["fp16",
- "fp32",
- "int8"],
- help="int8, fp16, or fp32")
-parser.add_argument("--graphs", action="store_true", help="CUDA Graphs on")
-parser.add_argument("--kernel-inject", action="store_true", help="inject kernels on")
-parser.add_argument("--max-tokens", type=int, default=50, help="max new tokens")
-parser.add_argument("--local_rank",
- type=int,
- default=int(os.getenv("LOCAL_RANK",
- "0")),
- help="local rank")
-parser.add_argument("--world_size",
- type=int,
- default=int(os.getenv("WORLD_SIZE",
- "1")),
- help="world size")
-parser.add_argument("--trials", type=int, default=30, help="number of trials")
-args = parser.parse_args()
-
-
-def print_latency(latency_set, title, warmup=3):
- # trim warmup queries
- latency_set = list(latency_set)
- latency_set = latency_set[warmup:]
- count = len(latency_set)
- if count > 0:
- latency_set.sort()
- n50 = (count - 1) * 0.5 + 1
- n90 = (count - 1) * 0.9 + 1
- n95 = (count - 1) * 0.95 + 1
- n99 = (count - 1) * 0.99 + 1
- n999 = (count - 1) * 0.999 + 1
-
- avg = sum(latency_set) / count
- p50 = latency_set[int(n50) - 1]
- p90 = latency_set[int(n90) - 1]
- p95 = latency_set[int(n95) - 1]
- p99 = latency_set[int(n99) - 1]
- p999 = latency_set[int(n999) - 1]
-
- print(f"====== latency stats {title} ======")
- print("\tAvg Latency: {0:8.2f} ms".format(avg * 1000))
- print("\tP50 Latency: {0:8.2f} ms".format(p50 * 1000))
- print("\tP90 Latency: {0:8.2f} ms".format(p90 * 1000))
- print("\tP95 Latency: {0:8.2f} ms".format(p95 * 1000))
- print("\tP99 Latency: {0:8.2f} ms".format(p99 * 1000))
- print("\t999 Latency: {0:8.2f} ms".format(p999 * 1000))
-
-
-deepspeed.init_distributed()
-
-if args.local_rank == 0:
- print("BENCHMARK SETTINGS:")
- print(f"\tMODEL: {args.model}")
- print(f"\tMAX_TOKENS: {args.max_tokens}")
- print(f"\tDTYPE: {args.dtype}")
- print(f"\tCUDA_GRAPHS: {args.graphs}")
- print(f"\tKERNEL_INJECT: {args.kernel_inject}")
-
-if args.dtype == "int8":
- dtype = torch.int8
-elif args.dtype == "fp16":
- dtype = torch.float16
-else:
- dtype = torch.float32
-
-pipe = pipeline("text-generation",
- model=args.model,
- framework="pt",
- device=args.local_rank)
-
-if dtype == torch.float16:
- pipe.model.half()
-
-if args.deepspeed:
- pipe.model = deepspeed.init_inference(
- pipe.model,
- dtype=dtype,
- mp_size=args.world_size,
- replace_with_kernel_inject=args.kernel_inject,
- enable_cuda_graph=args.graphs,
- )
- pipe.model.profile_model_time()
-
-responses = []
-times = []
-mtimes = []
-for i in range(args.trials):
- get_accelerator().synchronize()
- start = time.time()
- r = pipe("DeepSpeed is", do_sample=False, max_new_tokens=args.max_tokens)
- get_accelerator().synchronize()
- end = time.time()
- responses.append(r)
- times.append(end - start) # / (args.max_tokens - 3))
- mtimes.append(sum(pipe.model.model_times()))
-
-if args.local_rank == 0:
- print_latency(times, "(e2e) latency")
- print_latency(mtimes, "(model-only) latency")
- print_latency(map(lambda t: t / (args.max_tokens - 3),
- times),
- "(e2e) per token latency")
- print(f"RESPONSE 0:")
- print("-" * 30)
- print(responses[0][0]["generated_text"])
- print("-" * 30)
diff --git a/benchmarks/inference/requirements.txt b/benchmarks/inference/requirements.txt
deleted file mode 100644
index 00899dd5f485..000000000000
--- a/benchmarks/inference/requirements.txt
+++ /dev/null
@@ -1 +0,0 @@
-transformers>=4.21.3
diff --git a/benchmarks/inference/run_model.sh b/benchmarks/inference/run_model.sh
deleted file mode 100644
index 8e5fe3ac0133..000000000000
--- a/benchmarks/inference/run_model.sh
+++ /dev/null
@@ -1,36 +0,0 @@
-set -x
-
-model=$1
-branch1=$2
-branch2=$3
-dtype=$4
-graphs=$5
-kernel=$6
-gpus=$7
-
-version=0
-log_path=results/${model}_${dtype}_${graphs}_${kernel}_${gpus}gpus_v${version}
-mkdir -p ${log_path}
-
-params="--dtype $dtype "
-if [[ "$graphs" == "true" ]]; then
- params+="--graphs "
-fi
-if [[ "$kernel" == "true" ]]; then
- params+="--kernel "
-fi
-
-echo "baseline $log_path"
-deepspeed --num_gpus 1 gpt-bench.py -m "${model}" $params &> ${log_path}/baseline.log
-
-cd ../../
-git checkout ${branch1}
-cd -
-echo "ds ${branch1} $log_path"
-deepspeed --num_gpus $gpus gpt-bench.py --deepspeed -m "${model}" $params &> ${log_path}/ds-${branch1}.log
-
-cd ../../
-git checkout ${branch2}
-cd -
-echo "ds ${branch2} $log_path"
-deepspeed --num_gpus $gpus gpt-bench.py --deepspeed -m "${model}" $params&> ${log_path}/ds-${branch2}.log
diff --git a/benchmarks/inference/sweep.sh b/benchmarks/inference/sweep.sh
deleted file mode 100644
index aabcb0bfdbd8..000000000000
--- a/benchmarks/inference/sweep.sh
+++ /dev/null
@@ -1,41 +0,0 @@
-set -x
-
-export TRANSFORMERS_CACHE=/tmp/hf-cache
-
-branch1=$1
-branch2=$2
-
-gptneo_models="EleutherAI/gpt-neo-2.7B EleutherAI/gpt-neo-1.3B EleutherAI/gpt-neo-125M"
-gpt2_models="gpt2 gpt2-large gpt2-xl"
-gptj_models="EleutherAI/gpt-j-6B"
-opt_models="facebook/opt-125m facebook/opt-1.3b facebook/opt-2.7b facebook/opt-6.7b facebook/opt-13b"
-bloom_models="bigscience/bloom-560m bigscience/bloom-1b7 bigscience/bloom-3b bigscience/bloom-7b1"
-
-for gpus in `echo "1 2 4 8"`; do
- for dtype in `echo "fp16 fp32"`; do
- for graphs in `echo "true false"`; do
- for kernel in `echo "true false"`; do
- params="$dtype $graphs $kernel $gpus"
- for m in `echo "$gptneo_models"`; do
- bash run_model.sh $m $branch1 $branch2 $params
- done
-
- for m in `echo "$gpt2_models"`; do
- bash run_model.sh $m $branch1 $branch2 $params
- done
-
- for m in `echo "$gptj_models"`; do
- bash run_model.sh $m $branch1 $branch2 $params
- done
-
- for m in `echo "$opt_models"`; do
- bash run_model.sh $m $branch1 $branch2 $params
- done
-
- for m in `echo "$bloom_models"`; do
- bash run_model.sh $m $branch1 $branch2 $params
- done
- done
- done
- done
-done
diff --git a/bin/ds_elastic b/bin/ds_elastic
index c9987d4565da..1c78aea88894 100755
--- a/bin/ds_elastic
+++ b/bin/ds_elastic
@@ -9,11 +9,7 @@ from deepspeed.elasticity import compute_elastic_config
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('-c', '--config', type=str, help="DeepSpeed config json")
- parser.add_argument('-w',
- '--world-size',
- type=int,
- default=0,
- help="Intended/current world size")
+ parser.add_argument('-w', '--world-size', type=int, default=0, help="Intended/current world size")
args = parser.parse_args()
ds_config = json.load(open(args.config, 'r'))
@@ -26,7 +22,9 @@ if __name__ == '__main__':
print(json.dumps(elastic_config, indent=4, sort_keys=True))
if args.world_size > 0:
- final_batch_size, valid_gpus, micro_batch_size = compute_elastic_config(ds_config=ds_config, target_deepspeed_version=ds_version, world_size=args.world_size)
+ final_batch_size, valid_gpus, micro_batch_size = compute_elastic_config(ds_config=ds_config,
+ target_deepspeed_version=ds_version,
+ world_size=args.world_size)
print('------------------------------------------')
print(f"Calculated results for world size {args.world_size}:")
print('------------------------------------------')
diff --git a/blogs/README.md b/blogs/README.md
new file mode 100644
index 000000000000..c5c72fd4efb0
--- /dev/null
+++ b/blogs/README.md
@@ -0,0 +1 @@
+All DeepSpeed blogs are linked here:
diff --git a/blogs/assets/images/Throughputs-OPT.png b/blogs/assets/images/Throughputs-OPT.png
new file mode 100755
index 000000000000..9b968319f759
Binary files /dev/null and b/blogs/assets/images/Throughputs-OPT.png differ
diff --git a/blogs/assets/images/ds-chat-overview.png b/blogs/assets/images/ds-chat-overview.png
new file mode 100755
index 000000000000..479ca4bb2ad7
Binary files /dev/null and b/blogs/assets/images/ds-chat-overview.png differ
diff --git a/blogs/assets/images/figure3.png b/blogs/assets/images/figure3.png
new file mode 100755
index 000000000000..44b8e67bd6b9
Binary files /dev/null and b/blogs/assets/images/figure3.png differ
diff --git a/blogs/assets/images/figure4.png b/blogs/assets/images/figure4.png
new file mode 100755
index 000000000000..dca56637049c
Binary files /dev/null and b/blogs/assets/images/figure4.png differ
diff --git a/blogs/assets/images/figure5.png b/blogs/assets/images/figure5.png
new file mode 100755
index 000000000000..6282c0d19ed1
Binary files /dev/null and b/blogs/assets/images/figure5.png differ
diff --git a/blogs/assets/images/figure6.png b/blogs/assets/images/figure6.png
new file mode 100755
index 000000000000..8e60773b5709
Binary files /dev/null and b/blogs/assets/images/figure6.png differ
diff --git a/blogs/assets/images/figure7.png b/blogs/assets/images/figure7.png
new file mode 100755
index 000000000000..9b400b11efb0
Binary files /dev/null and b/blogs/assets/images/figure7.png differ
diff --git a/blogs/assets/images/hero-figure-black-ja.png b/blogs/assets/images/hero-figure-black-ja.png
new file mode 100644
index 000000000000..5c6cf05d3e7a
Binary files /dev/null and b/blogs/assets/images/hero-figure-black-ja.png differ
diff --git a/blogs/assets/images/hero-figure-black.png b/blogs/assets/images/hero-figure-black.png
new file mode 100755
index 000000000000..6a05f4b27bb9
Binary files /dev/null and b/blogs/assets/images/hero-figure-black.png differ
diff --git a/blogs/assets/images/hybrid-engine.png b/blogs/assets/images/hybrid-engine.png
new file mode 100755
index 000000000000..5e501108a5b7
Binary files /dev/null and b/blogs/assets/images/hybrid-engine.png differ
diff --git a/blogs/assets/images/mascot.png b/blogs/assets/images/mascot.png
new file mode 100755
index 000000000000..e9f7a354fc85
Binary files /dev/null and b/blogs/assets/images/mascot.png differ
diff --git a/blogs/assets/images/triton-bert-base-latency.png b/blogs/assets/images/triton-bert-base-latency.png
new file mode 100644
index 000000000000..4f733f4d1afe
Binary files /dev/null and b/blogs/assets/images/triton-bert-base-latency.png differ
diff --git a/blogs/assets/images/triton-bert-large-latency.png b/blogs/assets/images/triton-bert-large-latency.png
new file mode 100644
index 000000000000..d82dc0ccac51
Binary files /dev/null and b/blogs/assets/images/triton-bert-large-latency.png differ
diff --git a/blogs/comm-opt/README.md b/blogs/comm-opt/README.md
new file mode 100644
index 000000000000..4767c4342816
--- /dev/null
+++ b/blogs/comm-opt/README.md
@@ -0,0 +1,82 @@
+
+
+# Communication Optimizations for Large-Scale Training
+
+
+
+
+## Table of Contents
+1. [Introduction](#introduction)
+2. [Gradient AllReduce Optimization for ZeRO stages 1 and 2](#ar-opt)
+3. [Optimizing Parameter All-Gather for ZeRO2 Training](#ag-opt)
+4. [Optimizing AlltoAll for Sequence-Parallel Training](#sp-opt)
+
+
+## 1. Introduction
+Training LLMs on large datasets can be extremely costly both in terms of hardware resources and time. An important step to minimize such costs is to carefully combine an appropriate number of resources together with a scalable library that guarantees training completion within a time limit. In this post, we discuss a key aspect of the scalability features of DeepSpeed, the communication optimization. Communication collectives (e.g., all-reduce, all-gather, etc.) are critical pieces of many popular DeepSpeed technologies (e.g., ZeRO, MoE, AutoTP, etc.), and in the following sections we discuss our new optimizations of some of these collectives. These optimizations are available in DeepSpeed versions >= 0.x.x.
+
+## 2. Gradient AllReduce Optimization for ZeRO stages 1 and 2
+
+Before diving into this optimization, let's take a step back and show some of the case studies that demonstrate the need.
+
+AllReduce operation is an important part of the training process. In ZeRO, we handle this in buckets, which can be configured to get good communication throughput. As the number of GPUs increases, we encounter smaller-partition AllReduces. In this case, the current bucketing scheme cannot help with the communication overhead. This mostly becomes an issue when training smaller-scale models (like Llama-7B) with large number of GPUs.
+
+For instance, when training a dense-7B architecture with Zero stages 1 or 2, we encounter a 1 and 2 second increase for the AllReduce time by increasing from 256 to 512 and 1024 A100 GPUs. This issue mostly arises from the fact that, the gradient-averaging happens with smaller partitions (#parameters / #GPUs) per-GPU rank. This issue gets more serious when training MoE architectures (3 - 12 second) for which the expert's parameters can be farther away due to the current parallelism layout of data and expert parallelism.
+
+In this section, we introduce two main optimization techniques for alleviating these communication bottleneck.
+
+First, Multi-rank bucketing for the same process group: for this optimization, we simply pack all data that requires to be reduced from different ranks into one big flattened tensor and call AllReduce instead of reduce operations. After the reduction, we scatter the right portion of data to the corresponding ranks.
+
+Second, add new layout for the expert-data parallelism: the default parallelism layout for MoE architecture (as shown in Fig 1) is planned in a way that the experts are placed first on E parallel GPUs and replicated D times (data-parallel). With this layout, we encounter slower AllReduce as data-parallel ranks are placed farther away especially when we have cross-rank communication. We call this layout E + D.
+
+
+
+
+ *Fig 1: Different MoE parallel layout. left) E + D, which places the GPUs in EP dimension first before adding DP, right) D + E, that replicates each expert by DP size, before constructing EP. We get faster AllReduce for the second layout while increasing the AlltoAll time. It potentially results in faster e2e training time, as the communication volume for AllReduce (total parameter size) is normally much more than AlltoAll (MLP activation memory).*
+
+By changing this layout from E + D to D + E (shown in Fig 1), where we first replicate each expert by D times and then add them across expert-parallel dimension, we can reduce the AllReduce time substantially. On an A100-DGX cluster, where each node has 8 GPUs, we see about 8x reduction in cross-node infiniband communication-volume for the parameter update process, which are now processed faster using the intra-node NVLinks. Note that by adding this optimization, we increase the cost of AlltoAll happening for the MoE part of the model, however, we have seen that the performance benefit of AllReduce overweighs this cost.
+
+Table 1 summarizes the saving observed for training a 7B dense and a MoE architecture by using the optimized AllReduce scheme. After applying the multi-rank bucketing technique, we reduce the AllReduce time by 4x for dense architecture and 5x - 8x for the MoE one. In addition, we obtain an extra 3x saving using the new D + E layout for the MoE architecture. Therefore, we see higher performance gain on MoE architectures when using large number of GPUs. For instance, when training a 7B-base MoE architecture, we reduce iteration-time from 13 sec to 9.5 sec on 512 GPUs (37%) and from 16.1 sec to 5.1 sec on 1k-GPU setup (3.2x).
+
+
+| | GPUs | AllReduce time | Iteration time |
+|----------|:------:|:------:|:------:|
+baseline (dense) | 1024| 1.2 | 5.4
+optimized (dense) | 1024| 0.36 | 4.5
+baseline (MoE) | 1024 | 11.5 | 16.1
+optimized (MoE) | 1024 | 0.45 | 5.1
+
+Table 1. AllReduce saving observed for both dense and MoE architectures.
+
+
+
+## 3. Optimizing Parameter All-Gather for ZeRO2 Training
+
+The same as with AllReduce, all-gather takes longer as we have more partitions. As the parameters are stored in a flattened buffer for ZeRO stage-2, we can simply have a one call to all-gather the parameters into this tensor.
+
+When all-gathering the updated parameters at Zero-Stage2, the bucketing scheme uses several narrow operations and creates a list of tensors with the bucket size from each partition. We needed this scheme to align with the `all_gather` operation from PyTorch.
+However, by adding the support for the `all_gather_into_tensor`, operation that has been added to the newer versions of PyTorch, we can simply have a kernel call to do the full-parameter all-gather. With this optimization, we see about 2x reduction in the step time for large-scale training.
+
+## 4. Optimizing AlltoAll for Sequence-Parallel Training
+
+For this part of the optimization, we add some fusion for the communication that is required for the DeepSpeed-Ulysses to provide a more scalable approach for when we increase the SP from 2 to 8 (for this study, we consider A100-DGX hardware, which has 8 GPUs per-node and by increasing the parallelism more than 8, we encounter performance-hit by the cross-node communication).
+
+These fusions are done at two levels:
+1. Fuse the sequence AlltoAll for q,k, and v: we Scatter the heads using the mixed tensor rather than splitting them beforehand. For this part, we need to get some more information from the modeling side (such as the number of q and kv heads), to split the heads before calling AlltoAll. We have added some new changes on the Megatron-DeepSpeed repo that incorporate these changes for the sequence-parallelism.
+2. Fuse the AlltoAll tensors and call the PyTorch's AlltoAll-sinlge API: we reshape the tensors for the scatter dimension and use a single tensor for AlltoAll which alleviates the overhead of using a list of tensors which requires a contiguous call for each element of the list.
+
+By adding these optimizations, we see about 10 to 15% speedup compared to the previous design, and obtain good scalability across different SP-degree and context-lengths. In the following table, we show the improvement achieved by using SP, when doubling the GPU-count and increasing the SP-degree. We obtain over 80% of efficiency when increasing from 256 to 512 GPUs using SP-2. Furthermore, by increasing the sequence-length and SP, while keeping the processed tokens similar, we achieve over 75% of efficiency for 2x more resources. On the other hand, if we can double the number of tokens (shown on the last row of table 2), we can improve the performance to 1.81x.
+
+
+
+| GPUs | bsz | seq | Tokens (M) | SP | Sample (4K)-per-second | Speedup (x) |
+|----------|:------:|:------:|:------:|:------:|:------:|:------:|
+256 | 256| 8192 |2|1 | 60.71 |1
+512 | 256| 8192 |2|2 | 111.18 | 1.83
+512 | 128| 16384 |2|4 | 108.81 | 1.79
+512 | 64 |32768 |2|8 | 106.54 | 1.75
+512 | 64 |65536 |4|8 | 110.05 | 1.81
+
+Table 2. Sequence-Parallelism scalability using DeepSpeed-Ulysses.
+
+
diff --git a/blogs/comm-opt/assets/images/e+d.png b/blogs/comm-opt/assets/images/e+d.png
new file mode 100644
index 000000000000..72ad0f583857
Binary files /dev/null and b/blogs/comm-opt/assets/images/e+d.png differ
diff --git a/blogs/comm-opt/assets/images/sp+fp.png b/blogs/comm-opt/assets/images/sp+fp.png
new file mode 100644
index 000000000000..0b2940418f7a
Binary files /dev/null and b/blogs/comm-opt/assets/images/sp+fp.png differ
diff --git a/blogs/comm-opt/assets/images/sp-conv.png b/blogs/comm-opt/assets/images/sp-conv.png
new file mode 100644
index 000000000000..e1e36b4436a0
Binary files /dev/null and b/blogs/comm-opt/assets/images/sp-conv.png differ
diff --git a/blogs/deepspeed-chat/README.md b/blogs/deepspeed-chat/README.md
new file mode 100644
index 000000000000..43501652b98e
--- /dev/null
+++ b/blogs/deepspeed-chat/README.md
@@ -0,0 +1,332 @@
+
+
+# DeepSpeed Chat: Easy, Fast and Affordable RLHF Training of ChatGPT-like Models at All Scales
+
+
+
+
+
+
+
+
+
+To cite DeepSpeed Chat, please cite our [arxiv report](https://arxiv.org/abs/2308.01320):
+
+```
+@article{yao2023dschat,
+ title={{DeepSpeed-Chat: Easy, Fast and Affordable RLHF Training of ChatGPT-like Models at All Scales}},
+ author={Zhewei Yao and Reza Yazdani Aminabadi and Olatunji Ruwase and Samyam Rajbhandari and Xiaoxia Wu and Ammar Ahmad Awan and Jeff Rasley and Minjia Zhang and Conglong Li and Connor Holmes and Zhongzhu Zhou and Michael Wyatt and Molly Smith and Lev Kurilenko and Heyang Qin and Masahiro Tanaka and Shuai Che and Shuaiwen Leon Song and Yuxiong He},
+ journal={arXiv preprint arXiv:2308.01320},
+ year={2023}
+}
+```
+
+# 1. Overview
+
+ChatGPT-like models have taken the AI world by storm, and it would not be an overstatement to say that its impact on the digital world has been revolutionary. These models are incredibly versatile, capable of performing tasks like summarization, coding, and translation with results that are on-par or even exceeding the capabilities of human experts. Given the sheer power of these models, multiple efforts are underway in the AI open-source community to make ChatGPT-style models more accessible (e.g. ChatLLaMa, Alpaca, Vicuna, Databricks-Dolly, etc.).
+
+Despite these incredible efforts, there is still a lack of an end-to-end RLHF pipeline capable of training a powerful ChatGPT-like model that is easily accessible to the AI community. For instance, training a modest 6.7B ChatGPT model with existing systems typically requires an expensive multi-GPU setup that is beyond the reach of many data scientists. Even with access to such computing resources, training efficiency is often less than 5% of what these machines are capable of (as illustrated [shortly](#effective-throughput-and-scalability-analysis)). Finally, existing solutions simply cannot support easy, fast and affordable training state-of-the-art ChatGPT models with hundreds of billions of parameters, even given access to multi-GPU clusters.
+
+These limitations stem from a lack of a robust system design that is capable of effectively supporting the complex InstructGPT’s RLHF training pipeline which is quite different from the standard pre-training and fine-tuning pipelines that existing DL systems are designed for. Therefore, in the spirit of democratizing ChatGPT-like models, and making RLHF training truly accessible to the AI community, today we are releasing DeepSpeed-Chat with the following three capabilities:
+
+(i) ***Easy-to-use Training and Inference Experience for ChatGPT Like Models***: A single script capable of taking a pre-trained Huggingface model, running it through all three steps of InstructGPT training using DeepSpeed-RLHF system and producing your very own ChatGPT like model. In addition, we provide an inference API for testing conversation-style interactions after the model is trained.
+
+(ii) ***DeepSpeed-RLHF Pipeline***: DeepSpeed-RLHF pipeline primarily replicates the training pipeline from the InstructGPT paper with careful attention to ensure completeness and one-to-one correspondence with the three-steps that includes a) Supervised Fine-tuning (SFT), b) Reward Model Fine-tuning and c) Reinforcement Learning with Human Feedback (RLHF). Additionally, we offer data abstraction and blending capabilities to enable training with multiple data sources.
+
+(iii) ***DeepSpeed-RLHF System***: A robust and sophisticated RLHF system that combines the training and inference prowess of DeepSpeed into a single unified Hybrid Engine (DeepSpeed-HE) for RLHF. The Hybrid-Engine is capable of seamlessly transitioning between inference and training modes within RLHF, allowing it to leverage various optimizations from DeepSpeed-Inference such as tensor-parallelism and high-performance transformer kernels for generation, while also benefiting from the multitude of ZeRO- and LoRA-based memory optimization strategies for RL training. DeepSpeed-HE is also aware of the full RLHF pipeline, allowing it to make optimal decisions in terms of memory management and data movement across different phases of RLHF.
+
+
+DeepSpeed-RLHF system is capable of unparalleled efficiency at scale, making complex RLHF training fast, affordable, and easily accessible to the AI community:
+
+***Efficiency and Affordability***: In terms of efficiency, [DeepSpeed-HE is over 15x faster than existing systems](#throughput-and-model-size-scalability-comparisons-with-existing-rlhf-systems), making RLHF training both fast and affordable. For instance, DeepSpeed-HE can train an OPT-13B in just 9 hours and OPT-30B in 18 hours on Azure Cloud for under $300 and $600, respectively.
+
+
+
+
+| GPUs | OPT-6.7B | OPT-13B | OPT-30B | OPT-66B |
+|-------------|:--------:|:--------------:|:-------------:|:-----------:|
+| 8x A100-40GB | 5.7 hours | 10.8 hours | 1.85 days | NA |
+| 8x A100-80GB | 4.1 hours ($132) | 9 hours ($290) | 18 hours ($580) | 2.1 days ($1620) |
+
+*Table 1. Single-Node 8x A100: Training Time and Corresponding Approximate Cost on Azure.*
+
+
+
+
+***Excellent Scalability***: DeepSpeed-HE supports models with hundreds of billions of parameters and can achieve excellent scalability on multi-node multi-GPU systems. As a result, even a 13B model can be trained in 1.25 hours and a massive 175B model can be trained with DeepSpeed-HE in under a day.
+
+
+
+
+| GPUs | OPT-13B | OPT-30B | OPT-66B | OPT-175B |
+|---------------|:-----------------:|:---------------:|:-------------:|:-------------:|
+| 64x A100-80G | 1.25 hours ($320) | 4 hours ($1024) | 7.5 hours ($1920) | 20 hours ($5120)|
+
+*Table 2. Multi-Node 64x A100-80GB: Training Time and Corresponding Approximate Cost on Azure.*
+
+
+> ***Very Important Details***: The numbers in both tables above are for Step 3 of the training and are based on actual measured training throughput on DeepSpeed-RLHF curated dataset and training recipe which trains for one epoch on a total of 135M tokens. We have in total 67.5M query tokens (131.9k queries with sequence length 256) and 67.5M generated tokens (131.9k answers with sequence length 256), and a maximum global batch size per step of 0.5M tokens (1024 query-answer pairs). We urge readers to pay attention to these specifications before making any cost and e2e time comparisons with DeepSpeed-RLHF. See our [benchmark settings](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/BenckmarkSetting.md) page for more details.
+
+
+***Democratizing RLHF Training***: With just a single GPU, DeepSpeed-HE supports training models with over 13 billion parameters, enabling data scientists without access to multi-GPU systems to create not just toy RLHF models but large and powerful ones that can be used in real-world scenarios.
+
+
+
+| | V100 32G | A6000 48G | A100 40G | A100 80G |
+|------------|:---------:|:----------:|:--------:|:---------:|
+| Model Size | OPT-2.7B | OPT-6.7B | OPT-6.7B | OPT-13B |
+
+Table 3. Max Model Size Supported by DeepSpeed-HE on a Single GPU
+
+
+
+Next, we dive deeper into the three capabilities of DeepSpeed-Chat introduced above.
+
+# 2. Easy-to-use ChatGPT Training and Inference Experience
+
+We start with the easy-to-use experience by showing how you can train OPT-13B and then OPT-66B models with DeepSpeed-RLHF system. If you are short on time, you can even train an OPT-1.3B model on a single consumer-grade GPU in just two hours. We also demonstrate how you can use our DeepSpeed-chat RLHF API to develop your own custom pipelines.
+
+## Training your first ChatGPT-Style model is so easy with DeepSpeed-Chat’s RLHF examples
+
+*a) One single script completes all three stages of RLHF training and generates your first ChatGPT model!*
+
+We use an example of pretrained OPT-13B as the actor model and OPT-350M as the reward model in the following single script to generate a final 13B ChatGPT-style model:
+
+```python
+pip install deepspeed>=0.9.0
+
+git clone https://github.com/microsoft/DeepSpeedExamples.git
+cd DeepSpeedExamples/applications/DeepSpeed-Chat/
+pip install -r requirements.txt
+
+python train.py --actor-model facebook/opt-13b --reward-model facebook/opt-350m --deployment-type single_node
+```
+
+In about half a day, your 13B model would be fully trained with its checkpoints ready. The following table demonstrates a breakdown of the training time for each of the three steps:
+
+
+
+| Model Sizes | Step 1 | Step 2 | Step 3 | Total |
+|--------------------------------- |:------:|:------:|:------:|:------:|
+| Actor: OPT-13B, Reward: OPT-350M | 2.5hr | 0.25hr | 10.8hr | 13.6hr |
+
+Table 4. E2E time breakdown for training a 13 billion parameter ChatGPT model via DeepSpeed-Chat on a single DGX node with 8 NVIDIA A100-40G GPUs.
+
+
+
+*b) Test your final model with DeepSpeed-Chat’s Inference API*
+
+Now it is time to plug-in and test your final model trained by DeepSpeed-Chat! You can play with not only Q&A sessions but also conversation-style interactions.
+
+```
+Human: Do you know Microsoft?
+Assistant: Yes, I’m a big fan of Microsoft. What about them?
+Human: Can you explain it to a 6-year old child? I wonder how I should describe it
+Assistant: Sure, I can try. Microsoft is a company that makes computers, and they make a program called “Windows” which
+ is the operating system that runs on the computer. It’s like the “brain” of the computer. It’s where all the
+ programs and files are stored. It’s also where you’ll find the “start” button, which is where you’ll start to
+ use the computer.
+```
+
+## Want to try different model sizes and configurations? You got it!
+We understand users often like to play with different model sizes and configurations to meet their training time, resources, and quality requirements. With DeepSpeed-Chat, users can easily do that. For example, if you want to train a larger and higher-quality model on your GPU cluster for your research or business, you can simply use the same script with your desired model size e.g., 66B and GPU counts e.g., 64 GPUs:
+
+```python
+python train.py --actor-model facebook/opt-66b --reward-model facebook/opt-350m --deployment-type multi_node
+```
+
+Within 9 hours, you can have your 66 billion parameters ChatGPT model ready to be served in your favorite front-end GUI:
+
+
+
+
+| Model Sizes | Step 1 | Step 2 | Step 3 | Total |
+|---------------------------------- |:-------:|:------:|:------:|:------:|
+| Actor: OPT-66B, Reward: OPT-350M | 82 mins | 5 mins | 7.5hr | 9hr |
+
+Table 5. E2E time breakdown for training a 66 billion parameter ChatGPT model via DeepSpeed-Chat on 8 DGX nodes with 8 NVIDIA A100-80G GPUs/node.
+
+
+
+If you only have around 1-2 hours for coffee or lunch break, you can also try to train a small/toy model with DeepSpeed-Chat. For example, we prepared a training example for a 1.3B model with a single dataset to test our framework on your consumer-grade GPUs. The best part is that you will have your model checkpoint ready to play with when you are back from your lunch break!
+
+```python
+python train.py --actor-model facebook/opt-1.3b --reward-model facebook/opt-350m --deployment-type single_gpu
+```
+
+
+
+| Model Sizes | Step 1 | Step 2 | Step 3 | Total |
+|--------------------------------- |:---------:|:--------:|:------:|:------:|
+| Actor: OPT-1.3B, Reward: OPT-350M | 2900 secs | 670 secs | 1.2hr | 2.2hr |
+
+*Table 6. E2E time breakdown for training a 1.3 billion parameter ChatGPT model via DeepSpeed-Chat on a single commodity NVIDIA A6000 GPU with 48GB memory.*
+
+
+
+## Customizing your own RLHF training pipeline using DeepSpeed-Chat’s RLHF APIs
+DeepSpeed-Chat allows users to build their very own RLHF training pipeline using our flexible APIs shown below, which users can use to reconstruct their own RLHF training strategy. This enables a general interface and backend for creating a wide range of RLHF algorithms for research exploration.
+
+```python
+
+engine = DeepSpeedRLHFEngine(
+ actor_model_name_or_path=args.actor_model_name_or_path,
+ critic_model_name_or_path=args.critic_model_name_or_path,
+ tokenizer=tokenizer,
+ num_total_iters=num_total_iters,
+ args=args)
+
+trainer = DeepSpeedPPOTrainer(engine=engine, args=args)
+
+for prompt_batch in prompt_train_dataloader:
+ out = trainer.generate_experience(prompt_batch)
+ actor_loss, critic_loss = trainer.train_rlhf(out)
+```
+
+# 3. Full-fledged RLHF Training Pipeline
+
+To provide a seamless training experience, we follow InstructGPT and include a full-fledged end-to-end training pipeline in DeepSpeed-Chat as shown in Figure 1.
+
+
+
+
+
+*Figure 1: The illustration of DeepSpeed-Chat’s RLHF training pipeline with optional features.*
+
+
+
+Our pipeline includes three main steps:
+
+* **Step 1: Supervised finetuning (SFT)**, where human responses to various queries are carefully selected to finetune the pretrained language models.
+* **Step 2: Reward model finetuning**, where a separate (usually smaller than the SFT) model (RW) is trained with a dataset that has human-provided rankings of multiple answers to the same query.
+* **Step 3: RLHF training**, where the SFT model is further finetuned with the reward feedback from the RW model using the Proximal Policy Optimization (PPO) algorithm.
+
+We provide two additional features in Step 3 to help improve model quality:
+
+* **Exponential Moving Average (EMA) collection**, where an EMA based checkpoint can be chosen for the final evaluation.
+* **Mixture Training**, which mixes the pretraining objective (i.e., the next word prediction) with the PPO objective to prevent regression performance on public benchmarks like SQuAD2.0.
+
+The two training features, EMA and Mixed Training, are often omitted by other recent efforts since they can be optional. However, according to InstructGPT, EMA checkpoints generally provide better response quality than conventional final trained model and Mixture Training can help the model retain the pre-training benchmark solving ability. As such, we provide them for users to fully get the training experience as described in InstructGPT and strike for higher model quality.
+
+In addition to being highly consistent with InstructGPT paper, we also provide convenient features to support researchers and practitioners in training their own RLHF model with multiple data resources:
+
+* ***Data Abstraction and Blending Capabilities:*** DeepSpeed-Chat is able to train the model with multiple datasets for better model quality. It is equipped with (1) an abstract dataset layer to unify the format of different datasets; and (2) data splitting/blending capabilities so that the multiple datasets are properly blended and then split across the 3 training stages.
+
+To illustrate the effectiveness of our training pipeline, we demonstrate the model quality with multi-round conversation as shown in the experience section.
+
+
+# 4. DeepSpeed Hybrid Engine – Unified Infrastructure to Power and Optimize RLHF Training
+
+Step 1 and Step 2 of the instruct-guided RLHF pipeline resemble regular fine-tuning of large models, and they are powered by ZeRO-based optimizations and a flexible combination of parallelism strategies in DeepSpeed training to achieve scale and speed. Step 3 of the pipeline, on the other hand, is the most complex part to handle in terms of performance implications. Each iteration requires efficient processing of two phases a) inference phase for token/experience generation, producing inputs for the training and b) training phase to update the weights of actor and reward models, as well as the interaction and scheduling between them. It introduces two major costs: (1) the memory cost, as several copies of the SFT and RW models need to be served throughout stage 3; and (2) the predominant generation phase, which if not accelerated properly, will significantly slow down the entire stage 3. Additionally, the two important features we added in Stage 3, including Exponential Moving Average (EMA) collection and Mixture Training, will incur additional memory and training costs.
+
+To tackle these challenges, we composed the full system capability of DeepSpeed Training and Inference into a unified infrastructure that we call **Hybrid Engine**. It leverages the original DeepSpeed engines for fast training mode while effortlessly applying DeepSpeed inference engine for generation/evaluation mode, providing a significantly faster training system for RLHF training at Stage 3. As Figure 2 shows, the transition between DeepSpeed training and inference engine is seamless: by having the typical eval and train modes enabled for the actor model, when running for inference and training pipeline, DeepSpeed selects its different optimizations to run the model faster and improve the overall system throughput.
+
+
+
+
+
+
+*Figure 2. DeepSpeed Hybrid Engine design for accelerating the most time-consuming portion of a RLHF pipeline.*
+
+
+
+During its inference execution for the experience generation phase of RLHF training, DeepSpeed Hybrid Engine uses a light-weight memory management system to handle the KV-cache and intermediate results, together with highly optimized inference-adapted kernels and tensor parallelism implementation, to achieve a significant boost in throughput (tokens-per-second) compared to the existing solutions.
+
+During the training execution, Hybrid Engine enables memory optimization techniques such as DeepSpeed’s ZeRO family of technologies and Low Rank Adaption (LoRA). We designed and implemented these system optimizations in a way that they are compatible with each other and can be composed together to deliver the highest training efficiency under the unified Hybrid Engine.
+
+Hybrid Engine can seamlessly change model partitioning across training and inference to support tensor-parallelism based inferencing and ZeRO-based sharding mechanisms for training. It can also reconfigure the memory system to maximize memory availability during each of these modes. This allows for improved performance by avoiding memory allocation bottlenecks and supporting large batch sizes. Packed with a spectrum of system technologies from DeepSpeed training and inference, Hybrid Engine pushes the boundary of modern RLHF training and delivers unparalleled scale and system efficiency for RLHF workloads.
+
+
+# 5. DeepSpeed RLHF: Unparalleled Scale and Efficiency via Hybrid Engine
+
+## Capability Recap
+
+As discussed, DeepSpeed-HE is an amalgamation of powerful system technologies for inference and training, architected to achieve excellent scale and efficiency for DeepSpeed-RLHF pipeline across a wide range of hardware, making RLHF training fast, affordable, and easily accessible to AI community.
+
+In terms of efficiency and affordability, as shown in Table 1, DeepSpeed-HE can train OPT-13B in just 9 hours and OPT-30B in 18 hours on Azure Cloud for under $300 and $600, respectively. In terms of speed and scalability, as shown in Table 2, even a 13B model can be trained in 1.25 hours and a massive 175B model can be trained in under a day using a 64 GPU cluster. And in terms of accessibility and democratization of RLHF, DeepSpeed-HE supports training models with over 13 billion parameters on a single GPU as shown in Table 3.
+
+## Throughput and Model Size Scalability Comparisons with Existing RLHF Systems
+
+Compared to other RLHF systems like Colossal-AI or HuggingFace powered by native PyTorch, DeepSpeed-RLHF excels in system performance and model scalability:
+
+* With respect to throughput, DeepSpeed enables over 10x improvement for RLHF training on a single GPU (Figure 3). On multi-GPU setup, it enables 6 – 19x speedup over Colossal-AI and 1.4 – 10.5x over HuggingFace DDP (Figure 4).
+* With respect to model scalability, Colossal-AI can run a max model size of 1.3B on a single GPU and 6.7B on a single A100 40G node, DeepSpeed-HE can run 6.5B and 50B models respectively on the same hardware, up to 7.5x larger.
+
+Therefore, with over an order of magnitude higher throughput, DeepSpeed-HE unlocks the ability to train significantly larger actor models under the same latency budget or train models of similar size at over 10x lower cost, compared to existing RLHF systems like Colossal-AI or HuggingFace DDP.
+
+
+
+
+
+*Figure 3. Step 3 throughput comparison against two other system frameworks for accelerating RLHF \
+training on a single NVIDIA A100-40G commodity GPU. No icons represent OOM scenarios.*
+
+
+
+
+
+
+
+*Figure 4. End-to-end training throughput comparison for step 3 of the training pipeline (the most time \
+consuming portion) with different model sizes on a single DGX node equipped with 8 NVIDIA A100-40G GPUs.\
+No icons represent OOM scenarios.*
+
+
+
+This improvement in efficiency stems from DeepSpeed-HE’s ability to accelerate RLHF generation phase of the RLHF processing by leveraging DeepSpeed inference optimizations. Figure 5 shows the time breakdown for a 1.3B parameter model at an RLHF training iteration: majority of the time goes to the generation phase. By leveraging high performance inference kernels from DeepSpeed, DeepSpeed-HE can achieve up to 9x throughput improvement during this phase over HuggingFace and 15x over Colossal-AI allowing it to achieve unparalleled end-to-end efficiency.
+
+
+
+
+
+*Figure 5. Superior generation phase acceleration from DeepSpeed Chat’s Hybrid Engine: A time/sequence breakdown for training OPT-1.3B actor model + OPT-350M reward model on a single DGX node with 8 A100-40G GPUs.*
+
+
+
+## Effective Throughput and Scalability Analysis
+
+***(I) Effective Throughput Analysis.*** The effective throughput of DeepSpeed-HE during Stage 3 of the RLHF training depends on the throughput that it achieves during the generation and RL training phases. In our RLHF pipeline, the generation phase comprises approximately 20% of the total computation while the RL training phase comprises of remaining 80% (see [benchmark settings](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/BenckmarkSetting.md) page for details). However, despite having a small proportion, the former can take a large portion of the e2e time as it requires running the actor model once for each of the 256 generated tokens with an initial prompt of 256 tokens, making it memory bandwidth bound and difficult to achieve high throughput. In contrast, the RL training phase is compute bound running the reference actor model with just a couple of forward and backward passes with full 512 tokens from both prompt and generation per sample and can achieve good throughput.
+
+
+
+
+
+*Figure 6. RLHF Generation, training, and effective throughput with DeepSpeed-HE for different model sizes, at the GPU count that maximizes efficiency.*
+
+
+
+To maximize the effective throughput, DeepSpeed-HE optimizes both phases. First, it uses the largest batch size possible to get higher efficiency in both phases. Second, during the generation phase, it leverages high-performance transformer kernels to maximize GPU memory bandwidth utilization when the model fits in single GPU memory, and leverages tensor-parallelism (TP) when it does not. Using TP in the generation phase instead of ZeRO to fit the model reduces the inter-GPU communication and maintains high GPU memory bandwidth utilization.
+
+Figure 6 shows the best achievable effective throughput for DeepSpeed-HE in terms of TFlops/GPU for model sizes ranging from 1.3B to 175B. It also shows the throughput achieved by each of the generation and training phases. DeepSpeed-HE is the most efficient for models in the range 6.7B-66B. Going beyond this range to 175B, the throughput drops due to the limited memory to support larger batch sizes, while still achieving 1.2x better efficiency than the small 1.3B model. The per-GPU throughput of these gigantic models could improve further when we scale them to more GPUs with more memory available for larger batch sizes.
+
+Furthermore, we would like to point out that our effective performance is 19x higher than existing systems, as shown in Figure 4, which suggests that they are operating at lower than 5% of the peak. This demonstrates the challenge of optimizing RLHF workloads as well as the effectiveness of our system despite the challenge.
+
+
+
+
+
+*Figure 7. Scalability for training 13B (left) and 66B (right) actor model+350M reward model on an increasing number of DGX nodes with 8 A100-40/80G GPUs*
+
+
+
+***(II) Scalability Analysis.*** The best effective throughput for different model sizes is achieved at different GPU count. This is in part because some of the larger model sizes require more memory to run. However, a large part of this behavior stems from DeepSpeed-HE’s scalability properties that we discuss next.
+
+Figure 7 shows that DeepSeed-RLHF has achieved good scaling overall on up to 64 GPUs. However, if we look more closely, it shows that DeepSpeed-RLHF training achieves super-linear scaling at small scale, followed by near linear or sub-linear scaling at larger scales. This is due to the interaction between memory availability and max global batch size.
+
+As DeepSpeed-HE is powered by ZeRO-based technology for training, it allows model states to be partitioned across the available GPUs. As a result, the memory consumption per GPU reduces with the increase in the number of GPUs, allowing DeepSpeed-HE to support a larger batch per GPU resulting in super-linear scaling. However, at large scale, while the available memory continues to increase, the maximum global batch size (1024, in our case, with a sequence length of 512) limits the batch size per GPU, resulting in near-linear or sub-linear scaling.
+As a result, for a given max global batch size, DeepSpeed-HE achieves the best throughput and cost efficiency at the boundary of super-linear and sub-linear scalability, and the exact point is mostly determined by the largest batch size that can be run per GPU as the function of available memory and global batch size.
+
+# 6. Release: Try DeepSpeed Chat Today!
+
+We are very excited to share that DeepSpeed-Chat is now open-sourced and available to the AI community.
+
+* To get started, please visit our github page for DeepSpeed-Chat: [GitHub Landing Page](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat)
+
+* We will continue to improve DeepSpeed-Chat with your feedback and support. Our [roadmap](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/README.md#-deepspeed-chats-roadmap-) shows currently supported features as well as ones that are planned for the future.
+
+DeepSpeed-Chat is part of the bigger DeepSpeed ecosystem comprising a multitude of Deep Learning systems and modeling technologies. To learn more,
+
+* Please visit our [website](https://www.deepspeed.ai/) for detailed blog posts, tutorials, and helpful documentation.
+* You can also follow us on our [English Twitter](https://twitter.com/MSFTDeepSpeed), [Japanese Twitter](https://twitter.com/MSFTDeepSpeedJP), and [Chinese Zhihu](https://www.zhihu.com/people/deepspeed) for latest news on DeepSpeed.
+
+DeepSpeed welcomes your contributions! We encourage you to report issues, contribute PRs, and join discussions on the [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/) page. Please see our [contributing guide](https://github.com/microsoft/DeepSpeed/blob/master/CONTRIBUTING.md) for more details. We are open to collaborations with universities, research labs, companies, such as those working together on deep learning research, applying DeepSpeed to empower real-world AI models and applications, and so on. For such requests (and other requests unsuitable for GitHub), please directly email to deepspeed-info@microsoft.com.
diff --git a/blogs/deepspeed-chat/assets/images/ds_chat_main.png b/blogs/deepspeed-chat/assets/images/ds_chat_main.png
new file mode 100644
index 000000000000..3266a425b102
Binary files /dev/null and b/blogs/deepspeed-chat/assets/images/ds_chat_main.png differ
diff --git a/blogs/deepspeed-chat/assets/images/ds_chat_stability_sweep.png b/blogs/deepspeed-chat/assets/images/ds_chat_stability_sweep.png
new file mode 100644
index 000000000000..d98cd765bc6a
Binary files /dev/null and b/blogs/deepspeed-chat/assets/images/ds_chat_stability_sweep.png differ
diff --git a/blogs/deepspeed-chat/assets/images/ds_chat_zero_offload_gpu.png b/blogs/deepspeed-chat/assets/images/ds_chat_zero_offload_gpu.png
new file mode 100644
index 000000000000..935cadc0cf13
Binary files /dev/null and b/blogs/deepspeed-chat/assets/images/ds_chat_zero_offload_gpu.png differ
diff --git a/blogs/deepspeed-chat/assets/images/dschat-llama-13b-HE-perf.png b/blogs/deepspeed-chat/assets/images/dschat-llama-13b-HE-perf.png
new file mode 100644
index 000000000000..56cf6280d8a5
Binary files /dev/null and b/blogs/deepspeed-chat/assets/images/dschat-llama-13b-HE-perf.png differ
diff --git a/blogs/deepspeed-chat/assets/images/dschat-llama-7b-HE-perf.png b/blogs/deepspeed-chat/assets/images/dschat-llama-7b-HE-perf.png
new file mode 100644
index 000000000000..93342fffbc60
Binary files /dev/null and b/blogs/deepspeed-chat/assets/images/dschat-llama-7b-HE-perf.png differ
diff --git a/blogs/deepspeed-chat/assets/images/dschat-mpzero-llama.png b/blogs/deepspeed-chat/assets/images/dschat-mpzero-llama.png
new file mode 100644
index 000000000000..ae7f5f62f52e
Binary files /dev/null and b/blogs/deepspeed-chat/assets/images/dschat-mpzero-llama.png differ
diff --git a/blogs/deepspeed-chat/assets/images/llama2-with-he.png b/blogs/deepspeed-chat/assets/images/llama2-with-he.png
new file mode 100644
index 000000000000..de5015d277bf
Binary files /dev/null and b/blogs/deepspeed-chat/assets/images/llama2-with-he.png differ
diff --git a/blogs/deepspeed-chat/assets/images/sweep_after_lora_fix.png b/blogs/deepspeed-chat/assets/images/sweep_after_lora_fix.png
new file mode 100644
index 000000000000..d12dca7ac49c
Binary files /dev/null and b/blogs/deepspeed-chat/assets/images/sweep_after_lora_fix.png differ
diff --git a/blogs/deepspeed-chat/assets/images/sweep_before_lora_fix.png b/blogs/deepspeed-chat/assets/images/sweep_before_lora_fix.png
new file mode 100644
index 000000000000..ab01a46cddf7
Binary files /dev/null and b/blogs/deepspeed-chat/assets/images/sweep_before_lora_fix.png differ
diff --git a/blogs/deepspeed-chat/assets/images/zero_offload_after_stability.png b/blogs/deepspeed-chat/assets/images/zero_offload_after_stability.png
new file mode 100644
index 000000000000..8392f8230ff7
Binary files /dev/null and b/blogs/deepspeed-chat/assets/images/zero_offload_after_stability.png differ
diff --git a/blogs/deepspeed-chat/chinese/README.md b/blogs/deepspeed-chat/chinese/README.md
new file mode 100644
index 000000000000..03bf9b69449f
--- /dev/null
+++ b/blogs/deepspeed-chat/chinese/README.md
@@ -0,0 +1,325 @@
+
+
+# DeepSpeed Chat: 一键式RLHF训练,让你的类ChatGPT千亿大模型提速省钱15倍
+
+
+
+
+
+
+
+
+
+如需引用 DeepSpeed Chat,请引用我们的[arxiv report](https://arxiv.org/abs/2308.01320):
+
+```
+@article{yao2023dschat,
+ title={{DeepSpeed-Chat: Easy, Fast and Affordable RLHF Training of ChatGPT-like Models at All Scales}},
+ author={Zhewei Yao and Reza Yazdani Aminabadi and Olatunji Ruwase and Samyam Rajbhandari and Xiaoxia Wu and Ammar Ahmad Awan and Jeff Rasley and Minjia Zhang and Conglong Li and Connor Holmes and Zhongzhu Zhou and Michael Wyatt and Molly Smith and Lev Kurilenko and Heyang Qin and Masahiro Tanaka and Shuai Che and Shuaiwen Leon Song and Yuxiong He},
+ journal={arXiv preprint arXiv:2308.01320},
+ year={2023}
+}
+```
+
+# 1. 概述
+
+近日来,ChatGPT及类似模型引发了人工智能(AI)领域的一场风潮。 这场风潮对数字世界产生了革命性影响。ChatGPT类模型具有惊人的泛用性,能够执行归纳、编程、翻译等任务,其结果与人类专家相当甚至更优。为了使ChatGPT等模型的训练和部署更轻松,AI 开源社区进行了各种尝试(例如 ChatLLaMa、Alpaca、Vicuna、Databricks-Dolly等)。
+
+然而,尽管开源社区付出了巨大的努力,目前仍缺乏一个支持端到端的基于人工反馈机制的强化学习(RLHF)的规模化系统,这使得训练强大的类ChatGPT模型十分困难。例如,使用现有的开源系统训练一个具有 67 亿参数的类ChatGPT模型通常需要昂贵的多卡至多节点的 GPU 集群,但这些资源对大多数数据科学家或研究者而言难以获取。同时,即使有了这样的计算资源,[现有的开源系统的训练效率通常还不到这些机器所能达到的最大效率的5%](#有效吞吐量和可扩展性分析)。简而言之,即使有了昂贵的多GPU集群,现有解决方案也无法轻松、快速、经济的训练具有数千亿参数的最先进的类ChatGPT模型。
+
+ChatGPT模型的训练是基于InstructGPT论文中的RLHF方式。这与常见的大语言模型的预训练和微调截然不同。这使得现有深度学习系统在训练类ChatGPT模型时存在种种局限。因此,为了让ChatGPT类型的模型更容易被普通数据科学家和研究者使用,并使RLHF训练真正普及到AI社区,我们发布了 DeepSpeed-Chat。DeepSpeed-Chat具有以下三大核心功能:
+
+
+(i)***简化 ChatGPT 类型模型的训练和强化推理体验***:只需一个脚本即可实现多个训练步骤,包括使用 Huggingface 预训练的模型、使用 DeepSpeed-RLHF 系统运行 InstructGPT 训练的所有三个步骤、甚至生成你自己的类ChatGPT模型。此外,我们还提供了一个易于使用的推理API,用于用户在模型训练后测试对话式交互。
+
+(ii)***DeepSpeed-RLHF 模块***:DeepSpeed-RLHF 复刻了 InstructGPT 论文中的训练模式,并确保包括a) 监督微调(SFT),b) 奖励模型微调和 c) 基于人类反馈的强化学习(RLHF)在内的三个步骤与其一一对应。此外,我们还提供了数据抽象和混合功能,以支持用户使用多个不同来源的数据源进行训练。
+
+(iii)***DeepSpeed-RLHF 系统***:我们将 DeepSpeed 的训练(training engine)和推理能力(inference engine) 整合到一个统一的混合引擎(DeepSpeed Hybrid Engine or DeepSpeed-HE)中用于 RLHF 训练。DeepSpeed-HE 能够在 RLHF 中无缝地在推理和训练模式之间切换,使其能够利用来自 DeepSpeed-Inference 的各种优化,如张量并行计算和高性能CUDA算子进行语言生成,同时对训练部分还能从 ZeRO- 和 LoRA-based 内存优化策略中受益。DeepSpeed-HE 还能够自动在 RLHF 的不同阶段进行智能的内存管理和数据缓存。
+
+
+DeepSpeed-RLHF 系统在大规模训练中具有无与伦比的效率,使复杂的 RLHF 训练变得快速、经济并且易于大规模推广:
+
+**高效性和经济性**:[DeepSpeed-HE 比现有系统快 15 倍以上](#与现有-RLHF-系统的吞吐量和模型大小可扩展性比较),使 RLHF 训练快速且经济实惠。例如,DeepSpeed-HE 在 Azure 云上只需 9 小时即可训练一个 OPT-13B模型,只需 18 小时即可训练一个 OPT-30B模型。这两种训练分别花费不到 300 美元和 600 美元。
+
+
+
+| GPUs | OPT-6.7B | OPT-13B | OPT-30B | OPT-66B |
+|-------------|:--------:|:--------------:|:-------------:|:-----------:|
+| 8x A100-40GB | 5.7 hours | 10.8 hours | 1.85 days | NA |
+| 8x A100-80GB | 4.1 hours ($132) | 9 hours ($290) | 18 hours ($580) | 2.1 days ($1620) |
+
+*表 1. 单节点 8x A100:训练时长及预估的 Azure 费用。*
+
+
+
+***卓越的扩展性***:DeepSpeed-HE 能够支持训练拥有数千亿参数的模型,并在多节点多 GPU 系统上展现出卓越的扩展性。因此,即使是一个拥有 130 亿参数的模型,也只需 1.25 小时就能完成训练。而对于庞大的 拥有1750 亿参数的模型,使用 DeepSpeed-HE 进行训练也只需不到一天的时间。
+
+
+
+| GPUs | OPT-13B | OPT-30B | OPT-66B | OPT-175B |
+|---------------|:-----------------:|:---------------:|:-------------:|:-------------:|
+| 64x A100-80G | 1.25 hours ($320) | 4 hours ($1024) | 7.5 hours ($1920) | 20 hours ($5120)|
+
+*表 2. 多节点 64x A100-80GB:训练时长及预估的 Azure 费用。*
+
+
+> ***非常重要的细节***: 上述两个表格(即表一和表二)中的数据均针对 RLHF 训练的第 3 步,基于实际数据集和 DeepSpeed-RLHF 训练吞吐量的测试。该训练在总共 1.35 亿(135M)个字符(token)上进行一个时期(epoch)的训练。我们总共有 6750 万个查询(query)字符(131.9k 个 query,每个序列长度为 256)和 6750 万个生成/回答字符(131.9k 个答案,每个序列长度为 256),每步的最大全局字符批量大小约为 500 万个字符(1024 个查询-答案对)。在与 DeepSpeed-RLHF 进行任何成本和端到端时间比较之前,我们建议读者注意这些设定。想要了解更多详细信息,请参阅我们的页面 [benchmark setting](https://github.com/microsoft/DeepSpeedExamples-internal/blob/staging-deepspeed-chat-v2/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/BenckmarkSetting.md)。
+
+***实现 RLHF 训练的普及化***:仅凭单个 GPU,DeepSpeed-HE 就能支持训练超过 130 亿参数的模型。这使得那些无法使用多 GPU 系统的数据科学家和研究者不仅能够轻松创建轻量级的 RLHF 模型,还能创建大型且功能强大的模型,以应对不同的使用场景。
+
+
+
+| | V100 32G | A6000 48G | A100 40G | A100 80G |
+|------------|:---------:|:----------:|:--------:|:---------:|
+| Model Size | OPT-2.7B | OPT-6.7B | OPT-6.7B | OPT-13B |
+
+*表 3. DeepSpeed-HE 在不同的 GPU 单卡上支持的最大模型。*
+
+
+
+接下来,我们将深入了解上面介绍的 DeepSpeed-Chat 的三个功能。
+
+# 2. 简洁高效且经济的 ChatGPT 训练与推理体验
+
+接下来,我们向你展示如何使用一行命令即可利用 DeepSpeed-RLHF 系统训练 OPT-13B 和 OPT-66B 模型。即使时间有限,你也可以在大约两小时内在单个消费级 GPU 上训练一个 OPT-1.3B 模型。此外,我们还演示了如何利用我们的 DeepSpeed-chat RLHF API 来开发你自己的模型:
+
+## 使用 DeepSpeed-Chat 的 RLHF 示例轻松训练你的第一个 类ChatGPT 模型
+
+*a) 仅需一个脚本,完成 RLHF 训练的全部三个阶段,生成你的第一个 ChatGPT 模型!*
+
+以下示例展示了如何使用一个脚本,以预训练的 OPT-13B 作为 actor 模型,OPT-350M 作为 reward 模型,生成一个最终的 130 亿参数的 ChatGPT 类型的模型:
+```
+pip install deepspeed>=0.9.0
+
+git clone https://github.com/microsoft/DeepSpeedExamples.git
+cd DeepSpeedExamples/applications/DeepSpeed-Chat/
+pip install -r requirements.txt
+
+python train.py --actor-model facebook/opt-13b --reward-model facebook/opt-350m --deployment-type single_node
+```
+
+
+大约半天时间,你的 拥有130 亿参数的模型即可完成训练。下表展示了整个过程在三个步骤中所需的时间:
+
+
+
+| Model Sizes | Step 1 | Step 2 | Step 3 | Total |
+|--------------------------------- |:------:|:------:|:------:|:------:|
+| Actor: OPT-13B, Reward: OPT-350M | 2.5hr | 0.25hr | 10.8hr | 13.6hr |
+
+*表 4. 在单个DGX节点上(8x A100-40G),针对不同的RLHF步骤, 使用DeepSpeed-Chat训练OPT-13b所需的时间。*
+
+
+
+*b) 使用 DeepSpeed-Chat 的推理 API 测试训练好的模型*
+
+将训练好的模型导入提供的简易 API,你不仅可以与模型进行问答互动,还可以进行对话式交流。
+
+```
+Human: Do you know Microsoft?
+Assistant: Yes, I’m a big fan of Microsoft. What about them?
+Human: Can you explain it to a 6-year old child? I wonder how I should describe it
+Assistant: Sure, I can try. Microsoft is a company that makes computers, and they make a program called “Windows” which
+ is the operating system that runs on the computer. It’s like the “brain” of the computer. It’s where all the
+ programs and files are stored. It’s also where you’ll find the “start” button, which is where you’ll start to
+ use the computer.
+```
+
+## 想尝试不同的模型大小和配置?轻松实现!
+我们了解到用户通常喜欢尝试不同的模型大小和配置,以满足他们不同的训练时间、资源和质量的需求。借助 DeepSpeed-Chat,你可以轻松实现这些目标。例如,如果你想在 GPU 集群上训练一个更大、更高质量的模型,用于你的研究或业务,你可以使用相同的脚本,只需输入你期望的模型大小(例如 660 亿参数)和 GPU 数量(例如 64 个 GPU):
+
+```python
+python train.py --actor-model facebook/opt-66b --reward-model facebook/opt-350m --deployment-type multi_node
+```
+
+在 接下来的9 小时内,你将拥有一个 660 亿参数的 ChatGPT 模型,并可以在你喜欢的前端 GUI 中使用:
+
+
+
+| Model Sizes | Step 1 | Step 2 | Step 3 | Total |
+|---------------------------------- |:-------:|:------:|:------:|:------:|
+| Actor: OPT-66B, Reward: OPT-350M | 82 mins | 5 mins | 7.5hr | 9hr |
+
+*表 5. 在8个DGX节点上(64x A100-80G),针对不同的RLHF步骤, 使用DeepSpeed-Chat训练OPT-66b所需的时间。*
+
+
+
+如果你只有大约 1-2 小时的咖啡或午餐休息时间,你也可以尝试使用 DeepSpeed-Chat 训练一个小型模型。例如,我们为单个数据集准备了一个 13 亿参数模型的训练示例,以便在你的消费级 GPU 上测试我们的框架。当你从午餐休息回来时,你的模型已经准备好供你使用!
+
+```python
+python train.py --actor-model facebook/opt-1.3b --reward-model facebook/opt-350m --deployment-type single_gpu
+```
+
+
+
+| Model Sizes | Step 1 | Step 2 | Step 3 | Total |
+|--------------------------------- |:---------:|:--------:|:------:|:------:|
+| Actor: OPT-1.3B, Reward: OPT-350M | 2900 secs | 670 secs | 1.2hr | 2.2hr |
+
+*表 6. 在单个消费级A6000-48G上,针对不同的RLHF步骤, 使用DeepSpeed-Chat训练OPT-1.3b所需的时间。*
+
+
+
+## 利用 DeepSpeed-Chat 的 RLHF API 自定义你自己的 RLHF 训练流程
+DeepSpeed-Chat 允许用户使用我们灵活的 API(如下所示)构建自己的 RLHF 训练流程,用户可以使用这些 API 重建自己的 RLHF 训练策略。我们希望这些功能可以为研究探索中创建各种 RLHF 算法提供通用接口和后端。
+```python
+
+engine = DeepSpeedRLHFEngine(
+ actor_model_name_or_path=args.actor_model_name_or_path,
+ critic_model_name_or_path=args.critic_model_name_or_path,
+ tokenizer=tokenizer,
+ num_total_iters=num_total_iters,
+ args=args)
+
+trainer = DeepSpeedPPOTrainer(engine=engine, args=args)
+
+for prompt_batch in prompt_train_dataloader:
+ out = trainer.generate_experience(prompt_batch)
+ actor_loss, critic_loss = trainer.train_rlhf(out)
+```
+
+# 3. 完整的 RLHF 训练流程概述
+
+为了实现无缝的训练体验,我们遵循 InstructGPT 论文的方法,并在 DeepSpeed-Chat 中整合了一个端到端的训练流程,如图 1 所示。
+
+
+
+
+
+*图 1: DeepSpeed-Chat 的 RLHF 训练流程图示,包含了一些可选择的功能。*
+
+
+
+我们的流程包括三个主要步骤:
+
+* **步骤1:监督微调(SFT)** —— 使用精选的人类回答来微调预训练的语言模型以应对各种查询;
+* **步骤2:奖励模型微调** —— 使用一个包含人类对同一查询的多个答案打分的数据集来训练一个独立的(通常比 SFT 小的)奖励模型(RW);
+* **步骤3:RLHF 训练** —— 利用 Proximal Policy Optimization(PPO)算法,根据 RW 模型的奖励反馈进一步微调 SFT 模型。
+
+在步骤3中,我们提供了两个额外的功能,以帮助提高模型质量:
+* **指数移动平均(EMA)** —— 可以选择基于 EMA 的检查点进行最终评估
+* **混合训练** —— 将预训练目标(即下一个单词预测)与 PPO 目标混合,以防止在像 SQuAD2.0 这样的公开基准测试中的性能损失
+
+这两个训练功能,EMA 和混合训练,常常被其他的开源框架所忽略,因为它们并不会妨碍训练的进行。然而,根据 InstructGPT,EMA 通常比传统的最终训练模型提供更好的响应质量,而混合训练可以帮助模型保持预训练基准解决能力。因此,我们为用户提供这些功能,以便充分获得 InstructGPT 中描述的训练体验,并争取更高的模型质量。
+
+除了与 InstructGPT 论文高度一致外,我们还提供了一项方便的功能,以支持研究人员和从业者使用多个数据资源训练他们自己的 RLHF 模型:
+
+* **数据抽象和混合能力**: DeepSpeed-Chat 能够使用多个不同来源的数据集训练模型以获得更好的模型质量。它配备了(1)一个抽象数据集层,以统一不同数据集的格式;以及(2)数据拆分/混合功能,以便多个数据集在 3 个训练阶段中被适当地混合然后拆分。
+
+在我们之前的章节中,你可以看到使用整个 DeepSpeed-Chat 训练模型在多轮对话中的表现。
+
+# 4. DeepSpeed Hybrid Engine —— 统一的高效混合引擎,为 RLHF 训练提供动力并进行优化
+
+DeepSpeed-Chat流程的前两步与大型模型的常规微调相似,得益于基于ZeRO的内存管理优化和DeepSpeed训练中的并行策略灵活组合,实现了规模和速度的提升。然而,流程的第三步在性能方面是最具挑战性的部分。每次迭代都需要高效处理两个阶段:a) 生成回答的推理阶段,为训练提供输入;b) 更新 actor 和 reward 模型权重的训练阶段,以及它们之间的交互和调度。这引入了两个主要困难:(1)内存成本,因为在第三阶段的整个过程中需要运行多个SFT和RW模型;(2)生成回答阶段的速度较慢,如果没有正确加速,将显著拖慢整个第三阶段。此外,我们在第三阶段中添加的两个重要可选功能,包括指数移动平均(EMA)收集和混合训练,将产生额外的内存和训练成本。
+
+为了应对这些挑战,我们将DeepSpeed训练和推理的系统功能整合为一个统一的基础设施,称为混合引擎(Hybrid Engine)。它利用原始DeepSpeed引擎进行高速训练模式,同时轻松应用DeepSpeed推理引擎进行生成/评估模式,为第三阶段的RLHF训练提供了一个明显更快的训练系统。如图2所示,DeepSpeed训练和推理引擎之间的过渡是无缝的:通过为actor模型启用典型的eval和train模式,当运行推理和训练流程时,DeepSpeed选择其不同的优化来运行模型更快并提高整个系统吞吐量。
+
+
+
+
+
+*Figure 2. 设计图解:DeepSpeed Hybrid Engine,用于加速 RLHF 流程中最耗时的部分。*
+
+
+
+在RLHF训练的经验生成阶段的推理执行过程中,DeepSpeed混合引擎使用轻量级内存管理系统来处理KV缓存和中间结果,同时使用高度优化的推理CUDA核和张量并行计算。与现有解决方案相比,DeepSpeed-HE显著提高了吞吐量(每秒token数)。
+
+在训练执行过程中,混合引擎使用了多种内存优化技术,如DeepSpeed的ZeRO系列技术和现在流行的LoRA方法。这些技术在混合引擎中可以彼此兼容,并可以组合在一起以提供最高训练效率。
+
+DeepSpeed-HE可以在训练和推理之间无缝更改模型分区,以支持基于张量并行计算的推理和基于ZeRO的分片机制进行训练。它还会重新配置内存系统以在此期间最大化内存可用性。DeepSpeed-HE还通过规避内存分配瓶颈和支持大批量大小来进一步提高性能。混合引擎集成了DeepSpeed训练和推理的一系列系统技术,突破了现有RLHF训练的极限,并为RLHF工作负载提供了无与伦比的规模和系统效率。
+
+# 5. DeepSpeed RLHF: 通过 Hybrid Engine 实现无与伦比的规模和效率
+
+## 回顾
+
+如前所述,DeepSpeed-HE 是一个将强大的用于推理和训练的结合系统,旨在使 DeepSpeed-RLHF 在各种硬件上实现卓越的规模和效率,使 RLHF 训练快速、经济并且易于 AI 社区使用。
+
+在效率和经济性方面,如表 1 所示,DeepSpeed-HE 在 Azure 云上只需 9 小时即可训练一个OPT-13B模型,只需 18 小时既可训练 OPT-30B模型,分别花费不到 300 美元和 600 美元。在速度和可扩展性方面,如表 2 所示,即使是 13B 的模型也可以在 1.25 小时内训练,而庞大的 175B 模型可以在不到一天的时间内使用 64 个 GPU 集群进行训练。在 RLHF 的可访问性和普及化方面,DeepSpeed-HE 可以在单个 GPU 上训练超过 130 亿参数的模型,如表 3 所示。
+
+## 与现有 RLHF 系统的吞吐量和模型大小可扩展性比较
+
+与其他 RLHF 系统(如 Colossal-AI 或由原生 PyTorch 提供支持的 HuggingFace)相比,DeepSpeed-RLHF 在系统性能和模型可扩展性方面表现出色:
+
+* 就吞吐量而言,DeepSpeed 在单个 GPU 上的 RLHF 训练中实现了 10 倍以上的改进(图 3)。在多 GPU 设置中,它比 Colossal-AI 快 6 - 19 倍,比 HuggingFace DDP 快 1.4 - 10.5 倍(图 4)。
+* 就模型可扩展性而言,Colossal-AI 可以在单个 GPU 上运行最大 1.3B 的模型,在单个 A100 40G 节点上运行 6.7B 的模型,而 DeepSpeed-HE 可以在相同的硬件上分别运行 6.5B 和 50B 的模型,实现高达 7.5 倍的提升。
+
+因此,凭借超过一个数量级的更高吞吐量,与现有的 RLHF 系统(如 Colossal-AI 或 HuggingFace DDP)相比,DeepSpeed-HE 拥有在相同时间预算下训练更大的 actor 模型的能力,或者以十分之一的成本训练类似大小的模型的能力。
+
+
+
+
+
+
+*图 3. 在单个 NVIDIA A100-40G GPU 上,将 RLHF 训练的吞吐量与另外两个系统框架在步骤 3 进行比较。没有图标表示 OOM(内存不足)的情况*
+
+
+
+
+
+
+
+*图 4. 在单个 DGX 节点上,使用 8 个 NVIDIA A100-40G GPU,对训练流程第 3 步(耗时最长的部分)的不同模型大小进行端到端训练吞吐量比较。没有图标表示 OOM(内存不足)的情况。*
+
+
+
+这种效率的提高是 DeepSpeed-HE 利用 DeepSpeed 推理优化在 RLHF 处理过程中加速 RLHF 生成的结果。图 5 显示了 RLHF 训练迭代中 1.3B 参数模型的时间消耗细节:大部分时间用于生成阶段。通过利用 DeepSpeed 的高性能推理内核,DeepSpeed-HE 在这个阶段可以实现比 HuggingFace 高达 9 倍的吞吐量改进,比 Colossal-AI 高 15 倍,从而实现无与伦比的端到端效率。
+
+
+
+
+
+*图 5. DeepSpeed Chat 的混合引擎在生成阶段的优越加速:在单个 DGX 节点上使用 8 个 A100-40G GPU 训练 OPT-1.3B actor 模型 + OPT-350M reward 模型的时间/序列分解。*
+
+
+
+## 有效吞吐量和可扩展性分析
+
+***(I) 有效吞吐量分析。*** 在 RLHF 训练的第 3 阶段,DeepSpeed-HE 的有效吞吐量取决于它在生成和 RL 训练阶段所实现的吞吐量。在我们的 RLHF (详见 [benchmarking setting](https://github.com/microsoft/DeepSpeedExamples/blob/master/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/BenckmarkSetting.md))中,生成阶段占总计算的约 20%,而 RL 训练阶段占剩余的 80%。然而,尽管比例较小,前者可能会占用大部分的端到端时间,因为它需要为每个生成的字符运行一次 actor 模型,使其受到内存带宽限制,难以实现高吞吐量。相比之下,RL 训练阶段是计算密集型的,仅需运行参考 actor 模型进行几次前向和后向传递,每个样本都有来自提示和生成的全部 512 个字符,可以实现良好的吞吐量。
+
+
+
+
+
+*图 6. 在最大效率的情况下,DeepSpeed-HE 针对不同模型大小的RLHF生成、训练和有效吞吐量。*
+
+
+
+为了最大化有效吞吐量,DeepSpeed-HE 对两个阶段进行了优化。首先,它使用尽可能大的批量大小以在两个阶段上获得更高的效率。其次,在生成阶段,它利用高性能CUDA内核在模型在单个 GPU 上最大化 GPU 内存带宽利用率,并在其他情况下利用张量并行(Tensor Parallelism, 简写作TP)进行计算。DeepSpeed-HE进一步在生成阶段使用 TP 而不是 ZeRO 以减少 GPU 之间的通信并保持高 GPU 内存带宽利用率。
+
+图 6 显示了 DeepSpeed-HE 在 1.3B 到 175B 的模型大小范围内可以实现的最佳有效吞吐量(以 TFlops/GPU 表示)。它还分别显示了在生成和训练阶段实现的吞吐量。DeepSpeed-HE 对 6.7B-66B 范围内的模型最为高效。超出这个范围到 175B 时,由于内存有限,无法支持更大的批量大小,吞吐量下降,但仍比小型 1.3B 模型的效率高 1.2 倍。当我们将这些巨大的模型扩展到更多具有更多内存的 GPU 时,这些模型的每个 GPU 吞吐量可能会进一步提高。
+
+此外,我们想指出,如图 2 所示,我们系统的有效性能比现有系统高 19 倍,这表明它们的运行速度低于峰值的 5%。这说明了优化 RLHF 工作负载的挑战以及我们的系统在面对挑战时的有效性。
+
+
+
+
+
+*图 7. 在不同数量的DGX (A100-40/80G GPU) 节点上,进行13B(左)和66B(右)actor 模型 和 350M reward 模型的可扩展性训练。*
+
+
+
+***(II) 可扩展性分析。*** 不同模型大小的最佳有效吞吐量取决于不同的 GPU 数量。部分原因是因为一些较大的模型大小需要更多的内存来运行。基于此,我们接下来讨论 DeepSpeed-HE 的可扩展性特性。
+
+图 7 显示 DeepSeed-RLHF 在多达 64 个 GPU的集群 上实现了良好的整体扩展。然而,如果我们仔细观察,可以发现 DeepSpeed-RLHF 训练在小规模时实现了超线性扩展,随后在较大规模时实现了接近线性或次线性扩展。这是由于内存可用性和最大全局批量大小之间的相互作用。
+
+DeepSpeed-HE 的核心技术基于 ZeRO,用于训练过程中将模型状态分割到每个GPU上。这意味着随着 GPU 数量的增加,每个 GPU 的内存消耗会减少,使得 DeepSpeed-HE 能够在每个 GPU 上支持更大的批量,从而实现超线性扩展。然而,在大规模情况下,尽管可用内存持续增加,但最大全局批量大小仍然限制了每个 GPU 的批量大小,导致接近线性或次线性扩展。因此,在给定的最大全局批量大小(例如,我们设置为 1024 个句子,每个句子长度为 512)下,DeepSpeed-HE 在超线性和次线性可扩展性之间实现了最佳的吞吐量和成本效益。具体的平衡点主要取决于每个 GPU 上可运行的最大批量大小,而这又受到可用内存和全局批量大小的函数所决定。
+
+# 6. 发布:现在就尝试DeepSpeed Chat吧!
+
+我们非常高兴地宣布,DeepSpeed-Chat现已开源并向 AI 社区开放。
+
+* 如果你发现我们的成果对你有用或者喜欢我们的开源成果,请在 [DeepSpeed](https://github.com/microsoft/DeepSpeed) 和 [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples)上点⭐。
+
+
+* 请访问我们的DeepSpeed-Chat GitHub页面以开始使用:[GitHub 登陆页面](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat)
+
+
+* 我们将继续根据你的反馈和支持改进 DeepSpeed-Chat。我们的[计划图](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/README.md#-deepspeed-chats-roadmap-)显示了当前支持的功能以及计划在未来支持的功能。
+
+DeepSpeed-Chat 是更大的DeepSpeed生态系统的一部分,包括众多深度学习系统和建模技术。要了解更多信息,
+
+* 请访问我们的[网站](https://www.deepspeed.ai/),了解详细的博客文章、教程和有用的文档。
+* 我们会在[知乎](https://www.zhihu.com/people/deepspeed)上发布最新中文博客及动态。你还可以关注我们的[英文 Twitter](https://twitter.com/MSFTDeepSpeed) 和[日文 Twitter](https://twitter.com/MSFTDeepSpeedJP)。
+
+
+DeepSpeed 欢迎你的贡献!我们鼓励你在 [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/) 页面报告问题、贡献 PR 并参与讨论。请参阅我们的[贡献指南](https://github.com/microsoft/DeepSpeed/blob/master/CONTRIBUTING.md)了解更多详情。我们愿意与大学、研究实验室、公司等进行合作,共同开展深度学习研究,将 DeepSpeed 应用于赋能现实世界的 AI 模型和应用等。对于此类需求(以及其他不适合在 GitHub 上提出的需求),请直接发送电子邮件至 deepspeed-info@microsoft.com。
diff --git a/blogs/deepspeed-chat/ds-chat-release-8-31/README.md b/blogs/deepspeed-chat/ds-chat-release-8-31/README.md
new file mode 100644
index 000000000000..e1d4bf952bf1
--- /dev/null
+++ b/blogs/deepspeed-chat/ds-chat-release-8-31/README.md
@@ -0,0 +1,352 @@
+
+
+# DeepSpeed-Chat: Llama/Llama-2 system support, efficiency boost, and training stability improvements
+
+
+
+
+
+
+
+# Table of Contents
+1. [Introduction](#introduction)
+2. [System Support for Llama and Llama-2 models](#system-support-llama)
+3. [Improved Efficiency and Accessibility](#new-features)
+ - [3.3x Higher Throughput with MixZ++ for LoRA](#mixz)
+ - [ZeRO-Offload Support for Larger Models with 16x fewer GPUs](#zero-offload)
+4. [Stability Bug Fixes](#stability-bug-fixes)
+5. [Software Improvements](#software-improvements)
+ - [Characterization Scripts](#characterization-scripts)
+ - [Instrumentation](#instrumentation)
+ - [Testing](#testing)
+6. [Try Out DeepSpeed-Chat](#try-out-deepspeed-chat)
+
+
+# 1. Introduction
+
+DeepSpeed-Chat is a general system framework for RLHF training that enables easy, fast, affordable, and scalable training of ChatGPT-style models that we [publicly released on GitHub](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-chat/README.md). The detailed performance and capabilities of DeepSpeed-Chat have been published in our [blog post](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat) and [arXiv](https://arxiv.org/abs/2308.01320) paper.
+
+We are happy to share that today we are improving DeepSpeed-Chat along three areas: i) system support for Llama/Llama-2 family of models, ii) system features for improved efficiency and accessibility, and iii) stability and software enhancements.
+
+- **System support for training Llama and Llama-2 models**
+
+ We ***introduce system support for training Llama and Llama-2 models*** in DeepSpeed-Chat enabling and leveraging various optimizations and features including the Hybrid Engine, ZeRO family of optimizations, Low-Rank Adaptation (LoRA) support, as well as full integration into the three-stage DeepSpeed-Chat RLHF pipeline. By leveraging the Hybrid-Engine, we speed up the experience generation phase for Llama-2-7B and Llama-2-13B models by **up to 7.1X**.
+
+- **New System Features for Improved Efficiency and Accessibility**
+ - ***Mixed Precision ZeRO++ ([MixZ++](https://github.com/microsoft/DeepSpeed/pull/3954))***. It is an extended set of optimization strategies built upon [ZeRO++](https://www.deepspeed.ai/tutorials/zeropp/) tailored to reduce memory usage and improve training/inference efficiency for RLHF training with LoRA. MixZ++ partitions model parameters across GPUs to reduce footprint and gathers them with quantized communication only when needed similar to its ZeRO and ZeRO++ siblings. Our evaluation indicates MixZ++ increases the training throughput by **up to 3.3x** for the Llama-2-70B model running on 128 V100 GPUs.
+
+ - ***[ZeRO-Offload](https://www.microsoft.com/en-us/research/blog/zero-infinity-and-deepspeed-unlocking-unprecedented-model-scale-for-deep-learning-training/)***. It is an optimization that offloads optimizer memory and computation from the GPU to the host CPU, enabling larger models to be trained with fewer GPU resources. After training stability fixes and testing, we have enabled this feature across all three stages of the DeepSpeed-Chat RLHF training pipeline. ZeRO-Offload reduces the minimum number of GPUs required to train large models by **up to 16x**.
+
+- **Stability and Software Enhancements**
+
+ - DeepSpeed-Chat contains a rich set of features for training across many different platforms and scenarios. Composing these features in a systematic way and ensuring both system stability and decent training convergence is critical for the usability of the framework. Thus, in addition to new features in DeepSpeed-Chat, many system stability and training convergence issues have been fixed both in DeepSpeed-Chat (client code) and DeepSpeed (runtime). These improvements have been thoroughly tested using the OPT model family for end-to-end training. Furthermore, end-to-end testing, characterization scripts, and several instrumentation features like TensorBoard support are now also available. *To try out these latest features and software improvements, please use DeepSpeed release [v0.10.2](https://github.com/microsoft/DeepSpeed/tree/v0.10.2) and the latest DeepSpeed-Chat in [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples)*.
+
+ - Finally, to ensure the long-term health of the DeepSpeed-Chat training framework, [PyTests](https://github.com/microsoft/DeepSpeedExamples/blob/master/applications/DeepSpeed-Chat/training/tests/test_training.py) were added for testing Step 3 of the RLHF training pipeline and are run on a nightly basis through a newly developed [GitHub Actions workflow](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-ds-chat.yml).
+
+We now dive into the details of our new features, training stability, and software improvements.
+
+# 2. System Support for Llama and Llama-2 models
+
+The DeepSpeed-Chat training framework now provides system support for the Llama and Llama-2 models across all three stages of training. To support this, we encountered a spectrum of issues, spanning from minor runtime errors to intricate performance-related challenges. In particular, the Llama model architecture which deviates from the standard Transformers block, was incompatible with DeepSpeed's inference kernels and the DeepSpeed container policy used by the Hybrid Engine. Addressing these hurdles necessitated extensive modifications across our DeepSpeed-Chat pipeline and the DeepSpeed runtime including code to support the ZeRO family of optimizations and their interaction with optimized inference kernels in the Hybrid Engine. We have resolved these challenges to ensure that DeepSpeed-Chat can support Llama and Llama-2 and provide our users with the best possible experience. The details can be seen from several PRs that have been merged in our codebases.
+
+## Key Supported Optimizations
+
+The following key optimizations in DeepSpeed are now fully integrated for Llama and Llama-2 models:
+
+- **DeepSpeed-Chat Integration**: Fully integrated into the complete, end-to-end three-stage DeepSpeed-Chat RLHF training framework, based on the OpenAI InstructGPT training strategy.
+- **Hybrid Engine**: DeepSpeed Hybrid Engine allows for superior generation phase [acceleration](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-chat/README.md#throughput-and-model-size-scalability-comparisons-with-existing-rlhf-systems), now supported for all Llama-1 model variants, Llama-2-7B, and Llama-2-13B models.
+- **ZeRO and ZeRO-Offload**: Fully supported by the [ZeRO](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-chat/README.md#throughput-and-model-size-scalability-comparisons-with-existing-rlhf-systems) family of optimizations including offload support leveraging full memory capacity of a system thus enabling training of even larger models.
+- **Mixed Precision ZeRO++ (MixZ++)**: Enhanced support for larger models like Llama-2-70B through the new MixZ++ feature, improving efficiency and reducing memory usage when there are frozen or non-trainable parameters.
+- **LoRA**: Fully supported by the [LoRA](https://github.com/microsoft/LoRA) feature, which vastly reduces the storage requirements for large language models by freezing original weights and learning pairs of rank-decomposition matrices.
+
+## Getting Started
+
+Users looking to try the new Llama and Llama-2 model support can get started by using the newly added Llama scripts.
+| Step Number | Scripts |
+| --- | --- |
+| 1 | [Llama-2 Step 1 Scripts](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/llama2) |
+| 2 | [Llama-2 Step 2 Scripts](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/llama2) |
+| 3 | [Llama-2 Step 3 Scripts](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/llama2) |
+
+*Note*: While all the system aspects of Llama and Llama-2 support have been extensively tested, there are no guarantees about training convergence and may require hyper-parameter tuning to achieve convergence.
+
+## Performance Evaluation
+
+We highlight the performance benefits of the Hybrid Engine for Llama-2 models on NVIDIA A100 and V100 GPUs in this section. Improved performance for larger models like Llama-2-70B and reduced resource requirements via ZeRO-Offload are discussed in the [next section](#new-features).
+
+#### A100 Performance Evaluation
+Using A100 GPUs, we achieve 7.1x faster generation for Llama-2-7B and 5.4x faster generation for Llama-2-13B with DeepSpeed-Chat Hybrid Engine compared to DeepSpeed-Chat without Hybrid Engine (baseline) as shown in *Figure 1*.
+
+
+
+
+
+ *Figure 1: Up to 7.1x faster Llama-2 generation with DS-Chat Hybrid Engine*
+
+
+
+#### V100 Performance Evaluation
+Using V100 GPUs, we achieve 4x faster generation for Llama-2-7B and 2.1x faster generation for Llama-2-13B with DeepSpeed-Chat Hybrid Engine compared to DeepSpeed-Chat without Hybrid Engine (baseline) as shown in *Figure 2*.
+
+
+
+
+
+ *Figure 2: [Left] 4x faster Llama-2-7B generation with DS-Chat Hybrid Engine (16 V100 GPUs) [Right] 2.1x faster Llama-2-13B generation with DS-Chat Hybrid Engine on 32 V100 GPUS vs. DS-Chat without Hybrid Engine on 16 V100 GPUs.*
+
+
+
+
+# 3. Improved Efficiency and Accessibility
+
+We now dive into the details of two new features we are introducing today: 1) Mixed Precision ZeRO++ (MixZ++) and 2) ZeRO-Offload. Both these features offer unique benefits for DeepSpeed-Chat users. MixZ++ provides up to 3.3x better throughput for LoRA-enabled training and ZeRO-Offload reduces the minimum number of GPUs required to train by up to 16x.
+
+## 3.3x Higher Throughput with MixZ++ for LoRA
+
+Mixed Precision ZeRO++ ([MixZ++](https://github.com/microsoft/DeepSpeed/pull/3954)) is an extended set of optimization strategies built upon [ZeRO](https://www.deepspeed.ai/tutorials/zero/) and [ZeRO++](https://www.deepspeed.ai/tutorials/zeropp/) tailored to reduce memory usage and improve training/inference efficiency for RLHF training with LoRA.
+
+Similar to [ZeRO](https://www.deepspeed.ai/tutorials/zero/), MixZ++ partitions model parameters across GPUs to reduce footprint and gathers them only when needed. In addition, similar to ZeRO++, MixZ++ allows for hierarchical partitioning and quantized communication. The hierarchical partitioning allows all the parameters to be stored within a node when possible so that the communication happens within a node, where communication bandwidth is significantly higher than communicating across nodes. The communication overhead is further reduced by quantizing the weights before gathering them.
+
+Finally, unlike ZeRO++ where parameters are always stored in fp16/bf16, and quantized/dequantized before and after communication, MixZ++ can persistently store the frozen weights in [Low-Rank Adaptation (LoRA)](https://github.com/microsoft/LoRA) in lower-precision, significantly reducing the communication overhead, eliminating quantization overhead, and supporting larger batch sizes that enable better efficiency.
+
+A comprehensive exploration of technical details can be accessed through our [ZeRO++ blog](https://www.microsoft.com/en-us/research/blog/deepspeed-zero-a-leap-in-speed-for-llm-and-chat-model-training-with-4x-less-communication/), [MixZ++ tutorial](https://www.deepspeed.ai/tutorials/mixed_precision_zeropp/), and [paper](https://arxiv.org/pdf/2306.10209.pdf).
+
+#### Highlights
+
+State-of-the-art approaches like [QLoRA](https://arxiv.org/abs/2305.14314) focus on combining multiple techniques like quantization of LoRA weights, relying on new datatypes such as NF4, and memory-management/offload techniques like paged optimizers to enable finetuning of large models on a single GPU. MixZ++ is our approach to enable large model training powered by quantization but is designed to scale to a large number of GPUs with simplicity and compatibility with existing technologies like ZeRO-Offload and DeepSpeed Hybrid Engine.
+
+MixZ++ has the following highlights:
+- Simplicity: A general solution requiring no assumptions about the model and/or optimizer. Integrating it into your training script is as simple as adding a single line of code.
+- Performance: Powered by a set of highly optimized CUDA kernels that enables efficient quantization/dequantization. The evaluation shows up to 3.3x higher throughput for Llama-2-70B training on 128 GPUs compared to the ZeRO-3 baseline (*Figure 3*).
+- Compatibility: Compatible with DeepSpeed/ZeRO features like DeepSpeed Hybrid Engine, ZeRO-Offload, etc.
+- Scalability: Designed to scale to a large number of GPUs. It is tested on up to 384 GPUs on Azure.
+
+
+#### Performance Evaluation
+To assess the effectiveness of MixZ++ for LoRA-enabled training, we carried out a series of RLHF training experiments (Step 3) using the Llama-2-70B model. These experiments were conducted on hardware configurations featuring 64 and 128 V100 GPUs. A visual representation of the experiment results is shown in the following figure:
+
+
+
+
+ *Figure 3: We achieve 3.3x increased throughput for RLHF training of Llama-2-70B on 128 V100 GPUs using Mixed Precision ZeRO++ vs. ZeRO-3. We obsvered 2x improved throughout for the same experiment on 64 V100 GPUs.*
+
+
+
+Specifically, our results showcase a 2x increase in training throughput when utilizing 64 GPUs with MixZ++, compared to the ZeRO-3 baseline. Furthermore, when scaling up to 128 GPUs, the speedup effect becomes even more pronounced, with a substantial 3.3x improvement in training throughput. These outcomes underscore the potential of MixZ++ as a powerful tool for improving training efficiency in large-scale GPU settings.
+
+To try this feature, please refer to [MixZ++ tutorial](https://www.deepspeed.ai/tutorials/mixed_precision_zeropp/).
+
+## ZeRO-Offload Support for Larger Models with 16x fewer GPUs
+
+[ZeRO-Offload](https://www.microsoft.com/en-us/research/blog/zero-infinity-and-deepspeed-unlocking-unprecedented-model-scale-for-deep-learning-training/) powers unprecedented model sizes by leveraging the full memory capacity of a system, concurrently exploiting all heterogeneous memory. Modern GPU clusters have 2-3x more CPU memory than GPU memory. ZeRO-Offload capitalizes on this disparity and offloads optimizer memory and computation from the GPU to the host CPU, enabling larger models to be trained with fewer GPU resources without being bottlenecked by the CPU's lower bandwidth. ZeRO-Offload allows training of large models on up to 16x fewer GPUs as we can see in *Figure 4*.
+
+
+
+
+ *Figure 4: ZeRO-Offload enables us to train Llama-2-7B with 16x fewer GPUs. 16 V100 GPUs are required for training Llama-2-7B with DS-Chat ZeRO-3. Enabling LoRA allows for the number of GPUs to be reduced to 4 while enabling ZeRO-Offload reduces the number of needed GPUs to 1. The HuggingFace Baseline does not run due to memory limitations.*
+
+
+
+ZeRO-Offload was [disabled](https://github.com/microsoft/DeepSpeedExamples/pull/553)
+ with the initial release of DeepSpeed-Chat due to training instability that was observed when it was used with Hybrid Engine and LoRA. After improvements to Hybrid Engine and LoRA as well as extensive testing of all feature configurations for ZeRO Stage2 and ZeRO Stage 3, this feature can now be enabled across all three steps of the DeepSpeed-Chat training framework. Please note that configuring ZeRO-Offload with ZeRO Stage 2 and Hybrid Engine with LoRA disabled is currently unsupported due to observed training instability.
+
+
+
+
+ *Figure 5: Reward scores for all supported DeepSpeed-Chat configurations with ZeRO-Offload enabled. Run with 16 V100 GPUs, [AdamG012/chat-opt-1.3b-sft-deepspeed](https://huggingface.co/AdamG012/chat-opt-1.3b-sft-deepspeed) actor model, [AdamG012/chat-opt-350m-reward-deepspeed](https://huggingface.co/AdamG012/chat-opt-350m-reward-deepspeed) critic model, DS commit: [f036f00c](https://github.com/microsoft/DeepSpeed/tree/f036f00c3763694e539a9070a98130e2667e49bd), DSE commit: [81a8521f](https://github.com/microsoft/DeepSpeedExamples/tree/81a8521f05e2761eed34fcf65f19873df9f74403).*
+
+
+
+# 4. Stability Bug Fixes
+
+A wide range of issues have been addressed in the DeepSpeed runtime and the DeepSpeed-Chat pipeline. These fixes enable advanced features such as Hybrid Engine, LoRA, and ZeRO-Offload to run across all training steps of the DeepSpeed-Chat pipeline and improve training stability and convergence.
+
+
+
+
+ *Figure 6: Step 3 Reward Scores for all supported DeepSpeed-Chat configurations. Run with 16 V100 GPUs, [AdamG012/chat-opt-1.3b-sft-deepspeed](https://huggingface.co/AdamG012/chat-opt-1.3b-sft-deepspeed) actor model, [AdamG012/chat-opt-350m-reward-deepspeed](https://huggingface.co/AdamG012/chat-opt-350m-reward-deepspeed) critic model, DS commit: [f036f00c](https://github.com/microsoft/DeepSpeed/tree/f036f00c3763694e539a9070a98130e2667e49bd), DSE commit: [81a8521f](https://github.com/microsoft/DeepSpeedExamples/tree/81a8521f05e2761eed34fcf65f19873df9f74403).*
+
+
+
+*Figure 6* above shows the training convergence across all supported DeepSpeed-Chat configurations. This data was collected using 16 V100 NVIDIA GPUs, the [AdamG012/chat-opt-1.3b-sft-deepspeed](https://huggingface.co/AdamG012/chat-opt-1.3b-sft-deepspeed) OPT model as the actor, the [AdamG012/chat-opt-350m-reward-deepspeed](https://huggingface.co/AdamG012/chat-opt-350m-reward-deepspeed) OPT model as the critic, and the following DeepSpeed and DeepSpeedExamples repository commits: DS commit: [f036f00c](https://github.com/microsoft/DeepSpeed/tree/f036f00c3763694e539a9070a98130e2667e49bd), DSE commit: [81a8521f](https://github.com/microsoft/DeepSpeedExamples/tree/81a8521f05e2761eed34fcf65f19873df9f74403).
+
+We now dive into the details of all the fixes across different areas.
+
+## DeepSpeed-Chat Pipeline Fixes
+
+In this section we discuss the functionality and training stability fixes in the DeepSpeed-Chat pipeline.
+
+- **Training Stability:**
+
+ - [PR #620 - Make training more stable](https://github.com/microsoft/DeepSpeedExamples/pull/620)
+
+ - To improve the training stability in Step 3, several different areas of training were tuned and changed. To start, the Kullback-Liebler (KL) divergence used in the Proximal Policy Optimization (PPO) trainer was slightly tuned to reduce divergence between the new and reference policies and improve the reward score. Next, the sequence generation function in the PPO trainer (`_generate_sequence()`) removed the specification of a `min_length` in the Actor model's `generate()` call, which means generated sequences won't be artificially enlarged, allowing for the possibility of sequence generation to collapse i.e. when training convergence is extremely poor. A minor off-by-one error was also fixed in the PPO trainer's reward computation function (`compute_rewards()`). Finally, the PPO trainer's RLHF training function was updated to zero out the reward and value after the end of a conversation to prevent incorrect `advantages` and `returns`.
+
+ - [PR #633 - DS Chat Step 3 - Add separate Lora Adam optimizer group](https://github.com/microsoft/DeepSpeedExamples/pull/633)
+
+ - The [LoRA](https://github.com/microsoft/LoRA) feature is supported across all three training steps of the DeepSpeed-Chat framework. Prior to this stability effort, there was no distinction between the overall learning rate and the LoRA learning rate i.e. the LoRA learning rate was set to whatever the overall learning rate was. This led to instability in training convergence and can be seen in *Figure 7* below showing the reward score across training steps for various Step 3 configurations:
+
+
+
+
+ *Figure 7: Before the fix, the sweep across all ZeRO-2 cases without a separate LoRA learning rate shows training instability when LoRA is used.*
+
+
+
+ To address this training convergence issue, when creating the optimizer grouped parameters, the LoRA `lora_right_weight` and `lora_left_weight` parameters were explicitly separated out and given their own LoRA-specific learning rate. After this change, a dramatic improvement in stability was observed, as shown in the figure below:
+
+
+
+
+ *Figure 8: After creating a separate LoRA learning rate, the sweep across all ZeRO-2 cases shows proper convergence.*
+
+
+
+ The next fix details the addition of separate LoRA learning rate arguments.
+
+ - [PR ##685 Add LoRA LR for DS Chat steps 1-3](https://github.com/microsoft/DeepSpeedExamples/pull/685)
+
+ - A *separate* LoRA learning rate argument can now be provided in each of the three training steps, with Step 3 having individual LoRA learning rates for the Actor and Critic models.
+
+- **Bug Fixes:**
+
+ - [PR #636 - DS Chat Step 3 - Fix Zero Stage 3](https://github.com/microsoft/DeepSpeedExamples/pull/636)
+
+ - During DeepSpeed-Chat Step 3 training, we observed hangs when ZeRO Stage 3 was enabled for the actor model and when the `world_size > 1`. When observing the state of each rank, one rank would still be in the sequence generation phase `self._generate_sequence()`, while the other rank had already progressed to the `self.actor_model()` call. This ZeRO Stage 3 desynchronization, due to misaligned token generation between the GPUs, can normally be automatically detected and accounted for in the HuggingFace Transformers library via `synced_gpus`. However, due to the nature of the DeepSpeed-Chat pipeline and the lifetime of the corresponding model configuration objects, this automatic detection code was not triggered. To resolve this, when invoking the `generate()` function, the `synced_gpus` argument is explicitly passed and set to `True` when ZeRO Stage 3 is being used.
+
+ - [PR #658 - Fix only optimize lora and ack-ckpting compatible](https://github.com/microsoft/DeepSpeedExamples/pull/658)
+
+ - This fix allows Step 3 training to run with the combination of gradient checkpointing and *LoRA-only* parameter optimization, a previously unsupported training case. With the addition of the [enable_input_require_grads](https://github.com/huggingface/transformers/blob/f26099e7b5cf579f99a42bab6ddd371bf2c8d548/src/transformers/modeling_utils.py#L1225) model utility function in the HuggingFace Transformers library, which enables the gradients for the input embeddings, gradient checkpointing and optimization of *only* the LoRA parameters is made possible.
+
+ - [PR #576 - Fix argparse](https://github.com/microsoft/DeepSpeedExamples/pull/576)
+
+ - An external contributor helped in resolving an argument parsing issue.
+
+ - [PR #584 - Fix unused parameter bug](https://github.com/microsoft/DeepSpeedExamples/pull/584)
+
+ - An external contributor fixed the passing of an uninitialized parameter that was hardcoded earlier.
+
+
+## Hybrid Engine Fixes
+In this section we discuss several fixes in the Hybrid Engine.
+
+- [PR #3563 - Fix LoRA Fuse/Unfuse in Hybrid Engine](https://github.com/microsoft/DeepSpeed/pull/3563)
+
+ - During Step 3 training for OPT with LoRA and Hybrid Engine enabled, an issue arose regarding a tensor size mismatch of the LoRA weights. Specifically, the LoRA QKV weights were not fused in the OPT container policy, yet they were expected to be fused by the Hybrid Engine. This challenge was effectively resolved by introducing both fused and unfused LoRA methods in the Hybrid Engine. We thank @sxjscience for providing this fix.
+
+- [PR #3883 - Extend HE-Lora test with Z3 support + Fix/add guard in HE for Z3](https://github.com/microsoft/DeepSpeed/pull/3883)
+
+ - The Hybrid Engine was updated to properly check whether ZeRO Stage 3 was enabled when resetting the inference container parameters, along with expanding the corresponding unit tests.
+
+
+## ZeRO Stage 3 Fixes
+In this section we discuss several fixes in support of the ZeRO Stage 3 feature.
+
+- [PR #3819 - Fix racing condition in GatheredParameters](https://github.com/microsoft/DeepSpeed/pull/3819)
+
+ - A race condition in the the ZeRO `GatheredParameters` context, which resulted in various `'status': 'INFLIGHT'` issues, was fixed by removing duplicate input parameters that were being passed from the Hybrid Engine.
+
+- [PR #3884 - Separate ZeRO3 InflightParamRegistry for train and eval](https://github.com/microsoft/DeepSpeed/pull/3884)
+
+ - The ZeRO Stage 3 `InflightParamRegistry` was updated to use a separate `InflightParamRegistry` for training and evaluation, fixing an issue where leftover parameters in flight were causing inflight parameter errors. These fixes, along with related fixes in the Hybrid Engine, enabled the use of the ZeRO-Offload feature in the DeepSpeed-Chat training pipeline.
+
+- [PR #3928 - Remove the param.ds_tensor from print](https://github.com/microsoft/DeepSpeed/pull/3928)
+
+ - A minor change that was necessary to address the DeepSpeed-Chat Step 3 hang issue ([PR #636](https://github.com/microsoft/DeepSpeedExamples/pull/636)) as it allowed us to progress further into execution and observe the desynchronization point.
+
+
+# 5. Software Improvements
+
+To improve the characterization, ease of debug, and maintainability of the DeepSpeed-Chat framework, several areas of software improvements have been completed. Characterization scripts were added to enable systematic composition of features, instrumentation was added to improve insight into the behavior of training, and a testing CI workflow was added to improve the maintainability of the DeepSpeed-Chat training framework.
+
+## Characterization Scripts
+
+The DeepSpeed-Chat training framework provides a rich set of features (Hybrid Engine, ZeRO, LoRA, etc.) that can be composed in many different combinations, depending on the scenario. The interactions between the features are often complex and composing them in a systematic way for characterization is useful for understanding their behavior. To support such use cases, characterization scripts have been added to run sweeps of Steps 1, 2, and 3 training for various combinations of features. The scripts default to OPT but can be modified to run with Llama. Please see the READMEs in the following folders for more details:
+
+- [Step 1 Sweep Scripts](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/opt/single_node/sweep)
+- [Step 2 Sweep Scripts](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/opt/single_node/sweep)
+- [Step 3 Sweep Scripts](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/opt/single_node/sweep)
+
+For example, the Step 3 characterization script sweeps across various training features:
+| Feature | Values |
+| --- | --- |
+| ZeRO Stage | 2, 3 |
+| Hybrid Engine | True, False |
+| ZeRO-Offload | True, False |
+| LoRA | True, False |
+
+And can be ran as follows:
+
+
+DeepSpeedExamples/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning$ bash training_scripts/opt/single_node/sweep/run_step3_sweep.sh
+
+
+The training log for each combination of features will be stored in a folder with the name formatted as `z${z}_he_${he}_offload_${offload}_lora_${lora}`
+
+
+Related PRs:
+
+- [DS Chat Characterization Scripts (Step 1 and 3)](https://github.com/microsoft/DeepSpeedExamples/pull/638)
+- [Add step 2 sweep script, clean up scripts](https://github.com/microsoft/DeepSpeedExamples/pull/664)
+- [Update script location and docs for all 3 steps](https://github.com/microsoft/DeepSpeedExamples/pull/681)
+
+## Instrumentation
+
+To gain better insight into DeepSpeed-Chat training, new [instrumentation features](https://github.com/microsoft/DeepSpeedExamples/pull/624) were added across all three steps of DeepSpeed-Chat and can be enabled via arguments to each step's `main.py`.
+
+| Argument | Description | Step(s) |
+| --- | --- | --- |
+| --print_loss | Print loss during each step | 1 |
+| --enable_tensorboard | Enable TensorBoard logging at the model Runtime Engine level | 1,2,3 |
+| | Enable TensorBoard logging at the Training Pipeline level | 3 |
+| --tensorboard_path | Path to write TensorBoard log | 1,2,3 |
+| --print_answers | Print actor model prompt and answers during training across all ranks | 3 |
+
+
+### TensorBoard
+TensorBoard logging can be enabled in each of the three training steps, with some slight nuances in Step 3. To start, for each training step, the `enable_tensorboard` argument can be used to enable a TensorBoard monitor at the Runtime Engine level ([see documentation](https://www.deepspeed.ai/docs/config-json/#monitoring-module-tensorboard-wandb-csv)) and is reflected in the corresponding model training configuration:
+```python
+"tensorboard": {
+ "enabled": enable_tensorboard,
+ "output_path": f"{tb_path}/ds_tensorboard_logs/",
+ "job_name": f"{tb_name}_tensorboard"
+}
+```
+
+- **Step 3**:
+ Due to Step 3 initializing both an Actor and a Critic model, _each_ of the models will have their own corresponding TensorBoard monitor at the Runtime Engine level. Beyond that, Step 3 training also contains a Pipeline-level TensorBoard monitor a level above the model runtime engines, which captures the `reward`, `actor_loss`, `actor_loss_sum`, `critic_loss`, and `critic_loss_sum`.
+
+## Testing
+
+As part of the DeepSpeed team's commitment to maintaining the DeepSpeed-Chat training framework, continuous integration [PyTest](https://github.com/microsoft/DeepSpeedExamples/blob/master/applications/DeepSpeed-Chat/training/tests/test_training.py) testing has been added for Step 3 RLHF training in a new [GitHub Actions workflow](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-ds-chat.yml).
+
+| Description | Status |
+| ----------- | ------ |
+| Integrations | [![nv-ds-chat](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-ds-chat.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-ds-chat.yml) |
+
+ The workflow is run on a **nightly** basis across a **16-case** test matrix (see table below), and uses the **facebook/opt-125m** model for both the actor and critic.
+
+| Parameter | Values |
+| --- | --- |
+| ZeRO Stage | 2, 3 |
+| Hybrid Engine | True, False |
+| ZeRO-Offload | True, False |
+| LoRA | True, False |
+
+Each configuration (16 total) runs through a limited number of Step 3 non-overflow training steps (i.e. steps where neither actor nor critic overflow) and saves the actor/critic models. Assertions are used to check if the training pipeline executed correctly and if the actor and critic models were saved properly.
+
+# 6. Try Out DeepSpeed-Chat
+We are very excited to share this DeepSpeed-Chat feature and stability release.
+
+* To get started, please visit our GitHub page for DeepSpeed-Chat: [GitHub Landing Page](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat)
+
+* We will continue to improve DeepSpeed-Chat with your feedback and support. Our [roadmap](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/README.md#-deepspeed-chats-roadmap-) shows currently supported features as well as ones that are planned for future.
+
+DeepSpeed-Chat is part of the bigger DeepSpeed ecosystem comprising of a multitude of Deep Learning systems and modeling technologies. To learn more,
+
+* Please visit our [website](https://www.deepspeed.ai/) for detailed blog posts, tutorials, and helpful documentation.
+* You can also follow us on our [English Twitter](https://twitter.com/MSFTDeepSpeed), [Japanese Twitter](https://twitter.com/MSFTDeepSpeedJP), and [Chinese Zhihu](https://www.zhihu.com/people/deepspeed) for latest news on DeepSpeed.
+
+DeepSpeed welcomes your contributions! We encourage you to report issues, contribute PRs, and join discussions on the [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/) page. Please see our [contributing guide](https://github.com/microsoft/DeepSpeed/blob/master/CONTRIBUTING.md) for more details. We are open to collaborations with universities, research labs, companies, such as those working together on deep learning research, applying DeepSpeed to empower real-world AI models and applications, and so on. For such requests (and other requests unsuitable for GitHub), please directly email to deepspeed-info@microsoft.com.
+
+* "Star" our [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/) and [DeepSpeedExamples GitHub](https://github.com/microsoft/DeepSpeedExamples/) repositories if you like our work!
diff --git a/blogs/deepspeed-chat/japanese/README.md b/blogs/deepspeed-chat/japanese/README.md
new file mode 100644
index 000000000000..e7aa62721417
--- /dev/null
+++ b/blogs/deepspeed-chat/japanese/README.md
@@ -0,0 +1,345 @@
+
+
+# DeepSpeed Chat: ChatGPTライクなモデルを簡単・高速・低コストに、あらゆるスケールで学習
+
+
+
+
+
+
+
+
+
+DeepSpeed Chat を引用するには、こちらの[arxiv report](https://arxiv.org/abs/2308.01320)を引用してください:
+
+```
+@article{yao2023dschat,
+ title={{DeepSpeed-Chat: Easy, Fast and Affordable RLHF Training of ChatGPT-like Models at All Scales}},
+ author={Zhewei Yao and Reza Yazdani Aminabadi and Olatunji Ruwase and Samyam Rajbhandari and Xiaoxia Wu and Ammar Ahmad Awan and Jeff Rasley and Minjia Zhang and Conglong Li and Connor Holmes and Zhongzhu Zhou and Michael Wyatt and Molly Smith and Lev Kurilenko and Heyang Qin and Masahiro Tanaka and Shuai Che and Shuaiwen Leon Song and Yuxiong He},
+ journal={arXiv preprint arXiv:2308.01320},
+ year={2023}
+}
+```
+
+# 1. 概要
+
+ChatGPT(チャットGPT)やその類似モデルは、AIの世界に旋風を巻き起こし、デジタル業界に革命的な影響を与えています。これらのモデルは非常に汎用性が高く、要約、コーディング、翻訳などの多様なタスクを、人間の専門家と同等か、それ以上の結果で実施できます。その圧倒的な性能を受けて、AI関連のオープンソースコミュニティでは、ChatGPTスタイルのモデルをより利用しやすくするための複数の取り組みが始まっています(ChatLLaMa、Alpaca、Vicuna、Databricks-Dollyなど)。
+
+しかし、様々なプロジェクトで多大な努力が投じられたにも関わらず、ChatGPTライクなモデルの訓練で必要となるRLHF(Reinforcement Learning from Human Feedback)を、十分に簡単かつ高い効率で実行できるend-to-endなパイプラインは、これまで存在していませんでした。例えば、6.7BのChatGPTライクなモデルを訓練するには、高価なGPUが多数必要になり、多くのデータサイエンティストにとっては実施が困難でした。また仮にそうした計算資源があったとしても、従来のソフトウェアでは、ハードウェアの5%未満の性能しか引き出せませんでした([概要](#実効スループットとスケーラビリティ))。さらには、従来のソフトウェアを用いて、簡単かつ高速に、かつ低コストで、数千億のパラメータを持つ最先端のChatGPTライクなモデルの訓練する方法はありませんでした。
+
+ChatGPTの訓練に用いられるInstructGPTにおいて提案されたRLHFでは、これまでの標準的な事前学習やファインチューニングと全く異なり、はるかに複雑なパイプラインが必要となります。従来のソフトウェアでは、そうしたパイプラインが効果的にサポートする仕組みがありませんでした。そこで、RLHFの訓練を広くAIコミュニティで利用可能とし、ChatGPTのようなモデルを誰もが作成できるにするため、以下の機能を備えたDeepSpeed-Chatをリリースすることになりました。
+
+(i) ***容易に実施可能なChatGPTライクなモデルの訓練と推論***: Hugging Faceレポジトリで提供されている学習済みモデルから開始して、InstructGPT学習の全3ステップを実行し、独自のChatGPTライクなモデルを生成できるスクリプトを提供します。また、学習後の会話形式のインタラクションをテストするための推論APIを提供します。
+
+(ii) ***DeepSpeed-RLHF パイプライン***: DeepSpeed-RLHFパイプラインは、InstructGPTの学習パイプラインの3つのステップ a) 教師付きファインチューニング (Supervised fine-tuning, SFT), b) 報酬モデルのファインチューニング, c) RLHF (Reinforcement Learning with Human Feedback) を、包括的に、かつ1対1の対応を保って再現するものです。また、複数のデータソースからの同時学習を可能にするために、学習データの抽象化・ブレンド機能を提供します。
+
+(iii) ***DeepSpeed-RLHF システム***: DeepSpeedの学習・推論機能を統合した、RLHF用のハイブリッドエンジン DeepSpeed-HE を提供します。DeepSpeed-HE は、RLHFのパイプライン内で推論モードと訓練モードをシームレスに切り替えでき、テンソル並列や高性能なTransformerカーネルなど、DeepSpeed-Inferenceのさまざまな最適化技術を推論に活用できる一方、強化学習の訓練では、ZeROやLoRAベースの多数のメモリ最適化技術を利用します。また、DeepSpeed-HEはRLHFパイプラインに完全に適合した設計となっており、RLHFのさまざまなフェーズでメモリ管理やデータ移動の面で最適な技術を適用できます。
+
+DeepSpeed-RLHFシステムは、大規模モデルの学習において類を見ない効率性を実現し、AIコミュニティが、複雑なRLHFの訓練を高速かつ安価に、そして容易に利用できるようにします:
+
+***実行効率とコスト***: 実行効率において、[DeepSpeed-HEは既存システムよりも15倍以上速く](#実効スループットとスケーラビリティ)、RLHFの訓練を高速かつ低コストに行うことができます。例えば、DeepSpeed-HEは、Azure Cloud上でOPT-13Bモデルをわずか9時間で、OPT-30Bを18時間で訓練でき、それぞれのコストは300ドル以下、600ドル以下です。
+
+
+
+| GPUs | OPT-6.7B | OPT-13B | OPT-30B | OPT-66B |
+| ------- | :----------------------------------------------------------: | :------------------------------: | :-----: | :-----: |
+| 8x A100-40GB | 5.7 時間 | 10.8 時間 | 1.85 日 | NA |
+| 8x A100-80GB | 4.1 時間 ($132) | 9 時間 ($290) | 18 時間 ($580) | 2.1 日($1620) |
+
+*表1. ノード1台(8x A100)を用いた場合の訓練時間とAzureでの概算実行コスト*
+
+
+
+
+***高スケーラビリティ***: DeepSpeed-HEは、数千億のパラメータを持つモデルをサポートし、複数ノード・複数GPUのシステムで、優れたスケーラビリティを実現することができます。その結果、13Bのモデルであれば1.25時間で学習でき、175Bの巨大モデルでも、1日以内に学習できます。
+
+
+
+| GPUs | OPT-13B | OPT-30B | OPT-66B | OPT-like-175B |
+| ------------ | :-------------------------------: | :---------------------------------: | :-----: | :-----------: |
+| 64x A100-80G | 1.25 時間 ($320) | 4 時間 ($1024) | 7.5 時間 ($1920) | 20 時間 ($5120) | | |
+
+*表2. 複数ノード(64x A100-80GB)を用いた場合の訓練時間とAzureでの概算実行コスト*
+
+
+> ***注意事項***: 上記の2つの表の数値は、訓練のステージ3のものです。DeepSpeed-RLHFが用いるデータセットと訓練の設定において、合計1.35億トークンを1エポックで訓練した際のスループットの実測値に基づいています。合計6750万のクエリートークン(配列長256の13万件のクエリー)と6750万の生成トークン(配列長256の13万件の回答)があり、ステップごとの最大グローバルバッチサイズは 50万 トークン(クエリーと回答それぞれ1024件)です。DeepSpeedRLHFを用いた場合のコストおよび実行時間の比較にあたっては、これらの詳細をよくご確認ください。さらに詳細な情報は[ベンチマーク設定](https://github.com/microsoft/DeepSpeedExamples/blob/staging-deepspeed-chat-v2/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/BenckmarkSetting.md)を参照ください。
+
+***RLHFを誰もが利用できるように***: DeepSpeed-HEは、1台のGPUのみで130億以上のパラメーターを持つモデルの訓練を実行できます。複数のGPUを備えた高価な計算設備を持たないデータサイエンティストも、小規模なトイモデルではなく、実際のシナリオで使用できる大規模で強力なRLHFモデルを作成できます。
+
+
+
+| | V100 32G | A6000 48G | A100 40G | A100 80G |
+| ---------- | :------: | :-------: | :------: | :------: |
+| モデルサイズ| OPT-2.7B | OPT-6.7B | OPT-6.7B | OPT-13B |
+
+表3. DeepSpeed-HEを用いて1台のGPUで訓練できるモデルサイズ
+
+
+
+以降では、上記で紹介したDeepSpeed-Chatの3つの機能を深く掘り下げて紹介していきます。
+
+# 2. ChatGPTの訓練と推論を手軽に実行
+
+初めに、DeepSpeed-RLHFを用いた訓練の容易さを示す例として、OPT-13BとOPT-66Bのモデルを訓練できることを示します。訓練を実行できる時間が短い場合でも、コンシューマーグレードのGPU1つで、OPT-1.3Bモデルをわずか2時間で訓練することも可能です。DeepSpeed-ChatのAPIを用いて、カスタムされたパイプラインをどのように作るかも紹介していきます。
+
+## DeepSpeed-Chatの例を用いたChatGPTスタイルのモデルの訓練
+
+*a) スクリプトを一つ実行するだけで、RLHF訓練の全3ステージが実行され、ChatGPTライクなモデルを生成されるのを見てみましょう!*
+
+ここではアクターモデルとして事前学習済みOPT-13B、報酬モデルとしてOPT-350MをRLHFで訓練する例を示します。
+以下のスクリプトを実行すると、最終的に130億パラメータのChatGPTライクなモデルが生成されます。
+
+```python
+pip install deepspeed>=0.9.0
+
+git clone https://github.com/microsoft/DeepSpeedExamples.git
+cd DeepSpeedExamples/applications/DeepSpeed-Chat/
+pip install -r requirements.txt
+
+python train.py --actor-model facebook/opt-13b --reward-model facebook/opt-350m --deployment-type single_node
+```
+
+8台のA100を備えたDGXサーバを使う場合、130億パラメータのモデルを半日で訓練できます。以下は各ステップに要する時間の内訳です。
+
+
+
+| モデルサイズ| Step 1 | Step 2 | Step 3 | Total |
+| -------------------------------- | :----: | :----: | :----: | :----: |
+| アクターモデル: OPT-13B, 報酬モデル: OPT-350M | 2.5時間 | 0.25時間 | 10.8時間 | 13.6時間 |
+
+表4. 8台のA100を備えたDGXサーバ1台でのend-to-end処理時間の内訳
+
+
+
+*b) 学習したモデルをDeepSpeed Chatの推論APIで試してみましょう!*
+
+学習させたモデルを組み込んで、試してみましょう。質問応答だけでなく、会話形式のやりとりも楽しめます。
+
+```
+Human: Do you know Microsoft?
+Assistant: Yes, I’m a big fan of Microsoft. What about them?
+Human: Can you explain it to a 6-year old child? I wonder how I should describe it
+Assistant: Sure, I can try. Microsoft is a company that makes computers, and they make a program called “Windows” which
+ is the operating system that runs on the computer. It’s like the “brain” of the computer. It’s where all the
+ programs and files are stored. It’s also where you’ll find the “start” button, which is where you’ll start to
+ use the computer.
+```
+
+## モデルサイズ・設定の変更
+
+訓練の時間、計算機資源、および品質の要件を満たすために、さまざまなモデルのサイズや構成を試す必要があることがあります。DeepSpeed-Chatを使用すれば、簡単にそれが可能です。例えば、研究やビジネスのために、GPUクラスタでより大規模で高品質なモデルを訓練したい場合、希望するモデルサイズとGPU数を指定するだけです。以下は、アクターモデルのパラメータ数を66Bに、GPU数を64に指定する例です。
+
+```python
+python train.py --actor-model facebook/opt-66b --reward-model facebook/opt-350m --deployment-type multi_node
+```
+
+64台のA100 (80GBメモリ) GPUを使用する場合、9時間で660億パラメータのChatGPTモデルを訓練できます。
+
+
+
+| モデルサイズ | Step 1 | Step 2 | Step 3 | Total |
+| -------------------------------- | :-----: | :----: | :----: | :---: |
+|アクターモデル: OPT-66B, 報酬モデル: OPT-350M | 82分 | 5分 | 7.5時間 | 9時間 |
+
+表5. 8台のA100(80GBメモリ)を備えたDGXサーバ8台で660億パラメータのモデルを訓練する場合のend-to-end処理時間の内訳
+
+
+
+1~2時間のコーヒータイムや昼休みに、DeepSpeed-Chatで小規模なトイモデルをトレーニングしてみるのも良いでしょう。例えば、コンシューマグレードのGPUでの訓練を動かしてみるため、1つのデータセットで1.3Bのモデルを訓練する例を用意しました。これなら、昼休みから戻ったときに、できあがったモデルのチェックポイントを試してみることができます。
+
+```python
+python train.py --actor-model facebook/opt-1.3b --reward-model facebook/opt-350m --deployment-type single_gpu
+```
+
+
+
+| モデルサイズ | Step 1 | Step 2 | Step 3 | Total |
+| --------------------------------- | :-------: | :------: | :----: | :---: |
+| Actor: OPT-1.3B, Reward: OPT-350M | 2900 秒| 670 秒 | 1.2時間 | 2.2時間 |
+
+*表6. コモディティグレードのGPU(A6000)1台でのend-to-end処理時間の内訳*
+
+
+
+## DeepSpeed-ChatのAPIを用いたRLHFパイプラインのカスタマイズ
+
+DeepSpeed-Chatでは、設定を柔軟に指定できるAPIを提供しており、ユーザーが独自のRLHF訓練パイプラインを構築することができます。このAPIを通じて、探索的な研究のための幅広いRLHFアルゴリズムを作成するための、汎用的なインターフェースとバックエンドを利用できます。
+
+```python
+
+engine = DeepSpeedRLHFEngine(
+ actor_model_name_or_path=args.actor_model_name_or_path,
+ critic_model_name_or_path=args.critic_model_name_or_path,
+ tokenizer=tokenizer,
+ num_total_iters=num_total_iters,
+ args=args)
+
+trainer = DeepSpeedPPOTrainer(engine=engine, args=args)
+
+for prompt_batch in prompt_train_dataloader:
+ out = trainer.generate_experience(prompt_batch)
+ actor_loss, critic_loss = trainer.train_rlhf(out)
+```
+
+# 3. 完全なRLHF訓練パイプライン
+
+シームレスに訓練を実行するため、DeepSpeed-Chatには、InstructGPTと同様の完全なend-to-endの訓練パイプラインが組み込まれています(図1)。
+
+
+
+
+
+*図1: DeepSpeed-Chatの訓練パイプラインの概要*
+
+
+
+このパイプラインは、次の3つのステップに分かれています。
+
+* Step 1 教師付きファインチューニング(Supervised finetuning, SFT): 様々なクエリに対する人間の回答を慎重に選択し、事前学習された言語モデルをファインチューニングします。
+* Step 2 報酬モデルのファインチューニング:同じクエリに対する複数の回答のランキングを、人間が提供したデータセットを用いて、別のモデル(報酬モデルと呼ばれ、通常はSFTより小さい)を学習します。
+* Step 3 RLHF訓練: Proximal Policy Optimization(PPO)アルゴリズムを用いて、報酬モデルからのフィードバックによりSFTモデルをさらにファインチューニングします。
+
+ステップ3では、さらにモデルの品質を向上させるため、以下の2つの機能を追加で使用することができます。
+
+* 指数移動平均 (EMA) 収集: EMAベースのモデルチェックポイントを最終評価に使用できます。
+* 混合学習: SQuAD2.0のような公開ベンチマークでのモデル品質低下を防ぐために、事前学習の指標(次の単語予測)とPPOの指標を混合して使用します。
+
+これらの2つの機能は、最近のオープンソースプロジェクトではしばしば省かれることがあります。しかし、InstructGPTによれば、EMAチェックポイントは一般に、従来の最終学習済みモデルよりも優れた応答品質を実現できます。また混合学習によって、学習前のベンチマーク解答能力を保持できます。DeepSpeed-Chatでは、InstructGPTで示されたのと同様の訓練を実施可能とするために、これらの機能を提供しています。
+
+また、InstructGPTと同様の内容を実施する機能に加え、研究者や開発者が複数のデータリソースを用いて独自のRLHFモデルを訓練するのを支援するため、以下の便利な機能も提供しています。
+
+* データの抽象化・ブレンド機能: モデルの品質を向上させるため、複数のデータセットでモデルを訓練することができます。このため、DeepSpeed-Chatは、以下の二つの機能も備えています。 1)異なるデータセットの形式を統一するための抽象データセット層、(2)複数のデータセットを適切にブレンドし、3つのトレーニングステージに分割するためのデータ分割・ブレンド機能。
+
+
+# 4. DeepSpeedハイブリッドエンジン – RLHF訓練のための基盤
+
+与えられた指示に基づいて学習するRLHFパイプラインのステップ1とステップ2は、大規模モデルの通常のファインチューニングと似ています。そのため、DeepSpeed-Chatでは、DeepSpeedのZeROの技術による最適化と、DeepSpeedの様々な並列化の柔軟な組み合わせによって、高いスケーラビリティと高速な学習を実現しています。一方、ステップ3は、パフォーマンスへの影響という点で、最も複雑な処理を行う部分です。学習の各反復で、 a)トークン/経験生成と訓練のためのインプットを生成するための推論フェーズ、b) アクターモデルと報酬モデルのパラメータ更新する訓練フェーズの2つのフェーズがあり、さらにそれらの間の相互作用とスケジューリングを効率的に処理する必要があります。 これらを実現するには、 (1) SFTと報酬モデルの複数のコピーをステージ3全体を通して利用するためのメモリ利用の最適化、 (2) ステージ3全体の速度に大きな影響を与える生成フェーズの高速化 という2つの課題があります。指数移動平均(EMA)収集と混合学習を使用する場合には、必要なメモリ量と処理時間はさらに増大します。
+
+これらの課題に取り組むため、我々はDeepSpeedの訓練と推論の全システム機能を統一した基盤機能を、ハイブリッドエンジン DeepSpeed-HE として構成しました。これは、訓練モードではオリジナルのDeepSpeedエンジンを活用し、生成/推論モードではDeepSpeedの推論エンジンを適用することで、ステージ3のRLHFの訓練を大幅に高速化します。図2に示すように、DeepSpeedの訓練エンジンと推論エンジンは、シームレスに切り替えられます。アクターモデルに対して推論モードや訓練モードを有効にしておけば、推論や訓練パイプラインを実行する際に、DeepSpeedがそれぞれに異なる最適化を選択して、システム全体のスループットを改善します。
+
+
+
+
+
+
+*図2. RLHFで最も処理時間がかかる部分を高速化するハイブリッドエンジン(DeepSpeed-HE)*
+
+
+
+RLHF訓練の経験生成フェーズにおける推論では、DeepSpeed-HE は、KVキャッシュと中間結果を扱う軽量なメモリ管理システム、および推論のために高度に最適化されたカーネルと、テンソル並列機能により、既存のソフトウェアと比較してスループット(トークン/秒)を大幅に向上させています。
+
+また訓練では、DeepSpeedの一連のZeROの技術や、Low Rank Adaption(LoRA)などのメモリ最適化技術を利用できます。DeepSpeed-HEでは、非常に高い効率の訓練を実現するため、これらの複数の最適化技術を互いに組み合わせることが可能なように実装されています。
+
+DeepSpeed-HEは、訓練と推論の両方で、モデルの分割をシームレスに変更し、テンソル並列を使用した推論と、DeepSpeedのZeROの技術によるシャーディング機構を使用した訓練でサポートしています。また、メモリを最大限に活用するため、これらのモードごとにメモリの割り当てを再構成します。これにより、メモリ割り当てのボトルネックを回避するとともに、大規模なバッチサイズをサポートすることでパフォーマンスを向上させることができます。DeepSpeedの訓練や推論など、さまざまなシステム技術を集約したハイブリッドエンジンは、最新のRLHF訓練の限界を超えて、RLHFを比類ない規模と効率で実行可能にします。
+
+
+# 5. DeepSpeed RLHF: ハイブリッドエンジン DeepSpeed-HEによる類を見ないスケールと高い効率
+
+## 機能の概要
+
+これまでに説明してきたように、DeepSpeed-HEは、推論と学習のための強力な技術を融合するものです。幅広いハードウェアで、DeepSpeed-RLHFパイプラインの優れたスケーラビリティと高い実行効率を実現するように設計されており、RLHFの学習を高速かつ低コストで、AIコミュニティが簡単に利用できるようにします。
+
+表1は、異なるモデルサイズとGPUでの、実行効率と費用を示しています。DeepSpeed-HEを用いると、Azure Cloud上でOPT-13Bをわずか9時間、OPT-30Bを18時間で訓練でき、必要な費用はそれぞれ300ドル、600ドル以下です。スピードとスケーラビリティの面では、表2に示すように、13Bパラメータのモデルでも1.25時間で学習でき、64GPUのクラスタを使えば175Bの巨大モデルも1日以内に学習できます。また、誰もがRLHFを利用できるようにするという観点から、DeepSpeed-HEを用いると、表3に示すように、130億以上のパラメータを持つモデルを、1つのGPUで訓練することもできるようになっています。
+
+
+## 既存のRLHFシステムとのスループットとモデルサイズのスケーラビリティ比較
+
+DeepSpeed-RLHFは、Colossal-AIや、ネイティブのPyTorchを用いたHugging Faceなどの他のRLHFを訓練可能なシステムと比較して、実行速度とスケーラビリティの両方で優れています。
+
+* スループットに関しては、DeepSpeedは単一GPUでのRLHFトレーニングで10倍以上の向上を実現しています(図3)。複数GPU環境では、Colossal-AIと比較して6~19倍、Hugging Face DDPと比較して1.4~10.5倍のスピードアップを実現しています(図4)。
+* モデルのスケーラビリティに関しては、Colossal-AIが最大で1.3Bのモデルを単一GPUで、6.7BのモデルをA100-40Gを備えた単一のノードで訓練できますが、DeepSpeed-HEは同じハードウェアでそれぞれ6.5Bと50Bのサイズのモデルを訓練できます。これは、最大で7.5倍のモデルサイズを扱えることになります。
+
+したがって、DeepSpeed-HEは、Colossal-AIやHugging Face DDPなどの既存のRLHFシステムと比較して、1桁以上高いスループットを実現しており、同じ実行時間ではるかに大きなアクターモデルを訓練したり、10倍以上低いコストで同様のサイズのモデルを訓練することができます。
+
+
+
+
+
+*図3. 他フレームワークとのStep 3のスループット比較(1台のA100-40Gを使用。バツ印はメモリ不足で実行できないことを示す)*
+
+
+
+
+
+
+
+*図4. ステップ3(全3ステップ処理時間の大半を占める)のEnd-to-endの訓練スループット比較 (8台のA100-40Gを備えた1台のDGXノードを使用。バツ印はメモリ不足で実行できないことを示す)*
+
+
+
+この効率化は、DeepSpeed-HEが、DeepSpeedの高度に最適化された推論機能を活用して、RLHF処理の生成フェーズを高速化したことに起因しています。図5は、1.3BパラメータモデルのRLHF訓練の時間内訳を示したもので、時間の大半は生成フェーズに費やされていることが分かります。DeepSpeedの高性能な推論カーネルを活用することで、DeepSpeed-HEはこのフェーズでHugging Faceの9倍、Colossal-AIの15倍のスループット向上を達成し、end-to-endの類を見ない効率化を実現しています。
+
+
+
+
+
+*図5. DeepSpeed-HEを用いた生成フェーズの高速化(OPT-1.3Bベースのアクターモデル + OPT-350Mベースの報酬モデル、8台のA100-40Gを備えた1台のDGXノードを使用)*
+
+
+
+## 実効スループットとスケーラビリティ
+
+***(I) 実効スループット分析*** RLHFのステージ3におけるDeepSpeed-HEの実効スループットは、生成フェーズと強化学習の訓練フェーズの両方のスループットで決まります。我々の作成したRLHFのパイプラインでは、生成フェーズが全計算量の約20%を占め、強化学習の訓練フェーズが残りの80%を占めています(詳細は[ベンチマークのページ](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/BenckmarkSetting.md)を参照)。しかし、計算量で見た割合が少ないとはいえ、前者は生成された256個のトークンのそれぞれに対して、初期プロンプトの256個のトークンに対してアクターモデルによる推論をそれぞれ1回実行する必要があるため、end-to-endの時間で見ると、その大部分を占めることになり、メモリ帯域が制限されて高いスループットを得ることが難しくなります。一方、強化学習の訓練フェーズでは、1サンプルあたりプロンプトと生成の両方から512個のトークンをフルに使用して、参照アクターモデルについて、数回のフォワードパスとバックワードパスで実行できるため、高いスループットを達成できます。
+
+
+
+
+
+*図6. DeepSpeed-HEを用いたRLHFにおける生成、訓練、および実効スループット(GPU数は最善の効率を得られるように設定)*
+
+
+
+実効スループットを最大化するために、DeepSpeed-HEは、生成フェーズと強化学習の訓練フェーズの両方を最適化しています。まず、両フェーズでより高い効率を得るために、可能な限り大きなバッチサイズを使用します。次に、生成フェーズでは、高性能なTransformerのカーネルを活用して、モデルが単一のGPUメモリに収まる場合はGPUメモリ帯域幅の利用を最大化するとともに、メモリに収まらない場合はテンソル並列(Tensor parallelism)も併用します。生成フェーズでは、ZeROによる省メモリ化の代わりに、テンソル並列を使用することで、GPU間通信を減らし、GPUメモリ帯域幅の利用率を高く保つことができます。
+
+図6では、モデルサイズが1.3Bから175Bの範囲で、DeepSpeed-HEで達成可能な最良の実効スループットを、GPUあたりのTFlopsで示しています。また、生成と訓練の各フェーズで達成されたスループットも示しています。これを見ると、DeepSpeed-HEは、6.7B~66Bのモデルで高い効率を達成していることが分かります。この範囲を超えて175Bまでモデルを大きくすると、メモリが制限により、大きなバッチサイズが設定できなくなり、スループットが低下しますが、それでも1.3Bのモデルよりも1.2倍の効率性を達成しています。こうした巨大なモデルを学習する際のGPUあたりのスループットは、より大きなバッチサイズを扱えるように、より多くのメモリを搭載したGPUにスケールアップすれば、さらに向上する可能性があります。
+
+さらに、図4に示すように、我々の実効性能は既存システムの19倍であり、これは既存システムはピーク性能の5%以下で動作していることを示唆しています。これは、RLHFワークロードを最適化することの難しさとともに、我々のシステムがRLHFパイプラインにおいて有効であることを示しています。
+
+
+
+
+
+*図7. DGXノード(ノードあたり8台のA100-40/80G)の数を増加させた場合のスケーラビリティ(13Bおよび66Bのアクターモデルと350Mの報酬モデルを使用)*
+
+
+
+***(II) スケーラビリティ分析*** モデルサイズごとに、最良のスループットを得られるGPU数は異なります。これは、モデルサイズが大きくなると、実行に多くのメモリを必要とすることに加え、以下に説明する DeepSpeed-HE のスケーラビリティ特性にも起因しています。
+
+図7は、DeepSeed-RLHF が最大 64 GPU で全体的に良好なスケーラビリティを達成したことを示しています。しかし、より詳細に見ると、DeepSpeed-RLHFの訓練では、小規模な環境では超線形(super linear)なスケーリングを達成し、大規模では線形(linear)またはそれ以下のスケーラビリティになっていることが分かります。これは、メモリの可用性と最大グローバルバッチサイズとの間の相互作用によるものです。
+
+DeepSpeed-HEはトレーニングにZeROの技術を採用しているため、利用可能なGPU間でモデルを分割することが可能です。その結果、GPUあたりのメモリ消費量はGPU数の増加とともに減少し、DeepSpeed-HEはGPUあたりでより大きなバッチサイズをサポートできるようになり、超線形のスケーリングが実現できます。しかし、より大規模になると、利用可能なメモリが増加し続ける一方で、最大グローバルバッチサイズが制限されているため、GPUあたりのバッチサイズを小さくすることになり、線形またはそれ以下のスケーリングになります。その結果、与えられた最大グローバルバッチサイズに対して、DeepSpeed-HEは、スーパーリニアとサブリニアのスケーラビリティの境界で最高のスループットとコスト効率を達成し、正確なポイントは、利用可能なメモリとグローバルバッチサイズの関数としてGPUごとに実行できる最大バッチサイズによってほぼ決定されます。
+
+
+# 6. DeepSpeed-Chatのリリース: さっそく試してみましょう!
+
+DeepSpeed-ChatをオープンソースソフトウェアとしてAIコミュニティに公開できることを嬉しく思います。
+
+* DeepSpeed-Chatの[GitHubページ](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat)を見て、早速使い始めましょう。
+* ユーザのみなさまからのフィードバックと協力で、これからも継続的に DeepSpeed-Chat を改善していく予定です。現在サポートされている機能や、将来的にサポートされている機能については、[ロードマップ](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/README.md#-deepspeed-chats-roadmap-)をご覧ください。
+
+
+# 7. DeepSpeedについて
+
+DeepSpeedは、きわめて大規模かつ高速な深層学習を、容易に実現するための様々な機能を持ったソフトウェアです。
+DeepSpeed-Chatは、DeepSpeedの一連のソフトウェアエコシステムの一部です。
+DeepSpeedは、以下のような機能を提供します。
+
+* 数十億~1兆規模のパラメータを持つdenseあるいはsparseなモデルの訓練と推論
+* 高いスループットと数千GPU規模のスケーラビリティ
+* 限られたGPUリソース環境における訓練と推論
+* 類のないレベルの低遅延かつ高スループットな推論
+* 高度なモデル圧縮技術による低遅延な推論とモデルサイズ削減
+
+DeepSpeedは、Microsoftの[AI at Scale initiative](https://www.microsoft.com/en-us/research/project/ai-at-scale/)の一部で、次世代AIの機能の大規模な実現を進めています。詳細は[こちら](https://innovation.microsoft.com/en-us/exploring-ai-at-scale)をご覧ください。DeepSpeedは、[Megatron-Turing NLG (530B)](https://www.microsoft.com/en-us/research/blog/using-deepspeed-and-megatron-to-train-megatron-turing-nlg-530b-the-worlds-largest-and-most-powerful-generative-language-model/), [Jurassic-1 (178B)](https://uploads-ssl.webflow.com/60fd4503684b466578c0d307/61138924626a6981ee09caf6_jurassic_tech_paper.pdf), [BLOOM (176B)](https://huggingface.co/blog/bloom-megatron-deepspeed), [GLM (130B)](https://github.com/THUDM/GLM-130B), [YaLM (100B)](https://github.com/yandex/YaLM-100B) を含め、様々な大規模モデルを学習するのに使用されてきました。
+
+またDeepSpeedは、 [Hugging Face Transformers](https://huggingface.co/docs/transformers/main/main_classes/deepspeed), [Hugging Face Accelerate](https://huggingface.co/docs/accelerate/usage_guides/deepspeed), [PyTorch Lightning](https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.strategies.DeepSpeedStrategy.html), [MosaicML Composer](https://docs.mosaicml.com/en/latest/trainer/using_the_trainer.html?highlight=deepspeed#deepspeed-integration), [Determined AI](https://docs.determined.ai/latest/training/apis-howto/deepspeed/overview.html) など、多くの著名なオープンソースの深層学習フレームワークのバックエンドとして利用されています。
+
+DeepSpeedについてのより詳しい情報は、以下をご覧ください。
+
+* [DeepSpeedのWebサイト](https://www.deepspeed.ai/) には、DeepSpeedの技術に関する詳細なブログ記事、チュートリアル、ドキュメントなどが掲載されています。
+* [DeepSpeedのTwitterアカウント (英語)](https://twitter.com/MSFTDeepSpeed) では、DeepSpeedの最新情報を発信していますので、ぜひフォローください。[日本語版のTwitterアカウント](https://twitter.com/MSFTDeepSpeedJP)もあり、最新の情報を日本語で発信しています。
+
+DeepSpeedチームは、ユーザの方々からのフィードバックやご連絡を受け付けています。
+
+* ユーザのみなさまからのバグ報告、Pull request、さまざまな議論への参加は、[GitHub](https://github.com/microsoft/DeepSpeed/)で受け付けています。詳細については、[ガイドライン](https://github.com/microsoft/DeepSpeed/blob/master/CONTRIBUTING.md)を確認してください。
+* DeepSpeedチームでは、DeepSpeedを用いた深層学習の研究や実世界へのAIモデルやアプリケーションに関して、大学、研究所、企業との方々とのコラボレーションを行っています(日本語でコミュニケーション可能な研究員も在籍しています)。こうしたコラボレーションについてのご要望(およびGitHubには適さないその他の話題)については、deepspeed-info@microsoft.com まで直接メールをお送りください。
diff --git a/blogs/deepspeed-fastgen/README.md b/blogs/deepspeed-fastgen/README.md
new file mode 100644
index 000000000000..c309a9def53f
--- /dev/null
+++ b/blogs/deepspeed-fastgen/README.md
@@ -0,0 +1,302 @@
+
+
+# DeepSpeed-FastGen: High-throughput Text Generation for LLMs via MII and DeepSpeed-Inference
+
+
+
+
+
+
+
+
+## Table of Contents
+1. [Introduction](#introduction)
+2. [Key LLM Serving Techniques](#background)
+3. [Dynamic SplitFuse: A Novel Prompt and Generation Composition Strategy](#technical-approach)
+4. [Performance Evaluation](#performance-evaluation)
+5. [DeepSpeed-FastGen: Implementation and Usage](#using-deepspeed-fastgen)
+6. [Try out DeepSpeed-FastGen](#try)
+7. [Acknowledgements](#acknowledgements)
+
+
+## 1. Introduction
+
+Large language models (LLMs) like GPT-4 and LLaMA have emerged as a dominant workload in serving a wide range of applications infused with AI at every level. From general chat models to document summarization, and from autonomous driving to copilots at every layer of the software stack, the demand to deploy and serve these models at scale has skyrocketed. While frameworks like DeepSpeed, PyTorch, and several others can regularly achieve good hardware utilization during LLM training, the interactive nature of these applications and the poor arithmetic intensity of tasks like open-ended text generation have become the bottleneck for inference throughput in existing systems.
+
+To this end, frameworks like [vLLM](https://arxiv.org/pdf/2309.06180.pdf) powered by PagedAttention and research systems like [Orca](https://www.usenix.org/system/files/osdi22-yu.pdf) have significantly improved the performance of inference for LLMs. However, these systems still struggle to provide consistent quality of service, particularly for workloads with longer prompts. These long prompt workloads are becoming increasingly important as more and more models, like [MPT-StoryWriter](https://www.mosaicml.com/blog/mpt-7b), and systems, such as [DeepSpeed Ulysses](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-ulysses), support context windows stretching to tens of thousands of tokens. To better understand the problem space, we provide detailed examples of how text generation works for LLMs in two distinct phases called prompt processing and generation. When systems treat them as distinct phases, generation will be preempted by prompt processing that risks breaking the service level agreements (SLAs).
+
+Today, we are glad to present DeepSpeed-FastGen, a system that overcomes these limitations by leveraging the proposed Dynamic SplitFuse technique and offers up to 2.3x higher effective throughput compared to state-of-the-art systems like vLLM. DeepSpeed-FastGen leverages the combination of DeepSpeed-MII and DeepSpeed-Inference to provide an easy-to-use serving system.
+
+**Quick Start:** Trying DeepSpeed-FastGen is as simple as installing the latest [DeepSpeed-MII](https://github.com/microsoft/DeepSpeed-MII) release:
+
+```bash
+pip install deepspeed-mii
+```
+
+To generate text using a simple non-persistent pipeline deployment, run the following code. For more details, please see [Section 5](#using-deepspeed-fastgen).
+
+```python
+from mii import pipeline
+pipe = pipeline("mistralai/Mistral-7B-v0.1")
+output = pipe(["Hello, my name is", "DeepSpeed is"], max_new_tokens=128)
+print(output)
+```
+
+## 2. Existing LLM Serving Techniques in Literature
+
+A text generation workload for a single sequence consists of two phases: 1) prompt processing, in which the user-provided text is efficiently processed as a batch of tokens to build a key-value (KV) cache for attention, and 2) token generation, which will add a single token to that cache and generate a new token. Over the course of generating a sequence of text, the model will make many forward calls to the model to generate the full sequence of text. Two major techniques have been proposed in the literature and deployed in systems that address various limitations and bottlenecks that may arise during these phases.
+
+_ Blocked KV Caching: _
+
+vLLM identified that memory fragmentation due to large monolithic KV-caches significantly reduced the concurrency of LLM serving systems and proposed [Paged Attention](https://arxiv.org/pdf/2309.06180.pdf) to enable non-contiguous caches and increase total system throughput. Rather than assign individual variable-sized contiguous chunks of memory, the underlying storage in the KV cache is fixed-sized blocks (also known as pages). The blocked KV-cache increases system throughput by increasing the amount of potential sequence concurrency by eliminating KV-cache induced memory fragmentation. Non-contiguous KV cache implementations are also included in [HuggingFace TGI](https://github.com/huggingface/text-generation-inference) and [NVIDIA TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM).
+
+_ Continuous Batching: _
+
+In the past, dynamic batching, in which a server would wait for multiple requests to process in phase with each other, was used to improve GPU utilization. However, this approach has drawbacks, as it typically requires padding inputs to identical lengths or stalling the system to wait to construct a larger batch.
+
+Recent advancement in large language model (LLM) inference and serving has been focusing on fine granularity scheduling and optimizing memory efficiency. For instance, Orca proposes _iteration-level scheduling_ (also known as continuous batching) which makes distinct scheduling decisions at each forward pass of the model. This allows requests to join/leave the batch as needed, eliminating the need for padding requests thus improving the overall throughput. In addition to Orca, continuous batching has been implemented in NVIDIA TRT-LLM, HuggingFace TGI, and vLLM.
+
+In current systems, there are two primary approaches to implement continuous batching. In TGI and vLLM, the generation phase is preempted to perform prompt processing (called infill in TGI) before continuing with generation. In Orca, these phases are not distinguished; instead, Orca will add a prompt into the running batch so long as the total number of sequences doesn't reach a fixed bound. Both of these approaches to varying degrees need to stall generation to process long prompts (see [Section 3B](#splitfuse)).
+
+To address these shortcomings, we propose a novel prompt and generation composition strategy, Dynamic SplitFuse.
+
+## 3. Dynamic SplitFuse: A Novel Prompt and Generation Composition Strategy
+
+DeepSpeed-FastGen is built to leverage continuous batching and non-contiguous KV caches to enable increased occupancy and higher responsivity for serving LLMs in the data center, similar to existing frameworks such as TRT-LLM, TGI, and vLLM. In order to achieve a new level of performance, DeepSpeed-FastGen introduces SplitFuse which leverages dynamic prompt and generation decomposition and unification to further improve continuous batching and system throughput.
+
+### A. Three Performance Insights
+Before describing Dynamic SplitFuse, we answer three key performance questions that together motivate its design.
+
+*__1. What factors impact the forward pass of a single LLM?__* In order to effectively schedule, it is necessary to understand what are the relevant independent variables the scheduling loop should control. We observe below that the composition of sequences in a forward pass (the batch size in sequences) has a negligible impact on performance compared to the raw number of tokens in the forward pass. This means an effective scheduler can be built around a single signal, the number of tokens in the forward pass.
+
+
+
+
+
+*__2. How does a model's throughput respond to changing the number of tokens in the forward pass?__* An LLM has two key operating regions with a relatively steep transition. With a small number of tokens, the GPU bottleneck is reading the model from memory and so throughput scales with the number of tokens, whereas with many tokens the model is throughput bound by compute and sees near-constant throughput. The model should run highly efficiently if all forward passes are in the throughput-saturating region.
+
+
+
+
+
+*__3. How should a pool of tokens be scheduled across multiple forward passes?__* We observe above that for well-aligned inputs the token-throughput curve is concave, which means the second derivative is bound to be less than or equal to 0. As an example, let $f(x)$ be a concave function of latency to throughput for a given model. For a concave function $f(x)$, the following holds:
+
+ $$0 \geq \lim_{h \to 0} \frac{f(x + h) - 2f(x) + f(x - h)}{h^2}$$
+
+ $$0 \geq f(x + h) - 2f(x) + f(x - h)$$
+
+ $$2f(x) \geq f(x + h) + f(x - h)$$
+
+This states that for a given pool of `2x` tokens to process, the manner that maximizes throughput is that which evenly splits them between two batches. More generally, in a system that must consume and process P tokens over F forward passes, the ideal partitioning scheme will divide them equally.
+
+### B. Dynamic SplitFuse
+
+Dynamic SplitFuse is a novel token composition strategy for prompt processing and token generation. DeepSpeed-FastGen utilizes Dynamic SplitFuse to run at a consistent forward size by leveraging the capability to take partial tokens from prompts and compose this with generation. In particular, Dynamic SplitFuse performs two key behaviors:
+
+1. Long prompts are decomposed into much smaller chunks and scheduled across multiple forward passes (iterations) with only the final pass performing any generation.
+2. Short prompts will be composed to exactly fill a target token budget. Even short prompts may be decomposed to ensure the budget is precisely met and the forward sizes are well-aligned.
+
+Together, these two techniques provide concrete benefits on all user metrics:
+
+1. *__Better Responsiveness__:* Since long prompts no longer require extremely long forward passes to process, the model will provide lower client latency. More forward passes are performed within the same window of time.
+2. *__Higher Efficiency:__* Fusion of short prompts to larger token budgets enables the model to consistently operate in the high throughput regime.
+3. *__Lower variance and better consistency:__* Since forward passes are of consistent size and forward pass size is the primary determinant of performance, the latency of each forward pass is much more consistent than competing systems as is the perceived generation frequency. There are no pre-emption or long-running prompts to increase the latency as in other prior work.
+
+Consequently, DeepSpeed-FastGen will consume tokens from incoming prompts at a rate that permits fast ongoing generation while adding tokens to the system that increase system utilization, providing lower latency and higher throughput streaming generation to all clients as compared to other state-of-the-art serving systems.
+
+
+
+
+
+ *Figure 1: Illustration of continuous batching strategies. Each block shows the execution of a forward pass. An arrow indicates that the forward pass has sequences with one or more tokens generated. vLLM performs either token generations or prompt processing in a forward pass; token generation preempts prompt processing. Orca runs prompts at their complete length alongside generation. Dynamic SplitFuse performs dynamic composition of fixed-sized batches composed of both generation and prompt tokens.*
+
+
+
+## 4. Performance Evaluation
+
+DeepSpeed-FastGen provides state-of-the-art LLM serving performance leveraging its blocked KV cache and Dynamic SplitFuse continuous batching. We evaluate DeepSpeed-FastGen against vLLM on a range of models and hardware configurations following the benchmarking methodology discussed below.
+
+### A. Benchmarking Methodology
+
+We use two primary quantitative schemes for measuring performance.
+
+**Throughput-Latency Curves:** Two key metrics for production readiness are throughput (measured in requests per second) and latency (the responsiveness of each request). To measure this, we instantiate multiple clients (ranging from 1 to 32) concurrently and send requests (512 in total) to the server. The resulting latency of each request is measured at the endpoint and throughput is measured by the end-to-end time to complete the experiment.
+
+**Effective Throughput:** Interactive applications, such as chat applications, can have more stringent and complex requirements than can be captured by top-level metrics like end-to-end latency. In particular, we focus on the increasingly popular chat user scenario:
+
+ 1. A user initiates a task by sending a prompt.
+ 2. The system processes the prompt and returns the first token.
+ 3. Subsequent tokens are streamed to the user as they are produced.
+
+At each point in this process there is an opportunity for a system to provide an adverse user experience; for example, if the first token arrives too slowly or the generation appears to stop for some time. We propose an SLA framework that considers both of these dimensions.
+
+As the lengths of prompts and generated texts vary significantly, affecting computational costs, it is impractical to set rigid SLA values for throughput and latency. Therefore, we define the SLA for prompt latency as |tokens in prompt| / 512 seconds (= 512 tokens/s). Additionally, considering humans' reading speed, we set the SLA for generation latency on the Exponential Moving Average (EMA) to 2, 4, or 6 tokens/sec. Requests that adhere to these SLAs are deemed successful, and the throughput of these successful requests is referred to as **effective throughput**.
+
+We evaluate vLLM and DeepSpeed-FastGen on both Llama-2 7B, Llama-2 13B, and Llama-2 70B on NVIDIA A100, H100, and A6000.
+
+### B. Throughput-Latency Analysis
+
+In this experiment, DeepSpeed-FastGen outperforms vLLM in both throughput and latency, providing equivalent latency with greater throughput or more responsive latency and the same throughput. On Llama-2 70B with 4 A100x80GB, DeepSpeed-FastGen demonstrates up to 2x higher throughput (1.36 rps vs. 0.67 rps) at identical latency (9 seconds) or up to 50% latency reduction (7 seconds vs. 14 seconds) while achieving the same throughput (1.2 rps), as shown in Figure 2. These trends hold when evaluating Llama-2 13B as shown in Figure 3.
+
+
+
+
+ *Figure 2: Throughput and latency of text generation using Llama 2 70B (Tensor parallelism across 4 A100-80GB GPUs). A normal distribution was applied to prompt and generation lengths with averages of 1200/2600 and 128/60, respectively, and a 30% variance*
+
+
+
+
+
+ *Figure 3: Throughput and latency of text generation using Llama 2 13B (A100-80GB GPU, no tensor parallelism). A normal distribution was applied to prompt and generation lengths with averages of 1200/2600 and 60/128, respectively, and a 30% variance*
+
+
+### C. Effective Throughput Analysis
+
+Under the effective throughput analysis that considers both first token latency and the rate at which generation occurs, DeepSpeed-FastGen provides up to 2.3x higher throughput than vLLM. Figure 4 presents a comparative analysis of the effective throughputs of DeepSpeed-FastGen and vLLM. Each plotted point denotes the effective throughput derived from a specific number of clients. As we scaled the number of clients, we initially observed an increase in effective throughput. However, the latency also significantly increases as the number of clients approaches the system's capacity, causing many requests to fail in meeting the SLA. Consequently, the effective throughput will either saturate or decrease at some point. From a usability perspective, it's not particularly relevant how many clients are required to achieve the max effective throughput; the maximum point of the line is the optimal serving point.
+
+
+
+
+ *Figure 4: Effective throughput of DeepSpeed-FastGen and vLLM (Llama 2 70B/A100-80GB using tensor parallelism across 4 A100-80GB GPUs. A normal distribution was applied to prompt and generation lengths with averages of 2600 and 60, respectively, and a 30% variance)*
+
+
+When vLLM preempts the ongoing generation of previous requests, the generation latency experiences a notable increase. This leads to vLLM's effective throughput appearing lower than its directly measured throughput. At vLLM's peak, the effective throughput was 0.63 queries/sec and around 28% of requests failed to meet the 4 tokens/s SLA. At the same SLA, DeepSpeed-FastGen achieved 1.42 queries/sec (less than 1% of requests failed to meet the SLA), which is 2.3x higher than vLLM.
+
+### D. Token Level Timing Analysis
+
+Figure 5 displays the P50, P90, and P95 latencies of the generation processes. Both vLLM and DeepSpeed-FastGen exhibit similar P50 latencies, but vLLM demonstrates significantly higher latencies for P90 and P95.
+Regarding the P95 latencies, DeepSpeed-FastGen achieved a reduction of 3.7 times.
+
+This discrepancy is due to a noticeable spike in vLLM's generation latency when it preempts the ongoing generation to process new prompts.
+In contrast, DeepSpeed-FastGen typically processes the prompt and generation for previous requests concurrently, leading to much more consistent generation latency.
+
+
+
+
+
+ *Figure 5: Per-Token generation Latency of Llama 2 70B/A100-80GB using tensor parallelism across 4 A100-80GB GPUs, 16 clients. A normal distribution was applied to prompt and generation lengths with averages of 2600 and 128, respectively, and a 30% variance.*
+
+
+
+### E. Scalability using Load Balancing
+
+DeepSpeed-FastGen offers replica-level load balancing that evenly distributes requests across multiple servers, allowing you to effortlessly scale up your application.
+
+Figure 6 illustrates the scalability of DeepSpeed-FastGen when employing the load balancer and up to 16 replicas. Note that we utilized 4 A100 GPUs to compute the Llama 2 70B model. In total, we employed 8 nodes to run the 16 replicas. The results demonstrate nearly perfect scalability with DeepSpeed-FastGen.
+Given that the throughput of a single replica is 1.46 queries/sec, the throughput with 16 replicas reaches 23.7 queries/sec, marking a linear 16x increase compared to a single replica.
+
+
+
+
+ *Figure 6: Scalability using the load balancing feature. A normal distribution was applied to prompt and generation lengths with averages of 2600 and 60, respectively, and a 30% variance*
+
+
+### F. Other Hardware Platforms
+
+In addition to the deep analysis on A100, we provide additional benchmarking results for H100 and A6000. The same performance trends were observed on both A6000 and H100 as A100.
+
+
+
+
+ *Figure 7: Throughput-latency curve and effective throughput of Llama 2 70b using 8 H100 GPUs. A normal distribution was applied to prompt and generation lengths with averages of 2600 and 60, respectively, and a 30% variance*
+
+
+
+
+
+ *Figure 8: Throughput-latency curve and effective throughput of Llama 2 7b using A6000. A normal distribution was applied to prompt and generation lengths with averages of 2600 and 60, respectively, and a 30% variance*
+
+
+## 5. DeepSpeed-FastGen: Implementation and Usage
+
+DeepSpeed-FastGen is the synergistic composition of [DeepSpeed-MII](https://github.com/microsoft/DeepSpeed-MII) and [DeepSpeed-Inference](https://github.com/microsoft/DeepSpeed) as illustrated in the figure below. Together, both of these software packages provide various components of the system including the frontend APIs, the host and device infrastructure to schedule batches using Dynamic SplitFuse, optimized kernel implementations, and the tools to construct new model implementations.
+
+
+
+
+
+
+
+The fastest way to get started with our alpha release of DeepSpeed-FastGen is: `pip install deepspeed-mii`.
+
+Please follow our [Getting Started](https://github.com/microsoft/deepspeed-mii#getting-started-with-mii) guide for more details. For usage and reporting issues, please use the [DeepSpeed-MII Github repository](https://github.com/microsoft/DeepSpeed-MII).
+
+### A. Supported Models
+
+We currently support the following model architectures in this alpha release of DeepSpeed-FastGen:
+
+* [LLaMA](https://huggingface.co/models?other=llama) and [LLaMA-2](https://huggingface.co/models?other=llama-2)
+* [Mistral](https://huggingface.co/models?other=mistral)
+* [OPT](https://huggingface.co/models?other=opt)
+
+All current models leverage [HuggingFace](https://github.com/huggingface) APIs in our backend to provide both the model weights and the model's corresponding tokenizer.
+
+We plan to add additional models in the coming weeks and months after the initial release. If there are specific model architectures you would like supported, please [file an issue](https://github.com/microsoft/DeepSpeed-MII/issues) and let us know.
+
+### B. Deployment options
+All of the examples below are runnable in [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples/tree/master/inference/mii). Once installed you have two options for deployment: an interactive non-persistent pipeline or a persistent serving deployment:
+
+#### Non-persistent pipeline
+
+The non-persistent pipeline deployment is a great and fast way to get started and can be done with only a few lines of code. Non-persistent models are only around for the duration of the python script you are running but are useful for temporary interactive sessions.
+
+```python
+from mii import pipeline
+pipe = pipeline("mistralai/Mistral-7B-v0.1")
+output = pipe(["Hello, my name is", "DeepSpeed is"], max_new_tokens=128)
+print(output)
+```
+
+#### Persistent deployment
+
+A persistent deployment is ideal for use with long-running and production applications. The persistent deployment uses a lightweight GRPC server that can be created using the following 2 lines:
+
+
+```python
+import mii
+mii.serve("mistralai/Mistral-7B-v0.1")
+```
+
+The above server can be queried by multiple clients at once thanks to the built-in load balancer from DeepSpeed-MII. Creating a client also just takes 2 lines of code:
+
+```python
+client = mii.client("mistralai/Mistral-7B-v0.1")
+output = client.generate("Deepspeed is", max_new_tokens=128)
+print(output)
+```
+
+A persistent deployment can be terminated when it is no longer needed:
+
+```python
+client.terminate_server()
+```
+
+### C. Advanced Installation Information
+
+For ease of use and a significant reduction in lengthy compile times that many projects require in this space, we distribute a pre-compiled Python wheel covering the majority of our custom kernels through a new library called [DeepSpeed-Kernels](https://github.com/microsoft/DeepSpeed-Kernels). We have found this library to be very portable across environments with NVIDIA GPUs with compute capabilities 8.0+ (Ampere+), CUDA 11.6+, and Ubuntu 20+. In most cases, you shouldn't even need to know this library exists as it is a dependency of DeepSpeed-MII and will be installed with it. However, if for whatever reason you need to compile our kernels manually please see our [advanced installation docs](https://github.com/microsoft/DeepSpeed-Kernels#source).
+
+
+# 6. Try Out DeepSpeed-FastGen
+We are very excited to share this DeepSpeed-FastGen alpha release.
+
+* To get started, please visit our GitHub page for DeepSpeed-MII: [GitHub Landing Page](https://github.com/microsoft/DeepSpeed-MII)
+
+DeepSpeed-FastGen is part of the bigger DeepSpeed ecosystem comprising a multitude of Deep Learning systems and modeling technologies. To learn more,
+
+* Please visit our [website](https://www.deepspeed.ai/) for detailed blog posts, tutorials, and helpful documentation.
+* You can also follow us on our [English Twitter](https://twitter.com/MSFTDeepSpeed), [Japanese Twitter](https://twitter.com/MSFTDeepSpeedJP), and [Chinese Zhihu](https://www.zhihu.com/people/deepspeed) for latest news on DeepSpeed.
+
+DeepSpeed welcomes your contributions! We encourage you to report issues, contribute PRs, and join discussions on the [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/) page. Please see our [contributing guide](https://github.com/microsoft/DeepSpeed/blob/master/CONTRIBUTING.md) for more details. We are open to collaborations with universities, research labs, and companies, such as those working together on deep learning research, applying DeepSpeed to empower real-world AI models and applications, and so on. For such requests (and other requests unsuitable for GitHub), please directly email to deepspeed-info@microsoft.com.
+
+The following items are on our roadmap and we plan to engage with our community on these through our GitHub issues and PRs:
+
+- Performance improvements
+- Broader model support
+- New hardware backends through collaboration with partners
+- Release performance benchmarks (used to generate plots in this blog)
+
+**"Star" our [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/) and [DeepSpeedMII GitHub](https://github.com/microsoft/DeepSpeed-MII/) repositories if you like our work!**
+
+# 7. Acknowledgements
+
+We would like to thank various open-source community projects including HuggingFace, vLLM, and HuggingFace TGI. We have leveraged HF APIs to support models and tokenizers in our alpha release and will continue to add more models. We especially acknowledge and thank the developers of [Flash Attention](https://github.com/Dao-AILab/flash-attention) for their great work. We have extensively leveraged FlashAttention kernels in our system with modifications that have been acknowledged in our code repositories at appropriate file headers. Finally, we want to thank the developers of [FasterTransformer](https://github.com/NVIDIA/FasterTransformer) kernels that we have used in our MoE kernels (released as part of DeepSpeed-Kernels repository).
diff --git a/blogs/deepspeed-fastgen/assets/images/A6000_benchmark.png b/blogs/deepspeed-fastgen/assets/images/A6000_benchmark.png
new file mode 100644
index 000000000000..9d4ab55f5f7a
Binary files /dev/null and b/blogs/deepspeed-fastgen/assets/images/A6000_benchmark.png differ
diff --git a/blogs/deepspeed-fastgen/assets/images/H100_benchmark.png b/blogs/deepspeed-fastgen/assets/images/H100_benchmark.png
new file mode 100644
index 000000000000..89fb9ca3e1ce
Binary files /dev/null and b/blogs/deepspeed-fastgen/assets/images/H100_benchmark.png differ
diff --git a/blogs/deepspeed-fastgen/assets/images/effective_throughput.png b/blogs/deepspeed-fastgen/assets/images/effective_throughput.png
new file mode 100644
index 000000000000..11c7f82bc54f
Binary files /dev/null and b/blogs/deepspeed-fastgen/assets/images/effective_throughput.png differ
diff --git a/blogs/deepspeed-fastgen/assets/images/effective_throughput_main.png b/blogs/deepspeed-fastgen/assets/images/effective_throughput_main.png
new file mode 100644
index 000000000000..1b9a38306e8e
Binary files /dev/null and b/blogs/deepspeed-fastgen/assets/images/effective_throughput_main.png differ
diff --git a/blogs/deepspeed-fastgen/assets/images/fast-gen-overview.jpg b/blogs/deepspeed-fastgen/assets/images/fast-gen-overview.jpg
new file mode 100644
index 000000000000..2affbf8a4cc3
Binary files /dev/null and b/blogs/deepspeed-fastgen/assets/images/fast-gen-overview.jpg differ
diff --git a/blogs/deepspeed-fastgen/assets/images/fastgen-arch-dark.png b/blogs/deepspeed-fastgen/assets/images/fastgen-arch-dark.png
new file mode 100644
index 000000000000..9b90357a3f1b
Binary files /dev/null and b/blogs/deepspeed-fastgen/assets/images/fastgen-arch-dark.png differ
diff --git a/blogs/deepspeed-fastgen/assets/images/fastgen-arch-light.png b/blogs/deepspeed-fastgen/assets/images/fastgen-arch-light.png
new file mode 100644
index 000000000000..9e754abde85d
Binary files /dev/null and b/blogs/deepspeed-fastgen/assets/images/fastgen-arch-light.png differ
diff --git a/blogs/deepspeed-fastgen/assets/images/fastgen-hero-dark.png b/blogs/deepspeed-fastgen/assets/images/fastgen-hero-dark.png
new file mode 100755
index 000000000000..6ac1a775805b
Binary files /dev/null and b/blogs/deepspeed-fastgen/assets/images/fastgen-hero-dark.png differ
diff --git a/blogs/deepspeed-fastgen/assets/images/fastgen-hero-light.png b/blogs/deepspeed-fastgen/assets/images/fastgen-hero-light.png
new file mode 100755
index 000000000000..af8f1defe653
Binary files /dev/null and b/blogs/deepspeed-fastgen/assets/images/fastgen-hero-light.png differ
diff --git a/blogs/deepspeed-fastgen/assets/images/fastgen-overview-dark.png b/blogs/deepspeed-fastgen/assets/images/fastgen-overview-dark.png
new file mode 100755
index 000000000000..dde598a985d8
Binary files /dev/null and b/blogs/deepspeed-fastgen/assets/images/fastgen-overview-dark.png differ
diff --git a/blogs/deepspeed-fastgen/assets/images/fastgen-overview-light.png b/blogs/deepspeed-fastgen/assets/images/fastgen-overview-light.png
new file mode 100755
index 000000000000..bdb5f8df483e
Binary files /dev/null and b/blogs/deepspeed-fastgen/assets/images/fastgen-overview-light.png differ
diff --git a/blogs/deepspeed-fastgen/assets/images/observation-prompt-v-flops.png b/blogs/deepspeed-fastgen/assets/images/observation-prompt-v-flops.png
new file mode 100644
index 000000000000..6d45880588d9
Binary files /dev/null and b/blogs/deepspeed-fastgen/assets/images/observation-prompt-v-flops.png differ
diff --git a/blogs/deepspeed-fastgen/assets/images/observation-prompt-v-latency.png b/blogs/deepspeed-fastgen/assets/images/observation-prompt-v-latency.png
new file mode 100644
index 000000000000..7c14e2bf6e53
Binary files /dev/null and b/blogs/deepspeed-fastgen/assets/images/observation-prompt-v-latency.png differ
diff --git a/blogs/deepspeed-fastgen/assets/images/repl_scale_llama70b_tp4_p2600g60.png b/blogs/deepspeed-fastgen/assets/images/repl_scale_llama70b_tp4_p2600g60.png
new file mode 100644
index 000000000000..834c06dfb07a
Binary files /dev/null and b/blogs/deepspeed-fastgen/assets/images/repl_scale_llama70b_tp4_p2600g60.png differ
diff --git a/blogs/deepspeed-fastgen/assets/images/th_lat_curve_llama70b_tp4_p1200g128.png b/blogs/deepspeed-fastgen/assets/images/th_lat_curve_llama70b_tp4_p1200g128.png
new file mode 100644
index 000000000000..df16b5bebc53
Binary files /dev/null and b/blogs/deepspeed-fastgen/assets/images/th_lat_curve_llama70b_tp4_p1200g128.png differ
diff --git a/blogs/deepspeed-fastgen/assets/images/th_lat_curve_llama70b_tp4_p2600g128.png b/blogs/deepspeed-fastgen/assets/images/th_lat_curve_llama70b_tp4_p2600g128.png
new file mode 100644
index 000000000000..8b69a8a1718b
Binary files /dev/null and b/blogs/deepspeed-fastgen/assets/images/th_lat_curve_llama70b_tp4_p2600g128.png differ
diff --git a/blogs/deepspeed-fastgen/assets/images/throughput_latency.png b/blogs/deepspeed-fastgen/assets/images/throughput_latency.png
new file mode 100644
index 000000000000..aaceebde7038
Binary files /dev/null and b/blogs/deepspeed-fastgen/assets/images/throughput_latency.png differ
diff --git a/blogs/deepspeed-fastgen/assets/images/throughput_latency_13B_no_arrow.png b/blogs/deepspeed-fastgen/assets/images/throughput_latency_13B_no_arrow.png
new file mode 100644
index 000000000000..cc7b8ec1ec05
Binary files /dev/null and b/blogs/deepspeed-fastgen/assets/images/throughput_latency_13B_no_arrow.png differ
diff --git a/blogs/deepspeed-fastgen/assets/images/token_latency.png b/blogs/deepspeed-fastgen/assets/images/token_latency.png
new file mode 100644
index 000000000000..405a3c0d06ed
Binary files /dev/null and b/blogs/deepspeed-fastgen/assets/images/token_latency.png differ
diff --git a/blogs/deepspeed-fastgen/chinese/README.md b/blogs/deepspeed-fastgen/chinese/README.md
new file mode 100644
index 000000000000..fb9cc7319ab6
--- /dev/null
+++ b/blogs/deepspeed-fastgen/chinese/README.md
@@ -0,0 +1,299 @@
+
+
+# DeepSpeed-FastGen:通过 MII 和 DeepSpeed-Inference 实现 LLM 高吞吐量文本生成
+
+
+
+
+
+
+
+
+## 目录
+1. [引言](#introduction)
+2. [关键的 LLM 服务技术](#background)
+3. [动态 SplitFuse:一种新颖的提示和生成组合策略](#technical-approach)
+4. [性能评估](#performance-evaluation)
+5. [DeepSpeed-FastGen:实现与使用](#using-deepspeed-fastgen)
+6. [尝试 DeepSpeed-FastGen](#try)
+7. [致谢](#acknowledgements)
+
+
+## 1. 引言
+
+GPT-4 和 LLaMA 这样的大型语言模型(LLMs)已在各个层次上成为了集成 AI 的主流服务应用。从常规聊天模型到文档摘要,从自动驾驶到各个软件中的Copilot功能,这些模型的部署和服务需求正在迅速增加。像 DeepSpeed、PyTorch 和其他几个框架可以在 LLM 训练期间实现良好的硬件利用率。但它们在与用户互动及处理开放式文本生成等任务时,受限于这些操作的计算密集度相对较低,现有系统往往在推理吞吐量上遇到瓶颈。
+
+为了解决这一问题, [vLLM](https://arxiv.org/pdf/2309.06180.pdf) 这样由 PagedAttention 驱动的框架和 [Orca](https://www.usenix.org/system/files/osdi22-yu.pdf) 这样的系统显著提高了 LLM 推理的性能。然而,这些系统在面对长提示的工作负载时,依旧难以提供良好的服务质量。随着越来越多的模型(例如 [MPT-StoryWriter](https://www.mosaicml.com/blog/mpt-7b))和系统(例如[DeepSpeed Ulysses](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-ulysses))支持延伸到数万个令牌的上下文窗口,这些长提示工作负载变得越来越重要。为了更好地理解问题,我们在下文中提供了详细的示例来说明 LLM 的文本生成是如何在“提示处理”和“生成”的这两个阶段中工作的。当系统将它们视为不同的阶段时,生成阶段将被提示处理所抢占,这可能会破坏服务级别协议(SLAs)。
+
+今天,我们很高兴地介绍 DeepSpeed-FastGen 框架,它通过采用我们提出的动态 SplitFuse 技术,能够提供比vLLM 等先进系统高出多达 2.3 倍的有效吞吐量。DeepSpeed-FastGen 是 DeepSpeed-MII 和 DeepSpeed-Inference 的结合,提供了一个易于使用的服务系统。
+
+**快速开始:** 要使用 DeepSpeed-FastGen 只需安装最新的 [DeepSpeed-MII](https://github.com/microsoft/DeepSpeed-MII) 发行版:
+
+```bash
+pip install deepspeed-mii
+```
+
+要使用简单的非持久性管道部署并生成文本,请运行以下代码。更多详情,请参见[第 5 节](#using-deepspeed-fastgen)。
+
+```python
+from mii import pipeline
+pipe = pipeline("mistralai/Mistral-7B-v0.1")
+output = pipe(["Hello, my name is", "DeepSpeed is"], max_new_tokens=128)
+print(output)
+```
+
+## 2. 现有 LLM 服务技术
+
+单个序列的文本生成工作负载包含两个阶段:1)提示处理,此阶段系统处理用户输入的文本,将其转换成一系列令牌并构建用于注意力机制的键值(KV)缓存;2)生成令牌,即向缓存中添加单个令牌并产生新的令牌。在生成文本序列的过程中,系统将对模型进行多次前向调用以生成完整的文本序列。现有文献和系统中已经提出了两种主要技术,它们解决了这些阶段中可能出现的各种限制和瓶颈。
+
+_ 分块 KV 缓存:_
+
+vLLM识别出大型单体KV缓存导致的内存碎片化显著降低了大型语言模型服务系统的并发性,并提出了“分页注意力”[Paged Attention](https://arxiv.org/pdf/2309.06180.pdf) 机制来实现非连续KV缓存,并增加整个系统的总吞吐量。此技术采用分页缓存机制,从而提升了系统的整体吞吐量。不同于之前分配各个不同大小的连续内存块的做法,分块 KV 缓存中的底层存储是固定大小的块(也称为页面)。分块 KV 缓存通过消除 KV 缓存引起的内存碎片化,增加了潜在的序列并发量,从而增加了系统吞吐量。非连续 KV 缓存也被 [HuggingFace TGI](https://github.com/huggingface/text-generation-inference) 和 [NVIDIA TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) 等框架所实现。
+
+_ 连续批处理:_
+
+过去,动态批处理(服务器等待多个请求以同步处理)被用来提高 GPU 利用率。然而,这种方法有缺点,因为它通常需要将输入填充到相同长度或使系统等待以构建更大的批次(batch)。
+
+近期大型语言模型(LLM)推理和服务的优化一直专注于细粒度调度和优化内存效率。例如,Orca 提出了 _迭代级调度_(也称为连续批处理),它在模型的每次前向传递时作出独特的调度决策。这允许请求根据需要加入/离开批次,从而消除了填充请求的需要,提高了总体吞吐量。除了 Orca,NVIDIA TRT-LLM、HuggingFace TGI 和 vLLM 也实现了连续批处理。
+
+在当前系统中,有两种主要方法来实现连续批处理。在 TGI 和 vLLM 中,生成阶段被抢占以执行提示处理(在 TGI 中称为填充)然后继续生成。在 Orca 中,这些阶段不被区分;相反,只要总序列数没有达到固定限制,Orca 就会将提示加入正在运行的批次中。这两种方法都在不同程度上需要暂停生成以处理长提示(参见[第 3B 节](#splitfuse))。
+
+为了解决这些缺点,我们提出了一种新颖的提示和生成组合策略,动态 SplitFuse。
+
+## 3. 动态 SplitFuse:一种新颖的提示和生成组合策略
+
+类似于现有的框架如 TRT-LLM、TGI 和 vLLM,DeepSpeed-FastGen 的目标是利用连续批处理和非连续 KV 缓存技术,以提升数据中心服务大型语言模型(LLM)的硬件利用率和响应速度。为了实现更高的性能,DeepSpeed-FastGen 提出了 SplitFuse 技术,它利用动态提示和生成分解, 统一来进一步改善连续批处理和系统吞吐量。
+
+### A. 三个性能见解
+在描述动态 SplitFuse 之前,我们回答三个关键的性能问题,这些问题解释了SplitFuse背后的逻辑。
+
+*__1. 哪些因素影响单个 LLM 的前向传递?__* 为了有效地调度,我们必须首先了解调度过程中应考虑的独立变量有哪些。我们观察到,在前向传递中序列的组成(序列中的批次大小)对性能的影响可以忽略不计。这意味着我们可以围绕单一变量--即前向传递中的令牌数量--构建一个高效的调度器。
+
+
+
+
+
+*__2. 模型的吞吐量与前向传递中令牌数量的关系如何?__* 一个 LLM 有两个关键的运行区间,并且过渡相对陡峭。当令牌数量较少时,GPU 的瓶颈是从内存中读取模型,因此吞吐量会随着令牌数量的增加而上升,而当令牌数量很多时,模型的吞吐量受GPU计算能力限制,吞吐量近乎恒定。因此如果我们能将所有前向传递都保持在吞吐量饱和区间,则模型运行效率最高。
+
+
+
+
+
+*__3. 如何在多个前向传递中调度一组令牌?__* 我们在上图中观察到,对于对齐良好的输入,令牌吞吐量曲线是凹的,这意味着第二导数必定小于或等于 0。设 $f(x)$ 为给定模型的延迟至吞吐量的凹函数。则对于凹函数 $f(x)$,以下关系成立:
+
+ $$0 \geq \lim_{h \to 0} \frac{f(x + h) - 2f(x) + f(x - h)}{h^2}$$
+
+ $$0 \geq f(x + h) - 2f(x) + f(x - h)$$
+
+ $$2f(x) \geq f(x + h) + f(x - h)$$
+
+这表明,对于给定的 `2x` 个总令牌来说,最大化吞吐量的方式是将它们均匀分割到两个批次之间。更一般地说,在一个系统中,如果要在 F 个前向传递中处理 P 个令牌,最理想的分区方案是均匀分配它们。
+
+### B. 动态分割融合(Dynamic SplitFuse)
+
+动态分割融合是一种用于提示处理和令牌生成的新型令牌组成策略。DeepSpeed-FastGen 利用动态分割融合策略,通过从提示中取出部分令牌并与生成过程相结合,使得模型可以保持一致的前向传递大小(forward size)。具体来说,动态分割融合执行两个关键行为:
+
+1. 将长提示分解成更小的块,并在多个前向传递(迭代)中进行调度,只有在最后一个传递中才执行生成。
+2. 短提示将被组合以精确填满目标令牌预算。即使是短提示也可能被分解,以确保预算被精确满足,前向大小(forward sizes)保持良好对齐。
+
+动态分割融合(Dynamic SplitFuse)提升了以下性能指标:
+
+1. **更好的响应性:** 由于长提示不再需要极长的前向传递来处理,模型将提供更低的客户端延迟。在同一时间窗口内执行的前向传递更多。
+2. **更高的效率:** 短提示的融合到更大的令牌预算使模型能够持续运行在高吞吐量状态。
+3. **更低的波动和更好的一致性:** 由于前向传递的大小一致,且前向传递大小是性能的主要决定因素,每个前向传递的延迟比其他系统更加一致。生成频率也是如此,因为DeepSpeed-FastGen不需要像其他先前的系统那样抢占或长时间运行提示,因此延迟会更低。
+
+因此,与现有最先进的服务系统相比,DeepSpeed-FastGen 将以允许快速、持续生成的速率消耗来自提示的令牌,同时向系统添加令牌,提高系统利用率,提供更低的延迟和更高的吞吐量流式生成给所有客户端。
+
+
+
+
+
+ *图 1: 连续批处理策略的示意图。每个块显示一个前向传递的执行。箭头表示前向传递有一个或多个生成的令牌序列。vLLM 在一个前向传递中要么生成令牌,要么处理提示;令牌生成抢占提示处理。Orca 在生成过程中以完整长度处理提示。DeepSpeed-FastGen动态分割融合则执行固定大小批次的动态组合,包括生成和提示令牌。*
+
+
+
+## 4. 性能评估
+
+DeepSpeed-FastGen 利用分块 KV 缓存和动态分割融合连续批处理,提供了最先进的 LLM 服务性能。我们以下述的基准测试方法对 DeepSpeed-FastGen 和 vLLM 在一系列模型和硬件配置上进行评估。
+
+### A. 基准测试方法论
+
+我们采用两种主要的定量方法来衡量性能。
+
+**吞吐量-延迟曲线:** 生产环境的两个关键指标是吞吐量(以每秒请求计)和延迟(每个请求的响应性)。为了衡量这一点,我们模拟了多个客户端(数量从 1 到 32 不等)同时向服务器发送请求(总计 512 个)的情况。每个请求的结果延迟在端点测量,吞吐量通过完成实验的端到端时间来测量。
+
+**有效吞吐量:** 诸如聊天应用程序之类的交互式应用程序可能有比上述指标(如端到端延迟)更严格和复杂的要求。以越来越受欢迎的聊天应用为例:
+
+ 1. 用户通过发送提示(输入)来开始对话。
+ 2. 系统处理提示并返回第一个令牌。
+ 3. 随着生成的进行,后续令牌被流式传输给用户。
+
+在这个过程的每个阶段,系统都有可能提供不利的用户体验;例如,第一个令牌到达得太慢;或生成似乎停止了一段时间。我们提出了一个考虑这两个维度的 SLA 框架。
+
+由于提示和生成文本的长度差异很大,影响计算成本,因此设定同一个 SLA 值对于吞吐量和延迟是不切实际的。因此,我们将提示延迟的 SLA 定义为 “|提示中的令牌|/512” 秒(= 512 令牌/秒)。此外,考虑到人类的阅读速度,我们将生成延迟的 SLA 设置在指数移动平均(EMA)上为 2、4 或 6 令牌/秒。能够达到这些 SLA 的请求被认为是成功的,这些成功请求的吞吐量被称为**有效吞吐量**。
+
+我们通过在 NVIDIA A100、H100 和 A6000 上运行 Llama-2 7B、Llama-2 13B 和 Llama-2 70B 对 vLLM 和 DeepSpeed-FastGen进行了评估。
+
+### B. 吞吐量-延迟分析
+
+在这个实验中,DeepSpeed-FastGen 在吞吐量和延迟方面都优于 vLLM,在相同的延迟下DeepSpeed-FastGen的吞吐量更大;在相同的吞吐量下DeepSpeed-FastGen的响应延迟更小。如图 2 所示,在 Llama-2 70B 运行于 4 个 A100x80GB 的情况下,DeepSpeed-FastGen 展示了高达 2 倍的吞吐量(1.36 rps 对比 0.67 rps)在相同的延迟(9 秒)下;或高达 50% 的延迟减少(7 秒对比 14 秒)同时实现相同的吞吐量(1.2 rps)。评估 Llama-2 13B 时DeepSpeed-FastGen也呈现了这些趋势,如图 3 所示。
+
+
+
+
+ *图 2: 使用 Llama 2 70B 进行文本生成的吞吐量和延迟(使用 4 个 A100-80GB GPU 的张量并行)。提示和生成长度遵循正态分布,平均值分别为 1200/2600 和 128/60,方差为 30%*
+
+
+
+
+
+ *图 3: 使用 Llama 2 13B 进行文本生成的吞吐量和延迟(A100-80GB GPU,无张量并行)。提示和生成长度遵循正态分布,平均值分别为 1200/2600 和 60/128,并且有 30% 的方差*
+
+
+### C. 有效吞吐量分析
+
+在考虑了首个令牌的延迟和生成速率的有效吞吐量分析下,DeepSpeed-FastGen 提供的吞吐量比 vLLM 高出多达 2.3 倍。图 4 展示了 DeepSpeed-FastGen 和 vLLM 的有效吞吐量的比较分析。每个绘制的点表示从特定数量的客户端得出的有效吞吐量。当我们扩大客户端数量时,我们最初观察到有效吞吐量的增加。然而,当客户端数量接近系统容量时,延迟也显著增加,导致许多请求未能满足 SLA。因此,有效吞吐量将在某个点上饱和或减少。从可用性角度来看,达到最大有效吞吐量所需的客户端数量并不特别重要;线条的最高点是最优的服务点。
+
+
+
+
+ *图 4: DeepSpeed-FastGen 和 vLLM 的有效吞吐量(Llama 2 70B/A100-80GB 使用张量并行在 4 个 A100-80GB GPU 上。提示和生成长度遵循正态分布,平均值分别为 2600 和 60,并且有 30% 的方差)*
+
+
+当 vLLM 抢占正在进行的先前请求的生成时,生成延迟会明显增加。这导致 vLLM 的有效吞吐量看起来低于其直接测量的吞吐量。在 vLLM 的峰值时,有效吞吐量为 0.63 查询/秒,大约 28% 的请求未能满足 4 令牌/秒的 SLA。在相同的 SLA 下,DeepSpeed-FastGen 达到了 1.42 查询/秒(不到 1% 的请求未能满足 SLA),这是 vLLM 的 2.3 倍。
+
+### D. 令牌级时间分析
+
+图 5 显示了生成过程的 P50、P90 和 P95 延迟。vLLM 和 DeepSpeed-FastGen 展示了类似的 P50 延迟,但 vLLM 的 P90 和 P95 延迟显著更高。
+
+这种差异是由于 vLLM 在抢占正在进行的生成以处理新提示时,生成延迟出现显著增加所导致的。
+相比之下,DeepSpeed-FastGen 通常会同时处理之前请求的提示和生成,导致生成延迟更加一致。
+
+
+
+
+ *图 5: 使用张量并行在 4 个 A100-80GB GPU 上的 Llama 2 70B/A100-80GB 的每令牌生成延迟,16 客户端。提示和生成长度遵循正态分布,平均值分别为 2600 和 128,并且有 30% 的方差。
+
+
+
+### E. 使用负载均衡的可扩展性
+
+DeepSpeed-FastGen 提供了副本级负载均衡,可以将请求均匀分布在多个服务器上,让您轻松扩展应用程序。
+
+图 6 展示了 DeepSpeed-FastGen 在使用负载均衡器和最多 16 个副本时的可扩展性。请注意,我们使用了 4 个 A100 GPU 来计算每个 Llama 2 70B 模型。总共,我们使用了 8 个节点来运行 16 个副本。结果展示了 DeepSpeed-FastGen 几乎完美的可扩展性。
+单个副本时DeepSpeed-FastGen的吞吐量为 1.46 查询/秒,而16 个副本的吞吐量达到了 23.7 查询/秒,与单个副本相比标志着线性的 16 倍增长。
+
+
+
+
+ *图 6: 使用负载均衡功能的可扩展性。提示和生成长度遵循正态分布,平均值分别为 2600 和 60,并且有 30% 的方差*
+
+
+### F. 其他硬件平台
+
+除了对 A100 的深入分析,我们还提供了 H100 和 A6000 的基准测试结果。在 A6000 和 H100 上观察到的性能趋势与 A100 相同。
+
+
+
+
+ *图 7: 使用 8 个 H100 GPU 的 Llama 2 70b 的吞吐量-延迟曲线和有效吞吐量。提示和生成长度遵循正态分布,平均值分别为 2600 和 60,并且有 30% 的方差*
+
+
+
+
+
+ *图 8: 使用 A6000 的 Llama 2 7b 的吞吐量-延迟曲线和有效吞吐量。提示和生成长度遵循正态分布,平均值分别为 2600 和 60,并且有 30% 的方差*
+
+
+## 5. DeepSpeed-FastGen:软件实现与使用指南
+
+DeepSpeed-FastGen 是 [DeepSpeed-MII](https://github.com/microsoft/DeepSpeed-MII) 和 [DeepSpeed-Inference](https://github.com/microsoft/DeepSpeed) 的协同组合,如下图所示。这两个软件包共同提供了系统的各个组成部分,包括前端 API、用于使用动态 SplitFuse 调度批次的主机和设备基础设施、优化的内核实现,以及构建新模型实现的工具。
+
+
+
+
+
+
+
+使用我们的 alpha 版 DeepSpeed-FastGen 最快的入门方式是:`pip install deepspeed-mii`。
+
+请按照我们的 [入门指南](https://github.com/microsoft/deepspeed-mii#getting-started-with-mii) 获取更多细节。如需使用和报告问题,请使用 [DeepSpeed-MII Github 仓库](https://github.com/microsoft/DeepSpeed-MII)。
+
+### A. 支持的模型
+
+在 DeepSpeed-FastGen 的当前 alpha 版本中,我们目前支持以下模型架构:
+
+* [LLaMA](https://huggingface.co/models?other=llama) 和 [LLaMA-2](https://huggingface.co/models?other=llama-2)
+* [Mistral](https://huggingface.co/models?other=mistral)
+* [OPT](https://huggingface.co/models?other=opt)
+
+所有当前模型都利用了后端的 [HuggingFace](https://github.com/huggingface) API 来提供模型权重和模型对应的分词器。
+
+> 我们计划在最初发布后的几周和几个月内添加更多模型。如果您希望支持特定的模型架构,请[提交问题](https://github.com/microsoft/DeepSpeed-MII/issues)来让我们知道。
+
+### B. 部署选项
+以下所有示例均可在 [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples/tree/master/inference/mii) 中运行。安装后,您有两种部署方式:交互式非持久管道或持久化服务部署:
+
+#### 非持久管道
+
+非持久管道部署是快速入门的好方法,只需几行代码即可完成。非持久模型只在您运行的 python 脚本期间存在,适用于临时交互式会话。
+
+```python
+from mii import pipeline
+pipe = pipeline("mistralai/Mistral-7B-v0.1")
+output = pipe(["Hello, my name is", "DeepSpeed is"], max_new_tokens=128)
+print(output)
+```
+
+#### 持久部署
+
+持久部署非常适合用于长时间运行和生产的应用。持久部署使用了轻量级的 GRPC 服务器,可以使用以下两行代码创建:
+
+```python
+import mii
+mii.serve("mistralai/Mistral-7B-v0.1")
+```
+
+上述服务器可以同时被多个客户端查询,这要归功于 DeepSpeed-MII 内置的负载平衡器。创建客户端也只需要两行代码:
+
+```python
+client = mii.client("mistralai/Mistral-7B-v0.1")
+output = client.generate("Deepspeed is", max_new_tokens=128)
+print(output)
+```
+
+持久部署可以在不再需要时终止:
+
+```python
+client.terminate_server()
+```
+
+### C. 高级安装方式
+
+为了使用方便并显著减少许多其他框架所需的冗长编译时间,我们通过名为 [DeepSpeed-Kernels](https://github.com/microsoft/DeepSpeed-Kernels) 的新库分发了覆盖我们大部分自定义内核的预编译 Python wheel。我们发现这个库在环境中非常便携,只要这些环境具有 NVIDIA GPU 计算能力 8.0+(Ampere+)、CUDA 11.6+ 和 Ubuntu 20+。在大多数情况下,您甚至不需要知道这个库的存在,因为它是 DeepSpeed-MII 的依赖项,并将自动与之一起安装。然而,如果您因任何原因需要手动编译我们的内核,请参阅我们的[高级安装文档](https://github.com/microsoft/DeepSpeed-Kernels#source)。
+
+
+# 6. 尝试 DeepSpeed-FastGen
+我们非常高兴分享 DeepSpeed-FastGen 的首个 alpha 版本。
+
+* 要开始,请访问我们的 DeepSpeed-MII GitHub 页面: [GitHub 登陆页面](https://github.com/microsoft/DeepSpeed-MII)
+
+DeepSpeed-FastGen 是更大的 DeepSpeed 生态系统的一部分,该生态系统包含了多种深度学习系统和建模技术。要了解更多,
+
+* 请访问我们的[网站](https://www.deepspeed.ai/),详细查看博客文章、教程和有用的文档。
+* 您也可以通过我们的[英文 Twitter](https://twitter.com/MSFTDeepSpeed)、[日本 Twitter](https://twitter.com/MSFTDeepSpeedJP) 和[中文知乎](https://www.zhihu.com/people/deepspeed) 关注我们,以获取 DeepSpeed 的最新消息。
+
+DeepSpeed 欢迎您的贡献!我们鼓励您在 [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/) 页面上报告问题、贡献 PR,并参与讨论。有关更多详细信息,请参见我们的[贡献指南](https://github.com/microsoft/DeepSpeed/blob/master/CONTRIBUTING.md)。我们愿意与大学、研究实验室和公司合作,比如那些在深度学习研究上共同工作,应用 DeepSpeed 来赋能真实世界的 AI 模型和应用等。对于那些不适合在 GitHub 上提出的请求(以及其他请求),请直接发送电子邮件至 deepspeed-info@microsoft.com。
+
+以下项目在我们的路线图上,我们计划通过我们的 GitHub 问题和 PR 与我们的社区在这些项目上进行交流:
+
+- 性能改进
+- 更广泛的模型支持
+- 通过与合作伙伴的合作支持新硬件后端
+- 发布性能测试套件(例如此博客中生成的图表)
+
+如果您喜欢我们的工作,请为我们的 [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/) 和 [DeepSpeedMII GitHub](https://github.com/microsoft/DeepSpeed-MII/) 仓库打上“星标”!
+
+# 7. 致谢
+
+我们要对包括 HuggingFace、vLLM 和 HuggingFace TGI 在内的多个开源社区项目表示感谢。在 alpha 版本中, 我们利用 HF API 来调用模型和分词器,并计划未来添加更多模型。我们特别感谢 [Flash Attention](https://github.com/Dao-AILab/flash-attention) 开发者的出色工作。我们在系统中广泛利用了 FlashAttention 内核,并已经在我们的代码库的对应的文件头部进行了致谢。最后,我们要感谢我们在 MoE 内核(作为 DeepSpeed-Kernels 仓库的一部分发布)中使用的 [FasterTransformer](https://github.com/NVIDIA/FasterTransformer) 内核的开发者。
diff --git a/blogs/deepspeed-fastgen/japanese/README.md b/blogs/deepspeed-fastgen/japanese/README.md
new file mode 100644
index 000000000000..9729854afcf0
--- /dev/null
+++ b/blogs/deepspeed-fastgen/japanese/README.md
@@ -0,0 +1,315 @@
+
+
+# DeepSpeed-FastGen: MIIとDeepSpeed-InferenceによるLLMのための高速なテキスト生成
+
+
+
+
+
+
+
+
+## Table of Contents
+1. [概要](#introduction)
+2. [LLMのためのテキスト生成の既存技術](#background)
+3. [Dynamic SplitFuse: プロンプト処理と生成を組み合わせる新しいアプローチ](#technical-approach)
+4. [パフォーマンス評価](#performance-evaluation)
+5. [DeepSpeed-FastGen: 実装と使い方](#using-deepspeed-fastgen)
+6. [DeepSpeed-FastGenを使ってみる](#try)
+7. [謝辞](#acknowledgements)
+
+
+## 1. 概要
+
+AIを様々な目的に利用する幅広いアプリケーションで、GPT-4やLLaMAのような大規模言語モデル(LLM)が、主要なワークロードになってきています。一般的なチャットモデルから、文書の要約、自動運転、ソフトウェアスタックの各層におけるプログラミングの補助まで、これらのモデルを大規模に展開・提供する需要が急増しています。DeepSpeedやPyTorchをはじめとするフレームワークは、一般に、LLMの訓練では良好なハードウェアの利用効率を達成できるものの、オープンエンドのテキスト生成などの課題では、GPUなどのハードウェア上で一度に実行される計算量が少ないことが、既存システムにおいて推論スループットのボトルネックとなっています。
+
+PagedAttentionを搭載した [vLLM](https://arxiv.org/pdf/2309.06180.pdf) や [Orca](https://www.usenix.org/system/files/osdi22-yu.pdf) のような既存システムは、こうした課題を解決するために設計され、LLMの推論性能を大幅に向上させました。しかしこれらのシステムは依然として、特に長いプロンプトを含むワークロードにおいて、一貫したサービス品質の提供という点で課題を残しています。
+数千トークンに及ぶコンテキストウィンドウをサポートするモデルやシステム、例えば [MPT-StoryWriter](https://www.mosaicml.com/blog/mpt-7b) や [DeepSpeed Ulysses](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-ulysses) などが増えるにつれて、これらの長いプロンプトのワークロードはますます重要になってきています。
+これらの問題をより深く理解するために、LLMによるテキスト生成がどのように機能するか説明します。LLMによるテキスト生成は、プロンプト処理と生成と呼ばれる2つの異なるフェーズから構成されます。システムがこれらを全く独立に扱うと、生成のフェーズは、プロンプト処理によって中断されることになります。その結果、システムのレイテンシなどを定めた SLA (Service Level Agreement) に違反する可能性が高くなります。
+
+このブログで紹介するDeepSpeed-FastGenは、新たに提案するDynamic SplitFuse技術などを活用することでこうした課題を解決し、vLLMなどの最新の既存システムと比較して最大2.3倍の実効スループットを実現するシステムです。
+DeepSpeed-FastGenは、DeepSpeed-MIIとDeepSpeed-Inferenceの組み合わせにより、使いやすいテキスト生成機能を実現します。
+
+
+**クイックスタート:** 最新の[DeepSpeed-MII](https://github.com/microsoft/DeepSpeed-MII)をインストールするだけで、 DeepSpeed-FastGenを試すことができます。
+
+
+```bash
+pip install deepspeed-mii
+```
+
+より簡単に利用できる、非永続型(推論サーバを起動しない)のパイプラインを使用してテキストを生成するには、次のコードを実行します。詳細については、[セクション5](#using-deepspeed-fastgen) をご覧ください。
+
+```python
+from mii import pipeline
+pipe = pipeline("mistralai/Mistral-7B-v0.1")
+output = pipe(["Hello, my name is", "DeepSpeed is"], max_new_tokens=128)
+print(output)
+```
+
+## 2. LLMのためのテキスト生成の既存技術
+
+テキスト系列を生成するためのワークロードは、次の2つのフェーズで構成されます。 1. プロンプト処理: ここでユーザーが与えたテキストは、アテンション機構におけるキーとバリューのキャッシュ(KVキャッシュ)を構築するために、トークンのバッチとして効率的に処理されます。 2. トークン生成: このフェーズで、KVキャッシュに単一のトークンが追加され、新たなトークンが生成されます。テキスト系列を生成する過程では、モデルは完全なテキストの系列を生成するために多くのフォワードパスの呼び出しを行います。これらのフェーズにおける様々な制限やボトルネックを解決するため、既存システムでは従来提案されてきた以下の2つの主要な技術が採用されています。
+
+_ ブロックKVキャッシュ: _
+
+vLLMは、KVキャッシュにモノリシックの巨大なメモリ領域を割り当てることが、LLMによるテキスト生成システムの同時実行性を大幅に低下させる原因であるとし、その解決として、非連続的に確保されたメモリ領域をKVキャッシュとして利用することで、システム全体のスループットを増加させる [Paged Attention](https://arxiv.org/pdf/2309.06180.pdf) を提案しました。リクエストごとに様々なサイズの連続メモリ領域を割り当てるのではなく、固定されたサイズのメモリブロック(ページとも呼ばれる)を割り当てるようにします。このブロックKVキャッシュは、KVキャッシュによるメモリ断片化を解決することで、潜在的に処理可能な系列の同時実行数を増やし、システムのスループットを増加させます。こうした非連続KVキャッシュの実装は、[HuggingFace TGI](https://github.com/huggingface/text-generation-inference) と [NVIDIA TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) にも含まれています。
+
+
+_ 連続バッチ(Continuous Batching): _
+
+従来は、サーバーが複数のリクエストを一緒に処理するために待つという 動的バッチ(Dynamic Batching)が GPU利用率を改善するために使用されていました。しかし、このアプローチには欠点があります。通常は入力を同一の長さにパディングするか、より大きなバッチを構築するために、十分な数のリクエストが到着するまで処理を止めて待つ必要があります。
+
+最近の大規模言語モデル(LLM)の推論と、それをサービスとして提供するための技術は、より細かい粒度でのスケジューリングとメモリ効率の最適化に焦点を当てています。例えば、Orcaは _イテレーションレベルのスケジューリング_ (連続バッチまたは Continuous Batching とも呼ばれる)を提案しており、これはモデルの各フォワードパスごとにスケジューリングの判断を行います。これにより、必要に応じてあるリクエストをバッチに含めたり除いたりすることができるため、パディングが不要になり、全体のスループットを向上させます。この連続バッチは、Orcaだけでなく、NVIDIAのTRT-LLM、HuggingFaceのTGI、およびvLLMにも実装されています。
+
+現在のシステムでは、連続バッチ処理を実装するには二つの主要なアプローチがあります。TGIとvLLMでは、生成フェーズが中断されてプロンプト処理(TGIではインフィルと呼ばれる)が行われ、その後で生成を続けます。Orcaでは、これらのフェーズは区別されず、代わりにシーケンスの総数が一定の制限に達しない限り、実行中のバッチにプロンプトを追加します。これらのアプローチは、長いプロンプトを処理するために生成を一時停止する必要があるという点で、程度の差こそあれ似ています([セクション3B](#splitfuse)参照)。
+
+
+これらの課題に対処するために、私たちはDynamic SplitFuseと呼ばれる、プロンプト処理と生成を組み合わせる新しい手法を提案します。
+
+
+## 3. Dynamic SplitFuse: プロンプト処理と生成を組み合わせる新しいアプローチ
+
+DeepSpeed-FastGenは、データセンターでのLLMの提供において、TRT-LLM、TGI、vLLMなどの既存のフレームワークと同様に、連続バッチと非連続なKVキャッシュを活用して、より高い占有率と応答性を実現するために開発されました。より高いレベルのパフォーマンスを実現するために、DeepSpeed-FastGenはSplitFuseを導入し、動的にプロンプトの分解し、生成と組み合わせることで、連続バッチとシステムスループットをさらに改善します。
+
+
+### A. パフォーマンスに関する三つの知見
+
+Dynamic SplitFuseについて説明する前に、その設計を動機付ける三つの重要なパフォーマンスに関する質問とその回答を示します。
+
+*__1. 単一のLLMのフォワードパスに影響を与える要因は何ですか?__* 効果的にスケジューリングを行うためには、反復的に実行されるスケジューリングで制御すべき、関連する独立変数が何であるかを理解することが必要です。我々は以下に示すように、フォワードパス内のシーケンスの構成(シーケンスでのバッチサイズ)がフォワードパスのトークンの生数に比べてパフォーマンスにほとんど影響を与えないことを観察しました。これは、効果的なスケジューラを構築するには、主にフォワードパスのトークン数という単一の要素のみに注目すればよいことを意味しています。
+
+
+
+
+
+*__2. フォワードパスのトークン数の変化に対して、モデルのスループットはどのように反応しますか?__* LLMには比較的急に振る舞いが変化する、二つの主要な動作領域があります。トークン数が少ない場合、GPUのボトルネックはメモリからのモデルの読み出しであるため、スループットはトークン数に応じてスケールしますが、トークンが多い場合はモデルのスループットは計算によって制限され、ほぼ一定のスループットを示します。効率的な実行のために、すべてのフォワードパスが、スループットが飽和するような領域で実行されるのが望ましいと言えます。
+
+
+
+
+
+*__3. トークンのプールは複数のフォワードパスにどのようにスケジュールされるべきですか?__* 上記で述べたように、入力が適切に整列している場合、トークンのスループット曲線は凹形であり、これは二次導関数が0以下であることを意味します。例として、あるモデルの遅延からスループットへの凹関数を $f(x)$ としましょう。凹関数 $f(x)$ に対しては、以下が成り立ちます:
+
+ $$0 \geq \lim_{h \to 0} \frac{f(x + h) - 2f(x) + f(x - h)}{h^2}$$
+
+ $$0 \geq f(x + h) - 2f(x) + f(x - h)$$
+
+ $$2f(x) \geq f(x + h) + f(x - h)$$
+
+これは、処理する `2x` トークンのプールに対して、スループットを最大化する方法は、それらを二つのバッチに均等に分割することであると述べています。より一般的には、`P` トークンを `F` 回のフォワードパスで処理する必要があるシステムでは、理想的な分割スキームはそれらを均等に分割するものになります。
+
+### B. Dynamic SplitFuse
+
+Dynamic SplitFuseは、プロンプト処理とトークン生成を組み合わせるための新しいアプローチです。DeepSpeed-FastGenは、プロンプトからの一部のトークンを取り出し、これを生成と組み合わせることで、一貫したフォワードサイズで実行するためにDynamic SplitFuseを利用します。Dynamic SplitFuseは以下の2つの主要な動作からなります:
+
+1. 長いプロンプトは、はるかに小さなチャンクに分解され、複数のフォワードパス(イテレーション)にわたってスケジュールされます。生成は、最後のフォワードパスでのみ実行されます。
+2. 短いプロンプトは、フォワードパスのための目標トークン数を正確に満たすようにスケジュールされます。短いプロンプトであっても、フォワードパスに与える目標のトークン数を正確に満たし、複数のフォワードパスでトークン数が均等になるように分解されることがあります。
+
+これら2つの技術を組み合わせることで、以下のすべてのユーザー指標において、具体的な利点が得られます:
+
+1. *__より良い応答性__*: 長いプロンプトによりフォワードパスで非常に長い時間がかかることがなくなり、モデルはクライアントから見てより低いレイテンシが実現できます。これは、同じ時間枠内でより多くのフォワードパスが実行されていることになります。
+2. *__高い効率__*: 短いプロンプトを、その他のリクエストのトークンと一緒に実行することで、モデルは一貫して高スループットで動作します。
+3. *__レイテンシ変動の減少と一貫性の向上__*: 1回のフォワードパスに与えるトークン数の変動が少なくなります。フォワードパスに与えるトークン数がパフォーマンスの主要な決定要因であるため、各フォワードパスのレイテンシは競合するシステムよりもはるかに一貫したものとなります。他の先行研究のように、プリエンプションや長時間実行されるプロンプトによって遅延が増加することはありません。
+
+結果として、DeepSpeed-FastGenは、システムの利用率を高めるためにトークンをフォワードパスに加えていくことで、到着するリクエストのプロンプト処理を、進行中の生成フェーズを高速に実行しながら行えます。これにより、
+他の最先端のテキスト生成システムと比較して、すべてのクライアントに対してより低レイテンシかつ高スループットのストリーミング生成を実現できます。
+
+
+
+
+
+
+*図1: 連続バッチ処理戦略のイラスト。各ブロックはフォワードパスの実行を示しています。矢印は、1つ以上のトークンが生成されたシーケンスを持つフォワードパスを示しています。vLLMはフォワードパスでトークン生成またはプロンプト処理のいずれかを実行し、トークン生成はプロンプト処理をプリエンプトします。Orcaは生成と同時に完全な長さのプロンプトを実行します。Dynamic SplitFuseは、生成トークンとプロンプトトークンの両方で構成された固定サイズのバッチの動的構成を実行します。*
+
+
+## 4. パフォーマンス評価
+
+DeepSpeed-FastGenは、ブロックKVキャッシュとDynamic SplitFuseのcontinuous batchingを活用し、最先端のLLMサービング性能を提供します。我々は、以下で議論されるベンチマーク手法に従って、さまざまなモデルとハードウェア構成でDeepSpeed-FastGenとvLLMを評価します。
+
+### A. ベンチマーク手法
+
+パフォーマンスを測定するために、我々は2つの主要な定量的スキームを使用します。
+
+**スループット-レイテンシカーブ**: 実サービス利用のための2つの主要な指標は、スループット(秒間リクエスト数で測定)とレイテンシ(各リクエストの応答性)です。これを測定するために、我々は複数のクライアント(1から32まで)を同時に起動し、サーバーにリクエスト(合計512)を送信します。各リクエストの結果としてのレイテンシは各リクエストの単位で測定され、スループットは実験を完了するためのエンドツーエンドの時間で測定されます。
+
+**実効スループット**: チャットアプリケーションのようなインタラクティブなアプリケーションは、エンドツーエンドのレイテンシのようなトップレベルの指標では捉えきれない、より厳格で複雑な要件を持っている場合があります。特にここでは、急速に広く使われつつあるチャットアプリケーションのユーザシナリオに焦点を当てます:
+
+1. ユーザーがプロンプトを送信してタスクを開始します。
+2. システムがプロンプトを処理し、最初のトークンを返します。
+3. 続くトークンは、生成されると同時に、ユーザーにストリーミングで送信されます。
+
+このプロセスの各ポイントで、ユーザーにとって望ましくない体験になる可能性があります。例えば、最初のトークンが遅すぎる場合や、生成がしばらくの間停止するように見える場合です。我々は、これらの2つの観点を考慮に入れたSLAのフレームワークを提案します。
+
+プロンプトと生成されたテキストの長さには、非常に広い幅があり、またそれが計算コストに影響を与えるため、スループットとレイテンシに厳格なSLA値を設定することは非現実的です。したがって、我々はプロンプトのレイテンシのSLAをプロンプト内の|トークン数| / 512秒(= 512トークン/秒)と定義します。さらに、人間の読む速度を考慮して、生成レイテンシのSLAを、指数移動平均(EMA)で秒間2、4、または6トークンに設定します。これらのSLAを満たすリクエストは成功と見なし、これらの成功したリクエストのスループットを **実効スループット** とします。
+
+我々は、NVIDIA A100、H100、およびA6000上のLlama-2 7B、Llama-2 13B、およびLlama-2 70BでvLLMとDeepSpeed-FastGenを評価しました。
+
+### B. スループット・レイテンシ分析
+
+この実験では、DeepSpeed-FastGenは、vLLMをスループットとレイテンシの両方で上回り、同じスループットでより低レイテンシを提供するか、あるいはより高スループットで同じレイテンシを提供します。4台の A100 GPU(メモリ80GB)とLlama-2 70Bを使用したテキスト生成では、DeepSpeed-FastGenは同じレイテンシ(9秒)で2倍高いスループット(それぞれ1.36 rpsと0.67 rps)を示すか、同じスループット(1.2 rps)を達成しながら最大50%のレイテンシ削減(それぞれ7秒と14秒)を実現します。この結果は図2に示されています。またこの傾向は、図3に示されるLlama-2 13Bでの評価でも同様です。
+
+
+
+
+
+ *図2: テキスト生成のスループットとレイテンシ(4台のA100-80GB GPUでのテンソル並列を使用したLlama 2 70B)。プロンプトと生成の長さは、平均1200/2600と128/60の正規分布(30%の分散)に基づいて設定。*
+
+
+
+
+
+ *図3: テキスト生成のスループットとレイテンシ(1台のA100-80GB GPUでのテンソル並列なしでのLlama 2 13B)。プロンプトと生成の長さは、平均1200/2600と60/128の正規分布の正規分布(30%の分散)に基づいて設定。*
+
+
+### C. 実効スループット分析
+
+最初のトークンのレイテンシと、生成が行われる速度の両方を考慮した実効スループットにおいて、DeepSpeed-FastGenはvLLMに比べて最大2.3倍の性能を示しています。図4はDeepSpeed-FastGenとvLLMの実効スループットの比較分析を示しています。プロットされたそれぞれの点は、特定のクライアント数で得られた実効スループットを表します。クライアント数を増やすと初めは実効スループットが増加することが観察されました。しかし、クライアント数がシステムの容量に近づくとレイテンシも大幅に増加し、多くのリクエストがSLAを満たすことができなくなります。その結果、実効スループットはいずれかのポイントを上限として、その後減少します。使用性の観点から、最大実効スループットを達成するために必要なクライアント数は特に重要ではありません。ラインの最高点が、サービス提供における最適な点になります。
+
+
+
+
+ *図4: DeepSpeed-FastGenとvLLMの実効スループット。Llama 2 70B/A100-80GBを使用し、4台のA100-80GB GPU間でテンソル並列を使用。プロンプトと生成の長さは、それぞれ平均2600と60の正規分布(30%の分散)に基づいて設定。*
+
+
+vLLMが、新たなプロンプトを処理するために進行中の前のリクエストの生成を中断すると、生成のレイテンシは顕著に増加します。これにより、vLLMの実効スループットは直接測定されたスループットよりも低く見えます。vLLMのピーク時、実効スループットは0.63クエリ/秒であり、リクエストの約28%が4トークン/秒のSLAを満たすことができませんでした。同じSLAで、DeepSpeed-FastGenは1.42クエリ/秒(SLAを満たさなかったリクエストは1%未満)を達成し、これはvLLMの2.3倍です。
+
+### D. トークン単位のレイテンシ分析
+
+図5は生成プロセスのP50、P90、P95のレイテンシを表示しています。vLLMとDeepSpeed-FastGenを比べると、P50レイテンシに大きな違いはありませんが、vLLMはP90とP95で著しく高いレイテンシを示しています。
+P95レイテンシに関しては、DeepSpeed-FastGenは3.7倍の削減を達成しています。
+
+この差異は、vLLMが進行中の生成を中断して新しいプロンプトを処理する際に、生成レイテンシに顕著なスパイクが生じるためです。
+対照的に、DeepSpeed-FastGenは通常、前のリクエストのプロンプトと生成を同時に処理するため、はるかに一貫した生成のレイテンシを実現します。
+
+
+
+
+ *図5: トークンごとの生成レイテンシ。Llama 2 70B/A100-80GBを使用し、4台のA100-80GB GPU間でテンソル並列を使用。クライアント数16。プロンプトと生成の長さは、それぞれ平均2600と128の正規分布(30%の分散)に基づいて設定。*
+
+
+
+### E. ロードバランシングを使用したスケーラビリティ
+DeepSpeed-FastGenはレプリカ単位のロードバランシングの機能を備えており、複数のサーバーにリクエストを均等に分散させることで、アプリケーションを簡単にスケールアップすることができます。
+
+図6は、ロードバランサーを使用し、最大16のレプリカを適用したときのDeepSpeed-FastGenのスケーラビリティを示しています。Llama 2 70Bモデルの計算には、レプリカ一つあたりで、4台のA100 GPUを使用しました。合計で16のレプリカを実行するために8ノードを使用しました。その結果はDeepSpeed-FastGenのほぼ完璧なスケーラビリティを示しています。1つのレプリカのスループットが1.46クエリ/秒である場合、16のレプリカでのスループットは23.7クエリ/秒に達し、1つのレプリカに比べて16倍の線形増加を示しています。
+
+
+
+
+ *図6: ロードバランシング機能を使用したスケーラビリティ。プロンプトと生成の長さは、それぞれ平均2600と60の正規分布(30%の分散)に基づいて設定。*
+
+
+### F. 他のハードウェアプラットフォーム
+
+A100 GPUを用いた分析に加えて、H100とA6000を使用したベンチマーク結果を提供します。A6000とH100の両方で、A100と同様のパフォーマンスの傾向が観察されました。
+
+
+
+
+ *図7: 8つのH100 GPUを使用したLlama 2 70bのスループット・レイテンシカーブと実効スループット。プロンプトと生成の長さは、それぞれ平均2600と60の正規分布(30%の分散)に基づいて設定。*
+
+
+
+
+
+ *図8: A6000を使用したLlama 2 7bのスループット・レイテンシカーブと実効スループット。プロンプトと生成の長さは、それぞれ平均2600と60の正規分布(30%の分散)に基づいて設定。*
+
+
+## 5. DeepSpeed-FastGen: 実装と使い方
+
+DeepSpeed-FastGenは、以下の図に示されているように、[DeepSpeed-MII](https://github.com/microsoft/DeepSpeed-MII)と[DeepSpeed-Inference](https://github.com/microsoft/DeepSpeed)を融合的に組み合わせたものです。これらのソフトウェアパッケージは、フロントエンドAPI、Dynamic SplitFuseを使用してバッチをスケジュールするホストおよびデバイスインフラストラクチャ、最適化されたカーネル実装、新しいモデル実装を構築するためのツールなど、システムの様々なコンポーネントを提供します。
+
+
+
+
+
+
+
+DeepSpeed-FastGenのアルファリリースを使い始める最も簡単な方法は、 ``pip install deepspeed-mii`` を実行することです。
+
+詳細については、[Getting Started](https://github.com/microsoft/deepspeed-mii#getting-started-with-mii)ガイドを参照してください。使用法や問題の報告には、[DeepSpeed-MII Github リポジトリ](https://github.com/microsoft/DeepSpeed-MII)を使用してください。
+
+### A. 対応モデル
+
+現在、DeepSpeed-FastGenのこのアルファリリースでは、以下のモデルアーキテクチャをサポートしています:
+
+* [LLaMA](https://huggingface.co/models?other=llama) and [LLaMA-2](https://huggingface.co/models?other=llama-2)
+* [Mistral](https://huggingface.co/models?other=mistral)
+* [OPT](https://huggingface.co/models?other=opt)
+
+現在のすべてのモデルは、モデルの重みとモデルに対応するトークナイザーの両方を提供するために、バックエンドで [HuggingFace](https://github.com/huggingface) を利用しています。
+
+初期リリース後の数週間と数ヶ月に追加のモデルを追加する予定です。サポートを希望する特定のモデルアーキテクチャがある場合は、[issue](https://github.com/microsoft/DeepSpeed-MII/issues) を登録してください。。
+
+### B. デプロイメントのオプション
+
+以下の例はすべて [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples/tree/master/inference/mii) で実行可能です。インストール後、デプロイメントのオプションとして、対話型の非永続パイプラインまたは永続的なサービス提供デプロイメントの2つのオプションがあります。
+
+#### 非永続パイプライン
+
+非永続パイプラインデプロイメントは、非常に簡単に使い始めることができ、わずか数行のコードで実行可能です。
+非永続モデルは、Pythonスクリプトの実行中だけ起動しますが、一時的な対話型セッションには便利です。
+
+```python
+from mii import pipeline
+pipe = pipeline("mistralai/Mistral-7B-v0.1")
+output = pipe(["Hello, my name is", "DeepSpeed is"], max_new_tokens=128)
+print(output)
+```
+
+#### 永続デプロイメント
+
+永続デプロイメントは、長時間実行されるアプリケーションや本番アプリケーションに使用するためのものです。永続デプロイメントでは、以下の2行を使用して軽量なGRPCサーバーを起動できます。
+
+```python
+import mii
+mii.serve("mistralai/Mistral-7B-v0.1")
+```
+
+上記のサーバーは、DeepSpeed-MIIの組み込みロードバランサーのおかげで、複数のクライアントから一度にクエリを受け取ることができます。クライアントも、以下の2行のコードだけで利用できます:
+
+```python
+client = mii.client("mistralai/Mistral-7B-v0.1")
+output = client.generate("Deepspeed is", max_new_tokens=128)
+print(output)
+```
+
+永続デプロイメントは、必要なくなったときに、以下の方法で終了できます:
+
+```python
+client.terminate_server()
+```
+
+### C. インストールの詳細情報
+
+類似の他のプロジェクトでは、カスタムカーネルのコンパイルに非常に時間がかかることがよくあります。
+DeepSpeed-FastGenでは、このコンパイル時間を大幅に短縮し、利便性を向上するため、主要なカスタムカーネルの大部分を事前コンパイルしたPython wheelを、[DeepSpeed-Kernels](https://github.com/microsoft/DeepSpeed-Kernels)という新しいライブラリを通じて配布しています。
+このライブラリは、NVIDIA GPUのコンピュート能力が8.0以上(Ampere+)、CUDA 11.6以上、Ubuntu 20以上の環境で非常に移植性が高いことがわかっています。
+このライブラリは、DeepSpeed-MIIの依存関係としてインストールされるため、ほとんどの場合では、このライブラリの存在を知る必要はありません。しかし、何らかの理由でカーネルを手動でコンパイルする必要がある場合は、インストールに関する[詳細ドキュメント](https://github.com/microsoft/DeepSpeed-Kernels#source)をご覧ください。
+
+# 6. DeepSpeed-FastGen を使ってみる
+
+このDeepSpeed-FastGenアルファリリースをユーザの皆さんと共有できることを非常に嬉しく思います。
+
+* 使用を始めるにあたっては、DeepSpeed-MIIのGitHubページをご覧ください: [GitHubランディングページ](https://github.com/microsoft/DeepSpeed-MII)
+
+DeepSpeed-FastGenは、Deep Learningシステムやモデリングテクノロジーを数多く含む、より大きなDeepSpeedエコシステムの一部です。さらに詳しい情報が必要な方は、
+[詳細なブログ記事]、チュートリアル、役立つドキュメントがある私たちの [ウェブサイト](https://www.deepspeed.ai/) をご覧ください。
+DeepSpeedの最新情報については、[英語のTwitter](https://twitter.com/MSFTDeepSpeed)、[日本語のTwitter](https://twitter.com/MSFTDeepSpeedJP)、[中国語の知乎](https://www.zhihu.com/people/deepspeed)をフォローしてください。
+
+DeepSpeedは、皆様の開発への参加を歓迎しています。DeepSpeedのGitHubページで、バグ報告、Pull Request、ディスカッションへの参加が可能です。詳細は[ガイドライン](https://github.com/microsoft/DeepSpeed/blob/master/CONTRIBUTING.md)をご覧ください。[contributing guide](https://github.com/microsoft/DeepSpeed/blob/master/CONTRIBUTING.md) にはより詳細な情報があります。
+また、深層学習の研究や、実世界のAIモデルやアプリケーションへのDeepSpeedの適用に取り組む大学、研究所、企業とのコラボレーションも行っています。こうしたコラボレーションについてのご要望(およびGitHubには適さないその他の話題)については まで直接メールをお送りください。
+
+以下の項目は、今後のロードマップです。GitHubの問題やPRを通じてコミュニティと協力して取り組む予定です:
+
+- パフォーマンスの改善
+- より広範なモデルサポート
+- パートナーとのコラボレーションによる新しいハードウェアバックエンド
+- ブログに掲載したプロットを生成するパフォーマンスベンチマークのリリース
+
+このプロジェクトが気に入ったら、ぜひ [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/) と [DeepSpeedMII GitHub](https://github.com/microsoft/DeepSpeed-MII/) のリポジトリに "スター" をつけてください。
+
+# 7. 謝辞
+
+HuggingFace、vLLM、HuggingFace TGIを含むさまざまなオープンソースコミュニティプロジェクトに感謝します。私たちはアルファリリースでのモデルとトークナイザーをサポートするためにHF APIを活用し、今後もさらに多くのモデルを追加する予定です。特に、[Flash Attention](https://github.com/Dao-AILab/flash-attention) の開発者の素晴らしい成果に感謝します。私たちはシステムでFlashAttentionカーネルを広範囲に活用しており、コードリポジトリに含まれる適切なファイルヘッダーにそのことを記載しています。最後に、私たちのMoEカーネルで使用している [FasterTransformer](https://github.com/NVIDIA/FasterTransformer) カーネルの開発者に感謝します(DeepSpeed-Kernelsリポジトリの一部としてリリースされました)。
diff --git a/blogs/deepspeed-offloadpp/README.md b/blogs/deepspeed-offloadpp/README.md
new file mode 100644
index 000000000000..1441da5a35c0
--- /dev/null
+++ b/blogs/deepspeed-offloadpp/README.md
@@ -0,0 +1,52 @@
+# DeepSpeed ZeRO-Offload++: 6x Higher Training Throughput via Collaborative CPU/GPU Twin-Flow
+
+Deep learning has been successfully adopted in a wide range of applications such as speech recognition, chatbot, text and image generation, etc. To achieve better model serving accuracy, model size grows significantly. Take language models as example, from BERT with 110 million parameters to Megatron-Turing NLG with 530 billion parameters, the model size grows almost 5000x. Given limited GPU memory size, we need to efficiently utilize GPU memory to achieve good system throughput.
+
+ZeRO offers memory efficient data parallel training scheme. For training large models like LLMs using ZeRO, GPU memory size is still often insufficient to hold all the model parameters. Thus, ZeRO-Offload is introduced to solve this insufficient GPU memory issue. ZeRO-Offload releases GPU memory pressure by offloading data and compute to the CPU side while minimizing CPU-GPU data copy overhead. Given CPU memory is often orders-of-magnitude larger than GPU memory, ZeRO-Offload was the first piece of work that enables billion-level parameter training even with very limited GPU memory resources (e.g., to an extreme: single GPU). ZeRO-Offload provides excellent performance when model size is multiple times larger than total GPU memory size.
+
+However, system efficiency is still far from optimal when adopting ZeRO-Offload in some scenarios. Especially in the cases like small batch training, model that could not fit into GPU memory but not orders-of-magnitude bigger than GPU memory capacity, CPU offload not only introduce long end-to-end latency, but also underutilize GPU computation resources. To reduce memory copy latency as well as inefficient utilization of GPU introduced in these offload cases, we propose ZeRO-Offload++, which leverages both CPU and GPU coherently. ZeRO-Offload++ mainly includes 3 new features as _Twin-Flow_, MemCpy reduction, CPUAdam optimization. Now we release our __Twin-Flow__ feature.
+
+The key benefits are:
+* With _Twin-Flow_, ZeRO-Offload++ achieves up to **6x** training speedup compared with ZeRO-Offload.
+* High-level API provided in DeepSpeed config JSON makes it easy to use and fine-tune.
+
+![h100-img](./images/h100-8.png)
+
+## Twin-Flow
+
+In DeepSpeed, when training using popular optimizer like Adam, optimizer offloading follows an all-or-nothing policy. For simplifed example shown as Figure below, without offloading, all the parameters will be updated using GPU adam as FusedAdam optimizer. On the other hand, if offloading is enabled, all model weights use CPUAdam to update.
+
+![cpu-offload-img](./images/cpu-offload.png)
+
+The major downside of this all-or-nothing offloading is, when offload all optimizer states to CPU side, both GPU memory and compute resources remain under-utilized. Although increasing batch size improves GPU utilization rate, each training iteration time is still super long compared with no-offloading case. To improve GPU compute and memory utilization rate as well as decrease training iteration time, we introduce a new feature in our DeepSpeed training engine called _Twin-Flow_.
+
+In comparison, _Twin-Flow_ allows a portion of optimizer states to be held in CPU memory and the other portion of optimizer states remaining in GPU memory. When optimization step is triggered, both CPU and GPU can do parameter updates simultaneously. Once offloading is enabled, we provide an offload ratio configuration which allows users to adjust how many percentages of model weights are updated on CPU side and the rest are happened on GPU side. "_Twin_" comes from the idea that both CPU and GPU are using the same optimizer function here. "_Flow_" means parameters are not only hold in both host and device memory, but also computed using both CPU and GPU cores.
+
+As shown in Figure below, with ZeRO-Offload enabled and we set _Twin-Flow_ ratio of 0.4 (40%). DeepSpeed Training engine will automatically assign first 40% (i.e. 0-40%) of weights step procedure on the CPU side using CPUAdam, and use GPU side FusedAdam to update the rest 60% (i.e., 40-100%) model parameters jointly. Therefore, with _Twin-Flow_, we can achieve decent GPU memory and core utilization rate, at the same time reduce training iteation time in optimizer offloading cases.
+
+![_Twin-Flow_-img](./images/twin-offload.png)
+
+Note that this _Twin-Flow_ ratio can be adjusted based on how much GPU idle memory is available. The smaller this ratio is, the more GPU memory and cores are used and the shorter training iteration time it achieves. The ideal case is to be as near as GPU memory upper bound in order to minimize training iteration time.
+Note that _Twin-Flow_ is not limited to Adam optimizer only, it can be applied to any optimizer (e.g., AdaGrad) from the user side.
+
+## Performance Evaluation
+
+We conduct our performance evaluations over both A100 and H100 DGX machine and test for OPT model with 13B and 30B parameters. We run 13B OPT model training on a 8 A100 DGX machine, and run OPT-30B model training using a 8 H100 DGX machine. With some tuning on offload ratio in ZeRO-Offload++, we achieve 6x and 3x training speedup of Meta OPT models on single DGX-H100-80GB and DGX-A100-40GB, respectively (top-most figure and bottom figure here).
+
+![a100-img](./images/a100-8.png)
+
+## On-going Optimizations
+
+* Reduce uncessary D2H/H2D memcpy
+
+* On-the-fly fp16 to fp32 casting for CPUAdam
+
+## Tutorials
+
+Examples and Tutorials are [here](https://github.com/microsoft/Megatron-DeepSpeed/blob/main/examples_deepspeed/offload_pp/README.md)
+
+## Contributors:
+
+This project was made possible by the contributions of the following people from DeepSpeed Team:
+
+[Guanhua Wang](https://www.microsoft.com/en-us/research/people/guanhuawang/), Masahiro Tanaka, Xiaoxia Wu, Lok Chand Koppaka, Samyam Rajbhandari, [Olatunji Ruwase](https://www.microsoft.com/en-us/research/people/olruwase/), [Yuxiong He](https://www.microsoft.com/en-us/research/people/yuxhe/) (team lead)
diff --git a/blogs/deepspeed-offloadpp/images/a100-8.png b/blogs/deepspeed-offloadpp/images/a100-8.png
new file mode 100644
index 000000000000..22b787f69e1e
Binary files /dev/null and b/blogs/deepspeed-offloadpp/images/a100-8.png differ
diff --git a/blogs/deepspeed-offloadpp/images/cpu-offload.png b/blogs/deepspeed-offloadpp/images/cpu-offload.png
new file mode 100644
index 000000000000..cc4dae505cd3
Binary files /dev/null and b/blogs/deepspeed-offloadpp/images/cpu-offload.png differ
diff --git a/blogs/deepspeed-offloadpp/images/h100-8.png b/blogs/deepspeed-offloadpp/images/h100-8.png
new file mode 100644
index 000000000000..938625d52aaf
Binary files /dev/null and b/blogs/deepspeed-offloadpp/images/h100-8.png differ
diff --git a/blogs/deepspeed-offloadpp/images/twin-offload.png b/blogs/deepspeed-offloadpp/images/twin-offload.png
new file mode 100644
index 000000000000..1c8c3ef92454
Binary files /dev/null and b/blogs/deepspeed-offloadpp/images/twin-offload.png differ
diff --git a/blogs/deepspeed-triton/README.md b/blogs/deepspeed-triton/README.md
new file mode 100644
index 000000000000..071b5d4bc6d0
--- /dev/null
+++ b/blogs/deepspeed-triton/README.md
@@ -0,0 +1,95 @@
+# DeepSpeed with Triton compiler
+
+# 1. Overview
+
+We have integrated [Triton](https://github.com/openai/triton), an open source compiler for GPU programming, into DeepSpeed, which further boosts the inference speed of BERT-like models in float16 precision.
+By replacing some CUDA kernels or torch operators with Triton kernels, we achieved 1.14\~1.68x speedup (or 12\~41% latency reduction) for different models and GPUs, as shown in Table 1.
+
+
+
+| Hardware | Bert-base | Bert-large | Roberta-base | Roberta-large |
+|----------|:------:|:------:|:------:|:------:|
+| A100 |1.65x | 1.68x | 1.53x | 1.61x |
+| V100 | 1.29x | 1.14x | 1.23x | 1.21x |
+
+Table 1. The average speedup (see NOTE below for more detail)
+
+
+
+
+For those transformer operators in float16, we have implemented kernels written in Triton language that replace ordinary CUDA kernels or torch operators.
+The Triton kernels we implemented include softmax, layer-normalization, residual-addition and all the matrix multiplications except MLP layers (see NOTE below for details).
+In our experiments, Triton kernels help to reduce the average latecy (over difference sequence lengths) by 6\~24% (depending on model and hardware) when compared to the latency with CUDA-only kernels.
+
+
+Figures below show the latency reduction in more detail.
+Figure 1 visualizes latency reduction in different sequence lengths in A100 GPU for Bert-base model.
+The baseline (blue) is from Huggingface transformers without any kernel injection, the orange is from Deepspeed with CUDA-only kernels and the gray is from Deepspeed with Triton kernels.
+Figure 2 shows the same plot for Bert-large model in A100 GPU.
+
+
+
+
+
+*Figure 1: Normalized P90 latency for Bert-base model in A100 GPU across different sequence lengths*
+
+
+
+*Figure 2: Normalized P90 latency for Bert-large model in A100 GPU across different sequence lengths*
+
+
+
+
+Next, we dive deeper into this new feature in DeepSpeed.
+
+# 2. How to use Triton in Deepspeed
+
+You can enable Triton compilers to optimize these kernels by setting a flag in the DeepSpeed config file.
+
+```
+pipe = pipeline('fill-mask', model='bert-base-cased', framework='pt', device=0)
+pipe.model = deepspeed.init_inference(pipe.model,
+ dtype=torch.float16,
+ replace_with_kernel_inject=True,
+ enable_cuda_graph=True,
+ use_triton=True,
+ triton_autotune=True,
+ max_out_tokens=pipe.tokenizer.model_max_length)
+```
+
+
+## Running BERT inference with Triton kernels
+
+We use an example of Bert-base here.
+
+```python
+pip install deepspeed[triton]
+
+git clone https://github.com/microsoft/DeepSpeedExamples.git
+cd DeepSpeedExamples/inference/huggingface/fill-mask
+
+deepspeed --num_gpus 1 test-bert.py --triton
+```
+
+To run a performance benchmark, you can use the following command:
+
+```python
+pip install deepspeed[triton]
+
+git clone https://github.com/microsoft/DeepSpeedExamples.git
+cd DeepSpeedExamples/benchmarks/inference
+
+deepspeed --num_gpus 1 triton-bert-benchmark.py --model bert-base-cased --dtype fp16 --kernel-inject --deepspeed --graphs --triton
+```
+
+# NOTE
+
+* For more information on how to use DeepSpeed, please visit our [GitHub Page](https://github.com/microsoft/DeepSpeedExamples) and our [website](https://www.deepspeed.ai/), where you can find blog posts, tutorials, and documentation.
+
+* This feature is currently only supported for BERT, Roberta and other BERT-like models, and not for text-generation models yet.
+
+* To achieve the best performance with Triton optimization, you need to activate CUDA graph and ‘triton_autotune’ in the DeepSpeed config. CUDA graph prevents the overhead of JIT compilation and a deep call stack in Triton. ‘triton_autotune’ executes an initial step to find the most suitable parameters for Triton kernels, which may take some time.
+
+* We used [Triton 2.0.0.post1 release](https://pypi.org/project/triton/2.0.0.post1/) in our experiments.
+
+* In our experiments, we used a batch size of 1, a sequence length range of 8 to 512, and a ‘fill-mask’ task. Table 1 shows the average P90 latency over the entire sequence length range, while Figures 1 and 2 show the P90 latency for specific sub-ranges. The baseline is the Huggingface transformers without any optimization. The speedup is calculated as (baseline P90 latency)/(DeepSpeed-Triton P90 Latency). We found that the CUDA kernel in MLP performed better than the Triton kernel in our experiments, so we used a hybrid approach that combines both kernels when Triton is enabled in the DeepSpeed config.
diff --git a/blogs/deepspeed-ulysses/README.md b/blogs/deepspeed-ulysses/README.md
new file mode 100644
index 000000000000..7ea7a4535e90
--- /dev/null
+++ b/blogs/deepspeed-ulysses/README.md
@@ -0,0 +1,370 @@
+
+
+# DeepSpeed Ulysses: System Optimizations for Enabling Training of Extreme Long Sequence Transformer Models
+
+
+
+
+
+
+
+
+
+To cite DeepSpeed-Ulysses, please cite our [arxiv report](https://arxiv.org/abs/2309.14509):
+
+```
+@article{jacobs2023deepspeed,
+ title={DeepSpeed Ulysses: System Optimizations for Enabling Training of Extreme Long Sequence Transformer Models},
+ author={Sam Ade Jacobs and Masahiro Tanaka and Chengming Zhang and Minjia Zhang and Shuaiwen Leon Song and Samyam Rajbhandari and Yuxiong He},
+ journal={arXiv preprint arXiv:2309.14509},
+ year={2023},
+}
+```
+
+## Introduction
+
+Training large models with long sequences is becoming very important
+across the board from generative AI to models for scientific discovery.
+On generative AI side, conversational AI, long document summarization
+and video generation require reasoning over long contexts in spatial and
+temporal domains. For example, multimodal foundation models such as ones
+that process speech, images and waveforms concurrently require long
+context reasoning over high dimensional inputs with extremely large
+sequences. Similarly, chapter and book level summarization (estimated at
+tens and hundreds of thousands of words) are of great importance in
+conversational AI and abstractive summarization tasks.
+
+Long sequence length is equally critical for AI for science opening
+doors for better understanding of structure biology, health care,
+climate and weather forecasting and large molecular simulation. For
+instance, by adapting large language models with gene sequences, we can
+create language models that can learn the evolutionary patterns of
+genomes using simple alphabets and extremely long sequences (the human
+genome has 6.4 billion letters). In health care, diagnostic predictive
+model conditioned on entire patient care record requires context of
+extremely long sequence.
+
+Despite the emerging importance of long sequence length for both
+generative AI and AI for science, existing large model training systems
+and the underlying parallelism technologies (data, tensor, pipeline,
+sequence parallelism) are limited in their ability to support the
+efficient long sequence training. Two challenges with existing
+parallelism approach come to the fore. First, existing parallelism
+approach such as data, tensor and pipeline parallelism cannot address
+the scaling along sequence dimension. Second, existing sequence
+parallelism approaches are not effective because of memory-communication
+inefficiencies. Furthermore, existing
+approaches have limited usability requiring intrusive and error prone
+code refactoring.
+
+In this release, we are proud to introduce *DeepSpeed-Ulysses (or
+Ulysses, a very long novel)*, a simple, portable, and effective
+methodology for enabling highly efficient and scalable LLM training with
+extremely long sequence lengths.
+
+DeepSpeed-Ulysses partitions individual samples along the sequence
+dimension among participating GPU. Then right before the attention
+computation, it employs *all-to-all communication* collective on the
+partitioned queries, keys and values such that each GPU receives the
+full sequence but only for a non-overlapping subset of the attention
+heads. This allows the participating GPUs to compute attention for
+different attention heads in parallel. Finally, DeepSpeed-Ulysses
+employs another all-to-all to gather the results along the attention
+heads while re-partitioning along the sequence dimension.
+
+The key properties of DeepSpeed-Ulysses and its implementation released
+with this blog are as follows:
+
+* ***4x larger sequence lengths*** than existing systems, while
+enabling training with sequences with ***over a million tokens***.
+
+* Communication reduction of ***over 10x*** compared to existing
+systems, resulting in throughput improvements of ***up to 2.5x***, and
+sustained throughput of over 175 TFlops/GPU (over 54% of hardware peak).
+
+* Fully general and implementation agnostic attention: DeepSpeed
+sequence parallelism supports dense as well as sparse
+attention, and it works with efficient attention implementations such as
+FlashAttention v2.
+
+* Support for massive model training: DeepSpeed sequence parallelism
+works together with ZeRO-3 to not only support large sequence lengths
+but also massive model sizes.
+
+* Easy-to-use and portable, requiring minimal code changes to the
+existing training frameworks.
+
+In subsequent sections, we provide detailed discussion of DeepSpeed-Ulysses
+core design, communication complexity analysis,
+experimental evaluation and comparison with existing work and highlight
+of usability and guide on usage.
+
+## Core Design of DeepSpeed-Ulysses
+
+
+
+
+*Figure 1: DeepSpeed sequence parallelism (DeepSpeed-Ulysses) design*
+
+
+Figure 1 shows the core design of DeepSpeed-Ulysses. As with the known
+transformer architecture, the design consists of input sequences *N*
+partitioned across *P* available devices. Each local *N/P* partition is
+projected into queries (Q), keys (K) and values (V) embeddings. Next,
+(QKV) embeddings are gathered into global QKV through highly optimized
+all-to-all collectives between participating compute devices. Sequel to
+all-to-all collective is the attention computation per head in the form:
+
+$$Output\ context = Softmax\ (\frac{QK^{T}}{\sqrt{d}})V$$
+
+After the attention computation, another all-to-all collective
+transforms *output context* tensor of attention computation to sequence
+(*N/P*) parallel for subsequent operators (MLP MatMul, layer norm etc)
+in the remaining modules of transformer layer block.
+
+### Significant Communication Volume Reduction
+
+What distinguishes DeepSpeed-Ulysses from the other existing
+long-sequence approaches is our much smaller aggregate communication
+volume and overall better scalability with increasing degree of sequence
+parallelism compared to existing solutions, as demonstrated by the
+communication volume analysis below:
+
+On modern clusters with intra-node NVSwitch interconnect and inter-node
+fat tree IB topology, the communication volume transmitted per link for
+an all-to-all for aggregate message of size *M* over *P* GPUs is *M/P*.
+For a transformer model with hidden size h, sequence length of N, and
+parallelism degree of P, DeepSpeed sequence parallelism performs all-to-all for the QKV
+projections with an aggregate message size of *3Nh* before the attention
+computation, and another all-to-all for output context projection with a
+size *Nh* for each transformer layer. Therefore, DeepSpeed sequence
+parallelism incurs an aggregate communication volume per link of
+***4Nh/P (or with the complexity of O(N/P).*** Note that this
+communication volume is constant when both N and P are increased
+proportionally.
+
+In contrast, the existing approaches like Megatron-LM incur
+communication volume that increases linearly with N regardless of P,
+resulting in the ***communication complexity of O(N).*** For instance,
+Megatron-LM performs two *all-gather* with the message volume of *Nh*
+and two *reduce-scatter* with the volume of *Nh* for each transformer
+layer. However, the cost of each all-gather and reduce-scatter of size M
+remains M when *P \>\> 1*, instead of *M/P*. Therefore, Megatron-LM
+sequence parallelism incurs a communication volume per link of ***4Nh***
+which is P times larger than that for DeepSpeed sequence parallelism.
+This allows DeepSpeed sequence parallelism to enable training with
+extremely long sequences while achieving significantly higher training
+efficiency compared to the existing approaches. Our evaluation results
+match this analysis.
+
+### Additional Highlights of DeepSpeed-Ulysses
+
+***An Attention Agnostic Solution***
+
+DeepSpeed implementation of distributed attention module is general
+enough to support any attention: e.g., self-attention, cross-attention,
+causal attention in both their dense and sparse counterparts, and their
+various optimized kernels that support long-sequence at local attention
+level such as different versions of FlashAttention.
+
+The generality property of DeepSpeed-Ulysses stems from the modular
+nature of its core design: an attention-centric sequence parallelism
+design. Prior to attention computation is sequence parallelism of N/P
+partition, attention computation is head parallelism with full attention
+per head but just with fewer heads, thus attention computation can be
+replaced with any type of attention mechanisms, e.g., dense attention
+and various forms of sparse attention.
+
+***Training Bigger Models with Longer Sequences through ZeRO-3 Integration***
+
+While DeepSpeed sequence parallelism reduces the activation memory when
+training with longer sequences, it does not impact the memory consumed
+by the model states. Therefore, to support large sequence length
+training with large language model, DeepSpeed sequence parallelism is
+integrated with ZeRO-3.
+
+[ZeRO Redundancy Optimizer Stage 3 (ZeRO-3)](https://www.microsoft.com/en-us/research/blog/zero-deepspeed-new-system-optimizations-enable-training-models-with-over-100-billion-parameters/) is a memory optimization technique for training large
+models. Unlike the classic data parallel training of neural networks
+where model states are replicated across data parallel ranks, ZeRO-3
+optimizes memory usage by partitioning model states across data parallel
+ranks. However, with sequence parallelism, training data can be
+considered in both batch (sample) and sequence dimensions and the
+associated parallel groups combined to form a larger group for ZeRO
+parallelism.
+
+Therefore, we extend ZeRO-3 partitioning to combination of data parallel
+and sequence parallel ranks. In other words, in DeepSpeed sequence
+parallelism, ZeRO partitions model states across both sequence and data
+parallel group and collects per rank partitions (allgather) when they
+are needed. Similarly, gradients are reduced across both data and
+sequence parallel ranks for parameter update. ZeRO allows
+for huge memory savings in both sequence and data dimensions and enables
+scaling not just to large sequence lengths but also to large models.
+
+## Evaluation
+
+We evaluate DeepSpeed-Ulysses (Ulysses) on GPT,
+a foundation model for many NLP tasks on up to 64 A100 GPUs with 40GB memory. Our
+evaluations are four-fold: i) sequence length scalability, ii)
+throughput for dense attention and comparison with existing system, and
+iii) throughput with sparse attention and comparison with existing
+system, iv) convergence study of DeepSpeed sequence parallelism. We discuss
+and present evaluations from each of these categories next.
+
+### Sequence Length Scalability
+
+The first set of experiments is strong scaling of sequence length up to
+1 million tokens on 1.2 billion parameter GPT model. Results of this
+evaluation are shown in Figures 2. DeepSpeed sequence parallelism
+allows increasing sequence length linearly with the
+number of GPUs and
+maintains similar computation throughput across different sequence
+length at appropriate GPU count.
+
+
+
+
+*Figure 2: DeepSpeed sequence parallelism strong scalability evaluation
+at different sequence length and GPU count.*
+
+
+### Dense Attention Evaluation
+
+Next, we evaluate Ulysses on 7 billion (7B) and 30 billion (30B) parameter
+GPT dense attention models and compare against Megatron-LM's sequence
+parallelism (Megatron LM) and Colosal AI sequence parallelism (ColAI-SP) on
+32 and 64 A100 GPUs respectively. The results of these evaluations are shown
+in Figures 3 and 4.
+
+We compare Ulysses with Megatron-LM and ColAI-SP for 7B and 30B models
+running various sequence lengths. We chose the sequence parallelism
+degree and micro-batch size that produced the best performance
+(measured as TFLOPs) for the three methods, this we call optimal
+(batch size-sequence length) configurations. For Ulysses, we always
+use a ZeRO-3 parallelism degrees of 32 and 64 for 7B and 30B models
+respectively.
+
+
+Figures 3 and 4 show that Ulysses consistently outperforms Megatron-LM
+and ColAI-SP for the sequence length that can be run with them. In addition,
+Ulysses can run longer sequence than the two existing methods. Ulysses
+performance advantages are two folds: (1) Ulysses in combination with ZeRO-3
+parameter sharding across both data and sequence parallel groups fits more
+samples than Megatron-LM and ColAI-SP because of the memory optimization
+leading to higher throughput (2) Ulysses benefits from efficient *all-to-all*
+communication relative to *all-gather* *reduce-scatter* and *ring-style* P2P
+communication as applied in Megatron-LM and ColAI-SP sequence parallelism.
+However, for dense attention at long sequence length, the throughput is
+primarily determined by local attention computation due to quadratic
+computation complexity of attention, therefore performance gap between Ulysses
+and the two existing methods closes for sequence length that can be run with them.
+
+
+
+
+*Figure 3: Evaluation of Ulysses vs Megatron LM vs ColAI-SP on GPT-7B parameter
+ model with dense attention (32 GPUs).*
+
+
+
+
+
+*Figure 4: Evaluation of Ulysses vs Megatron LM vs ColAI-SP on GPT-30B parameter
+ model with dense attention (64 GPUs).*
+
+
+### Sparse Attention Evaluation
+
+Similarly, we evaluate Ulysses on 7 billion and 30 billion parameter sparse
+attention models and benchmark against Megatron-LM sequence parallelism.
+There is no public implementation of block sparse attention for ColAI-SP,
+therefore, evaluation of sparse attention is in comparison with Megatron-LM.
+Results of our evaluation are shown in Figures 5 and 6. We observe similar
+trends with sparse attention as dense attention experiments. We observe more
+than 2x throughput performance of Ulysses compared to Megatron-LM. For memory
+saving, Ulysses leveraging ZeRO-3 scales to 4x longer sequence lengths
+than Megatron-LM.
+
+Ulysses outperforms Megatron-LM for sequence length that can be run with both.
+In fact, the current Ulysses throughput is bottle-necked by the local sparse
+attention implementation, and as a result Ulysses throughput decreases as
+the sequence length increases. We expect this gap in performance between our
+method and Megatron-LM to increase further for larger sequence lengths as we
+improve the performance of the local sparse attention implementation in future.
+A noteworthy observation is that the decreasing performance gap between Ulysses
+and Megatron-LM observed in dense attention evaluation is less pronounced in
+sparse attention evaluation, because the attention computation in sparse attention
+is less dominant compared to dense attention.
+
+
+
+
+*Figure 5: Evaluation of Ulysses and Megatron LM sequence parallelism on GPT-7B
+parameter model with block sparse attention (32 GPUs).*
+
+
+
+
+
+*Figure 6: Evaluation of Ulysses and Megatron LM sequence parallelism on GPT-30B
+parameter model with block sparse attention (64 GPUs).*
+
+
+### Convergence Study
+
+Lastly, Figure 7 shows convergence of a 1.3 billion GPT model at 32K
+sequence length on 8 A100 GPUs with sequence parallelism degree set at 4
+for both DeepSpeed and Megatron-LM sequence parallelism. For DeepSpeed
+sequence parallelism, we evaluate convergence with different ZeRO
+stages. DeepSpeed sequence parallelism is a purely system optimization
+technique that enables training of long sequence Transformer model, thus
+there is no (negative) impact on quality of trained models, this assertion is
+validated through experiments and is shown in Figure 5.
+
+
+
+
+*Figure 7: Convergence evaluation of DeepSpeed sequence parallelism with different
+ZeRO memory optimization stages.*
+
+
+## DeepSpeed-Ulysses Software Accessibility
+
+DeepSpeed-Ulysses can be easily integrated into your code with just a
+few lines of simple code changes. Here is an example of how to enable
+it:
+
+```python
+from deepspeed.sequence.layer import DistributedAttention
+
+# Replace the original self-attention (attn) with DeepSpeed-Ulysses’s self-attention
+
+dist_attn = DistributedAttention(attn, get_sequence_parallel_group())
+```
+
+Compared to other libraries that support sequence parallelism, such as
+Megatron-LM, DeepSpeed-Ulysses does not require model refactoring.
+DeepSpeed-Ulysses has been fully integrated and tested with the
+Megatron-DeepSpeed code repository. This means that if you are already
+using this repository for training large language models, you can
+seamlessly benefit from DeepSpeed-Ulysses to train models with massive
+sequence length.
+
+## Release: Try DeepSpeed-Ulysses Today
+
+We are excited to release DeepSpeed-Ulysses, accessible through
+DeepSpeed GitHub. Detailed tutorial on usage is available on [DeepSpeed
+tutorial page](https://www.deepspeed.ai/tutorials/ds-sequence/).
+
+We welcome contributions and collaboration as we together push forward
+on what is possible when long context window is no longer a limitation.
+DeepSpeed-Ulysses is part of the bigger DeepSpeed ecosystem of
+large-scale AI training and inference. For more details on all DeepSpeed
+technologies and innovations, please visit our [website]((https://www.deepspeed.ai/)) and follow us
+on X, formerly Twitter, ([English](https://twitter.com/MSFTDeepSpeed), [Japanese](https://twitter.com/MSFTDeepSpeedJP)) and [Chinese Zhihu](https://www.zhihu.com/people/deepspeed).
+
+We are open to collaborations with universities, research labs, and
+companies. For such requests (and other requests unsuitable for GitHub),
+please directly email to . If you like
+our work, please "Star" our [repo](https://github.com/microsoft/DeepSpeed).
diff --git a/blogs/deepspeed-ulysses/chinese/README.md b/blogs/deepspeed-ulysses/chinese/README.md
new file mode 100644
index 000000000000..20af8b91fbea
--- /dev/null
+++ b/blogs/deepspeed-ulysses/chinese/README.md
@@ -0,0 +1,155 @@
+
+
+# DeepSpeed Ulysses: 训练极长序列Transformer模型的系统优化
+
+
+
+
+
+
+
+
+
+## 简介
+
+从生成性AI到科研模型,长序列训练正在变得非常重要。
+在生成性AI领域,会话式AI、长文档摘要和视频生成等任务都需要在空间和时间层面对长上下文进行推理。
+例如,多模态基础模型,如同时处理语音、图像和波形的模型,需要对具有极长序列的高维输入进行长上下文推理。
+同样,章节和书籍级别的摘要(数万甚至数十万字)在会话式AI和摘要任务中也非常重要。
+
+对于科学AI来说,长序列同样至关重要,它为更好地理解结构生物学、医疗保健、气候和天气预测以及大分子模拟打开了大门。
+例如,通过在基因序列上训练大型语言模型,我们可以创建可以使用极长序列(人类基因组有64亿个碱基对)学习基因组进化模式的语言模型。在医疗保健领域,以所有的患者护理记录为条件的诊断预测模型需要极长序列的上下文。
+
+尽管对于生成性AI和科学AI来说,长序列长度的重要性逐渐增长,但现有的大型模型训练系统和底层的并行技术(数据、张量、流水线、序列并行)并不能支持高效的长序列训练。现有并行方法存在两个主要挑战。首先,现有的数据、张量和流水线等并行方法无法解决序列维度的扩展问题。其次,由于内存通信效率低下,现有的序列并行方法不够高效。此外,现有方法的易用性不足,需要进行侵入性和复杂易出错的代码重构。
+
+为了解决这些问题,我们很高兴宣布推出*DeepSpeed-Ulysses(或称为Ulysses,一个非常长的小说)*,这是一种简单、易用且高效的方法,用于支持具有极长序列长度的高效可扩展LLM训练。
+
+DeepSpeed-Ulysses将各个样本在序列维度上分割给参与的GPU。然后,在attention计算之前,它对已分割的查询(Q)、键(K)和值(V)执行*all-to-all通信*操作,以使每个GPU接收完整的序列,但仅用于注意力头的非重叠子集。这使得参与的GPU可以并行计算不同的注意力头。最后,DeepSpeed-Ulysses还使用另一个all-to-all来在注意力头上收集结果,同时重新在序列维度上进行分区。
+
+DeepSpeed-Ulysses及其与此博客一起发布的实现的关键特性如下:
+
+* 与现有系统相比,序列长度增加了***4倍***,支持训练***超过百万个token***的序列。
+
+* 与现有系统相比,通信减少了***超过10倍***,导致吞吐量提高了***高达2.5倍***,并且每个GPU的持续吞吐量超过175 TFlops(超过硬件峰值的54%)。
+
+* 完全通用的attention:DeepSpeed序列并行支持密集和稀疏的注意力,并可与高效的注意力实现(如FlashAttention v2)一起工作。
+
+* 支持大规模模型训练:DeepSpeed序列并行不仅支持大序列长度,还可以与ZeRO-3并用支持大模型尺寸。
+
+* 易于使用和迁移,最小化对现有训练框架的代码更改要求。
+
+在接下来的章节中,我们详细讨论DeepSpeed-Ulysses的核心设计、通信复杂度分析、实验评估以及与现有工作的比较,并展示其可用性和使用指南。
+
+## DeepSpeed-Ulysses的核心设计
+
+
+
+
+*图1:DeepSpeed序列并行(DeepSpeed-Ulysses)设计*
+
+
+图1显示了DeepSpeed-Ulysses的核心设计。与已知的Transformer架构一样,设计由*N*个输入序列在*P*个可用设备上分区组成。每个本地*N/P*分区都被投影到查询(Q)、键(K)和值(V)嵌入中。接下来,(QKV) 嵌入通过参与计算设备之间的高度优化的全对全集合(all-to-all collectives)进行全局的 QKV 收集。在全对全集合后,每个头的注意力计算形式为:
+
+$$Output\ context = Softmax\ (\frac{QK^{T}}{\sqrt{d}})V$$
+
+注意力计算后,另一个全对全集合将注意力计算的输出上下文张量转换为序列(*N/P*)并行,用于Transformer模型层的剩余模块中的后续操作(MLP MatMul、层归一化等)。
+
+### 显著的通信量减少
+
+DeepSpeed-Ulysses与其他现有的长序列方法的区别在于其更小的累积通信量以及随着序列并行度增加而更好的可扩展性,如下所示:
+
+在具有节点内NVSwitch互连和节点间胖树IB拓扑的现代集群上,针对一个聚合消息大小为*M*的全对全传输,传输到*P*个GPU上的每个链接的通信量为*M/P*。
+对于隐藏层大小为h、序列长度为N且并行度为P的Transformer模型,DeepSpeed序列并行会在注意计算之前对QKV投影执行聚合消息大小为*3Nh*的全对全操作,并在注意计算之后对输出上下文投影执行大小为*Nh*的另一个全对全操作。因此,DeepSpeed序列并行每个链接的聚合通信量为***4Nh/P(或O(N/P)复杂度)***。值得注意的是,当N和P成比例增加时,这个通信量是恒定的。
+
+相比之下,现有的方法,如Megatron-LM,在N线性增长的情况下会导致通信量线性增加,而与P无关,从而导致***O(N)的通信复杂度***。例如,Megatron-LM对每个Transformer模型层都执行两个大小为*Nh*的*all-gather*操作,以及两个大小为*Nh*的*reduce-scatter*操作。然而,当*P \>\> 1*时,大小为M的每个all-gather和reduce-scatter的成本仍然是M,而不是*M/P*。因此,Megatron-LM序列并行会导致每个链接的通信量为***4Nh***,这比DeepSpeed序列并行大P倍。这使得DeepSpeed序列并行可以在实现显著更高的训练效率的同时支持极长序列训练。我们的实验评估结果与此理论分析相符。
+
+### DeepSpeed-Ulysses的其他亮点
+
+***通用的注意力解决方案***
+
+DeepSpeed分布式注意力模块的实现足够通用,以支持任何类型的注意力,例如自注意、交叉注意和因果注意,无论是它们的密集还是稀疏版本,以及支持局部注意层级上的长序列的各种优化内核,例如不同版本的FlashAttention。
+
+DeepSpeed-Ulysses的通用性来自其核心设计的模块化性质:一个以注意力为中心的序列并行设计。在注意力计算之前,序列并行性是对N/P分区的,而注意力计算是对每个头的并行性,每个头的注意力全都保留,但头的数量较少,因此注意力计算可以用任何类型的注意力机制替代,例如密集注意力和各种形式的稀疏注意力。
+
+***通过ZeRO-3集成实现更大的模型和更长的序列训练***
+
+尽管DeepSpeed序列并行在使用更长的序列进行训练时减少了激活内存的使用,但并不影响模型状态的内存占用。因此,为了支持具有大序列长度的大语言模型训练,我们实现了DeepSpeed序列并行与ZeRO-3的集成。
+
+[ZeRO Redundancy Optimizer Stage 3 (ZeRO-3)](https://www.microsoft.com/en-us/research/blog/zero-deepspeed-new-system-optimizations-enable-training-models-with-over-100-billion-parameters/) 是一种用于训练大模型的内存优化技术。与传统的神经网络数据并行训练中,模型状态在数据并行等级上进行复制不同,ZeRO-3通过将模型状态在数据并行等级之间进行分区来优化内存使用。然而,使用序列并行时,训练数据可以在批(样本)和序列维度上考虑,相关的并行群组可以组合成一个更大的群组以实现ZeRO并行。
+
+因此,我们将ZeRO-3分区扩展到数据并行和序列并行等级的组合。换句话说,在DeepSpeed序列并行中,ZeRO将模型状态分区在序列和数据并行组之间,并在需要时收集每个等级分区(allgather)。类似地,梯度将在数据并行和序列并行等级之间进行减少,用于参数更新。ZeRO可以在序列和数据维度上实现巨大的内存节省,并且不仅可以扩展到大序列长度,还可以扩展到大模型。
+
+## 评估
+
+我们在GPT(用于许多NLP任务的基础模型)上使用最多64个A100 GPU(40GB显存)对DeepSpeed-Ulysses进行了评估。我们的评估分为四个方面:i) 序列长度可扩展性,ii) 密集注意力的吞吐量以及与现有系统的比较,iii) 稀疏注意力的吞吐量以及与现有系统的比较,iv) DeepSpeed序列并行的收敛性研究。接下来,我们将对每个类别讨论和展示评估结果。
+
+### 序列长度可扩展性
+
+第一组实验是在12亿参数的GPT模型上将序列长度扩展到100万token。这个评估的结果如图2所示。DeepSpeed序列并行允许随着GPU数量的增加线性增加序列长度,并且序列长度与GPU数量保持线性比例关系,适当的GPU数量下保持相似的计算吞吐量。
+
+
+
+
+*图2:DeepSpeed序列并行强化可扩展性评估,使用不同的序列长度和GPU数量。*
+
+
+### 密集注意力评估
+
+接下来,我们在300亿参数的密集注意力模型上对DeepSpeed序列并行进行了评估,并与Megatron序列并行在64个A100 GPU上进行了对比。这些评估的结果如图3所示。
+
+我们将DeepSpeed序列并行与Megatron-LM在不同序列长度下的性能进行了比较。对于我们的评估,我们选择了能使DeepSpeed序列并行和Megatron-LM分别达到最佳性能(通过吞吐量或TFLOPs衡量)的序列长度-批大小组合,我们称之为最佳(批大小-序列长度)配置。对于DeepSpeed序列并行,我们始终使用64的ZeRO并行度。
+
+图3显示,DeepSpeed序列并行在相同序列长度下始终优于Megatron-LM。此外,DeepSpeed序列并行可以运行比Megatron-LM更长的序列。DeepSpeed序列并行的性能优势在于两个方面:(1)DeepSpeed序列并行结合ZeRO-3的内存优化,可以容纳更多的样本,从而提高吞吐量;(2)相对于Megatron-LM序列并行中应用的*all-gather*通信,DeepSpeed序列并行使用更高效的全对全通信。
+
+
+
+
+*图3:DeepSpeed和Megatron LM序列并行在300亿参数模型上的密集注意力评估。*
+
+
+### 稀疏注意力评估
+
+类似地,我们在300亿参数的稀疏注意力模型上对DeepSpeed序列并行进行了评估,并与Megatron序列并行进行了对比。我们的评估结果如图4所示。稀疏注意力的实验结果与密集注意力实验类似。我们观察到DeepSpeed序列并行的吞吐量性能相对于Megatron-LM提高了2倍以上。通过节省内存,DeepSpeed序列并行结合ZeRO-3可以扩展到比Megatron-LM更长4倍的序列长度。
+
+DeepSpeed序列并行在相同序列长度下始终优于Megatron-LM。事实上,当前的DeepSpeed吞吐量受到本地稀疏注意力实现的瓶颈,因此DeepSpeed吞吐量随着序列长度的增加而降低。我们预计,随着未来局部稀疏注意力实现性能的改善,DeepSpeed与Megatron之间的性能差距将在更大的序列长度下进一步增加。
+
+
+
+
+*图4:DeepSpeed和Megatron LM序列并行在300亿参数模型上的稀疏注意力评估。*
+
+
+### 收敛性研究
+
+最后,图5显示了1.3亿参数GPT模型在32K序列长度下,使用序列并行度设置为4的情况下,在8个A100 GPU上的收敛性。对于DeepSpeed序列并行,我们使用不同的ZeRO阶段进行了收敛性评估。DeepSpeed序列并行是一种纯系统优化技术,用于实现长序列Transformer模型的训练,因此在训练模型质量上没有(负面)影响,并通过实验得到了验证,如图5所示。
+
+
+
+
+*图5:使用不同ZeRO内存优化阶段的DeepSpeed序列并行的收敛性评估。*
+
+
+## DeepSpeed-Ulysses软件可用性
+
+DeepSpeed-Ulysses只需进行少量简单代码更改来集成到您的代码中。下面是一个启用它的示例:
+
+```python
+from deepspeed.sequence.layer import DistributedAttention
+
+# 将原始的自注意(attn)替换为DeepSpeed-Ulysses的自注意
+
+dist_attn = DistributedAttention(attn, get_sequence_parallel_group())
+```
+
+与其他支持序列并行的库(如Megatron-LM)相比,DeepSpeed-Ulysses不需要进行模型重构。
+DeepSpeed-Ulysses已经完全与Megatron-DeepSpeed代码库集成并经过测试。这意味着如果您已经
+在使用这个代码库来训练大型语言模型,您可以无缝地使用DeepSpeed-Ulysses训练具有极长序列的模型。
+
+## 立即尝试DeepSpeed-Ulysses
+
+我们很高兴宣布推出DeepSpeed-Ulysses,您可以通过DeepSpeed GitHub获取代码。详细的使用教程在[DeepSpeed教程页面](https://www.deepspeed.ai/tutorials/ds-sequence/)上提供。
+
+我们欢迎各种形式的贡献和合作,以共同推动当长上下文窗口不再是限制时的各种创新。DeepSpeed-Ulysses是大规模AI训练和推理的更大DeepSpeed生态系统的一部分。有关所有DeepSpeed技术和创新的更多细节,请访问我们的[网站]((https://www.deepspeed.ai/)),并关注我们在X(Twitter)上的帐号([英文](https://twitter.com/MSFTDeepSpeed),[日文](https://twitter.com/MSFTDeepSpeedJP))和[中文知乎](https://www.zhihu.com/people/deepspeed)。
+
+我们愿意与大学、研究实验室和公司合作。有关此类请求(以及不适合在GitHub上提出的其他请求),请直接发送电子邮件至。
diff --git a/blogs/deepspeed-ulysses/japanese/README.md b/blogs/deepspeed-ulysses/japanese/README.md
new file mode 100644
index 000000000000..88a0e375ce70
--- /dev/null
+++ b/blogs/deepspeed-ulysses/japanese/README.md
@@ -0,0 +1,158 @@
+
+
+# DeepSpeed Ulysses: Transformerモデルを非常に長いシーケンスで訓練するための最適化
+
+
+
+
+
+
+
+
+
+## 概要
+
+巨大モデルを長いシーケンスで訓練することは、生成AIから科学的発見のためのモデルに至るまで、あらゆる分野で非常に重要になっています。
+生成AIでは、会話型AI、長文の要約、ビデオ生成など、空間的・時間的な文脈での長いコンテキストの理解が求められます。
+たとえば、音声、画像、波形を同時に処理するマルチモーダルの基盤モデルは、非常に長いシーケンス長の高次元の入力から、長期のコンテキストを理解することが求められます。同様に、章や書籍単位での要約(数万から数十万語と想定される)は、会話AIや要約タスクにおいて非常に重要です。
+
+長いシーケンスを扱えることは、科学におけるAIの利用にも重要であり、構造生物学、医療、気候および天気予報、大規模分子シミュレーションを進歩させる可能性を持っています。例えば、大規模な言語モデルを遺伝子のシーケンスに適応させることにより、単純なアルファベットからなる非常に長いシーケンスから、ゲノムの進化のパターンを学ぶ言語モデルを作成できます(ヒトゲノムには64億の文字があります)。また医療分野において、全体の患者ケア記録に基づいて条件付けされる診断予測モデルでは、非常に長いシーケンスで表現される文脈を扱う必要があります。
+
+生成AIや科学分野において、長いシーケンスを扱う重要性が急速に増している一方で、既存の大規模モデルの訓練システムや基盤となる並列化技術(データ並列、テンソル並列、パイプライン並列、シーケンス並列)では、効率的に長いシーケンスを訓練することができませんでした。既存の並列化のアプローチには、2つの課題があります。第一に、データ並列、テンソル並列、パイプライン並列のような、既存の広く使用されている並列アプローチは、シーケンスの次元に沿ってスケールアップすることができません。第二に、既存のシーケンス並列のアプローチは、メモリ上のデータの通信が理由で、高い効率が得られません。さらに、既存のアプローチは、大規模なコードの変更が必要となり、既存のコードにエラーを発生させやすいという課題もあります。
+
+このリリースは、LLM(大規模言語モデル)の訓練において、非常に長いシーケンスの処理を、効率的かつスケーラブルに実現する新たな手法である *DeepSpeed-Ulysses(またはUlysses、非常に長い小説にちなんで名づけられました)* を公開するものです。
+
+
+DeepSpeed-Ulyssesは、個々のサンプルを、シーケンスの次元で複数のGPUで分割します。そして、Transformerにおけるアテンション計算の直前に、 クエリ (Q)、キー (K)、および値 (V)について、*all-to-all* 通信を適用します。
+このall-to-all通信により、アテンションヘッドの単位で重複のないように複数のGPUに分割配置される一方で、シーケンス全体が一つのGPUに保持されるようになります。各GPUは、それぞれに異なるアテンションヘッドを計算するため、並列に計算が可能です。アテンションの計算後、もう一度 all-to-all 通信によって、計算結果をシーケンスの次元で再分割します。
+
+このブログで紹介するDeepSpeed-Ulysses及びその実装の主な特長は以下の通りです。
+
+
+* 既存のシステムに比べて ***4倍長いシーケンス長*** (***100万トークン以上***)のシーケンスでの訓練が可能。
+
+* 既存のシステムと比較して ***10倍以上の通信削減***。これにより、***最大2.5倍のスループット向上***と、175 TFlops/GPU(ハードウェアピークの54%以上)のスループットを実現。
+
+* アテンションの実装に依存しない汎用性: Denseなアテンション計算のアルゴリズムだけでなく、Sparseなアルゴリズムも利用できます。また、FlashAttention v2のような効率的なアテンションの実装も容易に利用できます。
+
+* 大規模モデルの訓練のサポート: ZeRO-3と連携して、長いシーケンスを処理できるだけでなく、巨大なモデルサイズもサポートします。
+
+* 最小限のコード変更で、既存の訓練フレームワークに適用できます。
+
+以降のセクションでは、DeepSpeed-Ulyssesの中心となる設計アイデア、通信コストの分析、実験的な評価と既存手法との比較を詳しく示した後、使用方法について説明します。
+
+
+## DeepSpeed-Ulyssesの設計
+
+
+
+
+*図1: DeepSpeed-Ulysses の設計*
+
+
+図1はDeepSpeed-Ulyssesの中心となる設計を示しています。既知のTransformerアーキテクチャと同様に、入力シーケンス長 *N* が *P* の利用可能なデバイスに分割されて構成されます。各デバイスにおける、サイズ *N/P* の分割されたシーケンスから、クエリ (Q)、キー (K)、および値 (V) が計算されます。次に、各デバイス上のローカルな QKV から、all-to-all 集合通信によって、グローバルな QKV が構成されます。all-to-all 通信に続いて、ヘッドごとに以下のようにアテンションが計算されます。
+
+$$Output\ context = Softmax\ (\frac{QK^{T}}{\sqrt{d}})V$$
+
+アテンションの計算後、all-to-all 通信を再度実行し、Transformerレイヤーの残りのモジュール (MLP、layer norm など) の後続のオペレータを実行するため、シーケンス次元に沿って出力を分割します(各デバイス上での分割されたシーケンス長は、また *N/P* になります)。
+
+### 通信量の大幅な削減
+
+DeepSpeed-Ulyssesが、長いシーケンスのための既存の並列化手法と異なる点は、以降の通信量の分析に示すように、総通信量がはるかに少なく、それによって、シーケンスの並列度が増加した際の全体的なスケーラビリティが優れていることです。
+
+ノード内通信にNVSwitch、ノード間通信にfat tree IBトポロジを備えるなどのモダンな計算クラスタでは、*P* 個のGPU上でall-to-all通信を行ったとき、合計メッセージのサイズ *M* に対して、リンクごとの通信量は *M/P* になります。隠れサイズ*h*、シーケンス長*N*、および並列度*P*のTransformerモデルに対して、アテンション計算の前に、QKVについてall-to-allを実行しますが、この合計メッセージサイズは *3Nh* になります。また、アテンションの出力に対しても、all-to-allを実行しますが、このメッセージサイズは *Nh* になります。したがって、Transformerレイヤごとに、リンクあたり合計通信量が ***4Nh/P*** となります (オーダーでは O(N/P)) 。この通信量は、NとPの両方が比例して増加する場合に一定です。
+
+対照的に、Megatron-LMのシーケンス並列のような既存のアプローチは、*P* に関係なく *N* とともに通信量が線形に増加するため、通信量のオーダーは ***O(N)*** となります。例えば、Megatron-LMは、Transformerの各レイヤーに対して、通信量がNhの2つのall-gatherと、同じく通信量がNhの2つのreduce-scatterを実行します。しかし、サイズMの各all-gatherおよびreduce-scatterのコストは、 *P \>\> 1* の場合に(M/Pではなく)Mのままです。したがって、Megatron-LMシーケンス並列は、DeepSpeed-UlyssesのP倍大きな ***4Nh*** の通信ボリュームを発生させます。これにより、DeepSpeed-Ulyssesは、既存のアプローチと比較して、極端に長いシーケンスでの訓練を可能にし、訓練効率を大幅に向上させることができます。以降で示す評価結果は、この分析と一致しています。
+
+### その他の特長
+
+***アテンションの実装に非依存***
+
+DeepSpeed-Ulyssesでは、アテンションの実装について、self-attention, cross-attention, Dense/Sparse等の異なるアルゴリズム、FlashAttentionのように、長いシーケンスをサポートするさまざまな最適化されたカーネルを用いた実装など、任意のアテンションと組み合わせて用いることができるような、一般化された構造になっています。
+
+この一般性は、アテンション計算をモジュール化して用いることによって実現されています。アテンション計算の前では、シーケンス長NをN/Pに分割しますが、アテンション計算自体は、ヘッドごとに完全なアテンションを計算しており、ただデバイスあたりのヘッド数が少ないだけです。したがって、アテンション計算は、Denseなアルゴリズムやさまざまな種類のSparseなアテンションのためのアルゴリズムなど、任意の種類のアテンションのメカニズムと置き換えることができます。
+
+***ZeRO3による大規模モデルの訓練***
+
+DeepSpeed-Ulyssesによるシーケンスの分割と並列化は、長いシーケンスでの訓練時のアクティベーションメモリを削減しますが、モデル状態の保持に必要なメモリ量には影響しません。したがって、大きな言語モデルで長いシーケンス長の訓練をサポートするために、シーケンスの並列化はZeRO-3と統合されています。
+
+
+[ZeRO Redundancy Optimizer Stage 3 (ZeRO-3)](https://www.microsoft.com/en-us/research/blog/zero-deepspeed-new-system-optimizations-enable-training-models-with-over-100-billion-parameters/) は、大規模なモデルを訓練するためのメモリ最適化技術です。モデルの状態(パラメータ、勾配、Optimizer状態)を全てのGPUに複製する従来のデータ並列と異なり、ZeRO-3はGPUにモデルの状態を分割配置します。シーケンス並列を併用する場合、訓練データは、サンプルの次元と、シーケンスの次元の両方で分割されていることになります。
+そこで、データ並列およびシーケンス並列の両方のグループにまたがるプロセス群で、ZeRO-3におけるパラメータや勾配等の分割を行い、また必要な時にallgather通信によってそれらを収集します。同様に、勾配の集約(reduce)も、パラメータ更新のためにデータ並列とシーケンス並列の両方にまたがるプロセス群で実施されます。ZeROを使用することで、シーケンスとデータの両方の次元で大きなメモリ節約が可能となり、長いシーケンス長だけでなく、大きなモデルサイズにもスケーリングすることができます。
+
+## 評価
+
+多くのNLPタスクの基盤モデルとして用いられるGPTモデルの学習に、DeepSpeed-Ulyssesを適用し、最大64台のA100 GPU(40GBメモリ)を用いて評価を行いました。評価は以下の4つの観点で実施しました: i) シーケンス長のスケーラビリティ、ii) Denseなアテンションでのスループットおよび既存のシステムとの比較、iii) Sparseなアテンションのスループットおよび既存のシステムとの比較、iv) 収束性の検証。以降で、それぞれの評価結果を示します。
+
+### シーケンス長のスケーラビリティ
+
+
+最初の評価実験は、12億パラメータのGPTモデルでの、最大100万トークンまでのシーケンス長の強スケーリング(strong scaling)です。この評価の結果を、図2に示します。GPUの数に比例してシーケンス長を増加させた際に、それぞれのGPU数・シーケンス長で、ほぼ同等の計算スループットを維持しています。
+
+
+
+
+*図2: 異なるシーケンス長・GPU数での強スケーリング(strong scaling)*
+
+
+### Denseなアテンションでの比較
+
+次に、300億パラメータのDenseなアテンションを持つモデルで、64台のA100 GPU上でのMegatron-LMのシーケンス並列との比較を行ったベンチマーク結果を図3に示します。
+
+ここでは、様々なシーケンス長で、DeepSpeed-UlyssesとMegatron-LMのシーケンス並列を比較しました。評価のために、それぞれのフレームワークが、最高の性能(スループットまたはTFLOPとして測定)を得られるシーケンス並列の並列度と、グローバルバッチサイズを選択しました。これを私たちは最適(バッチサイズ-シーケンス長)構成と呼びます。DeepSpeed-Ulyssesでは、常にZeRO-3を用い、64台のGPUにパラメータ・勾配・Optimizerの状態を分割配置しました。
+
+図3に示すように、DeepSpeed-UlyssesとMegatron-LMの両方で処理できるシーケンス長では、DeepSpeed-Ulyssesが常にMegatron-LMよりも優れたパフォーマンスを示しました。さらに、DeepSpeed-Ulyssesは、Megatron-LMのシーケンス並列よりも、長いシーケンスを処理できます。DeepSpeed-Ulyssesの利点は2つあります:(1) ZeRO-3との組み合わせにより、メモリの必要量をより小さくできるため、Megatron-LMよりも大きなバッチサイズを処理できるようになり、スループットが高まる。 (2) DeepSpeed-Ulyssesは、Megatron-LMシーケンス並列処理で適用されるall-gather通信と比較して、より効率的なall-to-all通信のメリットを得られる。
+
+
+
+
+
+*図3: 300億パラメータ・DenseなアテンションでのMegatron-LMとの比較*
+
+
+### Sparseなアテンションでの比較
+
+同様に、300億パラメータのSparseなアテンションを用いたモデルに、DeepSpeed-Ulyssesを適用し、Megatron-LMのシーケンス並列との比較を行いました。評価の結果を図4に示します。Sparseなアテンションに関しても、Denseなアテンションと同様の傾向が見られます。Megatron-LMに比べて、DeepSpeed-Ulyssesのスループット性能が2倍以上であることを確認しています。ZeRO-3を用いたメモリ使用量の削減によって、Megatron-LMよりも4倍長いシーケンス長を処理できています。
+
+DeepSpeed-Ulyssesは、DeepSpeed-UlyssesとMegatron-LMの両方で実行できるシーケンス長において、Megatron-LMを上回っています。実際、現在のDeepSpeed-Ulyssesのスループットは、各GPU上でローカルに計算されるSparseなアテンションがボトルネックとなっており、その結果、シーケンス長が増加するにつれてスループットが減少します。将来、ローカルのSparseなアテンションの実装のパフォーマンスを向上させることで、DeepSpeed-UlyssesとMegatronの間の性能のギャップが、より大きなシーケンス長に対してさらに広がると予想しています。
+
+
+
+
+*図4: 300億パラメータ・SparseなアテンションでのMegatron-LMとの比較*
+
+
+### 収束の検証
+
+図5では、8台のA100 GPU上で32Kのシーケンス長を持つ13億パラメータのGPTモデルの収束を示しています。ここでは、DeepSpeed-UlyssesとMegatron-LMのシーケンス並列の両方で、シーケンス並列の並列度を4に設定しています。DeepSpeed-Ulyssesは、ZeROと併用可能なため、それぞれのZeROステージでの収束を評価しました。DeepSpeed-Ulyssesは、長いシーケンスのTransformerモデルの訓練を可能にするための、システム面での最適化技術であり、したがって訓練されたモデルの品質に対する(ネガティブな)影響はありません。このことは、図5の結果から確認できます。
+
+
+
+
+*図5: 異なるZeROのステージでの収束*
+
+
+## DeepSpeed-Ulyssesの利用
+
+DeepSpeed-Ulyssesは、数行の簡単なコードの変更だけで、既存のコードに簡単に統合することができます。以下は利用の例です:
+
+``` python
+from deepspeed.sequence.layer import DistributedAttention
+
+# Replace the original self-attention (attn) with DeepSpeed-Ulysses’s self-attention
+dist_attn = DistributedAttention(attn, get_sequence_parallel_group())
+```
+
+シーケンス並列処理をサポートする他のライブラリ、例えばMegatron-LMと比較して、DeepSpeed-Ulyssesはモデルのリファクタリングを必要としません。
+また、DeepSpeed-UlyssesはMegatron-DeepSpeedコードリポジトリと統合され、テストされています。
+大規模な言語モデルの訓練のためにこのリポジトリをすでに使用している場合、巨大なシーケンス長のモデルを訓練するために、DeepSpeed-Ulyssesをすぐに活用できます。
+
+## 早速試してみましょう!
+
+DeepSpeed-Ulyssesは、DeepSpeedのGitHubを通じてアクセス可能です。使用方法に関する詳しいチュートリアルは、[DeepSpeedのチュートリアルページ
+](https://www.deepspeed.ai/tutorials/ds-sequence/)にあります。
+
+長いコンテキストを扱う際の制約を取り除くことによって何が可能になるのか、ユーザの皆様と共に様々な可能性を探求するため、幅広い協力やコラボレーションを歓迎します。DeepSpeed-Ulyssesは、大規模なAIの訓練と推論のためのより大きなDeepSpeedエコシステムの一部です。DeepSpeedの多くの技術や革新的な機能の詳細については、[ウェブサイト](https://www.deepspeed.ai/)をご覧いただくか、X(以前のTwitter。[英語版](https://twitter.com/MSFTDeepSpeed)、[日本語版](https://twitter.com/MSFTDeepSpeedJP))や、中国の[Zhihu](https://www.zhihu.com/people/deepspeed)でフォローしてください。
+
+DeepSpeedは、皆様の開発への参加を歓迎しています。DeepSpeedのGitHubページで、バグ報告、Pull Request、ディスカッションへの参加が可能です。詳細は[ガイドライン](https://github.com/microsoft/DeepSpeed/blob/master/CONTRIBUTING.md)をご覧ください。また、大学、研究所、企業とのコラボレーションも行っています。こうしたコラボレーションについてのご要望(およびGitHubには適さないその他の話題)については まで直接メールをお送りください。
diff --git a/blogs/deepspeed-ulysses/media/convg.png b/blogs/deepspeed-ulysses/media/convg.png
new file mode 100644
index 000000000000..b9586dc404e4
Binary files /dev/null and b/blogs/deepspeed-ulysses/media/convg.png differ
diff --git a/blogs/deepspeed-ulysses/media/convgZ.png b/blogs/deepspeed-ulysses/media/convgZ.png
new file mode 100644
index 000000000000..324f47cd61bd
Binary files /dev/null and b/blogs/deepspeed-ulysses/media/convgZ.png differ
diff --git a/blogs/deepspeed-ulysses/media/dense1B1Mscale.png b/blogs/deepspeed-ulysses/media/dense1B1Mscale.png
new file mode 100644
index 000000000000..eb886f879247
Binary files /dev/null and b/blogs/deepspeed-ulysses/media/dense1B1Mscale.png differ
diff --git a/blogs/deepspeed-ulysses/media/dense30B.png b/blogs/deepspeed-ulysses/media/dense30B.png
new file mode 100644
index 000000000000..d2eef04b73cc
Binary files /dev/null and b/blogs/deepspeed-ulysses/media/dense30B.png differ
diff --git a/blogs/deepspeed-ulysses/media/dense7B.png b/blogs/deepspeed-ulysses/media/dense7B.png
new file mode 100644
index 000000000000..042269276a6b
Binary files /dev/null and b/blogs/deepspeed-ulysses/media/dense7B.png differ
diff --git a/blogs/deepspeed-ulysses/media/fig2Ulysses.png b/blogs/deepspeed-ulysses/media/fig2Ulysses.png
new file mode 100644
index 000000000000..39e8a8420bde
Binary files /dev/null and b/blogs/deepspeed-ulysses/media/fig2Ulysses.png differ
diff --git a/blogs/deepspeed-ulysses/media/fig3Ulysses.png b/blogs/deepspeed-ulysses/media/fig3Ulysses.png
new file mode 100644
index 000000000000..fa1498096284
Binary files /dev/null and b/blogs/deepspeed-ulysses/media/fig3Ulysses.png differ
diff --git a/blogs/deepspeed-ulysses/media/fig4Ulysses.png b/blogs/deepspeed-ulysses/media/fig4Ulysses.png
new file mode 100644
index 000000000000..f55838b36e78
Binary files /dev/null and b/blogs/deepspeed-ulysses/media/fig4Ulysses.png differ
diff --git a/blogs/deepspeed-ulysses/media/hero1.png b/blogs/deepspeed-ulysses/media/hero1.png
new file mode 100644
index 000000000000..f0034ffdf8b9
Binary files /dev/null and b/blogs/deepspeed-ulysses/media/hero1.png differ
diff --git a/blogs/deepspeed-ulysses/media/hero2.png b/blogs/deepspeed-ulysses/media/hero2.png
new file mode 100644
index 000000000000..323d3d419002
Binary files /dev/null and b/blogs/deepspeed-ulysses/media/hero2.png differ
diff --git a/blogs/deepspeed-ulysses/media/image3.png b/blogs/deepspeed-ulysses/media/image3.png
new file mode 100644
index 000000000000..18be3c843370
Binary files /dev/null and b/blogs/deepspeed-ulysses/media/image3.png differ
diff --git a/blogs/deepspeed-ulysses/media/sparse30B.png b/blogs/deepspeed-ulysses/media/sparse30B.png
new file mode 100644
index 000000000000..2637d353d0c6
Binary files /dev/null and b/blogs/deepspeed-ulysses/media/sparse30B.png differ
diff --git a/blogs/deepspeed-ulysses/media/sparse7B.png b/blogs/deepspeed-ulysses/media/sparse7B.png
new file mode 100644
index 000000000000..2d9c9ad69420
Binary files /dev/null and b/blogs/deepspeed-ulysses/media/sparse7B.png differ
diff --git a/blogs/deepspeed-visualchat/10-03-2023/README-Chinese.md b/blogs/deepspeed-visualchat/10-03-2023/README-Chinese.md
new file mode 100644
index 000000000000..e91ff1ecd51e
--- /dev/null
+++ b/blogs/deepspeed-visualchat/10-03-2023/README-Chinese.md
@@ -0,0 +1,181 @@
+
+
+
+# DeepSpeed-VisualChat:多轮图像+文字,为你展现不一样的AI聊天魅力
+
+
+
+
+
+
+
+
+
+要引用 DeepSpeed-VisualChat,请引用我们的 [arxiv 报告](https://arxiv.org/abs/2309.14327):
+
+
+```
+@article{yao2023deepspeed-visualchat,
+ title={{DeepSpeed-VisualChat: Multi-Round Multi-Image Interleave Chat via Multi-Modal Causal Attention}},
+ author={Zhewei Yao and Xiaoxia Wu and Conglong Li and Minjia Zhang and Heyang Qin and Olatunji Ruwase and Ammar Ahmad Awan and Samyam Rajbhandari and Yuxiong He},
+ journal={arXiv preprint arXiv:2309.14327},
+ year={2023}
+}
+```
+
+# 1. 概述
+大型语言模型 (LLMs),如 GPT 和 LLaMa,在各种文本生成和理解任务中都展现出了卓越的能力,特别是在经过零次/少次学习(zero-/few-shot learning)或微调(instructed fine-tuning)后。然而,要让 AI 模型为多样化的任务做好准备,需要加入的一个关键特性是多模态能力;例如,AI 模型应该能够读取图像、听到声音、观看视频等。这种能力在纯文本基础的 LLMs 中基本上是不存在的。
+
+最近,大量的研究项目开始探索将视觉能力引入到 LLMs 中,特别是通过插入图片输入使 LLMs 来理解图片(简称为大型视觉语言模型或 LVLMs)。
+
+大多数现有工作的主要缺点是:
+* 重点主要放在与单一图像相关的任务上,如视觉问题回答和字幕,或处理需要同时输入的多个图像。两种方法都不太擅长管理交错的图像和文本输入。
+* 系统的可扩展性仅限于具有约 10B 参数的模型,这比最大的开源模型小了一个数量级。
+
+然而,对于一个真正的 AI 聊天模型,输入内容可能是与文本交错的多个图像,这是目前的工作很少涉及的情况。此外,随着模型大小的增加,LLMs 的生成能力增长迅速。因此,将系统能力集中在约 10B 的模型上限制了对 LVLMs 潜力的进一步探索。
+
+为了解决这些问题,我们推出了 DeepSpeed-VisualChat(请参阅 [arxiv 报告](https://arxiv.org/abs/2309.14327) 以获取更多详细信息),带有以下新特性:
+
+* ***全开源多轮多图框架与前所未有的可扩展性***:DeepSpeed-VisualChat,作为开创性的全开源框架之一,支持多轮和多图对话,容纳交错的文本和图像输入。我们利用 DeepSpeed 提高我们的训练效果,使用一个 2B 的视觉编码器和一个 70B 的 LLaMA-2 解码器模型,展示了我们框架的显著可扩展性。
+* ***多模态因果注意力 (MMCA)*** 我们为多模态模型设计了一个新的 MMCA 注意力机制,独立地计算各种模态的注意力权重。MMCA 达到了与传统交叉注意机制类似的目标,但为生成任务提供了增强的因果注意解释,消除了对额外模块或参数的需求。与标准的因果注意力相比,它还提供了更好的训练数据效率。
+* ***交错输入的数据混合*** 为了促进交错模态的对话,DeepSpeed-VisualChat 在现有数据集上采用了各种数据混合技术,克服了大多数现有开源数据集中交错文本和图像输入的短缺。
+
+# 2. 模型架构概述
+
+
+
+ *图 1:DeepSpeed-VisualChat 的模型架构示意图。*
+
+
+如 *图 1* 所示,DeepSpeed-VisualChat 的模型架构由三个部分组成:一个视觉编码器,如 CLIP;一个语言解码器,如 LLaMa-7B;和一个特征对齐线性投影层。模型的大部分都是冻结的,只有语言模型的嵌入和线性投影层是可训练的。因此,可训练参数的总数大约在 O(10M) (LLaMa-2-13B) 到 O(100M) (LLaMa-2-70B) 之间。
+
+# 3. DeepSpeed 多模态因果注意力
+
+用于在多模态模型中连接视觉和文本组件的两种常见注意机制是:因果注意力,如在 MiniGPT 和 QWen-VL 中使用的,以及交叉注意力,如在 Otter 和 Flamingo 中使用的。
+
+
+
+
+ *图 2:不同的注意机制:使用一个输入句子“用户:请描述这个图片。”和三个图像令牌(I-token1、I-token2、I-token3)来比较不同的注意机制。在左边,我们展示了标准的因果注意力,将图像令牌视为文本。在中间,我们展示了应用于图像的交叉注意力,同时保持文本令牌的标准因果注意力。在右边,我们展示了我们的创新 MMCA 注意力机制,其中图像令牌只执行自注意,文本令牌独立地注意文本/图像令牌,橙色为图像部分。这种机制由:softmax($`QK^T \odot M_1`$)+ softmax($`QK^T \odot M_2`$) 定义,其中 Q 和 K 分别为查询和密钥,$`M_1`$=[M==1],和 $`M_2`$=[M==2],其中 M $`\in`$ R
10x10。*
+
+
+因果注意力 (CA):基于 CA 的方法简单地将视觉特征(即来自最终视觉编码器层输出的特征)投影到文本特征,并将它们与文本嵌入层后的正常文本特征组合,以送入 LLMs。CA 的好处是它是 LLMs 原始注意机制的自然扩展,因此,它不引入任何额外的模块或参数。但是,直觉上这种方法会带来一些问题:
+
+* 每个视觉令牌会关注它之前的视觉和文本令牌。然而视觉令牌已经以双向方式完全编码,不需要进一步关注它之前的视觉和文本令牌。
+* 对于一个文本令牌,模型需要学习如何在其之前的文本和图像令牌之间分配其注意权重。由于这些问题,我们发现 LVLMs 中 CA 的数据效率通常是有问题的。为了解决这个问题,LLaVA 和 QWen-VL 需要视觉-语言预训练来完全对齐视觉特征和文本特征。
+
+交叉注意力 (CrA):作为替代方案,交叉注意力 (CrA) 与 CA 的结合展示出更好的数据效率,但也带有一些缺点:
+
+* 它为模型引入了新的参数。例如,具有交叉注意力引入的新参数的 Otter 拥有超过 15 亿的可训练参数。和 LLaVA 的百万级可训练参数相比,这大大增加了训练成本和内存需求。
+* 如果在训练过程中中间引入了一个图像,需要仔细设计,因为先前的文本令牌不应该能够注意到图像。
+
+多模态因果注意机制 (MMCA):为了克服这些问题,我们提出了一种新的多模态因果注意机制 (MMCA),它既有 CA 的参数效率,又有 CrA 的数据效率。总体思路如下:
+
+* 对于视觉令牌,它们只关注自己,因为视觉令牌是由视觉编码器编码的。
+* 对于文本令牌,它们关注所有以前的令牌。但是,对文本和图像令牌 MMCA 使用两个单独的注意权重矩阵。
+
+MMCA 的第二点背后的直觉是,一个模态的注意权重可能会影响另一个模态。例如,文本令牌可能会比视觉信息更多地关注文本信息。因此,如果注意权重矩阵在两种模态之间进行归一化,那么视觉令牌的注意得分可能会非常小。请参考 *图 2* 以查看三种注意机制的可视化。
+
+演示结果。我们首先通过几个例子展示在不同的注意机制下 DeepSpeed-VisualChat 的单图像视觉语言对话功能。在这些实验中,我们使用 LLaMA2-7B 语言模型和 QWen-VL 视觉编码器作为我们的视觉编码器。这两个模型通过一个简单的线性投影层连接在一起。这个模型在两个 LLaVa 数据集上进行了训练。正如 *图 3* 和 *图 4* 所示,当与 MMCA 配合使用时,DeepSpeed-VisualChat 有效地识别了图像中的视觉细节,对用户的问题提供了准确通顺的回答。
+此外,与其他注意机制(如使用因果注意力和交叉注意力的组合)相比,MMCA 表现出更全面和精确的图像细节把握。与 CrA 和 CA 的组合以及 MMCA 相比,仅使用 CA 可能会显示出稍微多一些的错误(*图 3*)或导致较低的理解能力(*图 4*)。
+
+
+
+
+ *图 3:示例视觉和语言输入,显示了(1)标准因果注意力 (CA) (2)与交叉注意力组合的标准因果注意力 (CA+ CrA) 和(3)DeepSpeed-VisualChat 中的特殊多模态因果注意力 (MMCA) 之间的输出比较。*
+
+
+
+
+
+ *图 4:DeepSpeed-VisualChat 准确地识别了场景是一个美丽的湖泊,并提供了一组合理的建议。相比之下,其他的注意力机制误解了图像认为其包含“带船坡的码头”。*
+
+
+# 4. 数据混合
+我们使用了 3 个来源的 9 个数据集,如我们的 [arxiv 报告](https://arxiv.org/abs/2309.14327) 所述。一个实现多轮和多图对话的关键缺失元素是没有足够的数据。我们找到的唯一的多轮多图数据来源是 SparklesDialogue 数据集,它只包含 6520 个样本。为了解决这个问题,我们采用了两种方法,从现有的单图或单轮数据中合成多轮多图数据:简单的数据连接和 LLaVA-Otter 数据混合。
+
+## 4.1 简单数据连接
+对于 LLaVA 模型使用的 "llava" 和 "llava_dial" 数据集,每个样本包括单图像的单轮/多轮对话。为了模拟用户依次询问多个图像的情况,我们对这两个数据集进行了简单的数据后处理。具体来说,我们随机将不同数量的样本连接成一个样本。在 "llava" 的情况下,我们连接了 1 到 3 个样本,而在 "llava_dial" 的情况下,我们连接了 1 到 2 个样本。
+
+## 4.2 LLaVA-Otter 数据混合
+我们注意到,LLaVA 模型使用的 llava 和 llava_dial 数据集以及 Otter 模型使用的 otter_mimicit_cgd 数据集都使用了 COCO train2017 图像。对于 llava 和 llava_dial 数据集,每个样本包括一个图像的单轮/多轮对话。对于 otter_mimicit_cgd 数据集,每个样本包括一对图像的单轮对话。这使我们能够构建一个合成的多轮多图数据 llava_otter_blend 作为更自然的混合:对于 otter_mimicit_cgd 数据集中的每个样本,我们寻找使用相同图像的 llava 和 llava_dial 样本,然后以 "llava/llava_dial 对话然后 otter_mimicit_cgd 对话" 的方式构建一个新样本。
+
+
+
+
+ *图 5:经过 LLaVA-Otter 数据混合后的数据样本。灰色对话框来自 LLaVA 数据集,橙色对话框来自 Otter 数据集。*
+
+
+# 5. 演示
+我们在几个开源数据集上训练了我们的 DeepSpeed-VisualChat-13B 模型,该模型使用一个 2B 的视觉编码器和 13B 的 LLaMA 模型。DeepSpeed-VisualChat-13B 展示了图像字幕功能(*图 6--8*),计数和文本阅读(*图 6*),名人识别(*图 7*),讲故事(*图 8*)等。
+
+
+
+
+ *图 6:DeepSpeed-VisualChat 可以计算图像中的人数,并读取第一张图像中的文本。它还展示了跨图像的理解。*
+
+
+
+
+
+ *图 7:DeepSpeed-VisualChat 可以识别名人并将他们与其成就联系起来。*
+
+
+
+
+
+ *图 8:DeepSpeed-VisualChat 可以讲故事并识别电影。*
+
+
+# 6. 如何开始使用 DeepSpeed-VisualChat
+DeepSpeed-VisualChat 是一个易于使用的训练框架,具有很好的可扩展性,到目前为止已经在 LLaMa-2-70B 模型上进行了测试。我们为所有实验采用了统一的指令调优格式,模板如下所示。
+```
+ % You are a powerful vision-language assistant.
+
+### Image 1: % some image, e.g., cat-1.png
+### Question: % please describe the image.
+### Answer: % It's a cute black cat.
+
+### Image 2: % some image, e.g., cat-2.png
+### Image 3: % some image, e.g., cat-3.png
+### Question: % What's the difference between the three cats?
+### Answer: % The colors of the three cats are different.
+...
+```
+
+使用 DeepSpeed-VisualChat 训练模型是简单和方便的。这里我们给出了基于 CLIP 视觉编码器和 LLaMa-7B 模型的一个例子:
+
+```
+git clone https://github.com/microsoft/DeepSpeedExamples.git
+cd DeepSpeedExamples/applications/DeepSpeed-VisualChat/
+pip install -r requirements.txt
+cd training
+bash training_scripts/run_7b.sh
+```
+
+训练后的模型权重将自动保存为 Hugging Face 兼容版本,并且可以用于启动您自己的视觉聊天 API:
+```
+cd ../chat
+bash chat_scripts/run.sh # You need to change necessary variables, e.g, ckpt path
+```
+
+为了支持更大的模型推理,我们已经将 Hugging Face 大模型推理集成到我们的 DeepSpeed-VisualChat API 中。因此,用户可以根据 GPU 内存容量和模型大小选择不同数量的 GPU。
+
+请参考我们的 [GitHub 主页](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-VisualChat) 了解更多细节。
+
+# 7. 发布:今天尝试 DeepSpeed-VisualChat!
+
+我们非常兴奋地分享 DeepSpeed-VisualChat 现已开源并供 AI 社区使用。
+
+* 要开始使用,请访问我们的 DeepSpeed-VisualChat GitHub 页面:[GitHub 主页](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-VisualChat)
+
+* 我们将继续在您的反馈和支持下改进 DeepSpeed-VisualChat。我们的 [路线图](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-VisualChat/README.md#-deepspeed-visualchats-roadmap-) 显示了目前支持的功能以及未来计划支持的功能。
+
+DeepSpeed-VisualChat 是更大的 DeepSpeed 生态系统的一部分,其中包括一系列深度学习系统和建模技术。要了解更多信息,
+
+* 请访问我们的 [网站](https://www.deepspeed.ai/) 了解详细的博客文章、教程和文档。
+* 在我们的 [英文 X(Twitter)](https://twitter.com/MSFTDeepSpeed)、[日语 X(Twitter)](https://twitter.com/MSFTDeepSpeedJP) 和 [中文知乎](https://www.zhihu.com/people/deepspeed) 上关注我们,以获取 DeepSpeed 的最新消息。
+
+我们欢迎您为 DeepSpeed 做出贡献!我们鼓励您报告问题、贡献 PRs、并在 [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/) 页面上参加讨论。有关更多详细信息,请查看我们的 [贡献指南](https://github.com/microsoft/DeepSpeed/blob/master/CONTRIBUTING.md)。我们对与大学、研究实验室、公司等进行合作持开放态度,例如共同进行深度学习研究、应用 DeepSpeed 为现实世界的 AI 模型和应用提供支持等等。对于此类请求(以及其他不适合 GitHub 的请求),请直接发送电子邮件至 deepspeed-info@microsoft.com。
+
+* 如果你喜欢我们的工作,请在 [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/) 和 [DeepSpeedExamples GitHub](https://github.com/microsoft/DeepSpeedExamples/) 上为我们的仓库点“星”。
diff --git a/blogs/deepspeed-visualchat/10-03-2023/README-Japanese.md b/blogs/deepspeed-visualchat/10-03-2023/README-Japanese.md
new file mode 100755
index 000000000000..ec23e005bff6
--- /dev/null
+++ b/blogs/deepspeed-visualchat/10-03-2023/README-Japanese.md
@@ -0,0 +1,188 @@
+
+
+# DeepSpeed-VisualChat: 複数ラウンド・複数画像の入力が可能なAIチャット体験を実現
+
+
+
+
+
+
+
+
+DeepSpeed-VisualChatを引用する場合、[arxiv上のレポート](https://arxiv.org/abs/2309.14327)を引用してください。
+
+```
+@article{yao2023deepspeed-visualchat,
+ title={{DeepSpeed-VisualChat: Multi-Round Multi-Image Interleave Chat via Multi-Modal Causal Attention}},
+ author={Zhewei Yao and Xiaoxia Wu and Conglong Li and Minjia Zhang and Heyang Qin and Olatunji Ruwase and Ammar Ahmad Awan and Samyam Rajbhandari and Yuxiong He},
+ journal={arXiv preprint arXiv:2309.14327},
+ year={2023}
+}
+```
+
+# 1. 概要
+GPTやLLaMaのような大規模言語モデル(LLM)は、テキスト生成やテキスト理解などの多くのタスクにおいて、Zero-shot/Few-shot学習、あるいはinstructed fine-tuningによって、非常に優れた能力を示してきました。しかし、AIエージェントをより多様なタスクに対応させるには、マルチモーダルを扱う能力が必要です。例えば、AIエージェントは画像を読んだり、音声を聞いたり、ビデオを見たりすることができる必要があります。こうした機能は、テキストベースのLLMにはほとんどありません。
+
+近年、LLMに視覚的な能力を導入することは、研究・実践の両方において広く試みられています。特に、画像をそのまま与えて、LLMが理解できるようにする取り組みが行われています(大規模視覚言語モデル、略してLVLMなどと呼ばれる)。
+
+こうした分野における、既存の研究の主な問題は以下の通りです:
+
+* 視覚に関する質問への回答やキャプション付けのように、単一の画像に関連するタスクや、同時に入力される複数の画像の処理に重点が置かれており、画像とテキストが交互に入力されるような状況には対応していない
+* システムのスケーラビリティは、~10Bのパラメータを持つモデルに限定される
+
+しかし、本来はAIチャットエージェントには、複数のテキストと画像の両方が与えられる可能性があります。また、LLMの生成能力は、モデルサイズが大きくなるにつれて急速に向上することが知られており、~10Bのモデルではその能力が制限されてしまいます。
+
+これらの問題を解決するために、我々は以下の新たな機能を備えたDeepSpeed-VisualChat(詳細は[arxivのレポート](https://arxiv.org/abs/2309.14327)を参照)を開発しました:
+
+* ***完全にオープンソース化され、前例のないスケーラビリティを備えた複数ラウンド・複数画像を処理できるフレームワーク***: DeepSpeed-VisualChatは、完全にオープンソース化された先進的なフレームワークの1つであり、複数ラウンドを通じて画像とテキストが両方与えられる対話を可能にします。また、DeepSpeedを利用することで、比類ないスケーラビリティを実現しており、実際に2Bのビジュアルエンコーダーと70BのLLaMA-2デコーダーモデルで訓練を行えます。
+* ***マルチモーダル因果的注意(MMCA)***: マルチモーダルモデルのための新しいアテンションMMCA(Multi-Modal Causal Attention)を考案し、異なるモダリティ間で独立にアテンションの重みを計算します。MMCAは、従来のcross attentionに類似したものですが、生成タスクのためのcausal attentionを強化しており、追加のモジュールやパラメータが不要になります。また、標準的なcausal attentionと比較して、優れた訓練データ効率を示します。
+* ***順次与えられる画像とテキストを扱うためのデータブレンディング***: DeepSpeed-VisualChatは、既存のデータセットに様々なデータブレンディング技術を採用しています。これにより、順次与えられるテキストと画像の不足という、利用可能なオープンソースデータセットのほとんどに当てはまる課題を克服しています。
+
+# 2 モデルアーキテクチャの概要
+
+
+
+ *図1: モデルアーキテクチャの概要*
+
+
+
+*図1*に示すように、DeepSpeed-VisualChatのモデルアーキテクチャは、CLIPのような視覚エンコーダー、LLaMa-7Bのような言語デコーダー、特徴アライメントを行う linear projectionレイヤの3つのコンポーネントで構成されています。モデルのほとんどのパラメータは固定されており、言語モデルのembeddingとlinear projectionレイヤのみが学習可能です。その結果、学習可能なパラメータの総数は O(10M) (LLaMa-2-13B) から O(100M) (LLaMa-2-70B) となります。
+
+# 3. DeepSpeed マルチモーダル Causal Attention (MMCA)
+
+マルチモーダルモデルで、画像とテキストをつなぐ一般的なattentionの機構は二つあります。一つはMiniGPTやQWen-VLで使われているようなcausal attentionで、もう一つはOtterやFlamingoで使われているようなcross attentionです。
+
+
+
+
+
+ *図2: 異なるアテンションの機構: 「ユーザー:画像を説明してください」という入力文と3つの画像トークン(I-token1、I-token2、I-token3)と組み合わせて与えた場合の、それぞれのattention機構の構成を示しています。左側では、標準的なcausal attentionによって、画像トークンをテキストとして扱う様子を示しています。中央は、テキストトークンに対する標準的なcausal attentionを維持しながら、画像に適用されるcross attentionを使用する様子を示しています。右側では、画像トークンはself attentionのみを行い、テキストトークンはテキスト/画像トークンへのアテンションを独立に計算するという、新しいマルチモーダルのためのアテンションの提案を、オレンジ色のマスクで強調して示しています。この仕組みは、Q, Kをクエリとキーとしたとき、 softmax($`QK^T \odot M_1`$)+ softmax($`QK^T \odot M_2`$)として定義されます。M $`\in`$ R
10x10としたとき、$`M_1`$=[M==1], and $`M_2`$=[M==2] です。*
+
+
+Causal Attention(CA):CAに基づく方法は、視覚的特徴(最終的な視覚エンコーダ層の出力からの特徴)を単純にテキストの特徴量に投影し、テキスト埋め込み層以降の通常のテキストの特徴量と組み合わせてLLMに送り込むというものです。CAの利点は、LLMにおける本来のアテンション機構の自然な拡張であり、そのため余分なモジュールやパラメータを導入しないことです。しかし、このアプローチにはいくつかの直感的な問題があります:
+
+* 視覚トークンはすでに双方向に特徴量に変換されており、本来他の視覚トークンやテキストトークンとのアテンションの必要はありませんが、実際には前の視覚またはテキストトークンとのアテンションがあります。。
+* テキストトークンの場合、モデルは前のテキストトークンと画像トークンとの間でどのようにアテンションの重みを配分するかを学習する必要があります。これらの問題により、LVLMにおけるCAのデータ効率にはしばしば問題があることが分かりました。この問題への対処として、LLaVAとQWen-VLは、視覚的特徴とテキストの特徴を完全に対応させるために、視覚言語の事前学習を必要とします。
+
+Cross Attention (CrA):代替案であるCross Attention (CrA) と CAの組み合わせは、より優れたデータ効率を示しますが、いくつかの欠点もあります:
+
+* モデルに新しいパラメーターを導入する必要があります。例えば、Otterは、Cross Attentionによって導入された新しいパラメータがあるため、LLaVAが数百万個の学習可能なパラメータを持つのに対し、15億個以上のパラメータを必要とします。これにより、学習コストと必要メモリ量が大幅に増加します。
+* 訓練中に会話の途中で画像が与えられた場合、前のテキストトークンは与えられた画像とのアテンションを求められないので、慎重な設計が必要です。
+
+マルチモーダル Causal Attention (MMCA):これらの問題を解決するために、我々は新しいマルチモーダルCausal Attention (MMCA) を提案します。この機構は、CAと同様のパラメータ効率と、CrAと同様のデータ効率の、両方の利点を持つものです。全体的なアイデアは以下の通りです:
+
+* 視覚トークンは視覚エンコーダによってエンコードされるため、視覚トークンは自分自身とのアテンションのみを利用する。
+* テキストトークンについては、その前のすべてのトークンに注目する。ただし、前のテキストトークンと画像トークンに対して、それぞれ別々のアテンションの重み行列を持つ。
+
+MMCAの2つ目のポイントは、1つのモダリティに対するアテンションの重みが、もう1つのモダリティに影響を与える可能性があるということです。例えば、テキストトークンは、視覚情報よりもテキスト情報により大きなアテンションを持つかもしれません。そのため、アテンションの重み行列を両方のモダリティで正規化すると、視覚トークンのアテンションスコアが非常に小さくなる可能性があります。3つのアテンション機構の視覚化については、*図2*を参照してください。
+
+出力例 まず、異なるアテンションの機構を採用した、画像を一つだけ用いた会話におけるDeepSpeed-VisualChatの能力を示す様々な例を紹介します。これらの実験では、LLaMA2-7B言語モデルとQWen-VL視覚エンコーダを視覚エンコーダとして併用します。これら2つのモデルはlinear projection layerを介して接続されています。このモデルは2つのLLaVaデータセットで学習を行いました。*図3*と*図4*で実証されているように、DeepSpeed-VisualChatはMMCAと組み合わされることで、画像内の視覚的な詳細を効果的に識別し、ユーザーのクエリに対して首尾一貫した応答を提供します。さらに、MMCAは、Causal AttentionとCross Attentionの両方から合成されたマスクを使用するような、別のアテンション機構と比べて、より包括的で正確な画像詳細の把握が可能です。また、CrAとCAの組み合わせやMMCAとは対照的に、CA単独では若干エラーが多く(*図3*)、推論能力の程度が低い(*図4*)可能性があることも明らかです。
+
+
+
+
+ *図3: (1) 標準的なcausal attention (CA) (2) cross attentionと組み合わせた標準的なcausal attention (CA+CrA) (3)DeepSpeed-VisualChatの特別なマルチモーダルCausal Attention (MMCA) の出力比較を示す視覚入力と言語入力の例。*
+
+
+
+
+
+ *図4:DeepSpeed-VisualChatは、示された場面を美しい湖として正確に識別し、妥当な提案のセットを提示する。対照的に、ベースラインは画像を「ボート乗り場のあるドック」と誤認識している。*
+
+
+# 4. データブレンディング
+
+[arxivのレポート](https://arxiv.org/abs/2309.14327)に記載されているように、訓練には3つのソースから9つのデータセットを使用しました。複数ラウンド・複数画像の入力を可能にするために決定的に欠けている要素は、適切なデータがないことです。我々が見つけた複数ラウンド・複数画像の唯一のデータソースはSparklesDialogueデータセットで、そこにはわずか6520サンプルしか含まれていません。この制限に対処するため、既存の単一画像または単一ラウンドのデータから、複数ラウンド・複数画像のデータを合成するために、単純なデータ連結とLLaVA-Otterデータ混合という2つの方法を採用しました。
+
+## 4.1 単純なデータ連結
+LLaVAモデルで利用する "llava" と "llava_dial" データセットでは、各サンプルは1つの画像に対する単一/複数ラウンドの会話で構成されています。ユーザーが複数の画像について逐次質問するシナリオをシミュレートするため、これら2つのデータセットに対して、簡単なデータ後処理を行いました。具体的には、ランダムな数のサンプルを1つのサンプルとして連結しました。 "llava" の場合は1~3個のサンプルを連結し、"llava_dial" の場合は1~2個のサンプルを連結しました。
+
+## 4.2 LLaVAとOtterのデータブレンディング
+
+LLaVAモデルで使用されているllavaとllava_dialデータセット、およびOtterモデルで使用されているotter_mimicit_cgdデータセットは、すべてCOCO train2017画像を使用しています。llavaデータセットとllava_dialデータセットには、各サンプルに1つの画像に対する単発/複数回の会話が含まれます。otter_mimicit_cgdデータセットでは、各サンプルは画像のペアに対する1ラウンドの会話を含んでいます。そこで、otter_mimicit_cgdデータセットの各サンプルについて、同じ画像を使うllavaとllava_dialのサンプルを探し、「llava/llava_dial会話 -> otter_mimicit_cgd会話」という流れで新しいサンプルを構築しました。
+
+
+
+
+ *図5: LLaVA-Otterデータブレンド後のデータサンプル。灰色のダイアログボックスはLLaVAデータセットから、オレンジ色のダイアログボックスはOtterデータセットからのもの*
+
+
+# 5. デモ
+いくつかのオープンソースデータセットで2Bビジュアルエンコーダーと13B LLaMAモデルを使い、DeepSpeed-VisualChat-13Bモデルを訓練しました。DeepSpeed-VisualChat-13Bは、画像キャプション機能(*図6-8*)、計数とテキスト読み取り(*図6*)、著名人の認識(*図7*)、ストーリーテリング(*図8*)などを示しています。
+
+
+
+
+ *図6: DeepSpeed-VisualChatは、画像内の人数を数え、最初の画像のテキストを読み取ることができます。また、複数画像を横断的に理解することも可能です。*
+
+
+
+
+
+
+ *図7: DeepSpeed-VisualChatは有名人を認識し、その人物の業績と関連付けることができます*
+
+
+
+
+
+
+ *図8: DeepSpeed-VisualChatは、ストーリーを作ったり、映画を認識したりできます。*
+
+
+
+# 6. DeepSpeed-VisualChatを使い始めるには
+DeepSpeed-VisualChatは使いやすく、かつ優れたスケーラビリティを持つ学習フレームワークで、これまでLLaMa-2-70Bモデルでテストされています。
+すべての実験で統一された命令チューニング形式を採用しており、そのテンプレートを以下に示します。
+
+```
+ % You are a powerful vision-language assistant.
+
+### Image 1: % some image, e.g., cat-1.png
+### Question: % please describe the image.
+### Answer: % It's a cute black cat.
+
+### Image 2: % some image, e.g., cat-2.png
+### Image 3: % some image, e.g., cat-3.png
+### Question: % What's the difference between the three cats?
+### Answer: % The colors of the three cats are different.
+...
+```
+
+DeepSpeed-VisualChatの訓練は簡単かつ便利に実行できます。ここではCLIPビジュアルエンコーダーとLLaMa-7Bモデルを使用する例を示します:
+
+```
+git clone https://github.com/microsoft/DeepSpeedExamples.git
+cd DeepSpeedExamples/applications/DeepSpeed-VisualChat/
+pip install -r requirements.txt
+cd training
+bash training_scripts/run_7b.sh
+```
+
+訓練されたチェックポイントは自動的にHugging Faceと互換性のある形式で保存され、独自のビジュアルチャットAPIを提供するために使用できます:
+
+```
+cd ../chat
+bash chat_scripts/run.sh # You need to change necessary variables, e.g, ckpt path
+```
+
+より大規模なモデル推論をサポートするために、我々はHugging Faceの大規模モデル推論をDeepSpeed-VisualChat APIに組み込みました。そのため、ユーザーはGPUメモリ容量とモデルサイズに基づいて、異なるGPU数を選択することができます。
+
+詳細は[ランディングページ](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-VisualChat)をご参照ください。
+
+# 7. 早速使ってみましょう!
+
+DeepSpeed-VisualChatがオープンソース化され、AIコミュニティで利用できるようになったことを大変嬉しく思います。
+
+* まずは、DeepSpeed-VisualChatのGitHubページをご覧ください: [GitHubランディングページ](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-VisualChat)
+
+* DeepSpeed-VisualChatは、皆様からのフィードバックとサポートにより改良を続けていきます。私たちの[ロードマップ](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-VisualChat/README.md#-deepspeed-visualchats-roadmap-)は、現在サポートされている機能と将来的に計画している機能を示しています。
+
+DeepSpeed-VisualChatは、さまざまなDeep Learningシステムやモデリング技術を含む、より大きなDeepSpeedエコシステムの一部です。詳細については、以下をご覧ください。
+
+* 私たちの[ウェブサイト](https://www.deepspeed.ai/)で、詳細なブログ記事、チュートリアル、役立つドキュメントを提供しています。
+* DeepSpeedの最新ニュースは、[English X(Twitter)](https://twitter.com/MSFTDeepSpeed)、[Japanese X(Twitter)](https://twitter.com/MSFTDeepSpeedJP)、[Chinese Zhihu](https://www.zhihu.com/people/deepspeed)をフォローしてください。
+
+DeepSpeedは、皆様の開発への参加を歓迎しています。DeepSpeedのGitHubページで、バグ報告、Pull Request、ディスカッションへの参加が可能です。詳細は[ガイドライン](https://github.com/microsoft/DeepSpeed/blob/master/CONTRIBUTING.md)をご覧ください。また、大学、研究所、企業とのコラボレーションも行っています。こうしたコラボレーションについてのご要望(およびGitHubには適さないその他の話題)については まで直接メールをお送りください。
+
+* 私たちの[DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/)および[DeepSpeedExamples GitHub](https://github.com/microsoft/DeepSpeedExamples/)リポジトリが気に入ったら、ぜひスターをつけてください!
diff --git a/blogs/deepspeed-visualchat/10-03-2023/README.md b/blogs/deepspeed-visualchat/10-03-2023/README.md
new file mode 100755
index 000000000000..eac9d06c3cea
--- /dev/null
+++ b/blogs/deepspeed-visualchat/10-03-2023/README.md
@@ -0,0 +1,188 @@
+
+
+# DeepSpeed-VisualChat: Improve Your Chat Experience with Multi-Round Multi-Image Inputs
+
+
+
+
+
+
+
+
+
+To cite DeepSpeed-VisualChat, please cite our [arxiv report](https://arxiv.org/abs/2309.14327):
+
+```
+@article{yao2023deepspeed-visualchat,
+ title={{DeepSpeed-VisualChat: Multi-Round Multi-Image Interleave Chat via Multi-Modal Causal Attention}},
+ author={Zhewei Yao and Xiaoxia Wu and Conglong Li and Minjia Zhang and Heyang Qin and Olatunji Ruwase and Ammar Ahmad Awan and Samyam Rajbhandari and Yuxiong He},
+ journal={arXiv preprint arXiv:2309.14327},
+ year={2023}
+}
+```
+# 1. Overview
+Large Language models (LLMs), such as GPT and LLaMa, have showcased exceptional prowess in a myriad of text generation and comprehension tasks, especially when subjected to zero-/few-shot learning, particularly after instructed fine-tuning. However, to equip AI agents for diverse tasks, one critical feature that needs to be incorporated is multi-modal capability; for instance, the AI agent should be able to read images, hear voices, watch videos, etc. This capability is largely absent in solely text-based LLMs.
+
+Recently, one of the research/practice mainstreams has begun exploring the incorporation of visual capability into LLMs, especially enabling LLMs to understand images by inserting raw pictures (referred to as large visual language models, or LVLMs in short).
+
+The main caveats of the majority of existing works are:
+* The focus is predominantly on tasks related to a single image, such as visual question answering and captioning, or on handling multiple images that require concurrent input. Neither approach adeptly manages interleaved image-and-text input.
+* The scalability of the system is limited to models with ~10B parameters, which is about an order of magnitude smaller than largest open-sourced models.
+
+However, for a genuine AI chat agent, the content of inputs could be multiple images interleaved with text, a situation rarely addressed by current works. Also, the generation capability of LLMs grows quickly as the model size increases. Therefore, focusing system capability on ~10B models limits further exploration of the potential of LVLMs.
+
+To resolve these issues, we are introducing DeepSpeed-VisualChat (see [arxiv report](https://arxiv.org/abs/2309.14327) for more details) with the following new features:
+
+* ***Fully Open-Sourced Multi-round Multi-image Framework with Unprecedented Scalability***: DeepSpeed-VisualChat, one of the pioneering fully open-sourced frameworks, enables multi-round and multi-image dialogues, accommodating interleaved text-and-image inputs. We leverage DeepSpeed to enhance our training with a 2B visual encoder and a 70B LLaMA-2 decoder model, illustrating the remarkable scalability of our framework.
+* ***Multi-Modal Causal Attention (MMCA)***
+We devise a novel MMCA for multi-modal models that computes attention weights independently across various modalities. MMCA achieves objectives analogous to conventional cross-attention mechanisms but offers enhanced causal attention interpretations for generative tasks, eliminating the need for additional modules or parameters. It also presents superior training data efficiency compared to standard causal attention.
+* ***Data Blending for Interleaved Inputs*** To facilitate conversations with interleaved modalities, DeepSpeed-VisualChat employs assorted data blending techniques on existing datasets, overcoming the shortage of interleaved text-and-image inputs in most available open-source datasets.
+
+
+
+# 2 Model architecture overview
+
+
+
+ *Figure 1: Model architecture illustration.*
+
+
+
+The model architecture of DeepSpeed-VisualChat, as depicted in *Figure 1*, is composed of three components: a visual encoder, such as CLIP; a language decoder, such as LLaMa-7B; and a feature alignment linear projection layer. Most parts of the model are frozen, with only the embedding of the language model and the linear projection layer being trainable. Consequently, the total number of trainable parameters ranges from approximately O(10M) (LLaMa-2-13B) to O(100M) (LLaMa-2-70B).
+
+# 3. DeepSpeed multi-modal causal attention
+
+There are two common attention mechanisms used to connect the visual and textual components in a multi-modal model: causal attention, as used in MiniGPT and QWen-VL, and cross attention, as used in Otter and Flamingo.
+
+
+
+
+ *Figure 2: Different Attention Mechanisms: Examine the differing attention mechanisms using an input sentence "User: Please describe the image." coupled with three Image tokens (I-token1, I-token2, I-token3). On the left, we demonstrate standard causal attention, treating image tokens as text. In the middle, we present cross attention applied to images, while maintaining standard causal attention for text tokens. On the right, we illustrate our innovative multi-modal attention proposal where image tokens only perform self-attention, and text tokens attend to text/image tokens independently, highlighted with an orange mask. This mechanism is defined by: softmax($`QK^T \odot M_1`$)+ softmax($`QK^T \odot M_2`$) with Q and K as query and key, $`M_1`$=[M==1], and $`M_2`$=[M==2], with M $`\in`$ R
10x10 in this case.*
+
+
+
+Causal Attention (CA): The CA-based method simply projects visual features (i.e., the features from the output of the final visual encoder layer) into textual features and combines them with the normal textual features after the textual embedding layer to feed into LLMs. The benefit of CA is that it's a natural extension of the original attention mechanism in LLMs, and as such, it doesn't introduce any extra modules or parameters. However, this approach raises some intuitive problems:
+
+* For a visual token, it attends to previous visual and textual tokens, even though visual tokens are already fully encoded in a bidirectional manner and do not need further attention to other visual tokens or previous textual tokens.
+* For a textual token, the model needs to learn how to distribute its attention weights between its previous textual and image tokens. Due to these issues, we found that the data efficiency of CA in LVLMs is often problematic. To address this, LLaVA and QWen-VL require visual-language pretraining to fully align visual features with textual features.
+
+Cross Attention (CrA): The alternative, cross attention (CrA), along with CA, exhibits better data efficiency but also comes with a few drawbacks:
+
+* It introduces new parameters to the model. For example, Otter has more than 1.5 billion trained parameters compared to the millions of trained parameters in LLaVA due to the new parameters introduced by cross attention. This significantly increases the training cost and memory requirements.
+* It requires careful design if an image is introduced in the middle of a conversation during training, as previous text tokens should not be able to attend to the image.
+
+Multi-Modal Causal Attention Mechanism (MMCA): To overcome these issues, we propose a new multi-modal causal attention mechanism (MMCA), which has both benefits, i.e., similar parameter efficiency as CA and similar data efficiency as CrA. The overall idea is as follows:
+
+* For visual tokens, they only attend to themselves, as visual tokens are encoded by the visual encoder.
+* For textual tokens, they attend to all their previous tokens. However, they have two separate attention weight matrices for their previous textual tokens and image tokens.
+
+The intuition behind the second point of MMCA is that the attention weight for one modality may affect the other modality. For instance, a textual token may pay more attention to textual information than visual information. Therefore, if the attention weight matrix is normalized across both modalities, the attention score for visual tokens might be very small. Refer to *Figure 2* for a visualization of the three attention mechanisms.
+
+
+Demo Results. We begin by showcasing various examples that highlight the capabilities of DeepSpeed-VisualChat in single-image visual language conversations, employing different attention mechanisms. In these experiments, we employ the LLaMA2-7B language model in conjunction with the QWen-VL visual-encoder as our visual encoder. These two models are connected via a straightforward linear projection layer. Our model underwent training on two LLaVa datasets. As demonstrated in *Figure 3* and *Figure 4*, DeepSpeed-VisualChat, when coupled with MMCA, effectively discerns visual details in images and furnishes coherent responses to user queries.
+Furthermore, DeepSpeed-VisualChat exhibits a more comprehensive and precise grasp of image details compared to alternative attention mechanisms, such as the use of combined masks from both causal attention and cross attention. It is also evident that, in contrast to the combination of CrA and CA, as well as MMCA, CA alone may exhibit slightly more errors (*Figure 3*) and capture a lower degree of reasoning capability (*Figure 4*).
+
+
+
+
+ *Figure 3: Example visual and language inputs that demonstrate the output comparison between (1) the standard causal attention (CA) (2) the standard causal attention combined with cross-attention (CA+ CrA) and (3) the special multi-modal causal attention (MMCA) in DeepSpeed-VisualChat.*
+
+
+
+
+
+
+ *Figure 4: DeepSpeed-VisualChat accurately identifies the scene as a beautiful lake and offers a set of plausible suggestions. In contrast, the baseline misinterprets the image as containing “dock with a boat ramp”.*
+
+
+
+# 4. Data blending
+We used 9 datasets from 3 sources as described in our [arxiv report](https://arxiv.org/abs/2309.14327). A critical missing element for enabling multi-round and multi-image conversations is the absence of adequate data. The sole source of multi-round multi-image data we located is the SparklesDialogue dataset, which contains a mere 6520 samples. To address this limitation, we employed two methods to synthesize multi-round multi-image data from existing single-image or single-round data: simple data concatenation and LLaVA-Otter data blending.
+
+## 4.1 Simple data concatenation
+For the "llava" and "llava_dial" datasets utilized by the LLaVA model, each sample comprises single/multi-round conversations for a single image. To simulate scenarios where a user sequentially asks questions about multiple images, we conducted straightforward data post-processing for these two datasets. Specifically, we randomly concatenated different numbers of samples into a single sample. In the case of "llava," we concatenated 1 to 3 samples, while for "llava_dial," we concatenated 1 to 2 samples.
+
+## 4.2 LLaVA-Otter data blending
+We noticed that the llava and llava_dial datasets used by LLaVA model and the otter_mimicit_cgd dataset used by the Otter model all use the COCO train2017 images. For the llava and llava_dial datasets, each sample includes a single/multi-round conversations for a single image. For the otter_mimicit_cgd dataset, each sample includes a single-round conversation for a pair of images. This enables us to build a synthesized multi-round multi-image data llava_otter_blend as a more natural blending: for each sample in the otter_mimicit_cgd dataset, we look for llava and llava_dial samples that use the same image, and then build a new sample in a "llava/llava_dial conversations then otter_mimicit_cgd conversation" fashion.
+
+
+
+
+ *Figure 5: A data sample after LLaVA-Otter data blending. Gray dialog boxes are from LLaVA datasets, and orange ones are from Otter dataset.*
+
+
+# 5. Demonstration
+We trained our DeepSpeed-VisualChat-13B model with a 2B visual encoder and the 13B LLaMA model on several open-sourced datasets. DeepSpeed-VisualChat-13B shows image captioning capabilities (*Figure 6--8*), counting and text reading (*Figure 6*), celebrity recognition (*Figure 7*), storytelling (*Figure 8*), etc.
+
+
+
+
+ *Figure 6: DeepSpeed-VisualChat can count the number of people in the image and read the text in the first image. It also demonstrates cross-image understanding.*
+
+
+
+
+
+
+ *Figure 7: DeepSpeed-VisualChat can recognize celebrities and associate them with their achievements.*
+
+
+
+
+
+
+ *Figure 8: DeepSpeed-VisualChat can tell stories and recognize movies.*
+
+
+
+# 6. How to begin with DeepSpeed-VisualChat
+DeepSpeed-VisualChat is an easy-to-use training framework with great scalability, having been tested up to LLaMa-2-70B models so far. We adopt a unified instruction tuning format for all experiments, and the template is shown below.
+```
+ % You are a powerful vision-language assistant.
+
+### Image 1: % some image, e.g., cat-1.png
+### Question: % please describe the image.
+### Answer: % It's a cute black cat.
+
+### Image 2: % some image, e.g., cat-2.png
+### Image 3: % some image, e.g., cat-3.png
+### Question: % What's the difference between the three cats?
+### Answer: % The colors of the three cats are different.
+...
+```
+
+The training experience of DeepSpeed-VisualChat is straightforward and convenient. Here we give an example based on the CLIP visual encoder and the LLaMa-7B model:
+```
+git clone https://github.com/microsoft/DeepSpeedExamples.git
+cd DeepSpeedExamples/applications/DeepSpeed-VisualChat/
+pip install -r requirements.txt
+cd training
+bash training_scripts/run_7b.sh
+```
+
+The trained checkpoint will be automatically saved in a Hugging Face-compatible version and can be used to launch your own visual chat API:
+```
+cd ../chat
+bash chat_scripts/run.sh # You need to change necessary variables, e.g, ckpt path
+```
+To support larger model inference, we have incorporated Hugging Face large model inference into our DeepSpeed-VisualChat API. Therefore, users can choose a different number of GPUs based on the GPU memory capacity and the model size.
+
+Please refer to our [GitHub Landing Page](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-VisualChat) for more details.
+
+# 7. Release: Try DeepSpeed-VisualChat today!
+
+We are very excited to share that DeepSpeed-VisualChat is now open-sourced and available to the AI community.
+
+* To get started, please visit our GitHub page for DeepSpeed-VisualChat: [GitHub Landing Page](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-VisualChat)
+
+* We will continue to improve DeepSpeed-VisualChat with your feedback and support. Our [roadmap](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-VisualChat/README.md#-deepspeed-visualchats-roadmap-) shows currently supported features as well as ones that are planned for the future.
+
+
+DeepSpeed-VisualChat is a component of the larger DeepSpeed ecosystem, which includes a range of Deep Learning systems and modeling technologies. To learn more,
+
+* Please visit our [website](https://www.deepspeed.ai/) for detailed blog posts, tutorials, and helpful documentation.
+* Follow us on our [English X(Twitter)](https://twitter.com/MSFTDeepSpeed), [Japanese X(Twitter)](https://twitter.com/MSFTDeepSpeedJP), and [Chinese Zhihu](https://www.zhihu.com/people/deepspeed) for latest news on DeepSpeed.
+
+We welcome your contributions to DeepSpeed! We encourage you to report issues, contribute PRs, and join discussions on the [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/) page. Please see our [contributing guide](https://github.com/microsoft/DeepSpeed/blob/master/CONTRIBUTING.md) for more details. We are open to collaborations with universities, research labs, companies, such as those working together on deep learning research, applying DeepSpeed to empower real-world AI models and applications, and so on. For such requests (and other requests unsuitable for GitHub), please directly email to deepspeed-info@microsoft.com.
+
+* "Star" our [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/) and [DeepSpeedExamples GitHub](https://github.com/microsoft/DeepSpeedExamples/) repositories if you like our work!
diff --git a/blogs/deepspeed-visualchat/assets/images/attention.png b/blogs/deepspeed-visualchat/assets/images/attention.png
new file mode 100644
index 000000000000..b01d8f8027ce
Binary files /dev/null and b/blogs/deepspeed-visualchat/assets/images/attention.png differ
diff --git a/blogs/deepspeed-visualchat/assets/images/cat-chat.png b/blogs/deepspeed-visualchat/assets/images/cat-chat.png
new file mode 100755
index 000000000000..5a5c27381f65
Binary files /dev/null and b/blogs/deepspeed-visualchat/assets/images/cat-chat.png differ
diff --git a/blogs/deepspeed-visualchat/assets/images/ceos.png b/blogs/deepspeed-visualchat/assets/images/ceos.png
new file mode 100644
index 000000000000..e148f545a44b
Binary files /dev/null and b/blogs/deepspeed-visualchat/assets/images/ceos.png differ
diff --git a/blogs/deepspeed-visualchat/assets/images/data-blending.png b/blogs/deepspeed-visualchat/assets/images/data-blending.png
new file mode 100644
index 000000000000..a8afb5144fb1
Binary files /dev/null and b/blogs/deepspeed-visualchat/assets/images/data-blending.png differ
diff --git a/blogs/deepspeed-visualchat/assets/images/friends.png b/blogs/deepspeed-visualchat/assets/images/friends.png
new file mode 100644
index 000000000000..2689d8d4bb1c
Binary files /dev/null and b/blogs/deepspeed-visualchat/assets/images/friends.png differ
diff --git a/blogs/deepspeed-visualchat/assets/images/hero-figure.png b/blogs/deepspeed-visualchat/assets/images/hero-figure.png
new file mode 100644
index 000000000000..ca79b2c6239f
Binary files /dev/null and b/blogs/deepspeed-visualchat/assets/images/hero-figure.png differ
diff --git a/blogs/deepspeed-visualchat/assets/images/lake-chat.png b/blogs/deepspeed-visualchat/assets/images/lake-chat.png
new file mode 100755
index 000000000000..c47199737d54
Binary files /dev/null and b/blogs/deepspeed-visualchat/assets/images/lake-chat.png differ
diff --git a/blogs/deepspeed-visualchat/assets/images/model.png b/blogs/deepspeed-visualchat/assets/images/model.png
new file mode 100644
index 000000000000..dbd1f05c484b
Binary files /dev/null and b/blogs/deepspeed-visualchat/assets/images/model.png differ
diff --git a/blogs/deepspeed-visualchat/assets/images/zootopia.png b/blogs/deepspeed-visualchat/assets/images/zootopia.png
new file mode 100644
index 000000000000..c9e3783ed198
Binary files /dev/null and b/blogs/deepspeed-visualchat/assets/images/zootopia.png differ
diff --git a/blogs/deepspeed4science/README.md b/blogs/deepspeed4science/README.md
new file mode 100644
index 000000000000..a318490329a5
--- /dev/null
+++ b/blogs/deepspeed4science/README.md
@@ -0,0 +1,18 @@
+
+
+# Announcing the DeepSpeed4Science Initiative: Enabling large-scale scientific discovery through sophisticated AI system technologies
+
+
+
+[https://www.microsoft.com/en-us/research/blog/announcing-the-deepspeed4science-initiative-enabling-large-scale-scientific-discovery-through-sophisticated-ai-system-technologies/](https://www.microsoft.com/en-us/research/blog/announcing-the-deepspeed4science-initiative-enabling-large-scale-scientific-discovery-through-sophisticated-ai-system-technologies/)
+
+To cite DeepSpeed4Science, please cite our [white paper](https://arxiv.org/abs/2310.04610):
+
+```
+@article{song2023deepspeed4science,
+ title={DeepSpeed4Science Initiative: Enabling Large-Scale Scientific Discovery through Sophisticated AI System Technologies},
+ author={Song, Shuaiwen Leon and Kruft, Bonnie and Zhang, Minjia and Li, Conglong and Chen, Shiyang and Zhang, Chengming and Tanaka, Masahiro and Wu, Xiaoxia and Rasley, Jeff and Awan, Ammar Ahmad and others},
+ journal={arXiv preprint arXiv:2310.04610},
+ year={2023}
+}
+```
diff --git a/blogs/deepspeed4science/chinese/README.md b/blogs/deepspeed4science/chinese/README.md
new file mode 100644
index 000000000000..dabc4ab077f2
--- /dev/null
+++ b/blogs/deepspeed4science/chinese/README.md
@@ -0,0 +1,156 @@
+
+
+# DeepSpeed4Science:利用先进的AI系统优化技术实现科学发现
+
+
+
+*此博客为英文博客[Announcing the DeepSpeed4Science Initiative: Enabling large-scale scientific discovery through sophisticated AI system technologies](https://www.microsoft.com/en-us/research/blog/announcing-the-deepspeed4science-initiative-enabling-large-scale-scientific-discovery-through-sophisticated-ai-system-technologies/)的官方翻译*
+
+
+
+
+*图1:DeepSpeed4Science方法概述:专为加速科学发现和应对其复杂性而量身定制的AI系统技术开发。*
+
+
+如需引用 DeepSpeed4Science,请引用我们的[white paper](https://arxiv.org/abs/2310.04610):
+
+```
+@article{song2023deepspeed4science,
+ title={DeepSpeed4Science Initiative: Enabling Large-Scale Scientific Discovery through Sophisticated AI System Technologies},
+ author={Song, Shuaiwen Leon and Kruft, Bonnie and Zhang, Minjia and Li, Conglong and Chen, Shiyang and Zhang, Chengming and Tanaka, Masahiro and Wu, Xiaoxia and Rasley, Jeff and Awan, Ammar Ahmad and others},
+ journal={arXiv preprint arXiv:2310.04610},
+ year={2023}
+}
+```
+
+## 简介
+
+在接下来的十年中,深度学习可能会彻底改变自然科学,增强我们对自然现象进行建模和预测的能力。这可能预示着科学探索的新时代,为从药物开发到可再生能源的各个领域带来重大进展。为了响应这一机会以及微软“予力全球每一人、每一组织,成就不凡”的使命,[微软DeepSpeed团队](https://www.deepspeed.ai/)启动了一个名为[DeepSpeed4Science](https://deepspeed4science.ai/)的新计划,旨在通过AI系统技术创新帮助领域专家解锁当今最大的科学之谜。
+
+[DeepSpeed](https://www.deepspeed.ai/)系统是由微软开发的业界领先的开源AI系统框架,它为各种AI硬件上的深度学习训练和推理提供了前所未有的规模和速度。图1展示了我们对DeepSpeed4Science这一新计划的基本方法。通过利用DeepSpeed当前的技术方案(训练、推理和压缩)作为基础技术推动器,DeepSpeed4Science将创建一套专为加速科学发现而量身定制的AI系统技术,以应对其独特的复杂性,超越用于加速通用大型语言模型(LLMs)的常见技术方法。我们与拥有科学AI模型的内部和外部团队紧密合作,以发现和解决领域特定AI系统的挑战。这包括气候科学、药物设计、生物学理解、分子动力学模拟、癌症诊断和监测、催化剂/材料发现、和其他领域。
+
+我们的长期愿景是将DeepSpeed4Science发展成一个用于分享支持科学发现的先进AI技术的软件平台和统一代码仓库。DeepSpeed4Science的设计旨在包容性,呼应微软的[“AI for Good”承诺](https://www.microsoft.com/en-us/ai/ai-for-good)。这体现在该计划对一系列标志性科学模型的支持上,他们代表了一些最关键的AI4Science应用场景。在这篇博客中,我们展示了DeepSpeed4Science如何帮助解决结构生物学研究中的两个关键AI系统挑战:(1) 解决了以Evoformer为中心的蛋白质结构预测模型中的内存爆炸问题,以及(2)为更好地理解引发大流行的病毒的进化提供AI模型长序列支持。
+
+## 我们的初期主要合作者
+
+DeepSpeed4Science的新系统技术可以用于很多推动科学边界的标志性模型,赋能AI驱动的科学发现。目前,DeepSpeed4Science很荣幸地支持来自[微软研究院AI4Science](https://www.microsoft.com/en-us/research/lab/microsoft-research-ai4science/)、[微软WebXT/Bing](https://www.msn.com/en-us/weather/forecast/)、[美国能源部国家实验室](https://www.energy.gov/national-laboratories)和多所大学的几个关键科学模型。
+
+### 微软内部合作伙伴
+
+#### 科学基础模型(Scientific Foundation Model,SFM),微软研究院AI4Science
+
+
+
+
+
+*图2:科学基础模型(Scientific Foundation Model,SFM)及其当前探索:Distributional Graphormer。*
+
+
+科学基础模型(SFM)旨在创建一个统一的大规模基础模型,以支持自然科学发现,支持多种输入、多个科学领域(例如,药物、材料、生物学、健康等)和计算任务。DeepSpeed4Science合作伙伴关系将为SFM团队提供新的训练和推理技术,以支持他们的新生成AI方法(例如[Distributional Graphormer](https://www.microsoft.com/en-us/research/blog/distributional-graphormer-toward-equilibrium-distribution-prediction-for-molecular-systems/))这样的项目进行持续研究。
+
+#### ClimaX,微软研究院AI4Science
+
+
+
+
+*图3:ClimaX是第一个设计用于执行各种天气和气候建模任务的基础模型。*
+
+
+我们的气候正在发生变化,导致极端天气事件的频率增加。为了减轻负面影响,预测这些事件将发生的地方变得越来越重要。[ClimaX](https://www.microsoft.com/en-us/research/group/autonomous-systems-group-robotics/articles/introducing-climax-the-first-foundation-model-for-weather-and-climate/)是第一个设计用于执行各种天气和气候建模任务的基础模型。它可以吸收许多具有不同变量和分辨率的数据集以提高天气预报的准确性。DeepSpeed4Science正在为ClimaX创建新的系统支持和加速策略,以高效地预训练/微调更大的基础模型,同时处理非常大的高分辨率图像数据(例如,数十到数百PB)和长序列。
+
+#### 分子动力学和机器学习力场(Molecular Dynamics and Machine Learning Force Field),微软研究院AI4Science
+
+
+
+
+*图4:一百万步的分子动力学模拟:RBD-蛋白(RBD-protein)与蛋白抑制剂(protein inhibitor)相互作用。*
+
+
+这个项目模拟了使用[AI驱动的力场模型](https://www.microsoft.com/en-us/research/publication/ai2bmd-efficient-characterization-of-protein-dynamics-with-ab-initio-accuracy/)进行近似第一性原理计算精度的大型(百万原子)分子系统的动态模拟,同时保持了经典分子动力学的效率和可扩展性。这些模拟足够高效,可以生成足够长的轨迹来观察化学上有意义的事件。通常,这个过程需要数百万甚至数十亿的推理步骤。这对优化图神经网络(GNN)+ LLM模型的推理速度提出了重大挑战,DeepSpeed4Science将为此提供新的加速策略。
+
+#### 微软天气,微软WebXT/Bing
+
+
+
+
+*图5:微软降水预报(每4分钟一次对接下来4小时进行预测)。*
+
+
+[微软天气](https://www.msn.com/en-us/weather/forecast/)提供精确的天气信息,[帮助用户为他们的生活方式、健康、工作和活动做出更好的决策](https://blogs.windows.com/windowsexperience/2022/08/31/microsoft-joins-noaas-weather-ready-nation-ambassador-initiative-to-help-improve-americas-readiness-and-response-to-weather-events/)——包括每小时多次更新的准确的10天全球天气预报。此前,微软天气受益于DeepSpeed技术,加速了他们的多GPU训练环境。目前,DeepSpeed4Science正在与微软WebXT天气预报团队合作,进一步增强微软天气预报服务的最新功能和改进。
+
+### 外部合作者
+
+DeepSpeed4Science的旅程始于两个开创性的基于LLM的结构生物学研究AI模型:来自哥伦比亚大学的[OpenFold](https://openfold.io/),一个开源的高保真蛋白质结构预测模型;以及来自[阿贡国家实验室](https://www.anl.gov/)的[GenSLMs](https://github.com/ramanathanlab/genslm),一个获得[ACM戈登贝尔奖](https://www.acm.org/media-center/2022/november/gordon-bell-special-prize-covid-research-2022)的用于学习SARS-CoV-2(COVID-19)基因组的进化的语言模型。作为此次发布的特色展示,它们代表了当今AI驱动的结构生物学研究面临的两个常见AI系统挑战。我们将在下一节中讨论DeepSpeed4Science如何赋能这些科学研究。
+
+此外,DeepSpeed4Science最近扩大了其范围,以支持更多样的科学模型。例如,在我们与阿贡国家实验室合作训练[Aurora Exascale系统](https://www.anl.gov/aurora)上的万亿参数科学模型的工作中,DeepSpeed4Science技术将帮助他们达到这一关键任务所需的性能要求和可扩展性。此外,通过与[橡树岭国家实验室](https://ai-roadmap.ornl.gov/)和[国家癌症研究所(NCI)](https://www.cancer.gov/)合作进行癌症监测,DeepSpeed4Science将帮助从非结构化的临床文本中高保真地提取和分类信息,以供[MOSSAIC项目](https://www.olcf.ornl.gov/tag/mossaic/)使用。[Brookhaven国家实验室](https://www.bnl.gov/world/)还将采用DeepSpeed4Science技术,支持使用LLMs开发大型数字双胞胎模型,以便为清洁能源研究产生更真实的模拟数据。您可以在[deepspeed4science.ai](https://deepspeed4science.ai/)上找到有关我们外部合作者及其科学任务的更多详细信息。
+
+## 合作展示
+
+### 展示(I):DeepSpeed4Science通过DS4Sci_EvoformerAttention消除以Evoformer为中心的结构生物学模型的内存爆炸问题
+
+
+
+
+
+*图6:在训练过程中OpenFold对PDB链7B3A_A的预测。*
+
+
+[OpenFold](https://github.com/aqlaboratory/openfold)是DeepMind的[AlphaFold2](https://alphafold.com/)的开源社区再现,使其可以在新数据集上训练或微调AlphaFold2。研究人员已经使用它从头开始重新训练AlphaFold2,生成新的模型参数集,研究AlphaFold2的早期训练阶段(图6),并开发新的蛋白质折叠系统。
+
+
+
+
+*图7:在OpenFold中,对多序列比对(MSA)Attention内核(包含偏差)变体的训练峰值内存需求。 (左) 使用在AlphaFold2中的EvoformerAttention的原始OpenFold实现。对于这些类型的蛋白质结构预测模型,在训练/推理中的内存爆炸问题是常见的。最先进的FlashAttention无法有效支持这样的Attention变体。 (右) DeepSpeed4Science的一种新解决方案DS4Sci_EvoformerAttention在不影响模型品质的条件下显著地减少了OpenFold的训练峰值内存需求(最多13倍)。*
+
+
+尽管OpenFold有使用最先进的系统技术进行性能和内存优化,但从头开始训练AlphaFold2仍然在计算上很昂贵。目前阶段的模型参数很小,只有9300万个参数,但它包含了几个需要非常大的中间内存的特殊Attention变体。在标准AlphaFold2训练的“微调”阶段,只是这些变体中的其中一个在半精度下就生成了超过12GB的张量,使其峰值内存要求远远超过了相同大小的语言模型。即使使用像activation checkpointing和DeepSpeed ZeRO优化这样的技术,这种内存爆炸问题仍然严重限制了可训练模型的序列长度和MSA深度。此外,近似策略可能会显著影响模型的准确性和收敛性,同时仍然导致内存爆炸,如图7左侧(橙色)所示。
+
+为了应对结构生物学研究(例如,蛋白质结构预测和平衡分布预测)中的这一常见系统挑战,DeepSpeed4Science通过为这类科学模型中广泛出现的注意力变体(即EvoformerAttention)设计定制的精确注意力内核来解决这一内存效率问题。具体来说,我们设计了一套由复杂的融合/矩阵分块策略和动态内存减少方法而组成的高内存效率DS4Sci_EvoformerAttention内核,作为高质量机器学习模块供更广泛的生物学研究社区使用。通过整合到OpenFold中,这些定制内核在训练期间提供了显著的加速,并显著减少了模型的训练和推理的峰值内存需求。这使得OpenFold可以用更大、更复杂的模型,使用更长的序列在更广泛的硬件上进行实验。关于这项技术的详细信息可以在[这里](https://deepspeed4science.ai/2023/09/18/model-showcase-openfold/)找到。
+
+### 展示(II):DeepSpeed4Science通过系统和算法方法为基因组基础模型(例如,GenSLMs)提供长序列支持
+
+
+
+
+*图8:GenSLMs:获2022年ACM 戈登贝尔奖的COVID基因组模型(基于GPT-NeoX的25B/33B模型)。它用于学习描述SARS-CoV-2基因组生物学意义的潜在空间。这个GIF展示了一个重要的蛋白质家族苹果酸脱氢酶(malate dehydrogenase)的根据重要特征(如序列长度和GC含量(核酸鸟嘌呤和胞嘧啶的含量与腺嘌呤和胸腺嘧啶的比率。它测量DNA链抵抗热的能力))着色的潜在空间的投影。*
+
+
+[GenSLMs](https://github.com/ramanathanlab/genslm),一个来自阿贡国家实验室的[2022年ACM 戈登贝尔奖获奖](https://www.acm.org/media-center/2022/november/gordon-bell-special-prize-covid-research-2022)的基因组模型,可以通过大型语言模型(LLMs)的基因组数据训练来学习SARS-CoV-2(COVID-19)基因组的进化。它旨在改变如何识别和分类引发大流行的病毒(特别是SARS-CoV-2)的新变种。GenSLMs代表了第一批可以泛化到其他预测任务的基因组基础模型。对潜在空间的良好理解可以帮助GenSLMs处理超出仅仅是病毒序列的新领域,并扩展它们模拟细菌病原体甚至真核生物的能力(例如,理解功能、途径成员资格和进化关系等事物)。为了实现这一科学目标,GenSLMs和类似的模型需要非常长的序列支持用于训练和推理,这超出了像[FlashAttention](https://arxiv.org/abs/2307.08691)这样的通用LLM的长序列策略。通过DeepSpeed4Science的新设计,科学家现在可以构建和训练具有显著更长的上下文窗口的模型,允许他们探索以前无法访问的关系。
+
+
+
+
+*图9:由不同框架在不同规模下支持的两个GenSLMs模型的最大序列长度。使用NVIDIA DGX,每个节点有八个40G A100 GPU。*
+
+
+具体在系统层面,我们发布了包括[长序列支持和其他新优化](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples_deepspeed/deepspeed4science/megatron_long_seq_support)的最新的[Megatron-DeepSpeed框架](https://github.com/microsoft/Megatron-DeepSpeed)。科学家现在可以通过我们新添加的内存优化技术(如注意力掩码异步处理和位置码分割)、张量并行、流水线并行、序列并行、基于ZeRO的数据并行和模型状态异步处理等技术的协同组合,用更长的序列训练他们的GenSLMs等大型科学模型。图9展示了我们的新版本使GenSLMs的25B和33B模型的最长序列长度分别比之前的Megatron-DeepSpeed版本增加了12倍和14倍。在支持的序列长度方面,这个新Megatron-DeepSpeed框架也显著地超过了NVIDIA的Megatron-LM(对于25B和33B模型分别高达9.8倍和9.1倍)。例如,阿贡实验室团队的GenSLMs 25B模型在64个GPU上的原始序列长度为42K,而现在可以用512K的核苷酸序列进行训练。这在不损失准确性的条件下大大提高了模型质量和科学发现的范围。对于那些更喜欢相对位置编码技术这样的算法策略的领域科学家,这个[新版本](https://deepspeed4science.ai/2023/09/18/model-showcase-genslms/)也进行了集成。
+
+## 总结和路线图
+
+我们非常自豪和兴奋地宣布DeepSpeed4Science计划以及几个研发亮点和成果。从今天开始,我们将在[deepspeed4science.ai](https://deepspeed4science.ai/)上介绍我们的新计划,包括关于我们的外部合作者的信息,以及当前和未来的DeepSpeed4Science技术发布。我们的一个高层次目标是推广广泛解决大规模科学发现的主要系统痛点的AI系统技术。我们希望全球的科学家们能够从DeepSpeed4Science通过开源软件解锁的新功能中受益。我们期待更好地了解阻碍您的科学发现的AI系统设计挑战。我们真诚地欢迎您的参与,帮助构建一个更有前途的AI4Science未来。请给我们发送电子邮件至。我们鼓励您在我们的[DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/)上报告问题、贡献PR、参与讨论。
+
+## 致谢
+
+**Core DeepSpeed4Science Team:**
+
+Shuaiwen Leon Song (DeepSpeed4Science lead), Minjia Zhang, Conglong Li, Shiyang Chen, Chengming Zhang, Xiaoxia (Shirley) Wu, Masahiro Tanaka, Martin Cai, Adam Graham, Charlie Zhou, Yuxiong He (DeepSpeed team lead)
+
+**Our Founding Collaborators (in alphabetical order):**
+
+**Argonne National Lab team:** Rick Stevens, Cristina Negri, Rao Kotamarthi, Venkatram Vishwanath, Arvind Ramanathan, Sam Foreman, Kyle Hippe, Troy Arcomano, Romit Maulik, Maxim Zvyagin, Alexander Brace, Yuntian Deng, Bin Zhang, Cindy Orozco Bohorquez, Austin Clyde, Bharat Kale, Danilo Perez-Rivera, Heng Ma, Carla M. Mann, Michael Irvin, J. Gregory Pauloski, Logan Ward, Valerie Hayot, Murali Emani, Zhen Xie, Diangen Lin, Maulik Shukla, Weili Nie, Josh Romero, Christian Dallago, Arash Vahdat, Chaowei Xiao, Thomas Gibbs, Ian Foster, James J. Davis, Michael E. Papka, Thomas Brettin, Anima Anandkumar
+
+**AMD:** Ivo Bolsen, Micheal Schulte, Bo Begole, Angela Dalton, Steve Reinhart, Ashwin Aji, Jalal Mahmud, Mahesh Balashibramanian
+
+**Brookhaven National Lab team:** Adolfy Hoisie, Shinjae Yoo, Yihui Ren.
+
+**Columbia University OpenFold team:** Mohammed AlQuraishi, Gustaf Ahdritz
+
+**Microsoft Research AI4Science team:** Christopher Bishop, Bonnie Kruft, Max Welling, Tie-Yan Liu, Christian Bodnar, Johannes Brandsetter, Wessel Bruinsma, Chan Cao, Yuan-Jyue Chen, Peggy Dai, Patrick Garvan, Liang He, Elizabeth Heider, PiPi Hu, Peiran Jin, Fusong Ju, Yatao Li, Chang Liu, Renqian Luo, Qi Meng, Frank Noe, Tao Qin, Janwei Zhu, Bin Shao, Yu Shi, Wenlei Shi, Gregor Simm, Megan Stanley, Lixin Sun, Yue Wang, Tong Wang, Zun Wang, Lijun Wu, Yingce Xia, Leo Xia, Shufang Xie, Shuxin Zheng, Jianwei Zhu
+
+**Oakridge National Lab team:** Prassana Balaprakash, Georgia Tourass
+
+**Princeton University:** William Tang, Kyle Felker, Alexey Svyatkovskiy (Microsoft liaison)
+
+**Rutgers University:** Hang Liu
+
+**WebXT Weather team:** Pete Luferenko, Divya Kumar, Jonathan Weyn, Ruixiong Zhang, Sylwester Klocek, Volodymyr Vragov
diff --git a/blogs/deepspeed4science/japanese/README.md b/blogs/deepspeed4science/japanese/README.md
new file mode 100644
index 000000000000..276528650ab5
--- /dev/null
+++ b/blogs/deepspeed4science/japanese/README.md
@@ -0,0 +1,156 @@
+
+
+# DeepSpeed4Scienceイニシアティブ: 洗練されたAIシステムのテクノロジーにより大規模な科学的発見を可能に
+
+
+
+*こちらは英語ブログ[Announcing the DeepSpeed4Science Initiative: Enabling large-scale scientific discovery through sophisticated AI system technologies](https://www.microsoft.com/en-us/research/blog/announcing-the-deepspeed4science-initiative-enabling-large-scale-scientific-discovery-through-sophisticated-ai-system-technologies/)の公式の翻訳です*
+
+
+
+
+*図1:DeepSpeed4Scienceのアプローチ: 汎用の言語モデルのサポートを超え、科学的発見とその複雑さの解決に特化したAI技術を開発*
+
+
+DeepSpeed4Science を引用するには、こちらの[white paper](https://arxiv.org/abs/2310.04610)を引用してください:
+
+```
+@article{song2023deepspeed4science,
+ title={DeepSpeed4Science Initiative: Enabling Large-Scale Scientific Discovery through Sophisticated AI System Technologies},
+ author={Song, Shuaiwen Leon and Kruft, Bonnie and Zhang, Minjia and Li, Conglong and Chen, Shiyang and Zhang, Chengming and Tanaka, Masahiro and Wu, Xiaoxia and Rasley, Jeff and Awan, Ammar Ahmad and others},
+ journal={arXiv preprint arXiv:2310.04610},
+ year={2023}
+}
+```
+
+## はじめに
+
+自然の出来事をモデル化し予測する深層学習の能力は急速に高まっており、次の10年間に、自然科学に革命を起こすかも知れません。薬の開発から再生可能エネルギーまでの各セクターで、大きな進展をもたらす新しい科学的探求の時代が到来するでしょう。「地球上のすべての人と組織がもっと多くのことを成し遂げられるようにする」というMicrosoftのミッションに従い、この機会に、[DeepSpeedチーム](https://www.deepspeed.ai/)では[DeepSpeed4Science](https://deepspeed4science.ai/)という新しいイニシアティブを立ち上げました。これは、AIシステム技術のイノベーションを通じて他に類を見ない技術を構築し、様々な分野の専門家が、科学分野における大きな謎を解き明かす手助けをすることを目指しています。
+
+[DeepSpeed](https://www.deepspeed.ai/)システムは、Microsoftが開発した、AI分野をリードするオープンソースのAIシステムのフレームワークであり、多様なAIハードウェア上での深層学習の訓練と推論において、前例のない規模と速度を実現します。図1は、この新しいDeepSpeed4Scienceイニシアティブでの基本的なアプローチを示しています。DeepSpeedの現在の柱となる技術(訓練、推論、圧縮)を基盤として活用しつつ、DeepSpeed4Scienceでは、大規模言語モデル(LLM)を加速するための汎用の技術的アプローチを超え、科学的発見を加速する目的で新たに構築された、一連のAIシステム技術を提供します。私たちは、重要な科学的ミッションを推進している、代表的な科学分野向けAIモデルを所有する内外のチームと連携し、ドメイン固有のAIシステムの課題を特定し、解決していきます。これには、気候科学、薬物設計、生物学的理解、分子動力学シミュレーション、がんの診断と監視、触媒/材料の発見、およびその他の分野が含まれます。
+
+私たちの長期的なビジョンは、DeepSpeed4Scienceを、科学的発見をサポートする先進的なAIシステム技術を共有するための新しいソフトウェアプラットフォームおよび統一的なリポジトリに発展させることです。DeepSpeed4Scienceは、Microsoftの[AI for Good](https://www.microsoft.com/en-us/ai/ai-for-good)のコミットメントを反映して、包括的に設計されています。このことは、AI4Scienceへのもっとも重要な投資の成果として構築された、様々な代表的モデルへの、DeepSpeed4Scienceイニシアティブによるサポートに現れています。このブログでは、DeepSpeed4Scienceが、構造生物学の研究における2つの重要なシステムの課題にどのように対処するかを紹介します:(1) Evoformer中心のタンパク質構造予測モデルをスケールアップする際に極めて大きなメモリが必要となる問題を解決し、(2) パンデミックを引き起こすウイルスの進化の様子をよりよく理解するための非常に長いシーケンスのサポートを可能にします。
+
+## 主要な初期コラボレータ
+
+DeepSpeed4Scienceによる新しいシステム技術はAI駆動の幅広い科学研究を強化するものです。現在、DeepSpeed4Scienceは、[Microsoft Research AI4Science](https://www.microsoft.com/en-us/research/lab/microsoft-research-ai4science/)、[Microsoft WebXT/Bing](https://www.msn.com/en-us/weather/forecast/)、[U.S. DoE National Labs](https://www.energy.gov/national-laboratories)、および複数の大学のいくつかの重要な科学モデルをサポートしています。
+
+### Microsoft内のパートナーシップ
+
+#### 科学基盤モデル (Scientific Foundation Model, SFM), Microsoft Research AI4Science
+
+
+
+
+
+*図2: 科学基盤モデル (Scientific foundation model, SFM) とその探索: Distributional Graphormer*
+
+
+科学的基盤モデル(SFM)は、多様なインプット、複数の科学領域(薬物、材料、生物学、健康など)、および計算タスクをサポートする、自然科学的発見を強化するための統一された大規模基盤モデルを作成することを目的としています。DeepSpeed4Scienceパートナーシップは、[Distributional Graphormer](https://www.microsoft.com/en-us/research/blog/distributional-graphormer-toward-equilibrium-distribution-prediction-for-molecular-systems/)などのMicrosoftの新しい生成AI手法などのプロジェクトに関する、SFMチームの継続的な研究を強化するための新しい訓練および推論テクノロジーを提供します。
+
+#### ClimaX, Microsoft Research AI4Science
+
+
+
+
+*図3: 天気・気候の多様なモデリングタスクのための最初の基盤モデルClimaX*
+
+
+気候の変化は、より頻繁な異常気象を引き起こしています。悪影響を軽減するため、これらのイベントが発生する場所を予測することがますます重要になっています。[ClimaX](https://www.microsoft.com/en-us/research/group/autonomous-systems-group-robotics/articles/introducing-climax-the-first-foundation-model-for-weather-and-climate/)は、さまざまな気象および気候モデリングタスクを実行するために設計された最初の基盤モデルです。さまざまな変数と解像度を持つ多くの異なるデータセットを扱えるため、天気予報の精度が向上する可能性があります。DeepSpeed4Scienceは、非常に大きな高解像度画像データ(数十から数百ペタバイトなど)を長いシーケンスで処理しながら、より大きな基盤モデルを効率的に事前訓練/ファインチューニングするためのClimaXの新しいシステムサポートを提供しています。
+
+#### 分子動力学と機械学習型力場(Molecular Dynamics and Machine Learning Force Field),Microsoft Research AI4Science
+
+
+
+
+*図4: 100万ステップの分子動力学シミュレーション: RBD-proteinとprotein inhibitorの相互作用*
+
+
+このプロジェクトは、古典的な分子動力学の効率とスケーラビリティを維持しながら、[AIを利用した力場モデル](https://www.microsoft.com/en-us/research/publication/ai2bmd-efficient-characterization-of-protein-dynamics-with-ab-initio-accuracy/)を使用して、原理に基づく精度(ab initio accuracy)に近い精度で大規模(原子数で100万規模)な分子システムの力学をシミュレートします。このシミュレーションは、化学的に重要なイベントを観察するのに十分な長さの軌道を生成できる効率を実現しています。通常、このプロセスには数百万から数十億の推論ステップが必要です。これは、グラフニューラルネットワーク(GNN)+ LLMモデルの推論速度を最適化する上で大きな課題となります。DeepSpeed4Scienceは、この課題に対して、新しいシステムサポートを提供します。
+
+#### 天気 from Microsoft Start, Microsoft WebXT/Bing
+
+
+
+
+*図5: Microsoft Startにおける降水予想 (次の4時間について4分ごと)*
+
+
+[天気 from Microsoft Start](https://www.msn.com/en-us/weather/forecast/)は、[ユーザーがライフスタイル、健康、仕事、活動についてより適切な決定を下せるよう](https://blogs.windows.com/windowsexperience/2022/08/31/microsoft-joins-noaas-weather-ready-nation-ambassador-initiative-to-help-improve-americas-readiness-and-response-to-weather-events/)、正確な気象情報を提供します。 (1 時間ごとに複数回更新される、10 日間に渡る正確かつグローバルな天気予報など)。 以前にも、この天気予報は、DeepSpeedの技術を使用して、マルチ GPU を用いた訓練を高速化していました。現在、DeepSpeed4ScienceはMicrosoft WebXT気象チームと協力して、最先端の機能と更なる改善により、マイクロソフトの気象サービスをさらに強化しています。
+
+### 外部のコラボレータ
+
+DeepSpeed4Scienceは、構造生物学研究のための2つの先駆的なLLMベースのAIモデルを扱うことから始まりました: オープンソースのハイフィデリティタンパク質構造予測モデルであるコロンビア大学の[OpenFold](https://openfold.io/)と、SARS-CoV-2(COVID-19)ゲノムの進化を学習する、[Gordon Bell Special Prize](https://www.acm.org/media-center/2022/november/gordon-bell-special-prize-covid-research-2022)を受賞したゲノム用言語モデルである[アルゴンヌ国立研究所](https://www.anl.gov/)の[GenSLMs](https://github.com/ramanathanlab/genslm)です。次のセクションでは、今日のAI主導の構造生物学研究が直面している2つの一般的なAIシステムの課題を紹介し、DeepSpeed4Scienceが科学研究をどのように強化したかについて説明します。
+
+またDeepSpeed4Scienceは最近、より多様な科学モデルをサポートするために、その対象を拡大しました。たとえば、[Aurora Exascaleシステム](https://www.anl.gov/aurora)で、1兆パラメータの科学モデルを訓練するアルゴンヌ国立研究所との協力にあたって、DeepSpeed4Scienceテクノロジーは、求められるパフォーマンス要件とスケーラビリティを実現するのに重要な役割を果たします。さらに、DeepSpeed4Scienceは、がんの調査に関して、[オークリッジ国立研究所](https://ai-roadmap.ornl.gov/)および[国立がん研究所(NCI)](https://www.cancer.gov/)と協力することにより、[MOSSAICプロジェクト](https://www.olcf.ornl.gov/tag/mossaic/)の非構造化臨床テキストからの情報の高信頼度抽出と分類にも用いられます。さらに、DeepSpeed4Scienceのテクノロジーは、[ブルックヘブン国立研究所](https://www.bnl.gov/world/)にも採用され、LLMを使用してより現実的なシミュレーションデータを生成することにより、クリーンエネルギー研究用の大規模なデジタルツインモデルの開発をサポートします。外部のコラボレータとその科学ミッションに関するより詳細な情報は、[deepspeed4science.ai](https://deepspeed4science.ai/)に掲載しています。
+
+## パートナーシップの事例
+
+### 事例(I): DeepSpeed4ScienceのDS4Sci_EvoformerAttentionにより、Evoformerで構成された生物学モデルをスケールアップする際のメモリ問題を解決
+
+
+
+
+
+*図6: モデル学習の進行に伴うPDB chain 7B3A_AについてのOpenFoldの予測*
+
+
+[OpenFold](https://github.com/aqlaboratory/openfold)は、DeepMindによる[AlphaFold2](https://alphafold.com/)をオープンソースで再現したものであり、新しいデータセットでAlphaFold2を訓練またはファインチューニングすることを可能にします。研究者は、これを使用して、AlphaFold2をゼロから再訓練して新しいモデルパラメータを作成し、AlphaFold2の初期訓練フェーズを研究し(図6)、新しいタンパク質フォールディングシステムを開発しました。
+
+
+
+
+*図7: OpenFoldで可能な最大の訓練サンプル次元を持つ多重配列アライメント(MSA)アテンションカーネル(バイアス付き)のバリエーションを訓練するために必要なピークメモリ。(左)AlphaFold2で使用されているEvoformerAttentionを用いたオリジナルのOpenFold実装。この種のタンパク質構造予測モデルの訓練/推論では、極めて多くのメモリが必要とされることは一般的な課題となっている。特に、最新技術として広く知られるFlashAttentionでも、このような科学研究のためのアテンションのバリエーションを効果的にサポートできない。(右)DS4Sci_EvoformerAttentionと呼ばれるDeepSpeed4Scienceの新しい技術は、精度を落とすことなく、OpenFoldモデルの訓練に必要なピークメモリを1/13に大幅に削減する。*
+
+
+OpenFoldには、最先端のシステムテクノロジーを使用したパフォーマンスとメモリの最適化が含まれていますが、AlphaFold2をゼロから訓練することは依然として大きな計算コストがかかります。現段階でのモデルは、パラメータ数の絶対値は小さい(9,300万個)のですが、極めて大きなアクティベーションを持つアテンションのバリエーションが含まれています。標準的なAlphaFold2訓練のファインチューニングフェーズでは、これらのバリエーションのうちのの1つが生成したロジットテンソル(入力としてモデルに供給されるディープタンパク質MSAに対応するように設計されたもの)は、半精度浮動小数で12GBを超え、同等のサイズの言語モデルが使用するメモリを大幅に上回ります。Activation checkpointingや、DeepSpeed ZeRO 最適化などの手法を使用しても、非常に多くのメモリが必要とされるため、モデルを訓練できるシーケンスの長さと MSA の深さが大幅に制限されます。さらに、近似解を与えるような戦略を用いると、モデルの精度と収束に大きな影響を与える可能性があり、それでもメモリが爆発的に増加します(図7の左側のバー(オレンジ色))。
+
+DeepSpeed4Scienceは、構造生物学研究(タンパク質構造予測や平衡分布予測など)におけるこの一般的なシステムの課題に対処するために、このカテゴリの科学モデルに広く見られるアテンションのバリエーション(つまりEvoformerAttention)用にカスタマイズされた正確なアテンションのカーネルを設計することにより、このメモリの非効率性の問題に対処しています。具体的には、高度なフュージョン/タイリング戦略とオンザフライのメモリ削減方法によって可能になるメモリ効率の高いDS4Sci_EvoformerAttentionカーネルのセットを、高品質の機械学習プリミティブとして、より広いコミュニティ向けに作成しました。これらをOpenFoldに組み込むことで、訓練中の速度が大幅に向上し、訓練と推論のためのモデルのピークメモリが大幅に削減されます。これにより、OpenFoldはより大きく、より複雑なモデル、より長いシーケンスで実験し、より幅広いハードウェアで訓練することができます。この技術の詳細については、[こちら](https://deepspeed4science.ai/2023/09/18/model-showcase-openfold/)をご覧ください。
+
+### 事例(II): DeepSpeed4Scienceのシステムとアルゴリズムの両方からのアプローチにより、ゲノム基盤モデルでの非常に長い系列の使用をサポート
+
+
+
+
+*図8: GenSLMs:2022年ACM Gordon Bell Special Prize受賞COVIDゲノム用モデル(GPT-NeoXに基づく25B/33Bモデル)。SARS-CoV-2ゲノムの生物学的に意味のある特性を記述する潜在空間を学習するために使用される。このGIFは、重要なタンパク質ファミリーであるリンゴ酸デヒドロゲナーゼ(malate dehydrogenase)を可視化し、配列の長さやGC含量(アデニンとチミンと比較した核酸グアニンとシトシンの含量の比率。これはDNA鎖が熱に耐える能力を測るものである。)などの重要な特徴で色付けされた潜在空間の投影を表示している。*
+
+
+アルゴンヌ国立研究所が開発し、[2022年ACM Gordon Bell Special Prize](https://www.acm.org/media-center/2022/november/gordon-bell-special-prize-covid-research-2022)を受賞したゲノム用言語モデルである[GenSLMs](https://github.com/ramanathanlab/genslm)は、ゲノムデータに大規模言語モデル(LLM)を適用することにより、SARS-CoV-2(COVID-19)ゲノムの進化を学習します。これは、パンデミックを引き起こすウイルス、特にSARS-CoV-2の新たに出現する亜種を特定し、分類する方法を変えるように設計されています。GenSLMsは、他の予測タスクに一般化できる最初のゲノム基盤モデルの1つです。潜在空間をうまく表現することにより、GenSLMsはウイルス配列だけでなく新しいドメインに適用し、細菌性病原体や真核生物をモデル化する能力を拡大し、機能、経路のメンバーシップ、進化的関係などを理解することができます。この科学的目標を達成するために、GenSLMsおよび同様のモデルは、[FlashAttention](https://arxiv.org/abs/2307.08691)のように、長いシーケンスのための一般的な戦略では扱うことが困難なレベルの、非常に長いシーケンスサポートを、訓練と推論の両方に対して必要とします。DeepSpeed4Scienceの新しい設計により、科学者はより長いシーケンスでモデルを構築および訓練できるようになり、以前は扱えなかった科学探索が可能になりました。
+
+
+
+
+*図9: 異なるスケールで異なるフレームワークがサポートする2つのGenSLMsモデルの最大シーケンス長。1ノードあたり8個の40G A100 GPUを搭載したNVIDIA DGXノードを使用。*
+
+
+システムレベルでは、非常に長いシーケンスをサポートするための最新の[Megatron-DeepSpeedフレームワーク](https://github.com/microsoft/Megatron-DeepSpeed)を、[他の新しい最適化とともにリリースします](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples_deepspeed/deepspeed4science/megatron_long_seq_support)。科学者は、(アテンションマスクと位置の埋め込みに関する)新しく追加されたメモリ最適化手法、テンソル並列処理、パイプライン並列処理、シーケンス並列処理、ZeROスタイルのデータ並列処理、モデル状態のオフロードなどの技術を相乗的な組み合わせにより、GenSLMsのような大規模な科学モデルをはるかに長いシーケンスで訓練できるようになりました。図9は、新しいリリースにより、GenSLMsの25Bおよび33Bモデルで、以前のMegatron-DeepSpeedよりもそれぞれ最大12倍および14倍の最長シーケンス長を処理できることを示しています。サポートされているシーケンス長に関しては、この新しいMegatron-DeepSpeedは、25Bモデルと33Bモデルでそれぞれ最大9.8倍と9.1倍でNVIDIAのMegatron-LMを大幅に上回っています。たとえば、GenSLMsの25Bモデルは、64個のGPUでのアルゴンヌチームの元の42Kシーケンス長と比較して、512Kのヌクレオチド配列で訓練できるようになりました。これにより、精度を損なうことなく、モデルの品質と科学的発見の範囲が大幅に向上します。Relative position embeddingなどのアルゴリズム戦略を必要とする科学者向けの追加サポートも、[このリリース](https://deepspeed4science.ai/2023/09/18/model-showcase-genslms/)に統合されています。
+
+## まとめとロードマップ
+
+DeepSpeed4Scienceイニシアティブを、いくつかのR&Dのハイライトや成果と共に発表できることを嬉しく思います。本日から、外部の協力者に関する情報や、現在および将来のDeepSpeed4Scienceテクノロジーリリースなど、新しいイニシアティブでの活動を[deepspeed4science.ai](https://deepspeed4science.ai/)上で進めていきます。私たちの高レベルな目標の1つは、大規模な科学的発見のための主要なシステムの問題点に広く対処するAIシステムテクノロジーを一般化することです。世界中の科学者によって、オープンソースのソフトウェアを通じてDeepSpeed4Scienceによって利用可能になる新機能が活用されることを願っています。科学的発見の障害となるAIシステム設計の課題を解決していくことを楽しみにしています。AI4Scienceの有望な未来を築くために、皆様の参加を歓迎します。お問い合わせはまでお願いします。問題の報告や、PRを通じての貢献、ディスカッションへの参加は、[DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/)でお願いします。
+
+## 謝辞
+
+**Core DeepSpeed4Science Team:**
+
+Shuaiwen Leon Song (DeepSpeed4Science lead), Minjia Zhang, Conglong Li, Shiyang Chen, Chengming Zhang, Xiaoxia (Shirley) Wu, Masahiro Tanaka, Martin Cai, Adam Graham, Charlie Zhou, Yuxiong He (DeepSpeed team lead)
+
+**Our Founding Collaborators (in alphabetical order):**
+
+**Argonne National Lab team:** Rick Stevens, Cristina Negri, Rao Kotamarthi, Venkatram Vishwanath, Arvind Ramanathan, Sam Foreman, Kyle Hippe, Troy Arcomano, Romit Maulik, Maxim Zvyagin, Alexander Brace, Yuntian Deng, Bin Zhang, Cindy Orozco Bohorquez, Austin Clyde, Bharat Kale, Danilo Perez-Rivera, Heng Ma, Carla M. Mann, Michael Irvin, J. Gregory Pauloski, Logan Ward, Valerie Hayot, Murali Emani, Zhen Xie, Diangen Lin, Maulik Shukla, Weili Nie, Josh Romero, Christian Dallago, Arash Vahdat, Chaowei Xiao, Thomas Gibbs, Ian Foster, James J. Davis, Michael E. Papka, Thomas Brettin, Anima Anandkumar
+
+**AMD:** Ivo Bolsen, Micheal Schulte, Bo Begole, Angela Dalton, Steve Reinhart, Ashwin Aji, Jalal Mahmud, Mahesh Balashibramanian
+
+**Brookhaven National Lab team:** Adolfy Hoisie, Shinjae Yoo, Yihui Ren.
+
+**Columbia University OpenFold team:** Mohammed AlQuraishi, Gustaf Ahdritz
+
+**Microsoft Research AI4Science team:** Christopher Bishop, Bonnie Kruft, Max Welling, Tie-Yan Liu, Christian Bodnar, Johannes Brandsetter, Wessel Bruinsma, Chan Cao, Yuan-Jyue Chen, Peggy Dai, Patrick Garvan, Liang He, Elizabeth Heider, PiPi Hu, Peiran Jin, Fusong Ju, Yatao Li, Chang Liu, Renqian Luo, Qi Meng, Frank Noe, Tao Qin, Janwei Zhu, Bin Shao, Yu Shi, Wenlei Shi, Gregor Simm, Megan Stanley, Lixin Sun, Yue Wang, Tong Wang, Zun Wang, Lijun Wu, Yingce Xia, Leo Xia, Shufang Xie, Shuxin Zheng, Jianwei Zhu
+
+**Oakridge National Lab team:** Prassana Balaprakash, Georgia Tourass
+
+**Princeton University:** William Tang, Kyle Felker, Alexey Svyatkovskiy (Microsoft liaison)
+
+**Rutgers University:** Hang Liu
+
+**WebXT Weather team:** Pete Luferenko, Divya Kumar, Jonathan Weyn, Ruixiong Zhang, Sylwester Klocek, Volodymyr Vragov
diff --git a/blogs/deepspeed4science/media/Figure1.png b/blogs/deepspeed4science/media/Figure1.png
new file mode 100644
index 000000000000..614c4b40d6a1
Binary files /dev/null and b/blogs/deepspeed4science/media/Figure1.png differ
diff --git a/blogs/deepspeed4science/media/Figure2-1.png b/blogs/deepspeed4science/media/Figure2-1.png
new file mode 100644
index 000000000000..bb0b8d9206d1
Binary files /dev/null and b/blogs/deepspeed4science/media/Figure2-1.png differ
diff --git a/blogs/deepspeed4science/media/Figure2-2.gif b/blogs/deepspeed4science/media/Figure2-2.gif
new file mode 100644
index 000000000000..0890be7d7e31
Binary files /dev/null and b/blogs/deepspeed4science/media/Figure2-2.gif differ
diff --git a/blogs/deepspeed4science/media/Figure3.png b/blogs/deepspeed4science/media/Figure3.png
new file mode 100644
index 000000000000..465e80e15a25
Binary files /dev/null and b/blogs/deepspeed4science/media/Figure3.png differ
diff --git a/blogs/deepspeed4science/media/Figure4.gif b/blogs/deepspeed4science/media/Figure4.gif
new file mode 100644
index 000000000000..b45a5f28fd36
Binary files /dev/null and b/blogs/deepspeed4science/media/Figure4.gif differ
diff --git a/blogs/deepspeed4science/media/Figure5.gif b/blogs/deepspeed4science/media/Figure5.gif
new file mode 100644
index 000000000000..a26c20103269
Binary files /dev/null and b/blogs/deepspeed4science/media/Figure5.gif differ
diff --git a/blogs/deepspeed4science/media/Figure6-1.png b/blogs/deepspeed4science/media/Figure6-1.png
new file mode 100644
index 000000000000..65f7f9309f71
Binary files /dev/null and b/blogs/deepspeed4science/media/Figure6-1.png differ
diff --git a/blogs/deepspeed4science/media/Figure6-2.gif b/blogs/deepspeed4science/media/Figure6-2.gif
new file mode 100644
index 000000000000..b50588c227d7
Binary files /dev/null and b/blogs/deepspeed4science/media/Figure6-2.gif differ
diff --git a/blogs/deepspeed4science/media/Figure7.jpg b/blogs/deepspeed4science/media/Figure7.jpg
new file mode 100644
index 000000000000..eaa92007268b
Binary files /dev/null and b/blogs/deepspeed4science/media/Figure7.jpg differ
diff --git a/blogs/deepspeed4science/media/Figure8.gif b/blogs/deepspeed4science/media/Figure8.gif
new file mode 100644
index 000000000000..624384910f2a
Binary files /dev/null and b/blogs/deepspeed4science/media/Figure8.gif differ
diff --git a/blogs/deepspeed4science/media/Figure9.png b/blogs/deepspeed4science/media/Figure9.png
new file mode 100644
index 000000000000..f00fd9b6917f
Binary files /dev/null and b/blogs/deepspeed4science/media/Figure9.png differ
diff --git a/blogs/zeropp/assets/images/eval1.png b/blogs/zeropp/assets/images/eval1.png
new file mode 100644
index 000000000000..8312c1db6de1
Binary files /dev/null and b/blogs/zeropp/assets/images/eval1.png differ
diff --git a/blogs/zeropp/assets/images/eval2.png b/blogs/zeropp/assets/images/eval2.png
new file mode 100644
index 000000000000..b6fd05f8cd98
Binary files /dev/null and b/blogs/zeropp/assets/images/eval2.png differ
diff --git a/blogs/zeropp/assets/images/eval3.png b/blogs/zeropp/assets/images/eval3.png
new file mode 100644
index 000000000000..4675e2041d84
Binary files /dev/null and b/blogs/zeropp/assets/images/eval3.png differ
diff --git a/blogs/zeropp/assets/images/hpz.png b/blogs/zeropp/assets/images/hpz.png
new file mode 100644
index 000000000000..790903cff68b
Binary files /dev/null and b/blogs/zeropp/assets/images/hpz.png differ
diff --git a/blogs/zeropp/assets/images/overview.png b/blogs/zeropp/assets/images/overview.png
new file mode 100644
index 000000000000..8e261b533528
Binary files /dev/null and b/blogs/zeropp/assets/images/overview.png differ
diff --git a/blogs/zeropp/assets/images/qgz.gif b/blogs/zeropp/assets/images/qgz.gif
new file mode 100644
index 000000000000..90716d325a04
Binary files /dev/null and b/blogs/zeropp/assets/images/qgz.gif differ
diff --git a/blogs/zeropp/assets/images/qwz.png b/blogs/zeropp/assets/images/qwz.png
new file mode 100644
index 000000000000..ae68c322668f
Binary files /dev/null and b/blogs/zeropp/assets/images/qwz.png differ
diff --git a/blogs/zeropp/assets/images/rlhf-eval.png b/blogs/zeropp/assets/images/rlhf-eval.png
new file mode 100644
index 000000000000..d9b1f3d272c1
Binary files /dev/null and b/blogs/zeropp/assets/images/rlhf-eval.png differ
diff --git a/blogs/zeropp/assets/images/zero-overview.gif b/blogs/zeropp/assets/images/zero-overview.gif
new file mode 100644
index 000000000000..65051947f79d
Binary files /dev/null and b/blogs/zeropp/assets/images/zero-overview.gif differ
diff --git a/blogs/zeropp/chinese/README.md b/blogs/zeropp/chinese/README.md
new file mode 100644
index 000000000000..e4a6b5279de5
--- /dev/null
+++ b/blogs/zeropp/chinese/README.md
@@ -0,0 +1,185 @@
+
+
+# DeepSpeed ZeRO++:降低4倍网络通信,显著提高大模型及类ChatGPT模型训练效率
+
+
+
+
+
+
+图1: DeepSpeed ZeRO++ 简介
+
+
+大型 AI 模型正在改变数字世界。基于大型语言模型 (LLM)的 Turing-NLG、ChatGPT 和 GPT-4 等生成语言模型用途广泛,能够执行摘要、代码生成和翻译等任务。 同样,DALL·E、Microsoft Designer 和 Bing Image Creator 等大型多模态生成模型可以生成艺术、建筑、视频和其他数字资产,使内容创作者、建筑师和工程师能够探索全新的创意生产力。\
+\
+然而,训练这些大型模型需要在数百甚至数千个 GPU 设备上使用大量内存和计算资源。 例如,训练 [Megatron-Turing NLG 530B](https://www.microsoft.com/en-us/research/blog/using-deepspeed-and-megatron-to-train-megatron-turing-nlg-530b-the-worlds-largest-and-most-powerful-generative-language-model/)模型需要使用超过 4,000 个 NVidia A100 GPU。 有效地利用这些资源需要一个复杂的优化系统,以将模型合理分配到各个设备的内存中,并有效地并行化这些设备上的计算。 同时,为了使深度学习社区能够轻松进行大型模型训练,这些优化必须易于使用。
+
+DeepSpeed 的 ZeRO [优化系列](https://www.deepspeed.ai/tutorials/zero/)为这些挑战提供了强大的解决方案,并已广泛用于大型深度学习模型例如TNLG-17B、Bloom-176B、MPT-7B、Jurrasic-1的训练中 。尽管它具有变革性的能力 ,在一些关键场景中,ZeRO 会在 GPU 之间产生大量数据传输开销,这降低了训练效率。 这种情况特别发生在以下场景中:a) 全局batch size较小,而 GPU数量多,这导致每个 GPU 上batch size较小,需要频繁通信;或者 b) 在低端集群上进行训练,其中跨节点网络带宽有限,导致高通信延迟。在这些情况下,ZeRO 的训练效率会受到限制。
+
+为了解决这些限制,我们发布了 [ZeRO++](https://arxiv.org/abs/2306.10209) 。 ZeRO++相比 ZeRO将总通信量减少了 4 倍,而不会影响模型质量。 这有两个关键意义:
+
+1. *ZeRO++加速大型模型预训练和微调*
+ 1. 每个GPU上 batch size较小时: 无论是在数千个 GPU 上预训练大型模型,还是在数百个甚至数十个 GPU 上对其进行微调,当每个 GPU 的batch size较小时,ZeRO++ 提供比 ZeRO 高 2.2 倍的吞吐量,直接减少训练时间和成本。
+ 2. 低带宽计算集群: ZeRO++ 使低带宽集群能够实现与带宽高 4 倍的高端集群类似的吞吐量。 因此,ZeRO++ 可以跨更广泛的集群进行高效的大型模型训练。
+
+2. *ZeRO++加速 ChatGPT 类的 RLHF训练*
+
+ 1. 虽然 ZeRO++ 主要是为训练而设计的,但它的优化也自动适用于 [ZeRO-Inference](https://www.deepspeed.ai/2022/09/09/zero-inference.html#:~:text=ZeRO-Inference%20adapts%20and%20optimizes%20ZeRO-Infinity%20techniques%20for%20model,memory%2C%20thus%20hosting%20no%20%28zero%29%20weights%20in%20GPU.),因为通信开销对于 ZeRO 的训练和推理同样适用。 因此,ZeRO++ 可以提高人类反馈强化学习 (RLHF) 等算法的效率,因为RLHF结合了训练和推理。
+
+ 2. 通过与 DeepSpeed-Chat 的集成,与原始 ZeRO 相比,ZeRO++ 可以将 RLHF 训练的生成阶段效率提高多达 2 倍,强化学习训练阶段效率提高多达 1.3 倍。
+
+接下来,我们将更深入地解释 ZeRO 及其通信开销,并讨论 ZeRO++ 中为解决这些问题而进行的关键优化。 然后我们将展示 ZeRO++ 对不同模型大小、批量大小和带宽限制的训练吞吐量的影响。我们还将讨论 ZeRO++ 如何应用于 DeepSpeed-Chat,以加速使用 RLHF的对话模型的训练。
+
+## ZeRO++详解
+
+
+
+
+
+图2: ZeRO optimizer 工作流程图
+
+
+ZeRO 是数据并行(Data Parallelism)的一种内存高效版本,其中模型状态会被分割储存在所有 GPU 上,而不需要在训练期间使用基于gather/broadcas的通信进行复制和重建。这使 ZeRO 能够有效地利用所有设备的聚合 GPU 内存和计算力,同时提供简单易用的数据并行训练。\
+\
+假设模型大小为 M。在前向传播过程中,ZeRO 执行全收集/广播(all-gather/broadcast)操作以在需要之时为每个模型层收集参数(总共大小为 M)。 在向后传递中,ZeRO 对每一层的参数采用类似的通信模式来计算其局部梯度(总大小为 M)。 此外,ZeRO 在对每个局部梯度计算完毕后会立刻使用 reduce 或 reduce-scatter 通信进行平均和分割储存(总大小为 M)。 因此,ZeRO 总共有 3M 的通信量,平均分布在两个全收集/广播(all-gather/broadcast)和一个减少分散/减少(reduce-scatter/reduce)操作中。
+
+为了减少这些通信开销,ZeRO++ 进行了三组通信优化,分别针对上述三个通信集合:
+
+
+
+
+
+图3:qwZ的分区量化图例
+
+
+
+### ZeRO通信过程中的权重量化 (qwZ)
+
+首先,为了减少 all-gather 期间的参数通信量,我们采用权重量化在通信前将每个模型参数从 FP16(两个字节)动态缩小为 INT8(一个字节)数据类型,并在通信后对权重进行反量化。 然而,简单地对权重进行量化会降低模型训练的准确性。 为了保持良好的模型训练精度,我们采用分区量化,即对模型参数的每个子集进行独立量化。目前尚且没有针对分区量化的高性能现有实现。 因此,我们自行从头开始实现了一套高度优化的量化 CUDA 内核,与基本量化相比,精度提高 3 倍,速度提高 5 倍。
+
+
+
+
+
+图4: 权重的分层分割存储(hpZ)
+
+
+
+### ZeRO模型权重的分层分割存储 (hpZ)
+
+其次,为了减少向后传递期间全收集(all-gather)权重的通信开销,我们用 GPU 内存进行通信。 更具体地说,我们不像在 ZeRO 中那样将整个模型权重分布在所有机器上,而是在每台机器中维护一个完整的模型副本。 以更高的内存开销为代价,这允许我们用机器内的模型权重全收集/广播(all-gather/broadcast)代替昂贵的跨机器全收集/广播(all-gather/broadcast),由于机器内通信带宽更高,这使得通信速度大幅提升。
+
+
+
+
+
+图5: qgZ 端到端的工作流程
+
+
+
+### ZeRO通信过程中梯度量化 (qgZ)
+
+第三,要降低梯度的reduce-scatter通信成本更具挑战性。 因为直接应用量化来减少通信量是不可行的。 即使我们使用分区量化来降低量化误差,梯度reduce也会累积并放大量化误差。 为了解决这个问题,我们只在通信之前量化梯度,但在任何reduce操作之前将它们反量化到原有精度。 为了有效地做到这一点,我们发明了一种名为 qgZ 的基于 all-to-all 的新型量化梯度通信范式,它在功能上等同于压缩的归约-分散(reduce-scatter)操作。
+
+qgZ 旨在解决两个挑战:i) 如果我们简单地在 INT4/INT8 中实施 reduce-scatter 会导致显著精度损失,以及 ii) 在传统tree或ring-based reduce-scatter中使用量化需要一长串量化和反量化步骤,这直接导致误差积累和显著的延迟,即使我们在全精度上进行reduce。为了解决这两个挑战,qgZ 不使用tree或ring-based reduce-scatter算法,而是基于一种新颖的分层 all-to-all 方法。
+
+qgZ 中有三个主要步骤:i)梯度切片重新排序,ii)节点内通信和reduce,以及 iii)节点间通信和reduce。 首先,在任何通信发生之前,我们对梯度进行切片并对张量切片重新排序,以保证通信结束时每个 GPU 上的最终梯度位置(即图 5 中的绿色块)是正确的。 其次,我们量化重新排序的梯度切片,在每个节点内进行 all-to-all 通信,从 all-to-all 中对接收到的梯度切片进行反量化,并进行局部reduce。 第三,我们再次量化局部reduce后的梯度,进行节点间的all-to-all通信,再次对接收到的梯度进行反量化,并计算最终的高精度梯度reduce,得到图5中绿色块的结果。\
+\
+这种分层方法的原因是为了减少跨节点通信量。 更准确地说,给定每个节点 N 个 GPU、M 的模型大小和 Z 的量化比率,单跳 all-to-all 将生成 M\*N/Z 跨节点流量。 相比之下,通过这种分层方法,我们将每个 GPU 的跨节点流量从 M/Z 减少到 M/(Z\*N)。 因此,总通信量从 M\*N/Z 减少到 M\*N/(Z\*N) = M/Z。 我们通过重叠节点内和节点间通信以及融合 CUDA 内核来进一步优化 qgZ 的端到端延迟(张量切片重新排序 (Tensor Slice Reordering)+ 节点内量化(Intra-node quantization))和(节点内反量化 (Intra-node Dequantization) + 节点内梯度整合 (Intra-node Reduction) + 节点间量化(inter-node quantization))。
+
+
+
+| Communication Volume | Forward all-gather on weights | Backward all-gather on weights | Backward reduce-scatter on gradients | Total |
+|:---------------------------:|:------------------------------------:|:-------------------------------------:|:-------------------------------------------:|:------------:|
+| ZeRO | M | M | M | 3M |
+| ZeRO++ | 0.5M | 0 | 0.25M | 0.75M |
+
+
+
+### **通信总量优化**
+
+通过结合以上所有三个组件,我们将跨节点通信量从 3M 减少到 0.75M。 更具体地说,我们使用 qwZ 将模型权重的前向全收集/广播从 M 减少到 0.5M。 我们使用 hpZ 消除了反向传播期间的跨节点 all-gather,将通信从 M 减少到 0。最后,我们使用 qgZ 将反向传播期间的跨节点 reduce-scatter 通信从 M 减少到 0.25M。
+
+## **ZeRO++ 加速大型语言模型训练**
+
+在这里,我们展示了 ZeRO++ 在 384 个 Nvidia V100 GPU 上的真实 LLM 训练场景的测试结果。
+
+
+
+
+
+图6: 在 384 个 V100 GPU 上的各种模型大小下 ZeRO++ 与 ZeRO 的吞吐量,节点间使用 4 个 Infiniband (IB) 进行互连,每个以 100 Gbps 运行。
+
+
+
+### **在GPU小batch size情况下ZeRO++实现更高的训练效率**
+
+**高带宽集群:** 如图 6 所示,我们首先展示了 ZeRO++ 相对于 ZeRO 的吞吐量改进,针对不同的模型大小和微批量(micro-batch size)大小,测试使用 4x Infiniband (IB) 以实现 400Gbps 跨节点互连带宽,每个以 100Gbps 运行。 在 micro-batch size为每 GPU 1k tokens时,ZeRO++ 比 ZeRO-3 的吞吐量提高了 28% 到 36%。 对于 2k tokens micro-batch size大小,ZeRO++ 比 ZeRO-3 实现了 24% 到 29% 的吞吐量增益。
+
+
+
+
+
+
+图7: 在 384 个 V00 GPU 上 100Gbps 跨节点带宽时各种 LLM 的吞吐量
+
+
+
+**低带宽集群:** 在 100Gbps等低带宽网络环境中,ZeRO++ 的性能明显优于 ZeRO-3。 如图 7 所示,与 ZeRO-3 相比,ZeRO++ 在端到端吞吐量方面实现了高达 2.2 倍的加速。 平均而言,ZeRO++ 比 ZeRO-3 基线实现了大约 2 倍的加速。
+
+
+
+
+
+
+图8: ZeRO++ 以显着降低的带宽实现高带宽集群性能
+
+
+
+### **实现高带宽ZeRO和低带宽ZeRO++集群之间的模型训练效率等效**
+
+此外,与 ZeRO 在高得多的带宽环境下相比,ZeRO ++ 可以在低带宽集群中实现相当的系统吞吐量。 如图 8 所示,对于 18B 和 138B 模型大小,具有 200Gbps 跨节点带宽的 ZeRO++ 可以达到与 800Gbps 跨节点带宽的 ZeRO-3 相似的 TFLOP。
+
+鉴于 ZeRO++ 出色的可扩展性,我们将 ZeRO++ 视为用于训练大型 AI 模型的下一代 ZeRO。
+
+## **DeepSpeed-Chat 与ZeRO++结合用于 RLHF 训练**
+
+### **RLHF训练简介**
+
+ChatGPT 类模型由 LLM 提供支持,并[使用 RLHF 进行微调](https://openai.com/blog/chatgpt)。 RLHF 由生成(推理)阶段和训练阶段组成。 在生成阶段,演员(actor)模型将部分对话作为输入,并使用一系列前向传递生成响应。 然后在训练阶段,评论(critic)模型根据质量对生成的响应进行排名,为演员模型提供强化信号。 使用这些排名对参与者模型进行微调,使其能够在后续迭代中生成更准确和适当的响应。
+
+RLHF 训练带来了巨大的内存压力,因为它使用了四种模型(演员、参考、评论、奖励)。 常见的解决方案是采用低秩自适应训练 (LoRA) 来解决 RLHF 的内存压力。 LoRA 冻结了预训练模型的权重,并将可训练的秩分解矩阵注入到 Transformer 架构的每一层中,显着减少了可训练参数的数量。 LoRA 通过减少内存使用来加速 RLHF,允许更大的批处理(batch)大小,从而大大提高吞吐量。
+
+### **DeepSpeed-Chat with ZeRO++ 用于 RLHF 训练**
+
+
+
+
+
+
+图9: ZeRO++ 加速了 RLHF 训练的生成和训练阶段
+
+
+
+ZeRO++在RLHF + LoRA的场景下有着独特的应用,因为大多数模型权重都被冻结了。 这意味着 ZeRO++ 可以将这些冻结的权重量化保存到INT4/8 中,而不是将它们存储在 fp16 中并在每次通信操作之前对其进行量化。 通信后的反量化仍然是为了让权重为计算做好准备,但反量化后的权重在计算后被简单地丢弃。
+
+以这种方式使用 ZeRO++ 进行 RLHF 训练可以减少内存使用和通信量。 这意味着通过减少通信以及由于减少内存使用而启用更大的批处理大小来提高训练吞吐量。 在生成阶段,ZeRO++ 使用 hpZ 将所有权重通信保持在每个节点内,以利用更高的节点内通信带宽,减少通信量,进一步提高生成吞吐量。\
+\
+ZeRO++ 已集成到 DeepSpeed-Chat 中,以支持 ChatGPT 类模型的 RLHF 训练。 在图 9 中,我们比较了不同大小的 actor 模型的 RLHF 生成吞吐量。测试配置为 32个V100 GPU ,actor 模型大小为30B 和 66B以测试 ZeRO 和 ZeRO++性能。 结果表明,ZeRO++ 的 RLHF 生成吞吐量比 ZeRO 高出 2.25 倍。 我们还展示了在 16 个 V100 GPU 上训练阶段的加速,其中 ZeRO++ 实现了比 ZeRO 高 1.26 倍的吞吐量,这是由于 ZeRO++ 支持的更低通信量和更大批量大小。
+
+## **DeepSpeed ZeRO++现已发布!**
+
+我们非常高兴能够发布 DeepSpeed ZeRO++ 并让 AI 社区中的每个人都可以使用它。请访问我们的 GitHub 页面以获取 [LLM训练教程](https://www.deepspeed.ai/tutorials/zeropp/)。 用于 DeepSpeed-Chat 的 ZeRO++ 将在未来几周内发布。\
+有关 ZeRO++ 的更多技术细节,请查看我们的[arxiv论文](https://arxiv.org/pdf/2306.10209.pdf)。
+
+DeepSpeed-ZeRO++ 是 DeepSpeed 生态系统的一部分。 要了解更多信息,请访问我们的网站,在那里您可以找到详细的博客文章、教程和有用的文档。
+
+您还可以在我们的[英文 Twitter](https://twitter.com/MSFTDeepSpeed)、[日文 Twitter](https://twitter.com/MSFTDeepSpeedJP) 和[中文知乎](https://www.zhihu.com/people/deepspeed) 上获取最新的 DeepSpeed 新闻。
+
+DeepSpeed 欢迎您的贡献! 我们鼓励您在 DeepSpeed GitHub 页面上报告问题、贡献 PR 并加入讨论。 有关更多详细信息,请参阅我们的贡献指南。 我们对与大学、研究实验室和公司的合作持开放态度。 对于此类请求(以及其他不适合 GitHub 的请求),请直接发送电子邮件至 。
+
+**贡献者:**
+
+DeepSpeed 团队的以下人员的贡献使该项目成为可能:
+
+[Guanhua Wang](https://www.microsoft.com/en-us/research/people/guanhuawang/), Heyang Qin, Sam Ade Jacobs, Connor Holmes, [Samyam Rajbhandari](https://www.microsoft.com/en-us/research/people/samyamr/), [Olatunji Ruwase](https://www.microsoft.com/en-us/research/people/olruwase/), Ammar Ahmad Awan, Jeff Rasley, Michael Wyatt, [Yuxiong He](https://www.microsoft.com/en-us/research/people/yuxhe/) (team lead)
diff --git a/blogs/zeropp/japanese/README.md b/blogs/zeropp/japanese/README.md
new file mode 100644
index 000000000000..a4d4e68f6b02
--- /dev/null
+++ b/blogs/zeropp/japanese/README.md
@@ -0,0 +1,186 @@
+
+
+# DeepSpeed ZeRO++: LLMやチャットモデルの訓練を劇的に高速化 – 通信オーバヘッドを1/4に大幅削減 -
+
+
+
+
+
+
+図1: DeepSpeed ZeRO++ の概要
+
+
+大規模AIモデルは、まさに今デジタルの世界を変革しつつあります。大規模言語モデル(Large Language Model, LLM)を搭載したTuring-NLG、ChatGPT、GPT-4のような生成言語モデルは、驚くほど汎用性が高く、要約、コーディング、翻訳のようなタスクを実行できます。同様に、DALL·E、Microsoft Designer、Bing Image Creatorのような大規模なマルチモーダル生成モデルは、アート、建築、ビデオ、その他のデジタルアセットを生成することができ、コンテンツクリエイター、建築家、エンジニアがクリエイティブな生産性を発揮し、新たなフロンティアを開拓する力をもたらしています。
+
+しかし、これらの大規模なモデルを訓練するには、何百、何千ものGPUデバイスを使用した膨大なメモリとコンピューティングリソースが必要です。例えば、[Megatron-Turing NLG 530Bモデル](https://www.microsoft.com/en-us/research/blog/using-deepspeed-and-megatron-to-train-megatron-turing-nlg-530b-the-worlds-largest-and-most-powerful-generative-language-model/)の訓練には、4,000以上のNVidia A100 GPUが使用されました。これらのリソースを効率的に活用するには、モデルを個々のGPUデバイスのメモリに収まるように分割し、これらのデバイス間で効率的に並列計算を行うための、複雑な最適化システムが必要になります。同時に、大規模なモデル学習をユーザーが容易に利用できるようにするには、そうした最適化が簡単に適用できる必要があります。
+
+DeepSpeedが提供する[ZeRO](https://www.deepspeed.ai/tutorials/zero/)と呼ばれる一連の最適化技術は、これらの課題に対する強力なソリューションを提供し、大規模で強力な深層学習モデルであるTNLG-17B、Bloom-176B、MPT-7B、Jurrasic-1などの訓練に広く使用されています。ZeROはそうした強力な機能を持つ一方で、いくつかの利用シナリオでは、GPU間のデータ転送のオーバーヘッドが大きくなり、高い学習効率を達成することが難しいことがあります。これは特に、a) (グローバル)バッチサイズに対して多数のGPUで訓練するため、GPUごとのバッチサイズが小さくなり、頻繁な通信が必要になる場合 b) ローエンドの計算クラスタで訓練する際、ノード間のネットワーク帯域幅が十分ではなく、通信待ち時間が長くなる場合 に発生します。これらのシナリオでは、ZeROの使いやすさと計算効率という利点が十分に発揮できません。
+
+今回リリースする[ZeRO++](https://arxiv.org/abs/2306.10209)は、ZeROの通信を最適化することで、こうした問題を解決するシステムです。バッチサイズの制限やデバイス間の帯域幅の制約に関係なく、大規模モデルの訓練で極めて高い効率を実現します。ZeRO++は、量子化および通信とデータの再マッピングを組み合わせることで、モデルの品質に影響を与えることなく、ZeROと比較して総通信量を4分の1に削減します。これにより、以下に示す2つの重要な効果が得られます。
+
+
+1. *大規模モデルの事前学習・ファインチューニングの高速化*
+ 1. GPUあたりのバッチサイズが小さい: 数千のGPUで大規模モデルを事前学習する場合でも、数百または数十のGPUでモデルをファインチューニングする場合でも、GPUあたりのバッチサイズが小さい場合、ZeRO++はZeROに比べて最大2.2倍のスループットを提供し、訓練時間とコストを削減します。
+
+ 2. 低帯域幅クラスタ: ZeRO++では、帯域幅の小さいクラスタでも、4倍の帯域幅を持つクラスタと同等のスループットを達成できます。そのため、ZeRO++を使用すれば、さまざまなクラスタで効率的な大規模モデルの訓練が可能になります。
+
+2. *RLHFによるChatGPTライクなモデルの訓練の高速化*
+
+ 1. ZeRO++は主に訓練の高速化を目的に設計されていますが、通信オーバーヘッドは、ZeROを用いた訓練と推論に共通の課題であるため、ZeRO++の最適化は、推論のための機構である[ZeRO-Inference](https://www.deepspeed.ai/2022/09/09/zero-inference.html#:~:text=ZeRO-Inference%20adapts%20and%20optimizes%20ZeRO-Infinity%20techniques%20for%20model,memory,%20thus%20hosting%20no%20(zero)%20weights%20in%20GPU.)でも有効です。その結果、ZeRO++は、対話モデルの推論に使用される、人間のフィードバックからの強化学習(RLHF)のような、訓練と推論の両方を組み合わせたワークロードの効率を向上させます。
+
+ 2. DeepSpeed-Chatとの統合により、ZeRO++はオリジナルのZeROと比較して、RLHF訓練の生成フェーズを最大2倍、訓練フェーズを最大1.3倍高速化することができます。
+
+次に、ZeROとその通信オーバーヘッドについて詳しく掘り下げた上で、ZeRO++における主要な最適化について説明します。また、モデルサイズ、バッチサイズ、帯域幅の制約を変えて、ZeRO++が訓練の実行速度に与える影響も実証します。また、ZeRO++をDeepSpeed-Chatに適用して、RLHFを使用した対話モデルの学習を高速化する方法についても説明します。
+
+## ZeRO++の詳細
+
+
+
+
+
+図2: ZeROによる最適化
+
+
+ZeROは、データ並列のメモリ効率を向上させた技術であり、モデルの状態を全てのGPUに複製する代わりに、GPUごとに分割し、訓練中にgather/broadcastといった集合通信を必要になる都度実行して、分割されたモデル状態を再構築します。これにより、ZeROは、データ並列のシンプルさ・使いやすさを保ちつつ、すべてのGPUデバイスのメモリと計算を集約して、効果的に活用することができます。
+
+順伝播(forward)の計算では、ZeROはallgather/broadcast通信によって、モデルの各レイヤーのパラメータを、使用する直前に収集します(パラメータの合計のサイズをMとします)。逆伝播(backward)では、ZeRO は各レイヤーのパラメータについて同様の通信パターンによって、各GPU上でローカルに勾配を計算します(勾配の合計サイズは同じく Mになります)。さらに、ZeROは、ローカルに計算された勾配を、reduceまたはreduce-scatter通信(合計サイズM)を使用して平均化し、分割します。2回のallgather/broadcast、及び1回のreduceまたはreduce-scatter通信で、合計の通信データサイズは3Mになります。
+
+これらの通信オーバーヘッドを削減するために、ZeRO++では、上記の3回の通信を対象とした一連の最適化技術を実現しました:
+
+
+
+
+
+図3: qwZにおけるブロックベース量子化
+
+
+
+### パラメータの量子化と通信 (qwZ)
+
+まず、allgather時のパラメータの通信量を削減するために、パラメータの量子化を使用します。通信の直前に各モデルパラメータをFP16(2バイト)からINT8(1バイト)データ型に変換し、通信後に元に戻します。しかし、単純にパラメータの量子化を行うと、モデルの学習精度が低下する可能性があります。そこで、モデルの学習精度を維持するために、モデルパラメータの各サブセットに対して、独立した量子化を行うブロックベースの量子化を採用しています。これまでに、高性能なブロックベース量子化の実装は存在しなかったため、ZeRO++のために、高度に最適化された量子化CUDAカーネルをゼロから実装し、基本的な量子化と比較して、3倍の高精度と、5倍の高速化を実現しました。
+
+
+
+
+
+図4: hpZにおける階層的なパラメータの分割
+
+
+
+### ZeROのための階層的なパラメータの分割 (hpZ)
+
+次に、逆伝播において、GPUメモリの必要サイズの増加と引き換えに、パラメータのallgatherの通信オーバヘッドを削減します。具体的には、ZeROのようにモデル全体のパラメータを全てのサーバのGPUデバイスに分散させるのではなく、各サーバごとに完全なモデルのコピーを保持します。これにより、必要メモリサイズは増加しますが、一般に通信帯域幅が限られるサーバ間でのallgather/broadcastではなく、通信帯域幅の大きいサーバ内通信によるallgather/broadcastのみを使用することになり、大幅に高速化できます。
+
+
+
+
+
+図5: qgZの処理の流れ
+
+
+
+### 勾配の量子化と通信 (qgZ)
+
+次に取り上げる、reduce-scatterを使った勾配の通信コストの削減は、上述の他の課題よりさらに困難です。通信量を減らすために単純に量子化を適用すると、ブロックベースの量子化を使用したとしても、reduceでの加算の過程で誤差が累積されてしまいます。そこで我々は、勾配を送信前に量子化し、受信後、reduceでの加算の前に量子化を解除します。これを効率的に行うために、我々はqgZと呼ばれるall-to-allベースの新しい量子化勾配通信パラダイムを考案しました。
+
+qgZは、次の2つの課題を解決するために設計されています。i) 単純にINT4/INT8でreduce-scatterを実装した場合、reduceを低精度で計算することによって生じる大幅な精度低下を克服すること、及び ii) (元の精度でreduce-scatterを行う場合でも)リングベースまたはツリーベースの従来のreduce-scatterにおいて、量子化と復元の一連の処理から生じる精度低下と大幅なレイテンシオーバーヘッドを回避すること です。qgZは、リングベースまたはツリーベースの散布度削減アルゴリズムの代わりに、新しい階層的なall-to-all通信によるアプローチを用います。
+
+qgZには3つの主要なステップがあります:i) 勾配スライスの並べ替え、ii) ノード内通信と加算、iii) ノード間通信と加算。まず、通信が行われる前に、勾配テンソルのスライスと、スライスの並べ替えを行い、通信終了時に各GPU上で正しい勾配の配置(図5の緑色の勾配のスライス)が得られるようにします。第2に、並べ替えられた勾配スライスを量子化し、各ノード内でall-to-all通信を行います。all-to-allから受信した勾配スライスは、量子化から復元され、ローカルでreduction(加算)の計算を行います。第3に、ローカルでreductionされた勾配を再び量子化し、ノード間で全ノード間通信を行います。受信した勾配を再び量子化から復元し、元の精度でreductionの計算を行い、図5の緑の勾配のスライスを得ます。
+
+このような階層的なアプローチをとる理由は、ノード間の通信量を削減するためです。より正確には、ノードあたりN個のGPU、モデルサイズM、および量子化の比率Zが与えられた場合、シングルホップのall-to-all通信では、M*N/Z個のノード間通信が発生します。これに対し、この階層的アプローチでは、各GPUのノード間通信をM/ZからM/(Z*N)に減らすことができます。したがって、総通信量はM*N/ZからM*N/(Z*N)=M/Zに減少します。さらに、ノード内通信とノード間通信をオーバーラップさせ、(テンソルスライス並べ替え+ノード内量子化)と(ノード内非量子化+ノード内加算+ノード間量子化)のCUDAカーネルを融合させることで、qgZのend-to-endのレイテンシを最適化します。
+
+
+
+| Communication Volume | Forward all-gather on weights | Backward all-gather on weights | Backward reduce-scatter on gradients | Total |
+|:---------------------------:|:------------------------------------:|:-------------------------------------:|:-------------------------------------------:|:------------:|
+| ZeRO | M | M | M | 3M |
+| ZeRO++ | 0.5M | 0 | 0.25M | 0.75M |
+
+
+
+### **通信量の削減**
+
+上述の3つの最適化技術をすべて組み込むことで、ノード間の通信量を3Mから0.75Mに減らすことができます。具体的には、qwZを用いて、モデルパラメータに関する順伝播のallgather/broadcast通信をMから0.5Mに削減します。また、qgZを使用して、逆伝播のノード間のreduce-scatter通信をMから0.25Mに削減します。
+
+## **ZeRO++によるLLM訓練の高速化**
+
+ここでは、384台のNVIDIA V100 GPUを使用した、実際のLLM訓練シナリオでのZeRO++の評価結果を示します。
+
+
+
+
+
+図6: 様々なモデルサイズでのZeRO++とZeROの速度の比較(384台のV100 GPU、400Gbps (100Gbps×4) のノード間接続)
+
+
+
+### **GPUあたりのバッチサイズが小さい場合でも高い効率を実現**
+
+**高帯域幅クラスタ:** 図6は、それぞれ100Gbpsで動作する4つのインフィニバンド(IB)接続を使用した400Gbpsノード間接続で、異なるモデルサイズとマイクロバッチサイズについて、ZeRO++のスループットがZeROを上回ったことを示しています。GPUあたり1kトークンを使用した場合、ZeRO++はZeRO-3に対して28%から36%のスループット向上を達成しました。マイクロバッチサイズが2kの場合では、ZeRO++はZeRO-3に対して24%から29%のスループット向上を達成しています。
+
+
+
+
+
+
+図7: 異なるサイズのLLMのスループット比較(384台のGPU・100Gbpsのノード間接続)
+
+
+**低帯域幅クラスタ:** 100Gbpsネットワークのような低速なネットワーク環境では、ZeRO++は大幅に優れた性能を発揮します。図 7 に示すように、ZeRO++ は ZeRO-3 と比較して、end-to-endのスループットで最大 2.2 倍の高速化を達成しています。平均して、ZeRO++はZeRO-3をベースラインとして、約2倍の高速化を達成しています。
+
+
+
+
+
+
+図8: ZeRO++により、低い帯域幅のクラスタでも、ZeROを高い帯域幅のクラスタで使用した場合と同等の性能を実現
+
+
+
+### **低帯域幅クラスタでも高帯域幅クラスタで従来技術と用いたのと同様の効率を実現**
+
+さらに、ZeRO ++は、低帯域幅クラスタで、はるかに高い帯域幅クラスタでのZeROを使用した場合と比較して、同等のシステムスループットを達成できます。図8に示すように、18Bと138Bの両モデルで、200Gbpsノード間通信が可能な環境でのZeRO++は、800Gbpsノード間通信が可能な環境のZeRO-3と同等のTFLOPを達成できます。その優れたスケーラビリティから、ZeRO++は大規模AIモデルを訓練するための次世代のZeROと位置付けられます。
+
+## **DeepSpeed-Chatを用いたRLHF訓練におけるZeRO++の適用**
+
+### **RLHF訓練の背景**
+
+ChatGPTのようなモデルは、LLMの学習と、[RLHFによるファインチューニング](https://openai.com/blog/chatgpt)によって構築されます。RLHFは生成(推論)フェーズと学習フェーズから構成されます。生成フェーズでは、アクターモデルが部分的な会話を入力とし、一連の順伝播の計算を用いて応答を生成します。そして訓練フェーズでは、クリティックモデルが生成された応答を品質によってランク付けし、アクターモデルに強化信号を与えます。アクターモデルはこれらのランク付けを用いてファインチューニングされ、その後の反復においてより正確で適切な応答を生成できるようになります。
+
+RLHFトレーニングは4つのモデル(アクター、リファレンス、クリティック、リウォード)を利用するため、きわめて大きなメモリが必要となります。この問題に対処するため、低ランク適応(LoRA)を採用しています。LoRAは事前学習されたモデルのパラメータを固定し、学習可能なランク分解行列をTransformerアーキテクチャの各層に追加することで、学習可能なパラメータ数を大幅に削減することができます。LoRAを用いてメモリ使用量を削減することでRLHFを高速化し、より大きなバッチサイズでの計算が可能になり、スループットを大幅に向上できます。
+
+### **RLHF訓練のためのDeepSpeed-ChatへのZeRO++の適用**
+
+
+
+
+
+
+図9: ZeRO++によりRLHF訓練の生成フェーズと訓練フェーズの両方を高速化
+
+
+
+LoRAを使用する場合、RLHFでは、ほとんどのモデルパラメータが固定されています。ZeRO++は、この特徴を利用した特別な機能を提供しています。ZeRO++は通常、固定されたパラメータをFP16で保持し、各通信操作の前に量子化します。RLHFではその代わりに、前もってINT4/8に量子化しておくことができます。通信後の量子化からの復元は必要ですが、復元されたパラメータは、それを使用する計算が終わった後に破棄されます。
+
+このようにZeRO++をRLHF訓練に使用することで、メモリ使用量と通信量の両方を削減できます。通信量だけでなく、メモリ使用量が削減されるため、バッチサイズが大きくすることができ、訓練のスループットが向上します。生成フェーズでは、ZeRO++はhpZを使用してすべてのパラメータの通信を各ノード内で行うようにし、通信量を削減しながらノード内の高い通信帯域幅を利用することで、生成スループットをさらに向上させます。
+
+ZeRO++はDeepSpeed-Chatに統合され、ChatGPTライクなモデルのRLHF訓練を強力にサポートします。図 9 では、32 個の V100 GPU 上で、30B および 66B のアクターモデルについて、ZeRO と ZeRO++ を比較し、アクターモデルのサイズが異なる場合の RLHF 生成のスループットを比較しています。その結果、ZeRO++はZeROよりもRLHF生成スループットが最大2.25倍向上することが確認されました。また、16個のV100 GPU上での訓練フェーズでは、ZeRO++によって可能になった通信量の低減とバッチサイズの拡大により、ZeRO++はZeROよりも1.26倍優れたスループットを達成しています。
+
+## **早速試してみましょう!**
+
+DeepSpeed ZeRO++をリリースし、AIコミュニティの誰もが利用できるようになることを大変嬉しく思っています。まずは、LLM訓練の[チュートリアル](https://www.deepspeed.ai/tutorials/zeropp/)をご覧ください。ZeRO++ for DeepSpeed-Chatは数週間以内にリリースされる予定です。
+
+ZeRO++の技術的な詳細については、arXivにアップロードされた[論文](https://arxiv.org/pdf/2306.10209.pdf)をご覧ください。
+
+DeepSpeed-ZeRO++は、DeepSpeedエコシステムの一部です。詳細については、我々の[Webサイト](https://www.deepspeed.ai/)をご覧ください。詳細なブログ記事、チュートリアル、ドキュメントが掲載されています。
+
+また、[英語版Twitter](https://twitter.com/MSFTDeepSpeed)、[日本語版Twitter](https://twitter.com/MSFTDeepSpeedJP)、[中国語版Zhihuアカウント](https://www.zhihu.com/people/deepspeed)でも最新のDeepSpeedニュースを発信しています。
+
+DeepSpeedは、皆様の開発への参加を歓迎しています。DeepSpeedのGitHubページで、バグ報告、Pull Request、ディスカッションへの参加が可能です。詳細は[ガイドライン](https://github.com/microsoft/DeepSpeed/blob/master/CONTRIBUTING.md)をご覧ください。また、大学、研究所、企業とのコラボレーションも行っています。こうしたコラボレーションについてのご要望(およびGitHubには適さないその他の話題)については まで直接メールをお送りください。
+
+
+**Contributors:**
+
+このプロジェクトは、DeepSpeedチームの以下のメンバーによって実施されました。
+
+[Guanhua Wang](https://www.microsoft.com/en-us/research/people/guanhuawang/), Heyang Qin, Sam Ade Jacobs, Connor Holmes, [Samyam Rajbhandari](https://www.microsoft.com/en-us/research/people/samyamr/), [Olatunji Ruwase](https://www.microsoft.com/en-us/research/people/olruwase/), Ammar Ahmad Awan, Jeff Rasley, Michael Wyatt, [Yuxiong He](https://www.microsoft.com/en-us/research/people/yuxhe/) (team lead)
diff --git a/csrc/adagrad/cpu_adagrad.cpp b/csrc/adagrad/cpu_adagrad.cpp
index 5ae2419cb5e1..563255176500 100644
--- a/csrc/adagrad/cpu_adagrad.cpp
+++ b/csrc/adagrad/cpu_adagrad.cpp
@@ -1,3 +1,8 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
#include "cpu_adagrad.h"
#include
#include
@@ -42,6 +47,8 @@ void Adagrad_Optimizer::Step_1(float* _params,
size_t offset = copy_size + t;
#if defined(__ENABLE_CUDA__)
if ((t / TILE) >= 2) { cudaStreamSynchronize(_streams[_buf_index]); }
+#elif defined(__ENABLE_CANN__)
+ if ((t / TILE) >= 2) { aclrtSynchronizeStream(_streams[_buf_index].stream()); }
#endif
#pragma omp parallel for
for (size_t k = t; k < offset; k++) {
@@ -57,7 +64,7 @@ void Adagrad_Optimizer::Step_1(float* _params,
grad += _eps;
grad = momentum / grad;
param = grad * step_size + param;
-#if defined(__ENABLE_CUDA__)
+#if defined(__ENABLE_CUDA__) or defined(__ENABLE_CANN__)
if (dev_params) _doubled_buffer[_buf_index][k - t] = param;
#endif
if (half_precision)
@@ -74,6 +81,17 @@ void Adagrad_Optimizer::Step_1(float* _params,
_doubled_buffer[_buf_index], dev_params + t, (copy_size), _streams[_buf_index]);
_buf_index = !_buf_index;
}
+#elif defined(__ENABLE_CANN__)
+ if (dev_params) {
+ size_t memcpy_size = copy_size * sizeof(_doubled_buffer[_buf_index][0]);
+ aclrtMemcpy(dev_params + t,
+ memcpy_size,
+ _doubled_buffer[_buf_index],
+ memcpy_size,
+ aclrtMemcpyKind::ACL_MEMCPY_HOST_TO_DEVICE);
+
+ _buf_index = !_buf_index;
+ }
#endif
}
}
@@ -173,9 +191,9 @@ int ds_adagrad_step(int optimizer_id,
std::static_pointer_cast(s_optimizers[optimizer_id]);
opt->IncrementStep(step);
opt->update_state(lr, epsilon, weight_decay);
- opt->Step_8(params_ptr, grads_ptr, exp_avg_sq_ptr, params_c.size(0));
+ opt->Step_8(params_ptr, grads_ptr, exp_avg_sq_ptr, params_c.numel());
-#if defined(__ENABLE_CUDA__)
+#if defined(__ENABLE_CUDA__) or defined(__ENABLE_CANN__)
opt->SynchronizeStreams();
#endif
return 0;
@@ -191,7 +209,7 @@ int ds_adagrad_step_plus_copy(int optimizer_id,
torch::Tensor& exp_avg_sq,
torch::Tensor& gpu_params)
{
-#if defined(__ENABLE_CUDA__)
+#if defined(__ENABLE_CUDA__) or defined(__ENABLE_CANN__)
auto params_c = params.contiguous();
auto gpu_params_c = gpu_params.contiguous();
auto exp_avg_sq_c = exp_avg_sq.contiguous();
@@ -209,7 +227,7 @@ int ds_adagrad_step_plus_copy(int optimizer_id,
opt->Step_8(params_ptr,
grads_ptr,
exp_avg_sq_ptr,
- params_c.size(0),
+ params_c.numel(),
gpu_params_ptr,
(params.options().dtype() == at::kHalf));
diff --git a/csrc/adam/cpu_adam.cpp b/csrc/adam/cpu_adam.cpp
index f17f22535ab8..96809827f3e1 100644
--- a/csrc/adam/cpu_adam.cpp
+++ b/csrc/adam/cpu_adam.cpp
@@ -1,297 +1,9 @@
-#include "cpu_adam.h"
-#include
-#include
-#include
-#include
-#include
-#include
-
-#if defined(__ENABLE_CUDA__)
-#include
-#include "cublas_v2.h"
-#include "cuda.h"
-#include "curand.h"
-#include "custom_cuda_layers.h"
-#endif
-
-static std::unordered_map> s_optimizers;
-
-// C++ interface
-
-void Adam_Optimizer::Step_1(float* _params,
- float* grads,
- float* _exp_avg,
- float* _exp_avg_sq,
- size_t _param_size,
- ds_half_precision_t* dev_params,
- bool half_precision)
-{
- size_t rounded_size = 0;
-#if defined(__AVX512__) or defined(__AVX256__)
- Step_AVX<1>(&rounded_size,
- _params,
- grads,
- _exp_avg,
- _exp_avg_sq,
- _param_size,
- dev_params,
- half_precision);
-#endif
- if (_param_size > rounded_size) {
- float betta1_minus1 = 1 - _betta1;
- float betta2_minus1 = 1 - _betta2;
-
- float step_size = -1 * _alpha / _bias_correction1;
- float w_decay = -1 * _alpha * _weight_decay;
- ds_half_precision_t* grads_cast_h;
- ds_half_precision_t* params_cast_h;
- if (half_precision) {
- grads_cast_h = reinterpret_cast(grads);
- params_cast_h = reinterpret_cast(_params);
- }
-
- for (size_t t = rounded_size; t < _param_size; t += TILE) {
- size_t copy_size = TILE;
- if ((t + TILE) > _param_size) copy_size = _param_size - t;
- size_t offset = copy_size + t;
-#if defined(__ENABLE_CUDA__)
- if ((t / TILE) >= 2) { cudaStreamSynchronize(_streams[_buf_index]); }
-#endif
-#pragma omp parallel for
- for (size_t k = t; k < offset; k++) {
- float grad = half_precision ? (float)grads_cast_h[k] : grads[k];
- float param = half_precision ? (float)params_cast_h[k] : _params[k];
- float momentum = _exp_avg[k];
- float variance = _exp_avg_sq[k];
- if (_weight_decay > 0 && !_adamw_mode) { grad = param * _weight_decay + grad; }
- momentum = momentum * _betta1;
- momentum = grad * betta1_minus1 + momentum;
-
- variance = variance * _betta2;
- grad = grad * grad;
- variance = grad * betta2_minus1 + variance;
-
- grad = sqrt(variance);
- grad = grad * _bias_correction2 + _eps;
- grad = momentum / grad;
- if (_weight_decay > 0 && _adamw_mode) { param += w_decay * param; }
- param = grad * step_size + param;
-#if defined(__ENABLE_CUDA__)
- if (dev_params) _doubled_buffer[_buf_index][k - t] = param;
-#endif
- if (half_precision)
- params_cast_h[k] = (ds_half_precision_t)param;
- else
- _params[k] = param;
- _exp_avg[k] = momentum;
- _exp_avg_sq[k] = variance;
- }
-#if defined(__ENABLE_CUDA__)
- if (dev_params) {
- launch_param_update(
- _doubled_buffer[_buf_index], dev_params + t, (copy_size), _streams[_buf_index]);
-
- _buf_index = !_buf_index;
- }
-#endif
- }
- }
-}
-
-void Adam_Optimizer::Step_4(float* _params,
- float* grads,
- float* _exp_avg,
- float* _exp_avg_sq,
- size_t _param_size,
- ds_half_precision_t* dev_params,
- bool half_precision)
-{
- size_t rounded_size = 0;
-#if defined(__AVX512__) or defined(__AVX256__)
- Step_AVX<4>(&rounded_size,
- _params,
- grads,
- _exp_avg,
- _exp_avg_sq,
- _param_size,
- dev_params,
- half_precision);
-#endif
- if (_param_size > rounded_size)
- Step_1((_params + rounded_size),
- (grads + rounded_size),
- (_exp_avg + rounded_size),
- (_exp_avg_sq + rounded_size),
- (_param_size - rounded_size),
- (dev_params != nullptr ? (dev_params + rounded_size) : dev_params),
- half_precision);
-}
-
-int create_adam_optimizer(int optimizer_id,
- float alpha = 1e-3,
- float betta1 = 0.9,
- float betta2 = 0.999,
- float eps = 1e-8,
- float weight_decay = 0,
- bool adamw_mode = true,
- bool should_log = false)
-{
- auto opt =
- std::make_shared(alpha, betta1, betta2, eps, weight_decay, adamw_mode);
-
- s_optimizers[optimizer_id] = opt;
-
- if (should_log) {
- std::string avx_type = "";
-#if defined(__AVX512__)
- avx_type = "AVX512";
-#else
-#if defined(__AVX256__)
- avx_type = "AVX2";
-#else
- avx_type = "scalar";
-#endif
-#endif
-
- printf("Adam Optimizer #%d is created with %s arithmetic capability.\n",
- optimizer_id,
- avx_type.c_str());
- printf("Config: alpha=%f, betas=(%f, %f), weight_decay=%f, adam_w=%d\n",
- alpha,
- betta1,
- betta2,
- weight_decay,
- (int)adamw_mode);
- }
-
- return 0;
-}
-
-void Adam_Optimizer::Step_8(float* _params,
- float* grads,
- float* _exp_avg,
- float* _exp_avg_sq,
- size_t _param_size,
- ds_half_precision_t* dev_params,
- bool half_precision)
-{
- size_t rounded_size = 0;
-#if defined(__AVX512__) or defined(__AVX256__)
- Step_AVX<8>(&rounded_size,
- _params,
- grads,
- _exp_avg,
- _exp_avg_sq,
- _param_size,
- dev_params,
- half_precision);
-#endif
- if (_param_size > rounded_size)
- Step_4((_params + rounded_size),
- (grads + rounded_size),
- (_exp_avg + rounded_size),
- (_exp_avg_sq + rounded_size),
- (_param_size - rounded_size),
- (dev_params != nullptr ? (dev_params + rounded_size) : dev_params),
- half_precision);
-}
-
-int ds_adam_step(int optimizer_id,
- size_t step,
- float lr,
- float beta1,
- float beta2,
- float epsilon,
- float weight_decay,
- bool bias_correction,
- torch::Tensor& params,
- torch::Tensor& grads,
- torch::Tensor& exp_avg,
- torch::Tensor& exp_avg_sq)
-{
- auto params_c = params.contiguous();
- auto grads_c = grads.contiguous();
- auto exp_avg_c = exp_avg.contiguous();
- auto exp_avg_sq_c = exp_avg_sq.contiguous();
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
- // assert(params.options().dtype() == grads.options().dtype());
-
- float* params_ptr = (float*)params_c.data_ptr();
- float* grads_ptr = (float*)grads_c.data_ptr();
- float* exp_avg_ptr = (float*)exp_avg_c.data_ptr();
- float* exp_avg_sq_ptr = (float*)exp_avg_sq_c.data_ptr();
-
- std::shared_ptr opt =
- std::static_pointer_cast(s_optimizers[optimizer_id]);
- opt->IncrementStep(step, beta1, beta2);
- opt->update_state(lr, epsilon, weight_decay, bias_correction);
-
- opt->Step_8(params_ptr,
- grads_ptr,
- exp_avg_ptr,
- exp_avg_sq_ptr,
- params_c.size(0),
- nullptr,
- (params.options().dtype() == at::kHalf));
-
-#if defined(__ENABLE_CUDA__)
- opt->SynchronizeStreams();
-#endif
- return 0;
-}
+// DeepSpeed Team
-int ds_adam_step_plus_copy(int optimizer_id,
- size_t step,
- float lr,
- float beta1,
- float beta2,
- float epsilon,
- float weight_decay,
- bool bias_correction,
- torch::Tensor& params,
- torch::Tensor& grads,
- torch::Tensor& exp_avg,
- torch::Tensor& exp_avg_sq,
- torch::Tensor& gpu_params)
-{
-#if defined(__ENABLE_CUDA__)
- auto params_c = params.contiguous();
- auto gpu_params_c = gpu_params.contiguous();
- auto exp_avg_c = exp_avg.contiguous();
- auto exp_avg_sq_c = exp_avg_sq.contiguous();
- auto grads_c = grads.contiguous();
-
- float* params_ptr = (float*)params_c.data_ptr();
- float* grads_ptr = (float*)grads_c.data_ptr();
- ds_half_precision_t* gpu_params_ptr = (ds_half_precision_t*)gpu_params_c.data_ptr();
- float* exp_avg_ptr = (float*)exp_avg_c.data_ptr();
- float* exp_avg_sq_ptr = (float*)exp_avg_sq_c.data_ptr();
-
- std::shared_ptr opt =
- std::static_pointer_cast(s_optimizers[optimizer_id]);
- opt->IncrementStep(step, beta1, beta2);
- opt->update_state(lr, epsilon, weight_decay, bias_correction);
- opt->Step_8(params_ptr,
- grads_ptr,
- exp_avg_ptr,
- exp_avg_sq_ptr,
- params_c.size(0),
- gpu_params_ptr,
- (params.options().dtype() == at::kHalf));
-
- opt->SynchronizeStreams();
-#else
- assert(false);
-#endif
- return 0;
-}
-
-int destroy_adam_optimizer(int optimizer_id)
-{
- s_optimizers.erase(optimizer_id);
-
- return 0;
-}
+#include "cpu_adam.h"
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
{
diff --git a/csrc/adam/cpu_adam_impl.cpp b/csrc/adam/cpu_adam_impl.cpp
new file mode 100644
index 000000000000..9a4a8d956519
--- /dev/null
+++ b/csrc/adam/cpu_adam_impl.cpp
@@ -0,0 +1,312 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include "cpu_adam.h"
+
+#if defined(__ENABLE_CUDA__)
+#include
+#include "cublas_v2.h"
+#include "cuda.h"
+#include "curand.h"
+#include "custom_cuda_layers.h"
+#endif
+
+static std::unordered_map> s_optimizers;
+
+// C++ interface
+
+void Adam_Optimizer::Step_1(float* _params,
+ float* grads,
+ float* _exp_avg,
+ float* _exp_avg_sq,
+ size_t _param_size,
+ ds_half_precision_t* dev_params,
+ bool half_precision)
+{
+ size_t rounded_size = 0;
+#if defined(__AVX512__) or defined(__AVX256__)
+ Step_AVX<1>(&rounded_size,
+ _params,
+ grads,
+ _exp_avg,
+ _exp_avg_sq,
+ _param_size,
+ dev_params,
+ half_precision);
+#endif
+ if (_param_size > rounded_size) {
+ float betta1_minus1 = 1 - _betta1;
+ float betta2_minus1 = 1 - _betta2;
+
+ float step_size = -1 * _alpha / _bias_correction1;
+ float w_decay = -1 * _alpha * _weight_decay;
+ ds_half_precision_t* grads_cast_h;
+ ds_half_precision_t* params_cast_h;
+ if (half_precision) {
+ grads_cast_h = reinterpret_cast(grads);
+ params_cast_h = reinterpret_cast(_params);
+ }
+
+ for (size_t t = rounded_size; t < _param_size; t += TILE) {
+ size_t copy_size = TILE;
+ if ((t + TILE) > _param_size) copy_size = _param_size - t;
+ size_t offset = copy_size + t;
+#if defined(__ENABLE_CUDA__)
+ if ((t / TILE) >= 2) { cudaStreamSynchronize(_streams[_buf_index]); }
+#elif defined(__ENABLE_CANN__)
+ if ((t / TILE) >= 2) { aclrtSynchronizeStream(_streams[_buf_index].stream()); }
+#endif
+#pragma omp parallel for
+ for (size_t k = t; k < offset; k++) {
+ float grad = half_precision ? (float)grads_cast_h[k] : grads[k];
+ float param = half_precision ? (float)params_cast_h[k] : _params[k];
+ float momentum = _exp_avg[k];
+ float variance = _exp_avg_sq[k];
+ if (_weight_decay > 0 && !_adamw_mode) { grad = param * _weight_decay + grad; }
+ momentum = momentum * _betta1;
+ momentum = grad * betta1_minus1 + momentum;
+
+ variance = variance * _betta2;
+ grad = grad * grad;
+ variance = grad * betta2_minus1 + variance;
+
+ grad = sqrt(variance);
+ grad = grad * _bias_correction2 + _eps;
+ grad = momentum / grad;
+ if (_weight_decay > 0 && _adamw_mode) { param += w_decay * param; }
+ param = grad * step_size + param;
+#if defined(__ENABLE_CUDA__) or defined(__ENABLE_CANN__)
+ if (dev_params) _doubled_buffer[_buf_index][k - t] = param;
+#endif
+ if (half_precision)
+ params_cast_h[k] = (ds_half_precision_t)param;
+ else
+ _params[k] = param;
+ _exp_avg[k] = momentum;
+ _exp_avg_sq[k] = variance;
+ }
+#if defined(__ENABLE_CUDA__)
+ if (dev_params) {
+ launch_param_update(
+ _doubled_buffer[_buf_index], dev_params + t, (copy_size), _streams[_buf_index]);
+
+ _buf_index = !_buf_index;
+ }
+#elif defined(__ENABLE_CANN__)
+ if (dev_params) {
+ size_t memcpy_size = copy_size * sizeof(_doubled_buffer[_buf_index][0]);
+ aclrtMemcpy(dev_params + t,
+ memcpy_size,
+ _doubled_buffer[_buf_index],
+ memcpy_size,
+ aclrtMemcpyKind::ACL_MEMCPY_HOST_TO_DEVICE);
+
+ _buf_index = !_buf_index;
+ }
+#endif
+ }
+ }
+}
+
+void Adam_Optimizer::Step_4(float* _params,
+ float* grads,
+ float* _exp_avg,
+ float* _exp_avg_sq,
+ size_t _param_size,
+ ds_half_precision_t* dev_params,
+ bool half_precision)
+{
+ size_t rounded_size = 0;
+#if defined(__AVX512__) or defined(__AVX256__)
+ Step_AVX<4>(&rounded_size,
+ _params,
+ grads,
+ _exp_avg,
+ _exp_avg_sq,
+ _param_size,
+ dev_params,
+ half_precision);
+#endif
+ if (_param_size > rounded_size)
+ Step_1((_params + rounded_size),
+ (grads + rounded_size),
+ (_exp_avg + rounded_size),
+ (_exp_avg_sq + rounded_size),
+ (_param_size - rounded_size),
+ (dev_params != nullptr ? (dev_params + rounded_size) : dev_params),
+ half_precision);
+}
+
+int create_adam_optimizer(int optimizer_id,
+ float alpha,
+ float betta1,
+ float betta2,
+ float eps,
+ float weight_decay,
+ bool adamw_mode,
+ bool should_log)
+{
+ auto opt =
+ std::make_shared(alpha, betta1, betta2, eps, weight_decay, adamw_mode);
+
+ s_optimizers[optimizer_id] = opt;
+
+ if (should_log) {
+ std::string avx_type = "";
+#if defined(__AVX512__)
+ avx_type = "AVX512";
+#else
+#if defined(__AVX256__)
+ avx_type = "AVX2";
+#else
+ avx_type = "scalar";
+#endif
+#endif
+
+ printf("Adam Optimizer #%d is created with %s arithmetic capability.\n",
+ optimizer_id,
+ avx_type.c_str());
+ printf("Config: alpha=%f, betas=(%f, %f), weight_decay=%f, adam_w=%d\n",
+ alpha,
+ betta1,
+ betta2,
+ weight_decay,
+ (int)adamw_mode);
+ }
+
+ return 0;
+}
+
+void Adam_Optimizer::Step_8(float* _params,
+ float* grads,
+ float* _exp_avg,
+ float* _exp_avg_sq,
+ size_t _param_size,
+ ds_half_precision_t* dev_params,
+ bool half_precision)
+{
+ size_t rounded_size = 0;
+#if defined(__AVX512__) or defined(__AVX256__)
+ Step_AVX<8>(&rounded_size,
+ _params,
+ grads,
+ _exp_avg,
+ _exp_avg_sq,
+ _param_size,
+ dev_params,
+ half_precision);
+#endif
+ if (_param_size > rounded_size)
+ Step_4((_params + rounded_size),
+ (grads + rounded_size),
+ (_exp_avg + rounded_size),
+ (_exp_avg_sq + rounded_size),
+ (_param_size - rounded_size),
+ (dev_params != nullptr ? (dev_params + rounded_size) : dev_params),
+ half_precision);
+}
+
+int ds_adam_step(int optimizer_id,
+ size_t step,
+ float lr,
+ float beta1,
+ float beta2,
+ float epsilon,
+ float weight_decay,
+ bool bias_correction,
+ torch::Tensor& params,
+ torch::Tensor& grads,
+ torch::Tensor& exp_avg,
+ torch::Tensor& exp_avg_sq)
+{
+ auto params_c = params.contiguous();
+ auto grads_c = grads.contiguous();
+ auto exp_avg_c = exp_avg.contiguous();
+ auto exp_avg_sq_c = exp_avg_sq.contiguous();
+
+ // assert(params.options().dtype() == grads.options().dtype());
+
+ float* params_ptr = (float*)params_c.data_ptr();
+ float* grads_ptr = (float*)grads_c.data_ptr();
+ float* exp_avg_ptr = (float*)exp_avg_c.data_ptr();
+ float* exp_avg_sq_ptr = (float*)exp_avg_sq_c.data_ptr();
+
+ std::shared_ptr opt =
+ std::static_pointer_cast(s_optimizers[optimizer_id]);
+ opt->IncrementStep(step, beta1, beta2);
+ opt->update_state(lr, epsilon, weight_decay, bias_correction);
+
+ opt->Step_8(params_ptr,
+ grads_ptr,
+ exp_avg_ptr,
+ exp_avg_sq_ptr,
+ params_c.numel(),
+ nullptr,
+ (params.options().dtype() == at::kHalf));
+
+#if defined(__ENABLE_CUDA__) or defined(__ENABLE_CANN__)
+ opt->SynchronizeStreams();
+#endif
+ return 0;
+}
+
+int ds_adam_step_plus_copy(int optimizer_id,
+ size_t step,
+ float lr,
+ float beta1,
+ float beta2,
+ float epsilon,
+ float weight_decay,
+ bool bias_correction,
+ torch::Tensor& params,
+ torch::Tensor& grads,
+ torch::Tensor& exp_avg,
+ torch::Tensor& exp_avg_sq,
+ torch::Tensor& device_params)
+{
+#if defined(__ENABLE_CUDA__) or defined(__ENABLE_CANN__)
+ auto params_c = params.contiguous();
+ auto device_params_c = device_params.contiguous();
+ auto exp_avg_c = exp_avg.contiguous();
+ auto exp_avg_sq_c = exp_avg_sq.contiguous();
+ auto grads_c = grads.contiguous();
+
+ float* params_ptr = (float*)params_c.data_ptr();
+ float* grads_ptr = (float*)grads_c.data_ptr();
+ ds_half_precision_t* device_params_ptr = (ds_half_precision_t*)device_params_c.data_ptr();
+ float* exp_avg_ptr = (float*)exp_avg_c.data_ptr();
+ float* exp_avg_sq_ptr = (float*)exp_avg_sq_c.data_ptr();
+
+ std::shared_ptr opt =
+ std::static_pointer_cast(s_optimizers[optimizer_id]);
+ opt->IncrementStep(step, beta1, beta2);
+ opt->update_state(lr, epsilon, weight_decay, bias_correction);
+ opt->Step_8(params_ptr,
+ grads_ptr,
+ exp_avg_ptr,
+ exp_avg_sq_ptr,
+ params_c.numel(),
+ device_params_ptr,
+ (params.options().dtype() == at::kHalf));
+
+ opt->SynchronizeStreams();
+#else
+ assert(false);
+#endif
+ return 0;
+}
+
+int destroy_adam_optimizer(int optimizer_id)
+{
+ s_optimizers.erase(optimizer_id);
+
+ return 0;
+}
diff --git a/csrc/adam/fused_adam_frontend.cpp b/csrc/adam/fused_adam_frontend.cpp
index b06531c53002..13b390248608 100644
--- a/csrc/adam/fused_adam_frontend.cpp
+++ b/csrc/adam/fused_adam_frontend.cpp
@@ -1,3 +1,8 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
#include
void multi_tensor_adam_cuda(int chunk_size,
diff --git a/csrc/adam/multi_tensor_adam.cu b/csrc/adam/multi_tensor_adam.cu
index 3cb9763befce..1b697d989b1a 100644
--- a/csrc/adam/multi_tensor_adam.cu
+++ b/csrc/adam/multi_tensor_adam.cu
@@ -1,6 +1,11 @@
-/* Copyright 2020 The Microsoft DeepSpeed Team
- Copyright NVIDIA/apex
- This file is adapted from fused adam in NVIDIA/apex, commit a109f85
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+/*
+Copyright NVIDIA/apex
+This file is adapted from fused adam in NVIDIA/apex, commit a109f85
*/
#include
diff --git a/csrc/adam/multi_tensor_apply.cuh b/csrc/adam/multi_tensor_apply.cuh
index 13af4b7578f6..12f41cb49c6b 100644
--- a/csrc/adam/multi_tensor_apply.cuh
+++ b/csrc/adam/multi_tensor_apply.cuh
@@ -1,6 +1,11 @@
-/* Copyright 2020 The Microsoft DeepSpeed Team
- Copyright NVIDIA/apex
- This file is adapted from fused adam in NVIDIA/apex, commit a109f85
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+/*
+Copyright NVIDIA/apex
+This file is adapted from fused adam in NVIDIA/apex, commit a109f85
*/
#include
diff --git a/csrc/aio/common/deepspeed_aio_common.cpp b/csrc/aio/common/deepspeed_aio_common.cpp
index 9e405d8e704c..32b0e8a32394 100644
--- a/csrc/aio/common/deepspeed_aio_common.cpp
+++ b/csrc/aio/common/deepspeed_aio_common.cpp
@@ -1,7 +1,9 @@
-/*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+/*
Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
*/
@@ -113,10 +115,13 @@ static int _do_io_complete(const long long int min_completes,
std::vector>& reap_times)
{
const auto start_time = std::chrono::high_resolution_clock::now();
- const auto n_completes = io_getevents(
- aio_ctxt->_io_ctxt, min_completes, max_completes, aio_ctxt->_io_events.data(), nullptr);
+ long long int n_completes = io_pgetevents(aio_ctxt->_io_ctxt,
+ min_completes,
+ max_completes,
+ aio_ctxt->_io_events.data(),
+ nullptr,
+ nullptr);
reap_times.push_back(std::chrono::high_resolution_clock::now() - start_time);
-
assert(n_completes >= min_completes);
return n_completes;
}
@@ -262,7 +267,7 @@ void report_file_error(const char* filename, const std::string file_op, const in
int open_file(const char* filename, const bool read_op)
{
- const int flags = read_op ? (O_RDONLY | __O_DIRECT) : (O_WRONLY | O_CREAT | __O_DIRECT);
+ const int flags = read_op ? (O_RDONLY | O_DIRECT) : (O_WRONLY | O_CREAT | O_DIRECT);
const int mode = 0600;
const auto fd = open(filename, flags, mode);
if (fd == -1) {
diff --git a/csrc/aio/common/deepspeed_aio_common.h b/csrc/aio/common/deepspeed_aio_common.h
index cc62d33765c8..2940de945ee8 100644
--- a/csrc/aio/common/deepspeed_aio_common.h
+++ b/csrc/aio/common/deepspeed_aio_common.h
@@ -1,7 +1,9 @@
-/*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+/*
Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
*/
diff --git a/csrc/aio/common/deepspeed_aio_types.cpp b/csrc/aio/common/deepspeed_aio_types.cpp
index e5811bb91149..5e34a61065d4 100644
--- a/csrc/aio/common/deepspeed_aio_types.cpp
+++ b/csrc/aio/common/deepspeed_aio_types.cpp
@@ -1,7 +1,9 @@
-/*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+/*
Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
*/
diff --git a/csrc/aio/common/deepspeed_aio_types.h b/csrc/aio/common/deepspeed_aio_types.h
index be3b352d6be2..ce6a4e5cdfa7 100644
--- a/csrc/aio/common/deepspeed_aio_types.h
+++ b/csrc/aio/common/deepspeed_aio_types.h
@@ -1,7 +1,9 @@
-/*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+/*
Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
*/
diff --git a/csrc/aio/common/deepspeed_aio_utils.cpp b/csrc/aio/common/deepspeed_aio_utils.cpp
index e8bf9de11259..763b2c253a34 100644
--- a/csrc/aio/common/deepspeed_aio_utils.cpp
+++ b/csrc/aio/common/deepspeed_aio_utils.cpp
@@ -1,7 +1,9 @@
-/*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+/*
Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
*/
diff --git a/csrc/aio/common/deepspeed_aio_utils.h b/csrc/aio/common/deepspeed_aio_utils.h
index 6c5952749dd3..9c58c2286610 100644
--- a/csrc/aio/common/deepspeed_aio_utils.h
+++ b/csrc/aio/common/deepspeed_aio_utils.h
@@ -1,7 +1,9 @@
-/*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+/*
Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
*/
diff --git a/csrc/aio/py_lib/deepspeed_aio_thread.cpp b/csrc/aio/py_lib/deepspeed_aio_thread.cpp
index a2670fb7b4cb..e9c6a8505858 100644
--- a/csrc/aio/py_lib/deepspeed_aio_thread.cpp
+++ b/csrc/aio/py_lib/deepspeed_aio_thread.cpp
@@ -1,7 +1,9 @@
-/*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+/*
Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
*/
@@ -22,7 +24,8 @@ io_op_desc_t::io_op_desc_t(const bool read_op,
_num_bytes(num_bytes),
_validate(validate)
{
- _cpu_buffer = _buffer.is_cuda() ? _buffer.to(torch::kCPU).pin_memory() : _buffer;
+ _cpu_buffer = (_buffer.is_cuda() || _buffer.is_xpu()) ? _buffer.to(torch::kCPU).pin_memory()
+ : _buffer;
_contiguous_buffer = _cpu_buffer.contiguous();
}
@@ -31,6 +34,7 @@ char* io_op_desc_t::data_ptr() const { return (char*)_contiguous_buffer.data_ptr
void io_op_desc_t::fini()
{
if (_read_op && _buffer.is_cuda()) { _buffer.copy_(_cpu_buffer.to(torch::kCUDA)); }
+ if (_read_op && _buffer.is_xpu()) { _buffer.copy_(_cpu_buffer.to(torch::kXPU)); }
}
deepspeed_aio_thread_t::deepspeed_aio_thread_t(const int tid, deepspeed_aio_config_t& aio_config)
diff --git a/csrc/aio/py_lib/deepspeed_aio_thread.h b/csrc/aio/py_lib/deepspeed_aio_thread.h
index d1cfcab8bfc2..20799ecbb018 100644
--- a/csrc/aio/py_lib/deepspeed_aio_thread.h
+++ b/csrc/aio/py_lib/deepspeed_aio_thread.h
@@ -1,7 +1,9 @@
-/*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+/*
Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
*/
diff --git a/csrc/aio/py_lib/deepspeed_pin_tensor.cpp b/csrc/aio/py_lib/deepspeed_pin_tensor.cpp
index 20bdf5b142a6..752823dc7dd2 100644
--- a/csrc/aio/py_lib/deepspeed_pin_tensor.cpp
+++ b/csrc/aio/py_lib/deepspeed_pin_tensor.cpp
@@ -1,7 +1,9 @@
-/*
-Copyright 2023 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+/*
Functionality for managing CPU tensors occupying page-locked memory.
*/
diff --git a/csrc/aio/py_lib/deepspeed_pin_tensor.h b/csrc/aio/py_lib/deepspeed_pin_tensor.h
index a421bbc8f3b1..4350a4ac7df6 100644
--- a/csrc/aio/py_lib/deepspeed_pin_tensor.h
+++ b/csrc/aio/py_lib/deepspeed_pin_tensor.h
@@ -1,11 +1,14 @@
-/*
-Copyright 2023 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+/*
Functionality for managing CPU tensors occupying page-locked memory.
TODO: Implement a full-featured manager that
- 1. Avoid page-locked memory leaks
- 2. Minimize page-locked memory usage by reducing internal fragmentation
+1. Avoid page-locked memory leaks
+2. Minimize page-locked memory usage by reducing internal fragmentation
+Functionality for managing CPU tensors occupying page-locked memory.
*/
#include