forked from EleutherAI/gpt-neox
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
19 changed files
with
2,384 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,120 @@ | ||
# Copyright (c) 2021, EleutherAI | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http:https://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
FROM nvidia/cuda:11.1.1-devel-ubuntu20.04 | ||
|
||
ENV DEBIAN_FRONTEND=noninteractive | ||
|
||
# metainformation | ||
LABEL org.opencontainers.image.version = "2.0" | ||
LABEL org.opencontainers.image.authors = "[email protected]" | ||
LABEL org.opencontainers.image.source = "https://www.github.com/eleutherai/gpt-neox" | ||
LABEL org.opencontainers.image.licenses = " Apache-2.0" | ||
LABEL org.opencontainers.image.base.name="docker.io/nvidia/cuda:11.1.1-devel-ubuntu20.04" | ||
|
||
#### System package (uses default Python 3 version in Ubuntu 20.04) | ||
RUN apt-get update -y && \ | ||
apt-get install -y \ | ||
git python3 python3-dev libpython3-dev python3-pip sudo pdsh \ | ||
htop llvm-9-dev tmux zstd software-properties-common build-essential autotools-dev \ | ||
nfs-common pdsh cmake g++ gcc curl wget vim less unzip htop iftop iotop ca-certificates ssh \ | ||
rsync iputils-ping net-tools libcupti-dev libmlx4-1 infiniband-diags ibutils ibverbs-utils \ | ||
rdmacm-utils perftest rdma-core nano && \ | ||
update-alternatives --install /usr/bin/python python /usr/bin/python3 1 && \ | ||
update-alternatives --install /usr/bin/pip pip /usr/bin/pip3 1 && \ | ||
pip install --upgrade pip && \ | ||
pip install gpustat | ||
|
||
### SSH | ||
# Set password | ||
ENV PASSWORD=password | ||
RUN mkdir /var/run/sshd && \ | ||
echo "root:${PASSWORD}" | chpasswd && \ | ||
# Allow root login with password | ||
sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config && \ | ||
# Prevent user being kicked off after login | ||
sed -i 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' /etc/pam.d/sshd && \ | ||
echo 'AuthorizedKeysFile .ssh/authorized_keys' >> /etc/ssh/sshd_config && \ | ||
echo 'PasswordAuthentication yes' >> /etc/ssh/sshd_config && \ | ||
# FIX SUDO BUG: https://github.com/sudo-project/sudo/issues/42 | ||
echo "Set disable_coredump false" >> /etc/sudo.conf | ||
|
||
# Expose SSH port | ||
EXPOSE 22 | ||
|
||
#### OPENMPI | ||
ENV OPENMPI_BASEVERSION=4.1 | ||
ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.0 | ||
RUN mkdir -p /build && \ | ||
cd /build && \ | ||
wget -q -O - https://download.open-mpi.org/release/open-mpi/v${OPENMPI_BASEVERSION}/openmpi-${OPENMPI_VERSION}.tar.gz | tar xzf - && \ | ||
cd openmpi-${OPENMPI_VERSION} && \ | ||
./configure --prefix=/usr/local/openmpi-${OPENMPI_VERSION} && \ | ||
make -j"$(nproc)" install && \ | ||
ln -s /usr/local/openmpi-${OPENMPI_VERSION} /usr/local/mpi && \ | ||
# Sanity check: | ||
test -f /usr/local/mpi/bin/mpic++ && \ | ||
cd ~ && \ | ||
rm -rf /build | ||
|
||
# Needs to be in docker PATH if compiling other items & bashrc PATH (later) | ||
ENV PATH=/usr/local/mpi/bin:${PATH} \ | ||
LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:${LD_LIBRARY_PATH} | ||
|
||
# Create a wrapper for OpenMPI to allow running as root by default | ||
RUN mv /usr/local/mpi/bin/mpirun /usr/local/mpi/bin/mpirun.real && \ | ||
echo '#!/bin/bash' > /usr/local/mpi/bin/mpirun && \ | ||
echo 'mpirun.real --allow-run-as-root --prefix /usr/local/mpi "$@"' >> /usr/local/mpi/bin/mpirun && \ | ||
chmod a+x /usr/local/mpi/bin/mpirun | ||
|
||
#### User account | ||
RUN useradd --create-home --uid 1000 --shell /bin/bash mchorse && \ | ||
usermod -aG sudo mchorse && \ | ||
echo "mchorse ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers | ||
|
||
## SSH config and bashrc | ||
RUN mkdir -p /home/mchorse/.ssh /job && \ | ||
echo 'Host *' > /home/mchorse/.ssh/config && \ | ||
echo ' StrictHostKeyChecking no' >> /home/mchorse/.ssh/config && \ | ||
echo 'export PDSH_RCMD_TYPE=ssh' >> /home/mchorse/.bashrc && \ | ||
echo 'export PATH=/home/mchorse/.local/bin:$PATH' >> /home/mchorse/.bashrc && \ | ||
echo 'export PATH=/usr/local/mpi/bin:$PATH' >> /home/mchorse/.bashrc && \ | ||
echo 'export LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:$LD_LIBRARY_PATH' >> /home/mchorse/.bashrc | ||
|
||
#### Python packages | ||
RUN pip install torch==1.8.1+cu111 -f https://download.pytorch.org/whl/torch_stable.html && pip cache purge | ||
COPY requirements/requirements.txt . | ||
COPY requirements/requirements-wandb.txt . | ||
COPY requirements/requirements-onebitadam.txt . | ||
COPY requirements/requirements-sparseattention.txt . | ||
COPY requirements/requirements-flashattention.txt . | ||
RUN pip install -r requirements.txt && pip install -r requirements-onebitadam.txt && \ | ||
pip install -r requirements-sparseattention.txt && \ | ||
pip install -r requirements-flashattention.txt && \ | ||
pip install -r requirements-wandb.txt && \ | ||
pip install protobuf==3.20.* && \ | ||
pip cache purge | ||
|
||
## Install APEX | ||
RUN pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" git+https://github.com/NVIDIA/apex.git@a651e2c24ecf97cbf367fd3f330df36760e1c597 | ||
|
||
COPY megatron/ megatron | ||
RUN python megatron/fused_kernels/setup.py install | ||
|
||
# Clear staging | ||
RUN mkdir -p /tmp && chmod 0777 /tmp | ||
|
||
#### SWITCH TO mchorse USER | ||
USER mchorse | ||
WORKDIR /home/mchorse |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
# How to use Mup (https://github.com/microsoft/mup) | ||
|
||
## Add mup neox args to your config | ||
|
||
``` | ||
# mup | ||
"use-mup": true, | ||
"save-base-shapes": false, # this only needs to be enabled once in order to generate the base-shapes-file on each rank | ||
"base-shapes-file": "base-shapes", # load base shapes from this file | ||
"coord-check": false, # generate coord check plots to verify mup's implementation in neox | ||
# mup hp search | ||
"mup-init-scale": 1.0, | ||
"mup-attn-temp": 1.0, | ||
"mup-output-temp": 1.0, | ||
"mup-embedding-mult": 1.0, | ||
"mup-rp-embedding-mult": 1.0, | ||
``` | ||
|
||
## Generate base shapes | ||
|
||
1. Set use-mup to true | ||
2. Set save-base-shapes to true | ||
3. Run once. gpt-neox will instantiate a base model and a delta model, then save one file per rank named <base-shapes-file>.<rank>. gpt-neox will exit immediately. | ||
4. Set save-base-shapes to false | ||
|
||
## Generate coord check plots (optional) | ||
|
||
1. Keep use-mup true | ||
2. Set coord-check to true | ||
3. Run once. gpt-neox will output jpg images similar to https://github.com/microsoft/mutransformers/blob/main/README.md#coord-check. gpt-neox will exit immediately | ||
4. Set coord-check to false | ||
|
||
## Tune mup hyperparameters and LR | ||
|
||
The values under `mup hp search` were added and correspond to appendix F.4 from https://arxiv.org/pdf/2203.03466.pdf. These and LR are tuned with a random search using the scaled-up config (tested with 6-7B.yml) but with hidden-size set to the value from the scaled-down config (125M.yml). | ||
|
||
## Transfer | ||
|
||
With the best LR set and the best mup HPs set, revert the value of hidden-size in the scaled-up config and run again. |
Oops, something went wrong.