start2

Jarvis-LLM · May 18, 2023 · 84cfbd4 · 84cfbd4
1 parent b0b561f
commit 84cfbd4
Show file tree

Hide file tree

Showing 19 changed files with 2,384 additions and 0 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,120 @@
+# Copyright (c) 2021, EleutherAI
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http:https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+FROM nvidia/cuda:11.1.1-devel-ubuntu20.04
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+# metainformation
+LABEL org.opencontainers.image.version = "2.0"
+LABEL org.opencontainers.image.authors = "[email protected]"
+LABEL org.opencontainers.image.source = "https://www.github.com/eleutherai/gpt-neox"
+LABEL org.opencontainers.image.licenses = " Apache-2.0"
+LABEL org.opencontainers.image.base.name="docker.io/nvidia/cuda:11.1.1-devel-ubuntu20.04"
+
+#### System package (uses default Python 3 version in Ubuntu 20.04)
+RUN apt-get update -y && \
+ apt-get install -y \
+ git python3 python3-dev libpython3-dev python3-pip sudo pdsh \
+ htop llvm-9-dev tmux zstd software-properties-common build-essential autotools-dev \
+ nfs-common pdsh cmake g++ gcc curl wget vim less unzip htop iftop iotop ca-certificates ssh \
+ rsync iputils-ping net-tools libcupti-dev libmlx4-1 infiniband-diags ibutils ibverbs-utils \
+ rdmacm-utils perftest rdma-core nano && \
+ update-alternatives --install /usr/bin/python python /usr/bin/python3 1 && \
+ update-alternatives --install /usr/bin/pip pip /usr/bin/pip3 1 && \
+ pip install --upgrade pip && \
+ pip install gpustat
+
+### SSH
+# Set password
+ENV PASSWORD=password
+RUN mkdir /var/run/sshd && \
+ echo "root:${PASSWORD}" | chpasswd && \
+ # Allow root login with password
+ sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config && \
+ # Prevent user being kicked off after login
+ sed -i 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' /etc/pam.d/sshd && \
+ echo 'AuthorizedKeysFile .ssh/authorized_keys' >> /etc/ssh/sshd_config && \
+ echo 'PasswordAuthentication yes' >> /etc/ssh/sshd_config && \
+ # FIX SUDO BUG: https://github.com/sudo-project/sudo/issues/42
+ echo "Set disable_coredump false" >> /etc/sudo.conf
+
+# Expose SSH port
+EXPOSE 22
+
+#### OPENMPI
+ENV OPENMPI_BASEVERSION=4.1
+ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.0
+RUN mkdir -p /build && \
+ cd /build && \
+ wget -q -O - https://download.open-mpi.org/release/open-mpi/v${OPENMPI_BASEVERSION}/openmpi-${OPENMPI_VERSION}.tar.gz | tar xzf - && \
+ cd openmpi-${OPENMPI_VERSION} && \
+ ./configure --prefix=/usr/local/openmpi-${OPENMPI_VERSION} && \
+ make -j"$(nproc)" install && \
+ ln -s /usr/local/openmpi-${OPENMPI_VERSION} /usr/local/mpi && \
+ # Sanity check:
+ test -f /usr/local/mpi/bin/mpic++ && \
+ cd ~ && \
+ rm -rf /build
+
+# Needs to be in docker PATH if compiling other items & bashrc PATH (later)
+ENV PATH=/usr/local/mpi/bin:${PATH} \
+ LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:${LD_LIBRARY_PATH}
+
+# Create a wrapper for OpenMPI to allow running as root by default
+RUN mv /usr/local/mpi/bin/mpirun /usr/local/mpi/bin/mpirun.real && \
+ echo '#!/bin/bash' > /usr/local/mpi/bin/mpirun && \
+ echo 'mpirun.real --allow-run-as-root --prefix /usr/local/mpi "$@"' >> /usr/local/mpi/bin/mpirun && \
+ chmod a+x /usr/local/mpi/bin/mpirun
+
+#### User account
+RUN useradd --create-home --uid 1000 --shell /bin/bash mchorse && \
+ usermod -aG sudo mchorse && \
+ echo "mchorse ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers
+
+## SSH config and bashrc
+RUN mkdir -p /home/mchorse/.ssh /job && \
+ echo 'Host *' > /home/mchorse/.ssh/config && \
+ echo ' StrictHostKeyChecking no' >> /home/mchorse/.ssh/config && \
+ echo 'export PDSH_RCMD_TYPE=ssh' >> /home/mchorse/.bashrc && \
+ echo 'export PATH=/home/mchorse/.local/bin:$PATH' >> /home/mchorse/.bashrc && \
+ echo 'export PATH=/usr/local/mpi/bin:$PATH' >> /home/mchorse/.bashrc && \
+ echo 'export LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:$LD_LIBRARY_PATH' >> /home/mchorse/.bashrc
+
+#### Python packages
+RUN pip install torch==1.8.1+cu111 -f https://download.pytorch.org/whl/torch_stable.html && pip cache purge
+COPY requirements/requirements.txt .
+COPY requirements/requirements-wandb.txt .
+COPY requirements/requirements-onebitadam.txt .
+COPY requirements/requirements-sparseattention.txt .
+COPY requirements/requirements-flashattention.txt .
+RUN pip install -r requirements.txt && pip install -r requirements-onebitadam.txt && \
+ pip install -r requirements-sparseattention.txt && \
+ pip install -r requirements-flashattention.txt && \
+ pip install -r requirements-wandb.txt && \
+ pip install protobuf==3.20.* && \
+ pip cache purge
+
+## Install APEX
+RUN pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" git+https://github.com/NVIDIA/apex.git@a651e2c24ecf97cbf367fd3f330df36760e1c597
+
+COPY megatron/ megatron
+RUN python megatron/fused_kernels/setup.py install
+
+# Clear staging
+RUN mkdir -p /tmp && chmod 0777 /tmp
+
+#### SWITCH TO mchorse USER
+USER mchorse
+WORKDIR /home/mchorse
diff --git a/README-MUP.md b/README-MUP.md
@@ -0,0 +1,49 @@
+# How to use Mup (https://github.com/microsoft/mup)
+
+## Add mup neox args to your config
+
+```
+# mup
+
+"use-mup": true,
+
+"save-base-shapes": false, # this only needs to be enabled once in order to generate the base-shapes-file on each rank
+
+"base-shapes-file": "base-shapes", # load base shapes from this file
+
+"coord-check": false, # generate coord check plots to verify mup's implementation in neox
+
+# mup hp search
+
+"mup-init-scale": 1.0,
+
+"mup-attn-temp": 1.0,
+
+"mup-output-temp": 1.0,
+
+"mup-embedding-mult": 1.0,
+
+"mup-rp-embedding-mult": 1.0,
+```
+
+## Generate base shapes
+
+1. Set use-mup to true
+2. Set save-base-shapes to true
+3. Run once. gpt-neox will instantiate a base model and a delta model, then save one file per rank named <base-shapes-file>.<rank>. gpt-neox will exit immediately.
+4. Set save-base-shapes to false
+
+## Generate coord check plots (optional)
+
+1. Keep use-mup true
+2. Set coord-check to true
+3. Run once. gpt-neox will output jpg images similar to https://github.com/microsoft/mutransformers/blob/main/README.md#coord-check. gpt-neox will exit immediately
+4. Set coord-check to false
+
+## Tune mup hyperparameters and LR
+
+The values under `mup hp search` were added and correspond to appendix F.4 from https://arxiv.org/pdf/2203.03466.pdf. These and LR are tuned with a random search using the scaled-up config (tested with 6-7B.yml) but with hidden-size set to the value from the scaled-down config (125M.yml).
+
+## Transfer
+
+With the best LR set and the best mup HPs set, revert the value of hidden-size in the scaled-up config and run again.