Skip to content

Commit

Permalink
Merge branch 'rework-mup' of https://github.com/EleutherAI/gpt-neox i…
Browse files Browse the repository at this point in the history
…nto rework-mup
  • Loading branch information
lintangsutawika committed May 2, 2024
2 parents 5ccf693 + 6fe55f4 commit 9dd583b
Show file tree
Hide file tree
Showing 47 changed files with 1,841 additions and 216 deletions.
69 changes: 69 additions & 0 deletions .github/workflows/cpu_ci_on_pr.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
name: "Pull Request CPU Tests"

on:
pull_request:
paths: # job only triggers when the PR changes files under megatron directory
- "megatron/**"

jobs:
run-tests:
runs-on: [ 'test', 'self-hosted' ]
steps:
- name: Checkout Repo
uses: actions/checkout@v4

- name: Install Python
uses: actions/setup-python@v4
with:
python-version: "3.8"
cache: "pip"
cache-dependency-path: "**/requirements*.txt"

- name: Upgrade Pip
run: python -m pip install --upgrade pip

- name: Set up Docker repository # this should possibly be done by the worker before the job starts in the interest of execution time?
run: |
# Add Docker's official GPG key:
sudo apt-get update -y
sudo apt-get install ca-certificates curl -y
sudo install -m 0755 -d /etc/apt/keyrings
sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
sudo chmod a+r /etc/apt/keyrings/docker.asc
# Add the repository to Apt sources:
echo \
"deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \
$(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \
sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
sudo apt-get update
- name: Docker installation # this should possibly be done by the worker before the job starts in the interest of execution time?
run: |
sudo apt-get install docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin -y
sudo docker run hello-world
- name: Prepare data
run: |
python prepare_data.py -d ./data
- name: Remove previous container
run: |
if docker ps -a | grep -q "$CONTAINER"; then
echo "Container already exists, deleting it..."
docker rm -f $CONTAINER
fi
- name: Create container
run: |
export NEOX_DATA_PATH='./data/enwik8'
export NEOX_CHECKPOINT_PATH='/mnt/sda/checkpoints' #todo: where do I get this?
docker compose run -d --build --name $CONTAINER gpt-neox tail -f /dev/null
- name: Install test requirements
run: |
docker exec $CONTAINER pip install -r /workspace/requirements-dev.txt
- name: Execute CPU tests 1
run: |
docker exec $CONTAINER sh -c "cd gpt-neox && pytest tests -m cpu"
- name: Execute CPU tests 2
run: |
docker exec $CONTAINER sh -c "cd gpt-neox && PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python pytest tests -m cpu"
- name: Generate report
run: |
docker exec $CONTAINER python -m http.server --directory htmlcov 8000
3 changes: 2 additions & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,15 @@ repos:
- id: check-yaml
- id: destroyed-symlinks
- id: end-of-file-fixer
exclude: docs/CNAME
exclude: ^(docs/CNAME/|configs/neox_arguments.md)
- id: fix-byte-order-marker
- id: fix-encoding-pragma
args: [--remove]
- id: mixed-line-ending
args: [--fix=lf]
- id: requirements-txt-fixer
- id: trailing-whitespace
exclude: ^(docs/CNAME/|configs/neox_arguments.md)
- repo: https://gitlab.com/daverona/pre-commit/cpp
rev: 0.8.0
hooks:
Expand Down
48 changes: 8 additions & 40 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

FROM nvidia/cuda:12.1.1-devel-ubuntu22.04
FROM nvcr.io/nvidia/pytorch:24.02-py3

ENV DEBIAN_FRONTEND=noninteractive

Expand All @@ -21,16 +21,16 @@ LABEL org.opencontainers.image.version = "2.0"
LABEL org.opencontainers.image.authors = "[email protected]"
LABEL org.opencontainers.image.source = "https://www.github.com/eleutherai/gpt-neox"
LABEL org.opencontainers.image.licenses = " Apache-2.0"
LABEL org.opencontainers.image.base.name="docker.io/nvidia/cuda:12.1.1-devel-ubuntu22.04"
LABEL org.opencontainers.image.base.name="nvcr.io/nvidia/pytorch:24.02-py3"

#### System package (uses default Python 3 version in Ubuntu 20.04)
RUN apt-get update -y && \
apt-get install -y \
git python3-dev libpython3-dev python3-pip sudo pdsh \
htop tmux zstd software-properties-common build-essential autotools-dev \
nfs-common pdsh cmake g++ gcc curl wget vim less unzip htop iftop iotop ca-certificates ssh \
rsync iputils-ping net-tools libcupti-dev libmlx4-1 infiniband-diags ibutils ibverbs-utils \
rdmacm-utils perftest rdma-core nano && \
python3-pip sudo pdsh \
htop tmux zstd software-properties-common \
nfs-common pdsh cmake htop iftop iotop ssh \
iputils-ping net-tools libcupti-dev libmlx4-1 infiniband-diags ibutils \
rdmacm-utils perftest rdma-core && \
update-alternatives --install /usr/bin/python python /usr/bin/python3 1 && \
update-alternatives --install /usr/bin/pip pip /usr/bin/pip3 1 && \
python -m pip install --upgrade pip && \
Expand All @@ -48,21 +48,6 @@ RUN mkdir /var/run/sshd && \
# Expose SSH port
EXPOSE 22

#### OPENMPI
ENV OPENMPI_BASEVERSION=4.1
ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.0
RUN mkdir -p /build && \
cd /build && \
wget -q -O - https://download.open-mpi.org/release/open-mpi/v${OPENMPI_BASEVERSION}/openmpi-${OPENMPI_VERSION}.tar.gz | tar xzf - && \
cd openmpi-${OPENMPI_VERSION} && \
./configure --prefix=/usr/local/openmpi-${OPENMPI_VERSION} && \
make -j"$(nproc)" install && \
ln -s /usr/local/openmpi-${OPENMPI_VERSION} /usr/local/mpi && \
# Sanity check:
test -f /usr/local/mpi/bin/mpic++ && \
cd ~ && \
rm -rf /build

# Needs to be in docker PATH if compiling other items & bashrc PATH (later)
ENV PATH=/usr/local/mpi/bin:${PATH} \
LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:${LD_LIBRARY_PATH}
Expand All @@ -88,29 +73,12 @@ RUN mkdir -p /home/mchorse/.ssh /job && \
echo 'export LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:$LD_LIBRARY_PATH' >> /home/mchorse/.bashrc

#### Python packages
RUN python -m pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
COPY requirements/* ./
RUN python -m pip install --no-cache-dir -r requirements.txt && pip install -r requirements-onebitadam.txt
RUN python -m pip install -r requirements-sparseattention.txt
RUN python -m pip install -r requirements-flashattention.txt
RUN python -m pip install -r requirements-wandb.txt
RUN python -m pip install protobuf==3.20.*
RUN python -m pip cache purge

## Install APEX
# Detect the architecture and install Apex accordingly
RUN ARCH=$(uname -m) && \
if [ "$ARCH" = "x86_64" ]; then \
wget https://github.com/segyges/not-nvidia-apex/releases/download/jan-2024/apex-0.1-cp310-cp310-linux_x86_64.zip && \
unzip ./apex-0.1-cp310-cp310-linux_x86_64.zip && \
python -m pip install ./apex-0.1-cp310-cp310-linux_x86_64.whl; \
else \
# Install Apex directly from source for other architectures
python -m pip install -r requirements-apex-pip.txt && \
python -m pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings --global-option=--cpp_ext --config-settings --global-option=--cuda_ext git+https://github.com/NVIDIA/apex.git@141bbf1cf362d4ca4d94f4284393e91dda5105a5; \
fi

COPY megatron/fused_kernels/ megatron/fused_kernels
COPY megatron/fused_kernels/ /megatron/fused_kernels
WORKDIR /megatron/fused_kernels
RUN python setup.py install

Expand Down
Loading

0 comments on commit 9dd583b

Please sign in to comment.