-
Notifications
You must be signed in to change notification settings - Fork 981
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'rework-mup' of https://github.com/EleutherAI/gpt-neox i…
…nto rework-mup
- Loading branch information
Showing
47 changed files
with
1,841 additions
and
216 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
name: "Pull Request CPU Tests" | ||
|
||
on: | ||
pull_request: | ||
paths: # job only triggers when the PR changes files under megatron directory | ||
- "megatron/**" | ||
|
||
jobs: | ||
run-tests: | ||
runs-on: [ 'test', 'self-hosted' ] | ||
steps: | ||
- name: Checkout Repo | ||
uses: actions/checkout@v4 | ||
|
||
- name: Install Python | ||
uses: actions/setup-python@v4 | ||
with: | ||
python-version: "3.8" | ||
cache: "pip" | ||
cache-dependency-path: "**/requirements*.txt" | ||
|
||
- name: Upgrade Pip | ||
run: python -m pip install --upgrade pip | ||
|
||
- name: Set up Docker repository # this should possibly be done by the worker before the job starts in the interest of execution time? | ||
run: | | ||
# Add Docker's official GPG key: | ||
sudo apt-get update -y | ||
sudo apt-get install ca-certificates curl -y | ||
sudo install -m 0755 -d /etc/apt/keyrings | ||
sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc | ||
sudo chmod a+r /etc/apt/keyrings/docker.asc | ||
# Add the repository to Apt sources: | ||
echo \ | ||
"deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \ | ||
$(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \ | ||
sudo tee /etc/apt/sources.list.d/docker.list > /dev/null | ||
sudo apt-get update | ||
- name: Docker installation # this should possibly be done by the worker before the job starts in the interest of execution time? | ||
run: | | ||
sudo apt-get install docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin -y | ||
sudo docker run hello-world | ||
- name: Prepare data | ||
run: | | ||
python prepare_data.py -d ./data | ||
- name: Remove previous container | ||
run: | | ||
if docker ps -a | grep -q "$CONTAINER"; then | ||
echo "Container already exists, deleting it..." | ||
docker rm -f $CONTAINER | ||
fi | ||
- name: Create container | ||
run: | | ||
export NEOX_DATA_PATH='./data/enwik8' | ||
export NEOX_CHECKPOINT_PATH='/mnt/sda/checkpoints' #todo: where do I get this? | ||
docker compose run -d --build --name $CONTAINER gpt-neox tail -f /dev/null | ||
- name: Install test requirements | ||
run: | | ||
docker exec $CONTAINER pip install -r /workspace/requirements-dev.txt | ||
- name: Execute CPU tests 1 | ||
run: | | ||
docker exec $CONTAINER sh -c "cd gpt-neox && pytest tests -m cpu" | ||
- name: Execute CPU tests 2 | ||
run: | | ||
docker exec $CONTAINER sh -c "cd gpt-neox && PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python pytest tests -m cpu" | ||
- name: Generate report | ||
run: | | ||
docker exec $CONTAINER python -m http.server --directory htmlcov 8000 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -12,7 +12,7 @@ | |
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
FROM nvidia/cuda:12.1.1-devel-ubuntu22.04 | ||
FROM nvcr.io/nvidia/pytorch:24.02-py3 | ||
|
||
ENV DEBIAN_FRONTEND=noninteractive | ||
|
||
|
@@ -21,16 +21,16 @@ LABEL org.opencontainers.image.version = "2.0" | |
LABEL org.opencontainers.image.authors = "[email protected]" | ||
LABEL org.opencontainers.image.source = "https://www.github.com/eleutherai/gpt-neox" | ||
LABEL org.opencontainers.image.licenses = " Apache-2.0" | ||
LABEL org.opencontainers.image.base.name="docker.io/nvidia/cuda:12.1.1-devel-ubuntu22.04" | ||
LABEL org.opencontainers.image.base.name="nvcr.io/nvidia/pytorch:24.02-py3" | ||
|
||
#### System package (uses default Python 3 version in Ubuntu 20.04) | ||
RUN apt-get update -y && \ | ||
apt-get install -y \ | ||
git python3-dev libpython3-dev python3-pip sudo pdsh \ | ||
htop tmux zstd software-properties-common build-essential autotools-dev \ | ||
nfs-common pdsh cmake g++ gcc curl wget vim less unzip htop iftop iotop ca-certificates ssh \ | ||
rsync iputils-ping net-tools libcupti-dev libmlx4-1 infiniband-diags ibutils ibverbs-utils \ | ||
rdmacm-utils perftest rdma-core nano && \ | ||
python3-pip sudo pdsh \ | ||
htop tmux zstd software-properties-common \ | ||
nfs-common pdsh cmake htop iftop iotop ssh \ | ||
iputils-ping net-tools libcupti-dev libmlx4-1 infiniband-diags ibutils \ | ||
rdmacm-utils perftest rdma-core && \ | ||
update-alternatives --install /usr/bin/python python /usr/bin/python3 1 && \ | ||
update-alternatives --install /usr/bin/pip pip /usr/bin/pip3 1 && \ | ||
python -m pip install --upgrade pip && \ | ||
|
@@ -48,21 +48,6 @@ RUN mkdir /var/run/sshd && \ | |
# Expose SSH port | ||
EXPOSE 22 | ||
|
||
#### OPENMPI | ||
ENV OPENMPI_BASEVERSION=4.1 | ||
ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.0 | ||
RUN mkdir -p /build && \ | ||
cd /build && \ | ||
wget -q -O - https://download.open-mpi.org/release/open-mpi/v${OPENMPI_BASEVERSION}/openmpi-${OPENMPI_VERSION}.tar.gz | tar xzf - && \ | ||
cd openmpi-${OPENMPI_VERSION} && \ | ||
./configure --prefix=/usr/local/openmpi-${OPENMPI_VERSION} && \ | ||
make -j"$(nproc)" install && \ | ||
ln -s /usr/local/openmpi-${OPENMPI_VERSION} /usr/local/mpi && \ | ||
# Sanity check: | ||
test -f /usr/local/mpi/bin/mpic++ && \ | ||
cd ~ && \ | ||
rm -rf /build | ||
|
||
# Needs to be in docker PATH if compiling other items & bashrc PATH (later) | ||
ENV PATH=/usr/local/mpi/bin:${PATH} \ | ||
LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:${LD_LIBRARY_PATH} | ||
|
@@ -88,29 +73,12 @@ RUN mkdir -p /home/mchorse/.ssh /job && \ | |
echo 'export LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:$LD_LIBRARY_PATH' >> /home/mchorse/.bashrc | ||
|
||
#### Python packages | ||
RUN python -m pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121 | ||
COPY requirements/* ./ | ||
RUN python -m pip install --no-cache-dir -r requirements.txt && pip install -r requirements-onebitadam.txt | ||
RUN python -m pip install -r requirements-sparseattention.txt | ||
RUN python -m pip install -r requirements-flashattention.txt | ||
RUN python -m pip install -r requirements-wandb.txt | ||
RUN python -m pip install protobuf==3.20.* | ||
RUN python -m pip cache purge | ||
|
||
## Install APEX | ||
# Detect the architecture and install Apex accordingly | ||
RUN ARCH=$(uname -m) && \ | ||
if [ "$ARCH" = "x86_64" ]; then \ | ||
wget https://github.com/segyges/not-nvidia-apex/releases/download/jan-2024/apex-0.1-cp310-cp310-linux_x86_64.zip && \ | ||
unzip ./apex-0.1-cp310-cp310-linux_x86_64.zip && \ | ||
python -m pip install ./apex-0.1-cp310-cp310-linux_x86_64.whl; \ | ||
else \ | ||
# Install Apex directly from source for other architectures | ||
python -m pip install -r requirements-apex-pip.txt && \ | ||
python -m pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings --global-option=--cpp_ext --config-settings --global-option=--cuda_ext git+https://github.com/NVIDIA/apex.git@141bbf1cf362d4ca4d94f4284393e91dda5105a5; \ | ||
fi | ||
|
||
COPY megatron/fused_kernels/ megatron/fused_kernels | ||
COPY megatron/fused_kernels/ /megatron/fused_kernels | ||
WORKDIR /megatron/fused_kernels | ||
RUN python setup.py install | ||
|
||
|
Oops, something went wrong.