forked from EleutherAI/gpt-neox
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Dockerfile
151 lines (131 loc) · 5.71 KB
/
Dockerfile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
# Copyright (c) 2021, EleutherAI
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http:https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
FROM nvidia/cuda:11.1.1-devel-ubuntu20.04
ENV DEBIAN_FRONTEND=noninteractive
#### System package (uses default Python 3 version in Ubuntu 20.04)
RUN apt-get update -y && \
apt-get install -y \
git python3 python3-dev libpython3-dev python3-pip sudo pdsh \
htop llvm-9-dev tmux zstd software-properties-common build-essential autotools-dev \
nfs-common pdsh cmake g++ gcc curl wget vim less unzip htop iftop iotop ca-certificates ssh \
rsync iputils-ping net-tools libcupti-dev libmlx4-1 infiniband-diags ibutils ibverbs-utils \
rdmacm-utils perftest rdma-core nano && \
update-alternatives --install /usr/bin/python python /usr/bin/python3 1 && \
update-alternatives --install /usr/bin/pip pip /usr/bin/pip3 1 && \
pip install --upgrade pip && \
pip install gpustat
RUN apt-get install -y --no-install-recommends \
gcc \
build-essential \
patch \
file \
git \
curl \
swig \
nkf \
libmecab-dev \
locales \
wget \
vim \
emacs \
unzip \
mlocate \
time \
openssh-server
RUN apt-get install --reinstall -y libnss3
RUN apt-get install -y language-pack-ja-base language-pack-ja
RUN locale-gen ja_JP.UTF-8
RUN export echo "LANG=ja_JP.UTF-8" >> ~/.bashrc
RUN export echo "LC_ALL=ja_JP.UTF-8" >> ~/.bashrc
RUN export echo "PYTHONIOENCODING=utf-8" >> ~/.bashrc
### SSH
# Set password
ENV PASSWORD=password
RUN mkdir /var/run/sshd && \
echo "root:${PASSWORD}" | chpasswd && \
# Allow root login with password
sed -i 's/PermitRootLogin without-password/PermitRootLogin yes/' /etc/ssh/sshd_config && \
# Prevent user being kicked off after login
sed -i 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' /etc/pam.d/sshd && \
echo 'AuthorizedKeysFile .ssh/authorized_keys' >> /etc/ssh/sshd_config && \
echo 'PasswordAuthentication yes' >> /etc/ssh/sshd_config && \
# FIX SUDO BUG: https://github.com/sudo-project/sudo/issues/42
echo "Set disable_coredump false" >> /etc/sudo.conf
# Expose SSH port
EXPOSE 22
#### OPENMPI
ENV OPENMPI_BASEVERSION=4.1
ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.0
RUN mkdir -p /build && \
cd /build && \
wget -q -O - https://download.open-mpi.org/release/open-mpi/v${OPENMPI_BASEVERSION}/openmpi-${OPENMPI_VERSION}.tar.gz | tar xzf - && \
cd openmpi-${OPENMPI_VERSION} && \
./configure --prefix=/usr/local/openmpi-${OPENMPI_VERSION} && \
make -j"$(nproc)" install && \
ln -s /usr/local/openmpi-${OPENMPI_VERSION} /usr/local/mpi && \
# Sanity check:
test -f /usr/local/mpi/bin/mpic++ && \
cd ~ && \
rm -rf /build
# Needs to be in docker PATH if compiling other items & bashrc PATH (later)
ENV PATH=/usr/local/mpi/bin:${PATH} \
LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:${LD_LIBRARY_PATH}
# Create a wrapper for OpenMPI to allow running as root by default
RUN mv /usr/local/mpi/bin/mpirun /usr/local/mpi/bin/mpirun.real && \
echo '#!/bin/bash' > /usr/local/mpi/bin/mpirun && \
echo 'mpirun.real --allow-run-as-root --prefix /usr/local/mpi "$@"' >> /usr/local/mpi/bin/mpirun && \
chmod a+x /usr/local/mpi/bin/mpirun
#### User account
RUN useradd --create-home --uid 1000 --shell /bin/bash mchorse && \
usermod -aG sudo mchorse && \
echo "mchorse ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers
## SSH config and bashrc
RUN mkdir -p /home/mchorse/.ssh /job && \
echo 'Host *' > /home/mchorse/.ssh/config && \
echo ' StrictHostKeyChecking no' >> /home/mchorse/.ssh/config && \
echo 'export PDSH_RCMD_TYPE=ssh' >> /home/mchorse/.bashrc && \
echo 'export PATH=/home/mchorse/.local/bin:$PATH' >> /home/mchorse/.bashrc && \
echo 'export PATH=/usr/local/mpi/bin:$PATH' >> /home/mchorse/.bashrc && \
echo 'export LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:$LD_LIBRARY_PATH' >> /home/mchorse/.bashrc
#### Python packages
RUN pip install torch==1.8.1+cu111 -f https://download.pytorch.org/whl/torch_stable.html && pip cache purge
COPY requirements/requirements.txt .
COPY requirements/requirements-onebitadam.txt .
COPY requirements/requirements-sparseattention.txt .
RUN pip install -r requirements.txt && pip install -r requirements-onebitadam.txt && \
pip install -r requirements-sparseattention.txt && \
pip install protobuf==3.20.* && \
pip cache purge
## Install APEX
RUN pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" git+https://github.com/NVIDIA/apex.git@a651e2c24ecf97cbf367fd3f330df36760e1c597
COPY megatron/ megatron
RUN python megatron/fused_kernels/setup.py install
# Clear staging
RUN mkdir -p /tmp && chmod 0777 /tmp
### WORKDIR
WORKDIR /data
WORKDIR /job
RUN echo 'Port 2222' >> /etc/ssh/sshd_config
RUN echo 'PasswordAuthentication no' >> /etc/ssh/sshd_config
EXPOSE 2222
WORKDIR /root/.ssh
COPY ssh/id_rsa /root/.ssh/id_rsa
RUN chmod 600 /root/.ssh/id_rsa
COPY ssh/authorized_keys /root/.ssh/authorized_keys
RUN chmod 644 /root/.ssh/authorized_keys
COPY ssh/hostfile /job/hostfile
COPY ssh/config /root/.ssh/config
WORKDIR /root
RUN git clone https://github.com/hisashi-ito/gpt-neox.git
RUN pip3 install best_download