From 1bdb8141517ebd6f207588c266326200ce2c7455 Mon Sep 17 00:00:00 2001 From: Leo Gao Date: Fri, 22 Jan 2021 17:50:49 -0700 Subject: [PATCH 01/16] Add initial Dockerfile and k8s deployment --- kubernetes/Dockerfile | 15 ++++++++++++ kubernetes/deploy_k8s.yml | 51 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+) create mode 100644 kubernetes/Dockerfile create mode 100644 kubernetes/deploy_k8s.yml diff --git a/kubernetes/Dockerfile b/kubernetes/Dockerfile new file mode 100644 index 000000000..1e564c129 --- /dev/null +++ b/kubernetes/Dockerfile @@ -0,0 +1,15 @@ +FROM atlanticcrypto/cuda-ssh-server:10.2-cudnn + +RUN echo 'AuthorizedKeysFile .ssh/authorized_keys' >> /etc/ssh/sshd_config && \ + apt-get update && \ + apt-get install -y git python3.8 python3.8-dev python3-pip sudo && \ + update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1 && \ + update-alternatives --install /usr/bin/python python /usr/bin/python3.8 1 && \ + update-alternatives --install /usr/bin/python python /usr/bin/python3.8 1 + +RUN mkdir -p ~/.ssh && \ + echo 'ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQChClBdh2UXpMhBV725nH1bMsGVaLmYAp8uMZFHtBqHN56sj/35jeenQTqr/Tov3u6xwqJK+rxegjgcPDZfuOSdsNnTMCLIacA/WqBMwW1mjdMc+zFOub7vQJHj4nmeF3pd4tSjt720ZLiX1ZsF5QrTIcnURAXT0/82SKIy2nqj18v9HCcIvBplexJU3SlVg+oWk/e5CsfnXvJMQH3VqJaeyrXIlaFgVOvVWSBY66Kc2H+g1RxLwe+BONyNanxSXxKHQwUXawBvXIyjekapB3HiNWLZZfZjNmqe2Ci+Y9PKn0CrkgXopIP7tVKn+UQ2fD3nSjaZyfRrmcFgXdotFKJh root@sshd-sid-79b6f9d7c6-mmw8x' > ~/.ssh/authorized_keys + +RUN python3 -m pip install --upgrade pip && \ + pip3 install torch pipx && \ + python3 -m pipx ensurepath diff --git a/kubernetes/deploy_k8s.yml b/kubernetes/deploy_k8s.yml new file mode 100644 index 000000000..3c6b184ab --- /dev/null +++ b/kubernetes/deploy_k8s.yml @@ -0,0 +1,51 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: eleuther-neox +spec: + strategy: + type: Recreate + replicas: 2 + selector: + matchLabels: + app.kubernetes.io/name: eleuther-neox + template: + metadata: + labels: + app.kubernetes.io/name: eleuther-neox + spec: + terminationGracePeriodSeconds: 10 + containers: + - name: neox + command: ["/usr/sbin/sshd"] + args: ["-D"] + tty: true + image: leogao2/deepspeed_eleuther + ports: + - name: sshd + containerPort: 2222 + protocol: TCP + + resources: + requests: + cpu: 30 + memory: 40Gi + limits: + nvidia.com/gpu: 8 + + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + # Edit for different GPU + - key: gpu.nvidia.com/model + operator: In + values: + - GeForce_RTX_2080_Ti + - key: failure-domain.beta.kubernetes.io/region + operator: In + values: + - ORD1 + + restartPolicy: Always \ No newline at end of file From 05f4dcf9a445d992c7ac64a9d5daeefaaf27700c Mon Sep 17 00:00:00 2001 From: Leo Gao Date: Fri, 22 Jan 2021 22:23:48 -0700 Subject: [PATCH 02/16] Fix imports --- gpt_neox/gpt_neox.py | 1 + 1 file changed, 1 insertion(+) diff --git a/gpt_neox/gpt_neox.py b/gpt_neox/gpt_neox.py index 1597612ba..cbe133e81 100644 --- a/gpt_neox/gpt_neox.py +++ b/gpt_neox/gpt_neox.py @@ -1,4 +1,5 @@ import torch +import torch.utils.checkpoint import torch.nn.functional as F from torch import nn, einsum from functools import partial From 6e9b3a89a8978da96fbb11e7972167fcde5beb52 Mon Sep 17 00:00:00 2001 From: Leo Gao Date: Fri, 22 Jan 2021 22:24:16 -0700 Subject: [PATCH 03/16] Update dockerfile --- kubernetes/Dockerfile => Dockerfile | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) rename kubernetes/Dockerfile => Dockerfile (71%) diff --git a/kubernetes/Dockerfile b/Dockerfile similarity index 71% rename from kubernetes/Dockerfile rename to Dockerfile index 1e564c129..5516b59e7 100644 --- a/kubernetes/Dockerfile +++ b/Dockerfile @@ -2,14 +2,28 @@ FROM atlanticcrypto/cuda-ssh-server:10.2-cudnn RUN echo 'AuthorizedKeysFile .ssh/authorized_keys' >> /etc/ssh/sshd_config && \ apt-get update && \ - apt-get install -y git python3.8 python3.8-dev python3-pip sudo && \ + apt-get install -y git python3.8 python3.8-dev python3-pip sudo pdsh && \ update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1 && \ update-alternatives --install /usr/bin/python python /usr/bin/python3.8 1 && \ update-alternatives --install /usr/bin/python python /usr/bin/python3.8 1 -RUN mkdir -p ~/.ssh && \ - echo 'ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQChClBdh2UXpMhBV725nH1bMsGVaLmYAp8uMZFHtBqHN56sj/35jeenQTqr/Tov3u6xwqJK+rxegjgcPDZfuOSdsNnTMCLIacA/WqBMwW1mjdMc+zFOub7vQJHj4nmeF3pd4tSjt720ZLiX1ZsF5QrTIcnURAXT0/82SKIy2nqj18v9HCcIvBplexJU3SlVg+oWk/e5CsfnXvJMQH3VqJaeyrXIlaFgVOvVWSBY66Kc2H+g1RxLwe+BONyNanxSXxKHQwUXawBvXIyjekapB3HiNWLZZfZjNmqe2Ci+Y9PKn0CrkgXopIP7tVKn+UQ2fD3nSjaZyfRrmcFgXdotFKJh root@sshd-sid-79b6f9d7c6-mmw8x' > ~/.ssh/authorized_keys - RUN python3 -m pip install --upgrade pip && \ pip3 install torch pipx && \ - python3 -m pipx ensurepath + python3 -m pipx ensurepath && \ + mkdir /app + +RUN mkdir -p ~/.ssh && \ + echo 'ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQChClBdh2UXpMhBV725nH1bMsGVaLmYAp8uMZFHtBqHN56sj/35jeenQTqr/Tov3u6xwqJK+rxegjgcPDZfuOSdsNnTMCLIacA/WqBMwW1mjdMc+zFOub7vQJHj4nmeF3pd4tSjt720ZLiX1ZsF5QrTIcnURAXT0/82SKIy2nqj18v9HCcIvBplexJU3SlVg+oWk/e5CsfnXvJMQH3VqJaeyrXIlaFgVOvVWSBY66Kc2H+g1RxLwe+BONyNanxSXxKHQwUXawBvXIyjekapB3HiNWLZZfZjNmqe2Ci+Y9PKn0CrkgXopIP7tVKn+UQ2fD3nSjaZyfRrmcFgXdotFKJh root@sshd-sid-79b6f9d7c6-mmw8x' > ~/.ssh/authorized_keys && \ + echo -e 'Host *\n StrictHostKeyChecking no' > ~/.ssh/config && \ + chmod 600 ~/.ssh/config + + +WORKDIR /app + +COPY install_deepspeed.sh /app +RUN sh ./install_deepspeed.sh + +COPY requirements.txt /app +RUN pip install -r requirements.txt + +COPY . /app \ No newline at end of file From 206b8ca5d53f1b51f4e77b3b7852289573b27890 Mon Sep 17 00:00:00 2001 From: Leo Gao Date: Fri, 22 Jan 2021 22:31:25 -0700 Subject: [PATCH 04/16] Add /dev/shm patch --- kubernetes/deploy_k8s.yml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/kubernetes/deploy_k8s.yml b/kubernetes/deploy_k8s.yml index 3c6b184ab..a54810347 100644 --- a/kubernetes/deploy_k8s.yml +++ b/kubernetes/deploy_k8s.yml @@ -25,6 +25,9 @@ spec: - name: sshd containerPort: 2222 protocol: TCP + volumeMounts: + - mountPath: /dev/shm + name: dshm resources: requests: @@ -47,5 +50,8 @@ spec: operator: In values: - ORD1 - + volumes: + - name: dshm + emptyDir: + medium: Memory restartPolicy: Always \ No newline at end of file From 2a578d2bd3ef96c8b445ff02dd4d15e5a559ecb3 Mon Sep 17 00:00:00 2001 From: Leo Gao Date: Fri, 22 Jan 2021 22:34:31 -0700 Subject: [PATCH 05/16] Update to be completely passwordless --- Dockerfile | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index 5516b59e7..8d3eafd77 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,11 +1,12 @@ FROM atlanticcrypto/cuda-ssh-server:10.2-cudnn -RUN echo 'AuthorizedKeysFile .ssh/authorized_keys' >> /etc/ssh/sshd_config && \ +RUN echo -e 'AuthorizedKeysFile .ssh/authorized_keys\nPermitEmptyPasswords yes' >> /etc/ssh/sshd_config && \ apt-get update && \ apt-get install -y git python3.8 python3.8-dev python3-pip sudo pdsh && \ update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1 && \ update-alternatives --install /usr/bin/python python /usr/bin/python3.8 1 && \ - update-alternatives --install /usr/bin/python python /usr/bin/python3.8 1 + update-alternatives --install /usr/bin/python python /usr/bin/python3.8 1 && \ + passwd -d root RUN python3 -m pip install --upgrade pip && \ pip3 install torch pipx && \ From d48f48a5c2c2cdb9f5cbd0a54e23da8fd7a674f4 Mon Sep 17 00:00:00 2001 From: Leo Gao Date: Fri, 22 Jan 2021 22:46:09 -0700 Subject: [PATCH 06/16] Add host file generation script --- kubernetes/make_hosts_file.sh | 1 + 1 file changed, 1 insertion(+) create mode 100755 kubernetes/make_hosts_file.sh diff --git a/kubernetes/make_hosts_file.sh b/kubernetes/make_hosts_file.sh new file mode 100755 index 000000000..32df7e685 --- /dev/null +++ b/kubernetes/make_hosts_file.sh @@ -0,0 +1 @@ +kubectl get pods -o wide | grep eleuther-neox | awk '{print $6 " slots=8"}' \ No newline at end of file From 032fde128c6f1a5e1def100b3c974ea3ffabdb3b Mon Sep 17 00:00:00 2001 From: Leo Gao Date: Fri, 22 Jan 2021 23:08:44 -0700 Subject: [PATCH 07/16] Make deploy script --- kubernetes/deploy_k8s.sh | 11 +++++++++++ kubernetes/make_hosts_file.sh | 1 - 2 files changed, 11 insertions(+), 1 deletion(-) create mode 100755 kubernetes/deploy_k8s.sh delete mode 100755 kubernetes/make_hosts_file.sh diff --git a/kubernetes/deploy_k8s.sh b/kubernetes/deploy_k8s.sh new file mode 100755 index 000000000..026679abf --- /dev/null +++ b/kubernetes/deploy_k8s.sh @@ -0,0 +1,11 @@ +kubectl delete deploy/eleuther-neox +kubectl apply -f deploy_k8s.yml +echo Waiting for deploy to complete... +kubectl wait --for=condition=available --timeout=600s deployment/eleuther-neox || exit + +kubectl get pods -o wide | grep eleuther-neox | awk '{print $6 " slots=8"}' > hosts +export MASTER_ID=$(kubectl get pods | grep eleuther-neox | awk '{print $1}' | head -n 1) +echo $MASTER_ID +kubectl cp $PWD/hosts $MASTER_ID:/app +#echo 'git remote set-url origin https://github.com/EleutherAI/gpt-neox/ && git pull' | kubectl exec --stdin --tty $MASTER_ID -- /bin/bash +kubectl exec --stdin --tty $MASTER_ID -- /bin/bash diff --git a/kubernetes/make_hosts_file.sh b/kubernetes/make_hosts_file.sh deleted file mode 100755 index 32df7e685..000000000 --- a/kubernetes/make_hosts_file.sh +++ /dev/null @@ -1 +0,0 @@ -kubectl get pods -o wide | grep eleuther-neox | awk '{print $6 " slots=8"}' \ No newline at end of file From 81f1ef40142cb09fb39a799b912bd46f0e8f8fdb Mon Sep 17 00:00:00 2001 From: Leo Gao Date: Sat, 23 Jan 2021 01:03:18 -0700 Subject: [PATCH 08/16] Update deploy script and fix echo -e problem --- Dockerfile | 8 +++++--- kubernetes/deploy_k8s.sh => deploy_k8s.sh | 4 ++-- kubernetes/deploy_k8s.yml | 6 +++--- 3 files changed, 10 insertions(+), 8 deletions(-) rename kubernetes/deploy_k8s.sh => deploy_k8s.sh (83%) diff --git a/Dockerfile b/Dockerfile index 8d3eafd77..2d55e60c1 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,6 +1,7 @@ FROM atlanticcrypto/cuda-ssh-server:10.2-cudnn -RUN echo -e 'AuthorizedKeysFile .ssh/authorized_keys\nPermitEmptyPasswords yes' >> /etc/ssh/sshd_config && \ +RUN echo 'AuthorizedKeysFile .ssh/authorized_keys' >> /etc/ssh/sshd_config && \ + echo 'PermitEmptyPasswords yes' >> /etc/ssh/sshd_config && \ apt-get update && \ apt-get install -y git python3.8 python3.8-dev python3-pip sudo pdsh && \ update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1 && \ @@ -15,7 +16,8 @@ RUN python3 -m pip install --upgrade pip && \ RUN mkdir -p ~/.ssh && \ echo 'ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQChClBdh2UXpMhBV725nH1bMsGVaLmYAp8uMZFHtBqHN56sj/35jeenQTqr/Tov3u6xwqJK+rxegjgcPDZfuOSdsNnTMCLIacA/WqBMwW1mjdMc+zFOub7vQJHj4nmeF3pd4tSjt720ZLiX1ZsF5QrTIcnURAXT0/82SKIy2nqj18v9HCcIvBplexJU3SlVg+oWk/e5CsfnXvJMQH3VqJaeyrXIlaFgVOvVWSBY66Kc2H+g1RxLwe+BONyNanxSXxKHQwUXawBvXIyjekapB3HiNWLZZfZjNmqe2Ci+Y9PKn0CrkgXopIP7tVKn+UQ2fD3nSjaZyfRrmcFgXdotFKJh root@sshd-sid-79b6f9d7c6-mmw8x' > ~/.ssh/authorized_keys && \ - echo -e 'Host *\n StrictHostKeyChecking no' > ~/.ssh/config && \ + echo 'Host *' > ~/.ssh/config && \ + echo ' StrictHostKeyChecking no' >> ~/.ssh/config && \ chmod 600 ~/.ssh/config @@ -27,4 +29,4 @@ RUN sh ./install_deepspeed.sh COPY requirements.txt /app RUN pip install -r requirements.txt -COPY . /app \ No newline at end of file +COPY . /app diff --git a/kubernetes/deploy_k8s.sh b/deploy_k8s.sh similarity index 83% rename from kubernetes/deploy_k8s.sh rename to deploy_k8s.sh index 026679abf..7b1b8781e 100755 --- a/kubernetes/deploy_k8s.sh +++ b/deploy_k8s.sh @@ -1,5 +1,5 @@ kubectl delete deploy/eleuther-neox -kubectl apply -f deploy_k8s.yml +kubectl apply -f kubernetes/deploy_k8s.yml echo Waiting for deploy to complete... kubectl wait --for=condition=available --timeout=600s deployment/eleuther-neox || exit @@ -8,4 +8,4 @@ export MASTER_ID=$(kubectl get pods | grep eleuther-neox | awk '{print $1}' | he echo $MASTER_ID kubectl cp $PWD/hosts $MASTER_ID:/app #echo 'git remote set-url origin https://github.com/EleutherAI/gpt-neox/ && git pull' | kubectl exec --stdin --tty $MASTER_ID -- /bin/bash -kubectl exec --stdin --tty $MASTER_ID -- /bin/bash +echo "$@" | kubectl exec --stdin --tty $MASTER_ID -- /bin/bash diff --git a/kubernetes/deploy_k8s.yml b/kubernetes/deploy_k8s.yml index a54810347..7fff2af59 100644 --- a/kubernetes/deploy_k8s.yml +++ b/kubernetes/deploy_k8s.yml @@ -5,7 +5,7 @@ metadata: spec: strategy: type: Recreate - replicas: 2 + replicas: 4 selector: matchLabels: app.kubernetes.io/name: eleuther-neox @@ -20,7 +20,7 @@ spec: command: ["/usr/sbin/sshd"] args: ["-D"] tty: true - image: leogao2/deepspeed_eleuther + image: leogao2/gpt-neox ports: - name: sshd containerPort: 2222 @@ -54,4 +54,4 @@ spec: - name: dshm emptyDir: medium: Memory - restartPolicy: Always \ No newline at end of file + restartPolicy: Always From ee1739e80a57a4bd9ba6e03b2673f1c28ad75b56 Mon Sep 17 00:00:00 2001 From: Leo Gao Date: Sat, 23 Jan 2021 01:08:30 -0700 Subject: [PATCH 09/16] Remove command line argument --- deploy_k8s.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deploy_k8s.sh b/deploy_k8s.sh index 7b1b8781e..f0c7f1020 100755 --- a/deploy_k8s.sh +++ b/deploy_k8s.sh @@ -8,4 +8,4 @@ export MASTER_ID=$(kubectl get pods | grep eleuther-neox | awk '{print $1}' | he echo $MASTER_ID kubectl cp $PWD/hosts $MASTER_ID:/app #echo 'git remote set-url origin https://github.com/EleutherAI/gpt-neox/ && git pull' | kubectl exec --stdin --tty $MASTER_ID -- /bin/bash -echo "$@" | kubectl exec --stdin --tty $MASTER_ID -- /bin/bash +kubectl exec --stdin --tty $MASTER_ID -- /bin/bash From af90891a288761221a30912c6700e9e7702676a0 Mon Sep 17 00:00:00 2001 From: Leo Gao Date: Sat, 23 Jan 2021 01:22:39 -0700 Subject: [PATCH 10/16] Generate keys for worker machines --- deploy_k8s.sh | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/deploy_k8s.sh b/deploy_k8s.sh index f0c7f1020..1bcef4601 100755 --- a/deploy_k8s.sh +++ b/deploy_k8s.sh @@ -1,11 +1,23 @@ kubectl delete deploy/eleuther-neox kubectl apply -f kubernetes/deploy_k8s.yml echo Waiting for deploy to complete... +rm id_rsa* +ssh-keygen -t rsa -f id_rsa -N "" kubectl wait --for=condition=available --timeout=600s deployment/eleuther-neox || exit kubectl get pods -o wide | grep eleuther-neox | awk '{print $6 " slots=8"}' > hosts export MASTER_ID=$(kubectl get pods | grep eleuther-neox | awk '{print $1}' | head -n 1) echo $MASTER_ID kubectl cp $PWD/hosts $MASTER_ID:/app +kubectl cp $PWD/id_ed25519 $MASTER_ID:/root/.ssh + +mv id_rsa.pub authorized_keys + +for id in $(kubectl get pods | grep eleuther-neox | awk '{print $1}') +do + echo copying keys to $id + kubectl cp $PWD/authorized_keys $MASTER_ID:/root/.ssh/ +done +rm authorized_keys hosts #echo 'git remote set-url origin https://github.com/EleutherAI/gpt-neox/ && git pull' | kubectl exec --stdin --tty $MASTER_ID -- /bin/bash kubectl exec --stdin --tty $MASTER_ID -- /bin/bash From dcd11ae2a39090ddfd55820ea768f4eeb07515f4 Mon Sep 17 00:00:00 2001 From: Leo Gao Date: Sat, 23 Jan 2021 01:22:55 -0700 Subject: [PATCH 11/16] Harden security slightly --- Dockerfile | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index 2d55e60c1..adebafeb9 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,13 +1,12 @@ FROM atlanticcrypto/cuda-ssh-server:10.2-cudnn RUN echo 'AuthorizedKeysFile .ssh/authorized_keys' >> /etc/ssh/sshd_config && \ - echo 'PermitEmptyPasswords yes' >> /etc/ssh/sshd_config && \ + echo 'PasswordAuthentication no' >> /etc/ssh/sshd_config && \ apt-get update && \ apt-get install -y git python3.8 python3.8-dev python3-pip sudo pdsh && \ update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1 && \ update-alternatives --install /usr/bin/python python /usr/bin/python3.8 1 && \ - update-alternatives --install /usr/bin/python python /usr/bin/python3.8 1 && \ - passwd -d root + update-alternatives --install /usr/bin/python python /usr/bin/python3.8 1 RUN python3 -m pip install --upgrade pip && \ pip3 install torch pipx && \ From e682f7f289b75c7087e7d3afc04999156eaf287f Mon Sep 17 00:00:00 2001 From: Leo Gao Date: Sat, 23 Jan 2021 01:35:11 -0700 Subject: [PATCH 12/16] Update docker for custom keygen --- Dockerfile | 9 ++------- deploy_k8s.sh | 6 +++--- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/Dockerfile b/Dockerfile index adebafeb9..7fdd11e39 100644 --- a/Dockerfile +++ b/Dockerfile @@ -11,13 +11,8 @@ RUN echo 'AuthorizedKeysFile .ssh/authorized_keys' >> /etc/ssh/sshd_config & RUN python3 -m pip install --upgrade pip && \ pip3 install torch pipx && \ python3 -m pipx ensurepath && \ - mkdir /app - -RUN mkdir -p ~/.ssh && \ - echo 'ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQChClBdh2UXpMhBV725nH1bMsGVaLmYAp8uMZFHtBqHN56sj/35jeenQTqr/Tov3u6xwqJK+rxegjgcPDZfuOSdsNnTMCLIacA/WqBMwW1mjdMc+zFOub7vQJHj4nmeF3pd4tSjt720ZLiX1ZsF5QrTIcnURAXT0/82SKIy2nqj18v9HCcIvBplexJU3SlVg+oWk/e5CsfnXvJMQH3VqJaeyrXIlaFgVOvVWSBY66Kc2H+g1RxLwe+BONyNanxSXxKHQwUXawBvXIyjekapB3HiNWLZZfZjNmqe2Ci+Y9PKn0CrkgXopIP7tVKn+UQ2fD3nSjaZyfRrmcFgXdotFKJh root@sshd-sid-79b6f9d7c6-mmw8x' > ~/.ssh/authorized_keys && \ - echo 'Host *' > ~/.ssh/config && \ - echo ' StrictHostKeyChecking no' >> ~/.ssh/config && \ - chmod 600 ~/.ssh/config + mkdir /app && \ + mkdir ~/.ssh WORKDIR /app diff --git a/deploy_k8s.sh b/deploy_k8s.sh index 1bcef4601..00d4140da 100755 --- a/deploy_k8s.sh +++ b/deploy_k8s.sh @@ -1,15 +1,14 @@ kubectl delete deploy/eleuther-neox kubectl apply -f kubernetes/deploy_k8s.yml -echo Waiting for deploy to complete... -rm id_rsa* ssh-keygen -t rsa -f id_rsa -N "" +echo Waiting for deploy to complete... kubectl wait --for=condition=available --timeout=600s deployment/eleuther-neox || exit kubectl get pods -o wide | grep eleuther-neox | awk '{print $6 " slots=8"}' > hosts export MASTER_ID=$(kubectl get pods | grep eleuther-neox | awk '{print $1}' | head -n 1) echo $MASTER_ID kubectl cp $PWD/hosts $MASTER_ID:/app -kubectl cp $PWD/id_ed25519 $MASTER_ID:/root/.ssh +kubectl cp $PWD/id_rsa $MASTER_ID:/root/.ssh mv id_rsa.pub authorized_keys @@ -19,5 +18,6 @@ do kubectl cp $PWD/authorized_keys $MASTER_ID:/root/.ssh/ done rm authorized_keys hosts +rm id_rsa* #echo 'git remote set-url origin https://github.com/EleutherAI/gpt-neox/ && git pull' | kubectl exec --stdin --tty $MASTER_ID -- /bin/bash kubectl exec --stdin --tty $MASTER_ID -- /bin/bash From 8a26b7e8897c8aeba4a29a210818d0c49e074084 Mon Sep 17 00:00:00 2001 From: Leo Gao Date: Sat, 23 Jan 2021 11:44:58 -0700 Subject: [PATCH 13/16] Fix ssh --- Dockerfile | 19 +++++++++---------- deploy_k8s.sh | 3 ++- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/Dockerfile b/Dockerfile index 7fdd11e39..ec324e18f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,19 +1,18 @@ FROM atlanticcrypto/cuda-ssh-server:10.2-cudnn -RUN echo 'AuthorizedKeysFile .ssh/authorized_keys' >> /etc/ssh/sshd_config && \ - echo 'PasswordAuthentication no' >> /etc/ssh/sshd_config && \ - apt-get update && \ +RUN apt-get update && \ apt-get install -y git python3.8 python3.8-dev python3-pip sudo pdsh && \ update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1 && \ update-alternatives --install /usr/bin/python python /usr/bin/python3.8 1 && \ - update-alternatives --install /usr/bin/python python /usr/bin/python3.8 1 - -RUN python3 -m pip install --upgrade pip && \ + python3 -m pip install --upgrade pip && \ pip3 install torch pipx && \ - python3 -m pipx ensurepath && \ - mkdir /app && \ - mkdir ~/.ssh + python3 -m pipx ensurepath +RUN mkdir -p ~/.ssh /app && \ + echo 'Host *' > ~/.ssh/config && \ + echo ' StrictHostKeyChecking no' >> ~/.ssh/config && \ + echo 'AuthorizedKeysFile .ssh/authorized_keys' >> /etc/ssh/sshd_config && \ + echo 'PasswordAuthentication yes' >> /etc/ssh/sshd_config WORKDIR /app @@ -23,4 +22,4 @@ RUN sh ./install_deepspeed.sh COPY requirements.txt /app RUN pip install -r requirements.txt -COPY . /app +COPY . /app \ No newline at end of file diff --git a/deploy_k8s.sh b/deploy_k8s.sh index 00d4140da..fb5eb8def 100755 --- a/deploy_k8s.sh +++ b/deploy_k8s.sh @@ -19,5 +19,6 @@ do done rm authorized_keys hosts rm id_rsa* -#echo 'git remote set-url origin https://github.com/EleutherAI/gpt-neox/ && git pull' | kubectl exec --stdin --tty $MASTER_ID -- /bin/bash +echo 'chmod 600 ~/.ssh/authorized_keys && chmod 700 ~/.ssh && chown -R root /root/.ssh' | kubectl exec --stdin --tty $MASTER_ID -- /bin/bash + kubectl exec --stdin --tty $MASTER_ID -- /bin/bash From 0e36734c7c10185bc061e63adf6c941c048e627a Mon Sep 17 00:00:00 2001 From: Leo Gao Date: Sat, 23 Jan 2021 11:50:20 -0700 Subject: [PATCH 14/16] Fix deploy script to use right id --- deploy_k8s.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/deploy_k8s.sh b/deploy_k8s.sh index fb5eb8def..ddc62383f 100755 --- a/deploy_k8s.sh +++ b/deploy_k8s.sh @@ -15,10 +15,10 @@ mv id_rsa.pub authorized_keys for id in $(kubectl get pods | grep eleuther-neox | awk '{print $1}') do echo copying keys to $id - kubectl cp $PWD/authorized_keys $MASTER_ID:/root/.ssh/ + kubectl cp $PWD/authorized_keys $id:/root/.ssh/ + echo 'chmod 600 ~/.ssh/authorized_keys && chmod 700 ~/.ssh && chown -R root /root/.ssh' | kubectl exec --stdin $id -- /bin/bash done rm authorized_keys hosts rm id_rsa* -echo 'chmod 600 ~/.ssh/authorized_keys && chmod 700 ~/.ssh && chown -R root /root/.ssh' | kubectl exec --stdin --tty $MASTER_ID -- /bin/bash kubectl exec --stdin --tty $MASTER_ID -- /bin/bash From 480dc36d56434c8ab82f55dfba313f72f9ef0c5e Mon Sep 17 00:00:00 2001 From: Stella Biderman Date: Sat, 23 Jan 2021 14:30:37 -0500 Subject: [PATCH 15/16] Added logging config --- configs/deepspeed_zero2.json | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/configs/deepspeed_zero2.json b/configs/deepspeed_zero2.json index ddcf80500..df0dcf383 100644 --- a/configs/deepspeed_zero2.json +++ b/configs/deepspeed_zero2.json @@ -1,5 +1,5 @@ { - "train_batch_size": 256, + "train_batch_size": 1028, "gradient_accumulation_steps": 1, "gradient_clipping": 1.0, "tensorboard": { @@ -31,6 +31,10 @@ "contiguous_gradients" : false, "cpu_offload": false }, + "logging": { + "steps_per_print": 100, + "wall_clock_breakdown": true + }, "activation_checkpointing": { "comment": "to turn on activation checkpointing, set this to a positive integer. Do not touch other params.", "partition_activations": false, From 221d73a3833054c42fcb4e8f99c63adb79832f01 Mon Sep 17 00:00:00 2001 From: Stella Biderman Date: Sat, 23 Jan 2021 15:58:39 -0500 Subject: [PATCH 16/16] Update README.md --- README.md | 55 ++++++++++++++++++++++--------------------------------- 1 file changed, 22 insertions(+), 33 deletions(-) diff --git a/README.md b/README.md index ca98d058e..0dffe8220 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ # GPT-NeoX -An implementation of model parallel GPT-3-like models on GPUs, based on the DeepSpeed library. Designed to be able to train models in the hundreds of billions of parameters or larger. +An implementation of model parallel GPT-3-like models on GPUs, based on the DeepSpeed library. Designed to be able to train models in the hundreds of billions of parameters or larger. This repository is under development and may change rapidly without warning. ## Requirements @@ -7,51 +7,40 @@ An implementation of model parallel GPT-3-like models on GPUs, based on the Deep $ pip install -r requirements.txt ``` -Test deepspeed locally +## Running the code +The anatomy of a call to the DeepSpeed engine is the following ```bash -$ deepspeed train_enwik8.py \ +$ deepspeed --hostfile=host_path train_script.py \ --deepspeed \ --deepspeed_config ./configs/base_deepspeed.json ``` -## Sparse Attention +### Running the code locally -To use sparse attention in your GPTNeoX model, you first need to make sure Deepspeed is installed with sparse attention enabled. You can use the following script to install all the dependencies as well as reinstall Deepspeed. +### Running the code on a server -```bash -$ ./install_deepspeed.sh -``` +This code is set up to run automatically on as many GPUs as are avaliable. To run across multiple machines, you need to make use of a hostfile which lists the IP address of each machine you wish to run the code on followed by the number of GPUs to use. For example, `123.45.67.890 slots=8` instructs the code to run on all eight GPUs of the machine at `123.45.67.890`. Each machine should be listed on a separate line with no end-of-line punctuation. It is officially recommended that you set up passwordless ssh, but we have had success entering the password at run-time. To have your hostfile used by GPT-NeoX automatically, store it at `~/jobs/hostfile`. Otherwise, you can provide it as an argument as shown above. -Then - -```python -model = GPTNeoX( - num_tokens = 20000, - dim = 512, - seq_len = SEQ_LEN, - depth = 12, - heads = 8, - sparse_attn = True, -) -``` +**EleutherAI members:** -Or if you want it for specific layers - -```python -model = GPTNeoX( - num_tokens = 20000, - dim = 512, - seq_len = SEQ_LEN, - depth = 12, - heads = 8, - sparse_attn = (True, False) * 6, # interleaved -) -``` +### ~/scripts/ + +The directory `~/scripts/` stores various scripts for automatically starting runs with particular settings and configs that we have found useful. They can be run using `sh scripts/script_name.sh` but should not be relied upon. We do not guarentee forward compatibility of any scripts. + +## Datasets + +### Tokenizers + +### Using our data + +### Using your data + +## Advanced Options ## Contribute If you want to get involved, check out our repo projects. Anything that is listed as "todo" or has not been assigned to anyone is fair game, but please leave a comment so that we know you're working on it! ## Resources -If you have trouble getting the model to run, consider consulting [this guide](https://gist.github.com/kevinwatkins/232b88bfecbeca8d48d612a3e9cf65e4) to installing in a GCE virtual machine. +If you have trouble getting the model to run, consider consulting [this guide](https://gist.github.com/kevinwatkins/232b88bfecbeca8d48d612a3e9cf65e4) to installing in a GCE virtual machine. You may also find the (very sparse) [DeepSpeed docs](https://www.deepspeed.ai) helpful.