EleutherAI · StellaAthena · Jan 23, 2021 · Jan 23, 2021 · Jan 23, 2021 · Jan 23, 2021
@@ -0,0 +1,30 @@
+FROM atlanticcrypto/cuda-ssh-server:10.2-cudnn
+
+RUN echo -e 'AuthorizedKeysFile     .ssh/authorized_keys\nPermitEmptyPasswords yes' >> /etc/ssh/sshd_config && \
+    apt-get update && \
+    apt-get install -y git python3.8 python3.8-dev python3-pip sudo pdsh && \
+    update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1 && \
+    update-alternatives --install /usr/bin/python python /usr/bin/python3.8 1 && \
+    update-alternatives --install /usr/bin/python python /usr/bin/python3.8 1 && \
+    passwd -d root
+
+RUN python3 -m pip install --upgrade pip && \
+    pip3 install torch pipx && \
+    python3 -m pipx ensurepath && \
+    mkdir /app
+
+RUN mkdir -p ~/.ssh && \
+    echo 'ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQChClBdh2UXpMhBV725nH1bMsGVaLmYAp8uMZFHtBqHN56sj/35jeenQTqr/Tov3u6xwqJK+rxegjgcPDZfuOSdsNnTMCLIacA/WqBMwW1mjdMc+zFOub7vQJHj4nmeF3pd4tSjt720ZLiX1ZsF5QrTIcnURAXT0/82SKIy2nqj18v9HCcIvBplexJU3SlVg+oWk/e5CsfnXvJMQH3VqJaeyrXIlaFgVOvVWSBY66Kc2H+g1RxLwe+BONyNanxSXxKHQwUXawBvXIyjekapB3HiNWLZZfZjNmqe2Ci+Y9PKn0CrkgXopIP7tVKn+UQ2fD3nSjaZyfRrmcFgXdotFKJh root@sshd-sid-79b6f9d7c6-mmw8x' > ~/.ssh/authorized_keys && \
+    echo -e 'Host *\n    StrictHostKeyChecking no' > ~/.ssh/config && \
+    chmod 600 ~/.ssh/config
+
+
+WORKDIR /app
+
+COPY install_deepspeed.sh /app
+RUN sh ./install_deepspeed.sh
+
+COPY requirements.txt /app
+RUN pip install -r requirements.txt
+
+COPY . /app
@@ -1,4 +1,5 @@
 import torch
+import torch.utils.checkpoint
 import torch.nn.functional as F
 from torch import nn, einsum
 from functools import partial

diff --git a/kubernetes/deploy_k8s.sh b/kubernetes/deploy_k8s.sh
@@ -0,0 +1,11 @@
+kubectl delete deploy/eleuther-neox
+kubectl apply -f deploy_k8s.yml
+echo Waiting for deploy to complete...
+kubectl wait --for=condition=available --timeout=600s deployment/eleuther-neox || exit
+
+kubectl get pods -o wide | grep eleuther-neox | awk '{print $6 " slots=8"}' > hosts
+export MASTER_ID=$(kubectl get pods | grep eleuther-neox | awk '{print $1}' | head -n 1)
+echo $MASTER_ID
+kubectl cp $PWD/hosts $MASTER_ID:/app
+#echo 'git remote set-url origin https://github.com/EleutherAI/gpt-neox/ && git pull' | kubectl exec --stdin --tty $MASTER_ID -- /bin/bash
+kubectl exec --stdin --tty $MASTER_ID -- /bin/bash
@@ -0,0 +1,57 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: eleuther-neox
+spec:
+  strategy:
+    type: Recreate
+  replicas: 2
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: eleuther-neox
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: eleuther-neox
+    spec:
+      terminationGracePeriodSeconds: 10
+      containers:
+      - name: neox
+        command: ["/usr/sbin/sshd"]
+        args: ["-D"]
+        tty: true
+        image: leogao2/deepspeed_eleuther
+        ports:
+          - name: sshd
+            containerPort: 2222
+            protocol: TCP
+        volumeMounts:
+          - mountPath: /dev/shm
+            name: dshm
+
+        resources:
+          requests:
+            cpu: 30
+            memory: 40Gi
+          limits:
+            nvidia.com/gpu: 8
+
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+            - matchExpressions:
+              # Edit for different GPU
+              - key: gpu.nvidia.com/model
+                operator: In
+                values:
+                  - GeForce_RTX_2080_Ti
+              - key: failure-domain.beta.kubernetes.io/region
+                operator: In
+                values:
+                  - ORD1
+      volumes:
+        - name: dshm
+          emptyDir:
+            medium: Memory
+      restartPolicy: Always