Add initial Dockerfile and k8s deployment

EleutherAI · StellaAthena · Jan 23, 2021 · Jan 23, 2021 · Jan 23, 2021 · Jan 23, 2021
commit 1bdb8141517ebd6f207588c266326200ce2c7455
diff --git a/kubernetes/Dockerfile b/kubernetes/Dockerfile
@@ -0,0 +1,15 @@
+FROM atlanticcrypto/cuda-ssh-server:10.2-cudnn
+
+RUN echo 'AuthorizedKeysFile .ssh/authorized_keys' >> /etc/ssh/sshd_config && \
+ apt-get update && \
+ apt-get install -y git python3.8 python3.8-dev python3-pip sudo && \
+ update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1 && \
+ update-alternatives --install /usr/bin/python python /usr/bin/python3.8 1 && \
+ update-alternatives --install /usr/bin/python python /usr/bin/python3.8 1
+
+RUN mkdir -p ~/.ssh && \
+ echo 'ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQChClBdh2UXpMhBV725nH1bMsGVaLmYAp8uMZFHtBqHN56sj/35jeenQTqr/Tov3u6xwqJK+rxegjgcPDZfuOSdsNnTMCLIacA/WqBMwW1mjdMc+zFOub7vQJHj4nmeF3pd4tSjt720ZLiX1ZsF5QrTIcnURAXT0/82SKIy2nqj18v9HCcIvBplexJU3SlVg+oWk/e5CsfnXvJMQH3VqJaeyrXIlaFgVOvVWSBY66Kc2H+g1RxLwe+BONyNanxSXxKHQwUXawBvXIyjekapB3HiNWLZZfZjNmqe2Ci+Y9PKn0CrkgXopIP7tVKn+UQ2fD3nSjaZyfRrmcFgXdotFKJh root@sshd-sid-79b6f9d7c6-mmw8x' > ~/.ssh/authorized_keys
+
+RUN python3 -m pip install --upgrade pip && \
+ pip3 install torch pipx && \
+ python3 -m pipx ensurepath
@@ -0,0 +1,51 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+ name: eleuther-neox
+spec:
+ strategy:
+ type: Recreate
+ replicas: 2
+ selector:
+ matchLabels:
+ app.kubernetes.io/name: eleuther-neox
+ template:
+ metadata:
+ labels:
+ app.kubernetes.io/name: eleuther-neox
+ spec:
+ terminationGracePeriodSeconds: 10
+ containers:
+ - name: neox
+ command: ["/usr/sbin/sshd"]
+ args: ["-D"]
+ tty: true
+ image: leogao2/deepspeed_eleuther
+ ports:
+ - name: sshd
+ containerPort: 2222
+ protocol: TCP
+
+ resources:
+ requests:
+ cpu: 30
+ memory: 40Gi
+ limits:
+ nvidia.com/gpu: 8
+
+ affinity:
+ nodeAffinity:
+ requiredDuringSchedulingIgnoredDuringExecution:
+ nodeSelectorTerms:
+ - matchExpressions:
+ # Edit for different GPU
+ - key: gpu.nvidia.com/model
+ operator: In
+ values:
+ - GeForce_RTX_2080_Ti
+ - key: failure-domain.beta.kubernetes.io/region
+ operator: In
+ values:
+ - ORD1
+
+ restartPolicy: Always