Merge pull request #1 from EleutherAI/sid_dev

Simplify, add ssh kube scripts, remove BERT stuff
EleutherAI · Feb 8, 2021 · 624df98 · 624df98
2 parents d77fc3f + 72e1a54
commit 624df98
Show file tree

Hide file tree

Showing 78 changed files with 303 additions and 9,384 deletions.
diff --git a/images/Makefile b/images/Makefile
diff --git a/images/cases.png b/images/cases.png
diff --git a/images/scaling-dp.png b/images/scaling-dp.png
diff --git a/images/scaling-mp.png b/images/scaling-mp.png
diff --git a/images/tables.tex b/images/tables.tex
diff --git a/kubernetes/deploy_sshd_service.sh b/kubernetes/deploy_sshd_service.sh
@@ -0,0 +1,13 @@
+# replace 'sid' in .yaml files with some unique name
+
+kubectl apply -f sshd-root-pvc.yaml
+kubectl apply -f sshd-data-pvc.yaml
+kubectl apply -f sshd-service.yaml
+kubectl apply -f sshd-deployment.yaml
+
+echo "getting pod id"
+
+kubectl get pods
+
+echo "run 'kubectl logs -f <pod ID> init' to get root pw"
+echo "then 'kubectl get service' to get the SSH service external IP"
diff --git a/kubernetes/sshd-data-pvc.yaml b/kubernetes/sshd-data-pvc.yaml
@@ -0,0 +1,14 @@
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+ name: sshd-data-pv-claim-sid
+spec:
+ # Available storage classes at time of writing are
+ # ceph-ssd-2-replica - SSD Backed Storage with 2 Replicas
+ # ceph-hdd-2-replica - HDD Backed Storage with 2 Replicas
+ storageClassName: ceph-hdd-2-replica
+ accessModes:
+ - ReadWriteOnce
+ resources:
+ requests:
+ storage: 500Gi
diff --git a/kubernetes/sshd-deployment.yaml b/kubernetes/sshd-deployment.yaml
@@ -0,0 +1,136 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+ name: sshd-sid
+spec:
+ strategy:
+ type: Recreate
+ replicas: 1
+ selector:
+ matchLabels:
+ app.kubernetes.io/name: sshd
+ template:
+ metadata:
+ labels:
+ app.kubernetes.io/name: sshd
+ spec:
+ terminationGracePeriodSeconds: 10
+ initContainers:
+ - name: init
+ image: atlanticcrypto/cuda-ssh-server:10.2-cudnn
+ command: ["/bin/bash"]
+ args: ["-c", "if [ ! -f /target/initialized ]; then
+ password=$(head /dev/urandom | tr -dc A-Za-z0-9 | head -c 13 ; echo '');
+ echo \"root:$password\" | chpasswd;
+ echo \"Root password is: $password\";
+ sed -i -e 's/archive.ubuntu.com/mirror.deace.id/' /etc/apt/sources.list;
+ echo 'Acquire::http::Proxy \"http:https://10.134.98.2:3128\";' > /etc/apt/apt.conf.d/proxy.conf;
+ cp -ax / /target;
+ echo 'Initialization complete';
+ touch /target/initialized;
+ fi"]
+ resources:
+ requests:
+ cpu: 1
+ memory: 1Gi
+ volumeMounts:
+ - name: root-storage
+ mountPath: /target
+
+ containers:
+ - name: sshd-sid
+ command: ["/usr/sbin/sshd"]
+ args: ["-D"]
+ tty: true
+ image: atlanticcrypto/cuda-ssh-server:10.2-cudnn
+ ports:
+ - name: ssh-sid
+ containerPort: 22
+ protocol: TCP
+ volumeMounts:
+ - name: data-storage
+ mountPath: /mnt/data
+ - name: root-storage
+ mountPath: /bin
+ subPath: bin
+ - name: root-storage
+ mountPath: /boot
+ subPath: boot
+ - name: root-storage
+ mountPath: /etc
+ subPath: etc
+ - name: root-storage
+ mountPath: /home
+ subPath: home
+ - name: root-storage
+ mountPath: /lib
+ subPath: lib
+ - name: root-storage
+ mountPath: /lib64
+ subPath: lib64
+ - name: root-storage
+ mountPath: /opt
+ subPath: opt
+ - name: root-storage
+ mountPath: /root
+ subPath: root
+ - name: root-storage
+ mountPath: /sbin
+ subPath: sbin
+ - name: root-storage
+ mountPath: /srv
+ subPath: srv
+ - name: root-storage
+ mountPath: /usr
+ subPath: usr
+ - name: root-storage
+ mountPath: /var
+ subPath: var
+ - name: run-lock
+ mountPath: /run/lock
+ - mountPath: /dev/shm
+ name: dshm
+
+ resources:
+ requests:
+ cpu: 30 # The CPU unit is mili-cores. 500m is 0.5 cores
+ memory: 40Gi
+ limits:
+ nvidia.com/gpu: 8
+ # GPUs can only be allocated as a limit, which both reserves and limits the number of GPUs the Pod will have access to
+ # Making individual Pods resource light is advantageous for bin-packing. Since this Pod is for general purpose interactive testing
+ # we allocate 6 GPUs to it
+
+ # Node affinity can be used to require / prefer the Pods to be scheduled on a node with a specific hardware type
+ # No affinity allows scheduling on all hardware types that can fulfill the resource request.
+ # In this example, without affinity, any NVIDIA GPU would be allowed to run the Pod.
+ # Read more about affinity at: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#affinity-and-anti-affinity
+ affinity:
+ nodeAffinity:
+ # This will REQUIRE the Pod to be run on a system with a Pascal GPU
+ requiredDuringSchedulingIgnoredDuringExecution:
+ nodeSelectorTerms:
+ - matchExpressions:
+ - key: gpu.nvidia.com/model
+ operator: In
+ values:
+ - GeForce_RTX_2080_Ti
+ - key: failure-domain.beta.kubernetes.io/region
+ operator: In
+ values:
+ - ORD1
+
+ volumes:
+ - name: root-storage
+ persistentVolumeClaim:
+ claimName: sshd-root-pv-claim-sid
+ - name: data-storage
+ persistentVolumeClaim:
+ claimName: sshd-data-pv-claim-sid
+ - name: run-lock
+ emptyDir:
+ medium: Memory
+ - name: dshm
+ emptyDir:
+ medium: Memory
+ restartPolicy: Always
diff --git a/kubernetes/sshd-root-pvc.yaml b/kubernetes/sshd-root-pvc.yaml
@@ -0,0 +1,16 @@
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+ name: sshd-root-pv-claim-sid
+spec:
+ # Available storage classes at time of writing are
+ # ceph-ssd-2-replica - SSD Backed Storage with 2 Replicas
+ # ceph-ssd-no-replica - SSD Backed Storage with 1 Replica (unsafe)
+ # ceph-hdd-2-replica - HDD Backed Storage with 2 Replicas
+ # ceph-hdd-no-replica - HDD Backed Storage with 1 Replicas (unsafe)
+ storageClassName: ceph-ssd-2-replica
+ accessModes:
+ - ReadWriteOnce
+ resources:
+ requests:
+ storage: 200Gi
diff --git a/kubernetes/sshd-service.yaml b/kubernetes/sshd-service.yaml
@@ -0,0 +1,19 @@
+apiVersion: v1
+kind: Service
+metadata:
+ annotations:
+ metallb.universe.tf/address-pool: public-ord1
+ # Setting a sharing key might save public IP addresses
+ # See https://metallb.universe.tf/usage/#ip-address-sharing for more detail
+ metallb.universe.tf/allow-shared-ip: example-1
+ name: sshd-sid
+spec:
+ type: LoadBalancer
+ externalTrafficPolicy: Local
+ ports:
+ - name: ssh-sid
+ port: 22
+ protocol: TCP
+ targetPort: ssh-sid
+ selector:
+ app.kubernetes.io/name: sshd
diff --git a/megatron/__init__.py b/megatron/__init__.py
@@ -14,23 +14,13 @@
 # limitations under the License.
 import torch
 
-from .package_info import (
- __description__,
- __contact_names__,
- __url__,
- __download_url__,
- __keywords__,
- __license__,
- __package_name__,
- __version__,
-)
-
 from .global_vars import get_args
 from .global_vars import get_tokenizer
 from .global_vars import get_tensorboard_writer
 from .global_vars import get_adlr_autoresume
 from .global_vars import get_timers
-from .initialize import initialize_megatron
+from .initialize import initialize_megatron
+
 
 def print_rank_0(message):
  """If distributed is initialized print only on rank 0."""

diff --git a/megatron/checkpointing.py b/megatron/checkpointing.py
@@ -29,16 +29,19 @@
 
 _CHECKPOINT_VERSION = None
 
+
 def set_checkpoint_version(value):
  global _CHECKPOINT_VERSION
  assert _CHECKPOINT_VERSION is None, \
  "checkpoint version already set"
  _CHECKPOINT_VERSION = value
 
+
 def get_checkpoint_version():
  global _CHECKPOINT_VERSION
  return _CHECKPOINT_VERSION
 
+
 def check_checkpoint_args(checkpoint_args):
  """Ensure fixed arguments for a model are the same for the input
  arguments and the one retreived frm checkpoint."""
@@ -49,7 +52,7 @@ def _compare(arg_name):
  args_value = getattr(args, arg_name)
  error_message = '{} value from checkpoint ({}) is not equal to the ' \
  'input argument value ({}).'.format(
-  arg_name, checkpoint_value, args_value)
+ arg_name, checkpoint_value, args_value)
  assert checkpoint_value == args_value, error_message
 
  _compare('num_layers')
@@ -103,8 +106,8 @@ def save_ds_checkpoint(iteration, model, args):
  sd['rng_tracker_states'] = mpu.get_cuda_rng_tracker().get_states()
 
  if args.pipe_parallel_size == 0:
- #megatron model uses state_dict_for_save_checkpointing instead of the standard state_dict
- #state_dict is used by deepspeed for module saving so it needs to point to the right function
+ # megatron model uses state_dict_for_save_checkpointing instead of the standard state_dict
+ # state_dict is used by deepspeed for module saving so it needs to point to the right function
  model.module.state_dict = model.module.state_dict_for_save_checkpoint
  else:
  # Pipeline parallelism manages its own state_dict.
@@ -116,7 +119,7 @@ def save_ds_checkpoint(iteration, model, args):
 def save_checkpoint(iteration, model, optimizer, lr_scheduler):
  """Save a model checkpoint."""
  args = get_args()
- 
+
  if args.deepspeed:
  save_ds_checkpoint(iteration, model, args)
  else:
@@ -151,8 +154,8 @@ def save_checkpoint(iteration, model, optimizer, lr_scheduler):
  # Save.
  checkpoint_name = get_checkpoint_name(args.save, iteration)
  print('global rank {} is saving checkpoint at iteration {:7d} to {}'.
- format(torch.distributed.get_rank(), iteration,
- checkpoint_name))
+  format(torch.distributed.get_rank(), iteration,
+  checkpoint_name))
  ensure_directory_exists(checkpoint_name)
  torch.save(state_dict, checkpoint_name)
  print(' successfully saved {}'.format(checkpoint_name))
@@ -266,9 +269,8 @@ def load_checkpoint(model, optimizer, lr_scheduler, load_arg='load'):
  except KeyError:
  print_rank_0('A metadata file exists but unable to load '
  'iteration from checkpoint {}, exiting'.format(
-  checkpoint_name))
+ checkpoint_name))
  sys.exit()
-
 
  # Check arguments.
  if 'args' in state_dict: