Skip to content

Commit

Permalink
Merge pull request #1 from EleutherAI/sid_dev
Browse files Browse the repository at this point in the history
Simplify, add ssh kube scripts, remove BERT stuff
  • Loading branch information
StellaAthena committed Feb 8, 2021
2 parents d77fc3f + 72e1a54 commit 624df98
Show file tree
Hide file tree
Showing 78 changed files with 303 additions and 9,384 deletions.
12 changes: 0 additions & 12 deletions images/Makefile

This file was deleted.

Binary file removed images/cases.png
Binary file not shown.
Binary file removed images/scaling-dp.png
Binary file not shown.
Binary file removed images/scaling-mp.png
Binary file not shown.
40 changes: 0 additions & 40 deletions images/tables.tex

This file was deleted.

13 changes: 13 additions & 0 deletions kubernetes/deploy_sshd_service.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# replace 'sid' in .yaml files with some unique name

kubectl apply -f sshd-root-pvc.yaml
kubectl apply -f sshd-data-pvc.yaml
kubectl apply -f sshd-service.yaml
kubectl apply -f sshd-deployment.yaml

echo "getting pod id"

kubectl get pods

echo "run 'kubectl logs -f <pod ID> init' to get root pw"
echo "then 'kubectl get service' to get the SSH service external IP"
14 changes: 14 additions & 0 deletions kubernetes/sshd-data-pvc.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: sshd-data-pv-claim-sid
spec:
# Available storage classes at time of writing are
# ceph-ssd-2-replica - SSD Backed Storage with 2 Replicas
# ceph-hdd-2-replica - HDD Backed Storage with 2 Replicas
storageClassName: ceph-hdd-2-replica
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 500Gi
136 changes: 136 additions & 0 deletions kubernetes/sshd-deployment.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: sshd-sid
spec:
strategy:
type: Recreate
replicas: 1
selector:
matchLabels:
app.kubernetes.io/name: sshd
template:
metadata:
labels:
app.kubernetes.io/name: sshd
spec:
terminationGracePeriodSeconds: 10
initContainers:
- name: init
image: atlanticcrypto/cuda-ssh-server:10.2-cudnn
command: ["/bin/bash"]
args: ["-c", "if [ ! -f /target/initialized ]; then
password=$(head /dev/urandom | tr -dc A-Za-z0-9 | head -c 13 ; echo '');
echo \"root:$password\" | chpasswd;
echo \"Root password is: $password\";
sed -i -e 's/archive.ubuntu.com/mirror.deace.id/' /etc/apt/sources.list;
echo 'Acquire::http::Proxy \"http:https://10.134.98.2:3128\";' > /etc/apt/apt.conf.d/proxy.conf;
cp -ax / /target;
echo 'Initialization complete';
touch /target/initialized;
fi"]
resources:
requests:
cpu: 1
memory: 1Gi
volumeMounts:
- name: root-storage
mountPath: /target

containers:
- name: sshd-sid
command: ["/usr/sbin/sshd"]
args: ["-D"]
tty: true
image: atlanticcrypto/cuda-ssh-server:10.2-cudnn
ports:
- name: ssh-sid
containerPort: 22
protocol: TCP
volumeMounts:
- name: data-storage
mountPath: /mnt/data
- name: root-storage
mountPath: /bin
subPath: bin
- name: root-storage
mountPath: /boot
subPath: boot
- name: root-storage
mountPath: /etc
subPath: etc
- name: root-storage
mountPath: /home
subPath: home
- name: root-storage
mountPath: /lib
subPath: lib
- name: root-storage
mountPath: /lib64
subPath: lib64
- name: root-storage
mountPath: /opt
subPath: opt
- name: root-storage
mountPath: /root
subPath: root
- name: root-storage
mountPath: /sbin
subPath: sbin
- name: root-storage
mountPath: /srv
subPath: srv
- name: root-storage
mountPath: /usr
subPath: usr
- name: root-storage
mountPath: /var
subPath: var
- name: run-lock
mountPath: /run/lock
- mountPath: /dev/shm
name: dshm

resources:
requests:
cpu: 30 # The CPU unit is mili-cores. 500m is 0.5 cores
memory: 40Gi
limits:
nvidia.com/gpu: 8
# GPUs can only be allocated as a limit, which both reserves and limits the number of GPUs the Pod will have access to
# Making individual Pods resource light is advantageous for bin-packing. Since this Pod is for general purpose interactive testing
# we allocate 6 GPUs to it

# Node affinity can be used to require / prefer the Pods to be scheduled on a node with a specific hardware type
# No affinity allows scheduling on all hardware types that can fulfill the resource request.
# In this example, without affinity, any NVIDIA GPU would be allowed to run the Pod.
# Read more about affinity at: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#affinity-and-anti-affinity
affinity:
nodeAffinity:
# This will REQUIRE the Pod to be run on a system with a Pascal GPU
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: gpu.nvidia.com/model
operator: In
values:
- GeForce_RTX_2080_Ti
- key: failure-domain.beta.kubernetes.io/region
operator: In
values:
- ORD1

volumes:
- name: root-storage
persistentVolumeClaim:
claimName: sshd-root-pv-claim-sid
- name: data-storage
persistentVolumeClaim:
claimName: sshd-data-pv-claim-sid
- name: run-lock
emptyDir:
medium: Memory
- name: dshm
emptyDir:
medium: Memory
restartPolicy: Always
16 changes: 16 additions & 0 deletions kubernetes/sshd-root-pvc.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: sshd-root-pv-claim-sid
spec:
# Available storage classes at time of writing are
# ceph-ssd-2-replica - SSD Backed Storage with 2 Replicas
# ceph-ssd-no-replica - SSD Backed Storage with 1 Replica (unsafe)
# ceph-hdd-2-replica - HDD Backed Storage with 2 Replicas
# ceph-hdd-no-replica - HDD Backed Storage with 1 Replicas (unsafe)
storageClassName: ceph-ssd-2-replica
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 200Gi
19 changes: 19 additions & 0 deletions kubernetes/sshd-service.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
apiVersion: v1
kind: Service
metadata:
annotations:
metallb.universe.tf/address-pool: public-ord1
# Setting a sharing key might save public IP addresses
# See https://metallb.universe.tf/usage/#ip-address-sharing for more detail
metallb.universe.tf/allow-shared-ip: example-1
name: sshd-sid
spec:
type: LoadBalancer
externalTrafficPolicy: Local
ports:
- name: ssh-sid
port: 22
protocol: TCP
targetPort: ssh-sid
selector:
app.kubernetes.io/name: sshd
14 changes: 2 additions & 12 deletions megatron/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,23 +14,13 @@
# limitations under the License.
import torch

from .package_info import (
__description__,
__contact_names__,
__url__,
__download_url__,
__keywords__,
__license__,
__package_name__,
__version__,
)

from .global_vars import get_args
from .global_vars import get_tokenizer
from .global_vars import get_tensorboard_writer
from .global_vars import get_adlr_autoresume
from .global_vars import get_timers
from .initialize import initialize_megatron
from .initialize import initialize_megatron


def print_rank_0(message):
"""If distributed is initialized print only on rank 0."""
Expand Down
18 changes: 10 additions & 8 deletions megatron/checkpointing.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,16 +29,19 @@

_CHECKPOINT_VERSION = None


def set_checkpoint_version(value):
global _CHECKPOINT_VERSION
assert _CHECKPOINT_VERSION is None, \
"checkpoint version already set"
_CHECKPOINT_VERSION = value


def get_checkpoint_version():
global _CHECKPOINT_VERSION
return _CHECKPOINT_VERSION


def check_checkpoint_args(checkpoint_args):
"""Ensure fixed arguments for a model are the same for the input
arguments and the one retreived frm checkpoint."""
Expand All @@ -49,7 +52,7 @@ def _compare(arg_name):
args_value = getattr(args, arg_name)
error_message = '{} value from checkpoint ({}) is not equal to the ' \
'input argument value ({}).'.format(
arg_name, checkpoint_value, args_value)
arg_name, checkpoint_value, args_value)
assert checkpoint_value == args_value, error_message

_compare('num_layers')
Expand Down Expand Up @@ -103,8 +106,8 @@ def save_ds_checkpoint(iteration, model, args):
sd['rng_tracker_states'] = mpu.get_cuda_rng_tracker().get_states()

if args.pipe_parallel_size == 0:
#megatron model uses state_dict_for_save_checkpointing instead of the standard state_dict
#state_dict is used by deepspeed for module saving so it needs to point to the right function
# megatron model uses state_dict_for_save_checkpointing instead of the standard state_dict
# state_dict is used by deepspeed for module saving so it needs to point to the right function
model.module.state_dict = model.module.state_dict_for_save_checkpoint
else:
# Pipeline parallelism manages its own state_dict.
Expand All @@ -116,7 +119,7 @@ def save_ds_checkpoint(iteration, model, args):
def save_checkpoint(iteration, model, optimizer, lr_scheduler):
"""Save a model checkpoint."""
args = get_args()

if args.deepspeed:
save_ds_checkpoint(iteration, model, args)
else:
Expand Down Expand Up @@ -151,8 +154,8 @@ def save_checkpoint(iteration, model, optimizer, lr_scheduler):
# Save.
checkpoint_name = get_checkpoint_name(args.save, iteration)
print('global rank {} is saving checkpoint at iteration {:7d} to {}'.
format(torch.distributed.get_rank(), iteration,
checkpoint_name))
format(torch.distributed.get_rank(), iteration,
checkpoint_name))
ensure_directory_exists(checkpoint_name)
torch.save(state_dict, checkpoint_name)
print(' successfully saved {}'.format(checkpoint_name))
Expand Down Expand Up @@ -266,9 +269,8 @@ def load_checkpoint(model, optimizer, lr_scheduler, load_arg='load'):
except KeyError:
print_rank_0('A metadata file exists but unable to load '
'iteration from checkpoint {}, exiting'.format(
checkpoint_name))
checkpoint_name))
sys.exit()


# Check arguments.
if 'args' in state_dict:
Expand Down
Loading

0 comments on commit 624df98

Please sign in to comment.