Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/main' into model_sampling
Browse files Browse the repository at this point in the history
  • Loading branch information
sdtblck committed Mar 25, 2021
2 parents d770333 + cd0f0b0 commit 78b8526
Show file tree
Hide file tree
Showing 35 changed files with 821 additions and 456 deletions.
197 changes: 150 additions & 47 deletions README.md

Large diffs are not rendered by default.

470 changes: 363 additions & 107 deletions configs/README.md

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion configs/eleutherai_cluster.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,4 @@
"tensorboard-dir": "/mnt/ssd-cluster/tensorboard",
"log-dir": "/mnt/ssd-cluster/logs",
"wandb_team": "eleutherai",
}
}
26 changes: 19 additions & 7 deletions deepy.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
#!/usr/bin/env python

# Copyright 2021 (c) Josh Levy-Kramer <[email protected]>. All rights reserved.
# Copyright (c) 2021, EleutherAI contributors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -17,11 +16,11 @@
from socket import gethostname

import shortuuid
import sys
import os
import deepspeed
from deepspeed.launcher.runner import main
import requests
import subprocess

from megatron.config_monster import ConfigMonster
import logging
Expand All @@ -30,6 +29,7 @@

logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO"))


def get_wandb_api_key():
""" Get Weights and Biases API key from ENV or .netrc file. Otherwise return None """
if 'WANDB_API_KEY' in os.environ:
Expand All @@ -40,10 +40,22 @@ def get_wandb_api_key():
if wandb_token is not None:
return wandb_token[1]


def get_git_commit_hash():
""" Gets the git commit hash of your current repo (if it exists) """
try:
git_hash = git_hash = subprocess.check_output(["git", "describe", "--always"]).strip()
git_hash = git_hash.decode()
except subprocess.CalledProcessError:
git_hash = None
return git_hash


# Generate unique run group name
wandb_group = shortuuid.uuid()
default_conf = {
'wandb_group': wandb_group
extra_conf = {
'wandb_group': wandb_group,
'git_hash': get_git_commit_hash()
}

# Extract wandb API key and inject into worker environments
Expand All @@ -52,12 +64,12 @@ def get_wandb_api_key():
deepspeed.launcher.runner.EXPORT_ENVS.append('WANDB_API_KEY')
os.environ['WANDB_API_KEY'] = wandb_token

old_style_args, conf = ConfigMonster().consume_args(default_conf=default_conf)
old_style_args, conf = ConfigMonster().consume_args(default_conf=extra_conf)

if 'log-dir' in conf:
os.makedirs(conf['log-dir'], exist_ok=True)
file_prefix = os.path.join(conf['log-dir'], '0-deepy')
Tee(file_prefix+'_stdout.txt', err=False)
Tee(file_prefix + '_stdout.txt', err=False)
Tee(file_prefix + '_stderr.txt', err=True)

if 'save' in conf:
Expand Down
2 changes: 0 additions & 2 deletions examples/pretrain_gpt2_eleutherai_cluster.sh

This file was deleted.

2 changes: 0 additions & 2 deletions examples/pretrain_gpt2_local.sh

This file was deleted.

5 changes: 0 additions & 5 deletions examples/text_generation_gpt2.sh

This file was deleted.

69 changes: 12 additions & 57 deletions megatron/arguments.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# coding=utf-8
#
# Copyright 2021 Biderman et al. This file is based on code by the authors denoted below and has been modified from its original version.
# Copyright (c) 2021, EleutherAI contributors
# This file is based on code by the authors denoted below and has been modified from its original version.
#
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
Expand Down Expand Up @@ -46,7 +46,6 @@ def _get_parser(extra_args_provider=None):
parser = _add_validation_args(parser)
parser = _add_data_args(parser)
parser = _add_autoresume_args(parser)
parser = _add_realm_args(parser)
parser = _add_zero_args(parser)
parser = _add_activation_checkpoint_args(parser)
parser = _add_text_generate_args(parser)
Expand Down Expand Up @@ -141,9 +140,6 @@ def parse_args(extra_args_provider=None, defaults={},
assert args.num_unique_layers <= args.num_layers
assert args.num_layers % args.num_unique_layers == 0, \
'num-layers should be divisible by num-unique-layers.'
if args.num_unique_layers < args.num_layers:
assert args.DDP_impl == 'local', \
'torch-DDP does not work with parameters sharing.'
# Mixed precision checks.
if args.fp16_lm_cross_entropy:
assert args.fp16, 'lm cross entropy in fp16 only support in fp16 mode.'
Expand Down Expand Up @@ -198,7 +194,7 @@ def _add_network_size_args(parser):
'layers 1 and 2: '
' grouped: [1, 2, 1, 2] and spaced: [1, 1, 2, 2].')
group.add_argument('--hidden-size', type=int, default=None,
help='Tansformer hidden size.')
help='Transformer hidden size.')
group.add_argument('--num-attention-heads', type=int, default=None,
help='Number of transformer attention heads.')
group.add_argument('--max-position-embeddings', type=int, default=None,
Expand Down Expand Up @@ -437,13 +433,15 @@ def _add_distributed_args(parser):
help='Size of the model parallel.')
group.add_argument('--pipe-parallel-size', type=int, default=0,
help='Size of the pipeline parallel. Disable with 0.')
group.add_argument('--pipe-partition-method', type=str, default='type:transformer',
help='method used to distribute model layers across pipeline stages. Choose from "parameters", '
'which balances the number of parameters on each pipeline stage, "uniform", which naively '
'balances the number of layers per stage, or "type:[regex]" (in our instance this will '
'basically only be type:transformer), which balances layers whose class names match [regex]'
)
group.add_argument('--distributed-backend', default='nccl',
choices=['nccl', 'gloo', 'mpi'],
help='Which backend to use for distributed training.')
group.add_argument('--DDP-impl', default='local',
choices=['local', 'torch'],
help='which DistributedDataParallel implementation '
'to use.')
group.add_argument('--local_rank', type=int, default=None,
help='local rank passed from distributed launcher.')
group.add_argument('--lazy-mpu-init', type=bool, required=False,
Expand All @@ -458,14 +456,12 @@ def _add_distributed_args(parser):

def _add_validation_args(parser):
group = parser.add_argument_group(title='validation')

group.add_argument('--eval-iters', type=int, default=100,
help='Number of iterations to run for evaluation'
'validation/test for.')
group.add_argument('--eval-interval', type=int, default=1000,
help='Interval between running evaluation on '
'validation set.')

return parser


Expand All @@ -485,8 +481,6 @@ def _add_data_args(parser):
help='Path to the BPE merge file.')
group.add_argument('--seq-length', type=int, default=None,
help="Maximum sequence length to process.")
group.add_argument('--mask-prob', type=float, default=0.15,
help='Probability of replacing a token with mask.')
group.add_argument('--short-seq-prob', type=float, default=0.1,
help='Probability of producing a short sequence.')
group.add_argument('--mmap-warmup', action='store_true',
Expand All @@ -501,13 +495,13 @@ def _add_data_args(parser):
choices=['lazy', 'cached', 'mmap', 'infer'],
help='Implementation of indexed datasets.')
group.add_argument('--reset-position-ids', action='store_true',
help='Reset posistion ids after end-of-document token.')
help='Reset position ids after end-of-document token.')
group.add_argument('--reset-attention-mask', action='store_true',
help='Reset self attention maske after '
help='Reset self attention mask after '
'end-of-document token.')
group.add_argument('--eod-mask-loss', action='store_true',
help='Mask loss for the end of document tokens.')
group.add_argument('--log-dir', type=str, help='Directory to store logs.')
group.add_argument('--log-dir', type=str, help='Directory to store logs.', default='./logs')

return parser

Expand All @@ -524,45 +518,6 @@ def _add_autoresume_args(parser):
return parser


def _add_realm_args(parser):
group = parser.add_argument_group(title='realm')

# network size
group.add_argument('--ict-head-size', type=int, default=None,
help='Size of block embeddings to be used in ICT and REALM (paper default: 128)')

# checkpointing
group.add_argument('--ict-load', type=str, default=None,
help='Directory containing an ICTBertModel checkpoint')
group.add_argument('--bert-load', type=str, default=None,
help='Directory containing an BertModel checkpoint (needed to start ICT and REALM)')

# data
group.add_argument('--titles-data-path', type=str, default=None,
help='Path to titles dataset used for ICT')
group.add_argument('--query-in-block-prob', type=float, default=0.1,
help='Probability of keeping query in block for ICT dataset')
group.add_argument('--use-one-sent-docs', action='store_true',
help='Whether to use one sentence documents in ICT')

# training
group.add_argument('--report-topk-accuracies', nargs='+', default=[],
help="Which top-k accuracies to report (e.g. '1 5 20')")

# faiss index
group.add_argument('--faiss-use-gpu', action='store_true',
help='Whether create the FaissMIPSIndex on GPU')
group.add_argument('--block-data-path', type=str, default=None,
help='Where to save/load BlockData to/from')

# indexer
group.add_argument('--indexer-batch-size', type=int, default=128,
help='How large of batches to use when doing indexing jobs')
group.add_argument('--indexer-log-interval', type=int, default=1000,
help='After how many batches should the indexer report progress')
return parser


def _add_zero_args(parser):
"""Text generate arguments."""

Expand Down
44 changes: 2 additions & 42 deletions megatron/checkpointing.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# coding=utf-8
#
# Copyright 2021 Biderman et al. This file is based on code by the authors denoted below and has been modified from its original version.
# Copyright (c) 2021, EleutherAI contributors
# This file is based on code by the authors denoted below and has been modified from its original version.
#
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
Expand Down Expand Up @@ -332,43 +332,3 @@ def load_checkpoint(model, optimizer, lr_scheduler, load_arg='load'):
print(' successfully loaded {}'.format(checkpoint_name))

return iteration


def load_ict_checkpoint(model, only_query_model=False, only_block_model=False, from_realm_chkpt=False):
"""selectively load ICT models for indexing/retrieving from ICT or REALM checkpoints"""

args = get_args()

if isinstance(model, torchDDP):
model = model.module

load_path = args.load if from_realm_chkpt else args.ict_load

tracker_filename = get_checkpoint_tracker_filename(load_path)
with open(tracker_filename, 'r') as f:
iteration = int(f.read().strip())

# assert iteration > 0
checkpoint_name = get_checkpoint_name(load_path, iteration, False)
if mpu.get_data_parallel_rank() == 0:
print('global rank {} is loading checkpoint {}'.format(
torch.distributed.get_rank(), checkpoint_name))

state_dict = torch.load(checkpoint_name, map_location='cpu')
ict_state_dict = state_dict['model']
if from_realm_chkpt and mpu.get_data_parallel_rank() == 0:
print(" loading ICT state dict from REALM", flush=True)
ict_state_dict = ict_state_dict['retriever']['ict_model']

if only_query_model:
ict_state_dict.pop('context_model')
if only_block_model:
ict_state_dict.pop('question_model')

model.load_state_dict(ict_state_dict)
torch.distributed.barrier()

if mpu.get_data_parallel_rank() == 0:
print(' successfully loaded {}'.format(checkpoint_name))

return model
30 changes: 26 additions & 4 deletions megatron/config_monster.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,16 @@
# Copyright 2021 (c) Josh Levy-Kramer <[email protected]>. All rights reserved.
# Copyright (c) 2021, EleutherAI contributors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http:https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import json
Expand All @@ -9,6 +21,7 @@

from megatron.utils import obtain_resource_pool
from megatron.arguments import _get_parser
import torch

log = logging.getLogger('ConfigMonster')

Expand All @@ -25,7 +38,7 @@ def _get_megatron_keys(_megatron_keys_exclude):


ds_runner_keys = ['hostfile', 'include', 'exclude', 'num_nodes', 'num_gpus', 'master_port', 'master_addr', 'launcher',
'launcher_args'] # handle separately: 'user_script', 'user_args'
'launcher_args', 'detect_nvlink_pairs'] # handle separately: 'user_script', 'user_args'

megatron_keys_exclude = [
'fp16', # Duplicated in ds_config
Expand All @@ -46,7 +59,7 @@ def _get_megatron_keys(_megatron_keys_exclude):
'dump_state', 'flops_profiler', 'activation_checkpointing', 'sparse_attention',
'zero_allow_untested_optimizer', ]

neox_config_keys = ['wandb_group', 'wandb_team']
neox_config_keys = ['wandb_group', 'wandb_team', 'git_hash']

ds_runner_keys_exclude = []

Expand Down Expand Up @@ -267,6 +280,11 @@ def parse_args(parser: argparse.ArgumentParser, args=None, extra_conf=None, defa
if args.conf_dir:
conf_files = [os.path.join(args.conf_dir, f) for f in conf_files]

# enables us to pass in `small` instead of `small.yml`
for cf in conf_files:
if not cf.endswith('.yml'):
cf += '.yml'

# Load and merge all configuration
conf = {} if extra_conf is None else extra_conf
for path in conf_files:
Expand Down Expand Up @@ -313,7 +331,11 @@ def derive_params_and_split(conf):
hostfile_path = conf.get('hostfile', DLTS_HOSTFILE)
resources = obtain_resource_pool(hostfile_path, conf.get('include', ''), conf.get('exclude', ''))
num_gpus = sum(map(len, resources.values()))
log.info(f"Total number of GPUs determined to be: {num_gpus}")
else:
num_gpus = torch.cuda.device_count()
conf["num_gpus"] = num_gpus

log.info(f"Total number of GPUs determined to be: {num_gpus}")

# get world size in the model/pipe parallel case, the actual `world size` deepspeed uses is the size of the
# data-parallel group, or (num_gpus / mp_size) / pp_size
Expand Down
5 changes: 2 additions & 3 deletions megatron/data/dataset_utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
# coding=utf-8
#
# Copyright 2021 Biderman et al. This file is based in code by the authors denoted below and has been modified from its original version.
# Copyright (c) 2021, EleutherAI contributors
# This file is based on code by the authors denoted below and has been modified from its original version.
#
# Copyright 2018 The Google AI Language Team Authors, and NVIDIA.
#
Expand Down
8 changes: 4 additions & 4 deletions megatron/data/gpt2_dataset.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# coding=utf-8
#
# Copyright 2021 Biderman et al. This file is based in code by the authors denoted below and has been modified from its original version.
# Copyright (c) 2021, EleutherAI contributors
# This file is based on code by the authors denoted below and has been modified from its original version.
#
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
Expand Down Expand Up @@ -191,15 +191,15 @@ def _build_index_mappings(name, data_prefix, documents, sizes,
# sample_idx = _build_sample_idx(sizes, doc_idx, seq_length,
# num_epochs, tokens_per_epoch)
np.save(sample_idx_filename, sample_idx, allow_pickle=True)
print_rank_0(' > elasped time to build and save sample-idx mapping '
print_rank_0(' > elapsed time to build and save sample-idx mapping '
'(seconds): {:4f}'.format(time.time() - start_time))
# shuffle-idx.
start_time = time.time()
# -1 is due to data structure used to retieve the index:
# sample i --> [sample_idx[i], sample_idx[i+1])
shuffle_idx = _build_shuffle_idx(sample_idx.shape[0] - 1, np_rng)
np.save(shuffle_idx_filename, shuffle_idx, allow_pickle=True)
print_rank_0(' > elasped time to build and save shuffle-idx mapping'
print_rank_0(' > elapsed time to build and save shuffle-idx mapping'
' (seconds): {:4f}'.format(time.time() - start_time))

# This should be a barrier but nccl barrier assumes
Expand Down
3 changes: 2 additions & 1 deletion megatron/global_vars.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# coding=utf-8
# Copyright 2021 Josh Levy-Kramer <[email protected]> et al.
# Copyright (c) 2021, EleutherAI contributors
# This file is based on code by the authors denoted below and has been modified from its original version.
#
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand Down
Loading

0 comments on commit 78b8526

Please sign in to comment.