Merge remote-tracking branch 'origin/main' into model_sampling

EleutherAI · Mar 25, 2021 · 78b8526 · 78b8526
2 parents d770333 + cd0f0b0
commit 78b8526
Show file tree

Hide file tree

Showing 35 changed files with 821 additions and 456 deletions.
diff --git a/README.md b/README.md
diff --git a/configs/README.md b/configs/README.md
diff --git a/configs/eleutherai_cluster.yml b/configs/eleutherai_cluster.yml
@@ -8,4 +8,4 @@
  "tensorboard-dir": "/mnt/ssd-cluster/tensorboard",
  "log-dir": "/mnt/ssd-cluster/logs",
  "wandb_team": "eleutherai",
-}
+}
diff --git a/deepy.py b/deepy.py
@@ -1,6 +1,5 @@
 #!/usr/bin/env python
-
-# Copyright 2021 (c) Josh Levy-Kramer <[email protected]>. All rights reserved.
+# Copyright (c) 2021, EleutherAI contributors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -17,11 +16,11 @@
 from socket import gethostname
 
 import shortuuid
-import sys
 import os
 import deepspeed
 from deepspeed.launcher.runner import main
 import requests
+import subprocess
 
 from megatron.config_monster import ConfigMonster
 import logging
@@ -30,6 +29,7 @@
 
 logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO"))
 
+
 def get_wandb_api_key():
  """ Get Weights and Biases API key from ENV or .netrc file. Otherwise return None """
  if 'WANDB_API_KEY' in os.environ:
@@ -40,10 +40,22 @@ def get_wandb_api_key():
  if wandb_token is not None:
  return wandb_token[1]
 
+
+def get_git_commit_hash():
+ """ Gets the git commit hash of your current repo (if it exists) """
+ try:
+ git_hash = git_hash = subprocess.check_output(["git", "describe", "--always"]).strip()
+ git_hash = git_hash.decode()
+ except subprocess.CalledProcessError:
+ git_hash = None
+ return git_hash
+
+
 # Generate unique run group name
 wandb_group = shortuuid.uuid()
-default_conf = {
- 'wandb_group': wandb_group
+extra_conf = {
+ 'wandb_group': wandb_group,
+ 'git_hash': get_git_commit_hash()
 }
 
 # Extract wandb API key and inject into worker environments
@@ -52,12 +64,12 @@ def get_wandb_api_key():
  deepspeed.launcher.runner.EXPORT_ENVS.append('WANDB_API_KEY')
  os.environ['WANDB_API_KEY'] = wandb_token
 
-old_style_args, conf = ConfigMonster().consume_args(default_conf=default_conf)
+old_style_args, conf = ConfigMonster().consume_args(default_conf=extra_conf)
 
 if 'log-dir' in conf:
  os.makedirs(conf['log-dir'], exist_ok=True)
  file_prefix = os.path.join(conf['log-dir'], '0-deepy')
- Tee(file_prefix+'_stdout.txt', err=False)
+ Tee(file_prefix + '_stdout.txt', err=False)
  Tee(file_prefix + '_stderr.txt', err=True)
 
 if 'save' in conf:

diff --git a/examples/pretrain_gpt2_eleutherai_cluster.sh b/examples/pretrain_gpt2_eleutherai_cluster.sh
diff --git a/examples/pretrain_gpt2_local.sh b/examples/pretrain_gpt2_local.sh
diff --git a/examples/text_generation_gpt2.sh b/examples/text_generation_gpt2.sh
diff --git a/megatron/arguments.py b/megatron/arguments.py
@@ -1,6 +1,6 @@
 # coding=utf-8
-#
-# Copyright 2021 Biderman et al. This file is based on code by the authors denoted below and has been modified from its original version.
+# Copyright (c) 2021, EleutherAI contributors
+# This file is based on code by the authors denoted below and has been modified from its original version.
 #
 # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
 #
@@ -46,7 +46,6 @@ def _get_parser(extra_args_provider=None):
  parser = _add_validation_args(parser)
  parser = _add_data_args(parser)
  parser = _add_autoresume_args(parser)
- parser = _add_realm_args(parser)
  parser = _add_zero_args(parser)
  parser = _add_activation_checkpoint_args(parser)
  parser = _add_text_generate_args(parser)
@@ -141,9 +140,6 @@ def parse_args(extra_args_provider=None, defaults={},
  assert args.num_unique_layers <= args.num_layers
  assert args.num_layers % args.num_unique_layers == 0, \
  'num-layers should be divisible by num-unique-layers.'
- if args.num_unique_layers < args.num_layers:
- assert args.DDP_impl == 'local', \
- 'torch-DDP does not work with parameters sharing.'
  # Mixed precision checks.
  if args.fp16_lm_cross_entropy:
  assert args.fp16, 'lm cross entropy in fp16 only support in fp16 mode.'
@@ -198,7 +194,7 @@ def _add_network_size_args(parser):
  'layers 1 and 2: '
  ' grouped: [1, 2, 1, 2] and spaced: [1, 1, 2, 2].')
  group.add_argument('--hidden-size', type=int, default=None,
- help='Tansformer hidden size.')
+ help='Transformer hidden size.')
  group.add_argument('--num-attention-heads', type=int, default=None,
  help='Number of transformer attention heads.')
  group.add_argument('--max-position-embeddings', type=int, default=None,
@@ -437,13 +433,15 @@ def _add_distributed_args(parser):
  help='Size of the model parallel.')
  group.add_argument('--pipe-parallel-size', type=int, default=0,
  help='Size of the pipeline parallel. Disable with 0.')
+ group.add_argument('--pipe-partition-method', type=str, default='type:transformer',
+ help='method used to distribute model layers across pipeline stages. Choose from "parameters", '
+ 'which balances the number of parameters on each pipeline stage, "uniform", which naively '
+ 'balances the number of layers per stage, or "type:[regex]" (in our instance this will '
+ 'basically only be type:transformer), which balances layers whose class names match [regex]'
+ )
  group.add_argument('--distributed-backend', default='nccl',
  choices=['nccl', 'gloo', 'mpi'],
  help='Which backend to use for distributed training.')
- group.add_argument('--DDP-impl', default='local',
- choices=['local', 'torch'],
- help='which DistributedDataParallel implementation '
- 'to use.')
  group.add_argument('--local_rank', type=int, default=None,
  help='local rank passed from distributed launcher.')
  group.add_argument('--lazy-mpu-init', type=bool, required=False,
@@ -458,14 +456,12 @@ def _add_distributed_args(parser):
 
 def _add_validation_args(parser):
  group = parser.add_argument_group(title='validation')
-
  group.add_argument('--eval-iters', type=int, default=100,
  help='Number of iterations to run for evaluation'
  'validation/test for.')
  group.add_argument('--eval-interval', type=int, default=1000,
  help='Interval between running evaluation on '
  'validation set.')
-
  return parser
 
 
@@ -485,8 +481,6 @@ def _add_data_args(parser):
  help='Path to the BPE merge file.')
  group.add_argument('--seq-length', type=int, default=None,
  help="Maximum sequence length to process.")
- group.add_argument('--mask-prob', type=float, default=0.15,
- help='Probability of replacing a token with mask.')
  group.add_argument('--short-seq-prob', type=float, default=0.1,
  help='Probability of producing a short sequence.')
  group.add_argument('--mmap-warmup', action='store_true',
@@ -501,13 +495,13 @@ def _add_data_args(parser):
  choices=['lazy', 'cached', 'mmap', 'infer'],
  help='Implementation of indexed datasets.')
  group.add_argument('--reset-position-ids', action='store_true',
- help='Reset posistion ids after end-of-document token.')
+ help='Reset position ids after end-of-document token.')
  group.add_argument('--reset-attention-mask', action='store_true',
- help='Reset self attention maske after '
+ help='Reset self attention mask after '
  'end-of-document token.')
  group.add_argument('--eod-mask-loss', action='store_true',
  help='Mask loss for the end of document tokens.')
- group.add_argument('--log-dir', type=str, help='Directory to store logs.')
+ group.add_argument('--log-dir', type=str, help='Directory to store logs.', default='./logs')
 
  return parser
 
@@ -524,45 +518,6 @@ def _add_autoresume_args(parser):
  return parser
 
 
-def _add_realm_args(parser):
- group = parser.add_argument_group(title='realm')
-
- # network size
- group.add_argument('--ict-head-size', type=int, default=None,
- help='Size of block embeddings to be used in ICT and REALM (paper default: 128)')
-
- # checkpointing
- group.add_argument('--ict-load', type=str, default=None,
- help='Directory containing an ICTBertModel checkpoint')
- group.add_argument('--bert-load', type=str, default=None,
- help='Directory containing an BertModel checkpoint (needed to start ICT and REALM)')
-
- # data
- group.add_argument('--titles-data-path', type=str, default=None,
- help='Path to titles dataset used for ICT')
- group.add_argument('--query-in-block-prob', type=float, default=0.1,
- help='Probability of keeping query in block for ICT dataset')
- group.add_argument('--use-one-sent-docs', action='store_true',
- help='Whether to use one sentence documents in ICT')
-
- # training
- group.add_argument('--report-topk-accuracies', nargs='+', default=[],
- help="Which top-k accuracies to report (e.g. '1 5 20')")
-
- # faiss index
- group.add_argument('--faiss-use-gpu', action='store_true',
- help='Whether create the FaissMIPSIndex on GPU')
- group.add_argument('--block-data-path', type=str, default=None,
- help='Where to save/load BlockData to/from')
-
- # indexer
- group.add_argument('--indexer-batch-size', type=int, default=128,
- help='How large of batches to use when doing indexing jobs')
- group.add_argument('--indexer-log-interval', type=int, default=1000,
- help='After how many batches should the indexer report progress')
- return parser
-
-
 def _add_zero_args(parser):
  """Text generate arguments."""
 

diff --git a/megatron/checkpointing.py b/megatron/checkpointing.py
@@ -1,6 +1,6 @@
 # coding=utf-8
-#
-# Copyright 2021 Biderman et al. This file is based on code by the authors denoted below and has been modified from its original version.
+# Copyright (c) 2021, EleutherAI contributors
+# This file is based on code by the authors denoted below and has been modified from its original version.
 #
 # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
 #
@@ -332,43 +332,3 @@ def load_checkpoint(model, optimizer, lr_scheduler, load_arg='load'):
  print(' successfully loaded {}'.format(checkpoint_name))
 
  return iteration
-
-
-def load_ict_checkpoint(model, only_query_model=False, only_block_model=False, from_realm_chkpt=False):
- """selectively load ICT models for indexing/retrieving from ICT or REALM checkpoints"""
-
- args = get_args()
-
- if isinstance(model, torchDDP):
- model = model.module
-
- load_path = args.load if from_realm_chkpt else args.ict_load
-
- tracker_filename = get_checkpoint_tracker_filename(load_path)
- with open(tracker_filename, 'r') as f:
- iteration = int(f.read().strip())
-
- # assert iteration > 0
- checkpoint_name = get_checkpoint_name(load_path, iteration, False)
- if mpu.get_data_parallel_rank() == 0:
- print('global rank {} is loading checkpoint {}'.format(
- torch.distributed.get_rank(), checkpoint_name))
-
- state_dict = torch.load(checkpoint_name, map_location='cpu')
- ict_state_dict = state_dict['model']
- if from_realm_chkpt and mpu.get_data_parallel_rank() == 0:
- print(" loading ICT state dict from REALM", flush=True)
- ict_state_dict = ict_state_dict['retriever']['ict_model']
-
- if only_query_model:
- ict_state_dict.pop('context_model')
- if only_block_model:
- ict_state_dict.pop('question_model')
-
- model.load_state_dict(ict_state_dict)
- torch.distributed.barrier()
-
- if mpu.get_data_parallel_rank() == 0:
- print(' successfully loaded {}'.format(checkpoint_name))
-
- return model
diff --git a/megatron/config_monster.py b/megatron/config_monster.py
@@ -1,4 +1,16 @@
-# Copyright 2021 (c) Josh Levy-Kramer <[email protected]>. All rights reserved.
+# Copyright (c) 2021, EleutherAI contributors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http:https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 import argparse
 import json
@@ -9,6 +21,7 @@
 
 from megatron.utils import obtain_resource_pool
 from megatron.arguments import _get_parser
+import torch
 
 log = logging.getLogger('ConfigMonster')
 
@@ -25,7 +38,7 @@ def _get_megatron_keys(_megatron_keys_exclude):
 
 
 ds_runner_keys = ['hostfile', 'include', 'exclude', 'num_nodes', 'num_gpus', 'master_port', 'master_addr', 'launcher',
- 'launcher_args'] # handle separately: 'user_script', 'user_args'
+ 'launcher_args', 'detect_nvlink_pairs'] # handle separately: 'user_script', 'user_args'
 
 megatron_keys_exclude = [
  'fp16', # Duplicated in ds_config
@@ -46,7 +59,7 @@ def _get_megatron_keys(_megatron_keys_exclude):
  'dump_state', 'flops_profiler', 'activation_checkpointing', 'sparse_attention',
  'zero_allow_untested_optimizer', ]
 
-neox_config_keys = ['wandb_group', 'wandb_team']
+neox_config_keys = ['wandb_group', 'wandb_team', 'git_hash']
 
 ds_runner_keys_exclude = []
 
@@ -267,6 +280,11 @@ def parse_args(parser: argparse.ArgumentParser, args=None, extra_conf=None, defa
  if args.conf_dir:
  conf_files = [os.path.join(args.conf_dir, f) for f in conf_files]
 
+ # enables us to pass in `small` instead of `small.yml`
+ for cf in conf_files:
+ if not cf.endswith('.yml'):
+ cf += '.yml'
+
  # Load and merge all configuration
  conf = {} if extra_conf is None else extra_conf
  for path in conf_files:
@@ -313,7 +331,11 @@ def derive_params_and_split(conf):
  hostfile_path = conf.get('hostfile', DLTS_HOSTFILE)
  resources = obtain_resource_pool(hostfile_path, conf.get('include', ''), conf.get('exclude', ''))
  num_gpus = sum(map(len, resources.values()))
- log.info(f"Total number of GPUs determined to be: {num_gpus}")
+ else:
+ num_gpus = torch.cuda.device_count()
+ conf["num_gpus"] = num_gpus
+
+ log.info(f"Total number of GPUs determined to be: {num_gpus}")
 
  # get world size in the model/pipe parallel case, the actual `world size` deepspeed uses is the size of the
  # data-parallel group, or (num_gpus / mp_size) / pp_size

diff --git a/megatron/data/dataset_utils.py b/megatron/data/dataset_utils.py
@@ -1,6 +1,5 @@
-# coding=utf-8
-#
-# Copyright 2021 Biderman et al. This file is based in code by the authors denoted below and has been modified from its original version.
+# Copyright (c) 2021, EleutherAI contributors
+# This file is based on code by the authors denoted below and has been modified from its original version.
 #
 # Copyright 2018 The Google AI Language Team Authors, and NVIDIA.
 #

diff --git a/megatron/data/gpt2_dataset.py b/megatron/data/gpt2_dataset.py
@@ -1,6 +1,6 @@
 # coding=utf-8
-#
-# Copyright 2021 Biderman et al. This file is based in code by the authors denoted below and has been modified from its original version.
+# Copyright (c) 2021, EleutherAI contributors
+# This file is based on code by the authors denoted below and has been modified from its original version.
 #
 # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
 #
@@ -191,15 +191,15 @@ def _build_index_mappings(name, data_prefix, documents, sizes,
  # sample_idx = _build_sample_idx(sizes, doc_idx, seq_length,
  # num_epochs, tokens_per_epoch)
  np.save(sample_idx_filename, sample_idx, allow_pickle=True)
- print_rank_0(' > elasped time to build and save sample-idx mapping '
+ print_rank_0(' > elapsed time to build and save sample-idx mapping '
  '(seconds): {:4f}'.format(time.time() - start_time))
  # shuffle-idx.
  start_time = time.time()
  # -1 is due to data structure used to retieve the index:
  # sample i --> [sample_idx[i], sample_idx[i+1])
  shuffle_idx = _build_shuffle_idx(sample_idx.shape[0] - 1, np_rng)
  np.save(shuffle_idx_filename, shuffle_idx, allow_pickle=True)
- print_rank_0(' > elasped time to build and save shuffle-idx mapping'
+ print_rank_0(' > elapsed time to build and save shuffle-idx mapping'
  ' (seconds): {:4f}'.format(time.time() - start_time))
 
  # This should be a barrier but nccl barrier assumes

diff --git a/megatron/global_vars.py b/megatron/global_vars.py
@@ -1,6 +1,7 @@
 # coding=utf-8
-# Copyright 2021 Josh Levy-Kramer <[email protected]> et al.
+# Copyright (c) 2021, EleutherAI contributors
 # This file is based on code by the authors denoted below and has been modified from its original version.
+#
 # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");