diff --git a/Dockerfile b/Dockerfile index 179dad119..d3dc6517c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -71,9 +71,12 @@ RUN mkdir -p /home/mchorse/.ssh /job && \ #### Python packages RUN pip install torch==1.8.1+cu111 -f https://download.pytorch.org/whl/torch_stable.html && pip cache purge - COPY requirements/requirements.txt . -RUN pip install -r requirements.txt && pip cache purge +COPY requirements/requirements-onebitadam.txt . +COPY requirements/requirements-sparseattention.txt . +RUN pip install -r requirements.txt && pip install -r requirements-onebitadam.txt && pip install -r requirements-sparseattention.txt && pip cache purge + +## Install APEX RUN pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" git+https://github.com/NVIDIA/apex.git@a651e2c24ecf97cbf367fd3f330df36760e1c597 # Clear staging diff --git a/deepy.py b/deepy.py index b2f67d49f..d4cb399da 100755 --- a/deepy.py +++ b/deepy.py @@ -13,12 +13,12 @@ # See the License for the specific language governing permissions and # limitations under the License. +import logging import os + import deepspeed -from deepspeed.launcher.runner import main import requests - -import logging +from deepspeed.launcher.runner import main logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO")) @@ -43,8 +43,13 @@ def get_wandb_api_key(): neox_args = NeoXArgs.consume_deepy_args() +if neox_args.wandb_group is not None: + # concat the wandb group name with a uid to make sure it's unique + import wandb + neox_args.wandb_group += "_" + wandb.util.generate_id() neox_args.print() deepspeed_main_args = neox_args.get_deepspeed_main_args() + if __name__ == '__main__': main(deepspeed_main_args) diff --git a/megatron/checkpointing.py b/megatron/checkpointing.py index 84be33101..36d24c4b8 100644 --- a/megatron/checkpointing.py +++ b/megatron/checkpointing.py @@ -26,7 +26,6 @@ import numpy as np import torch -from torch.nn.parallel import DistributedDataParallel as torchDDP from glob import glob from megatron import mpu, get_args @@ -34,6 +33,7 @@ from megatron import print_rank_0 from megatron.utils import natural_sort + def check_checkpoint_args(checkpoint_args): """Ensure fixed arguments for a model are the same for the input arguments and the one retreived frm checkpoint.""" @@ -104,9 +104,7 @@ def delete_old_checkpoints(save_dir, n_to_keep): def save_ds_checkpoint(iteration, model, args): """Save a model checkpoint.""" - - sd = {} - sd['iteration'] = iteration + sd = {'iteration': iteration} # rng states. if not args.no_save_rng: sd['random_rng_state'] = random.getstate() @@ -114,15 +112,6 @@ def save_ds_checkpoint(iteration, model, args): sd['torch_rng_state'] = torch.get_rng_state() sd['cuda_rng_state'] = torch.cuda.get_rng_state() sd['rng_tracker_states'] = mpu.get_cuda_rng_tracker().get_states() - - if args.pipe_parallel_size == 0: - # megatron model uses state_dict_for_save_checkpointing instead of the standard state_dict - # state_dict is used by deepspeed for module saving so it needs to point to the right function - model.module.state_dict = model.module.state_dict_for_save_checkpoint - else: - # Pipeline parallelism manages its own state_dict. - pass - model.save_checkpoint(args.save, client_state=sd) @@ -152,17 +141,14 @@ def save_checkpoint(iteration, model, optimizer, lr_scheduler): torch.distributed.barrier() -def load_checkpoint(model, optimizer, lr_scheduler, load_arg='load'): +def load_checkpoint(model, optimizer, lr_scheduler): """Load a model checkpoint and return the iteration.""" args = get_args() - load_dir = getattr(args, load_arg) - if isinstance(model, torchDDP): - model = model.module # Read the tracker file and set the iteration. - tracker_filename = get_checkpoint_tracker_filename(load_dir) + tracker_filename = get_checkpoint_tracker_filename(args.load) - # If no tracker file, return iretation zero. + # If no tracker file, return iteration zero. if not os.path.isfile(tracker_filename): print_rank_0('WARNING: could not find the metadata file {} '.format( tracker_filename)) @@ -190,7 +176,7 @@ def load_checkpoint(model, optimizer, lr_scheduler, load_arg='load'): if args.deepspeed: load_optim_and_scheduler = not args.no_load_optim # TODO: These should be configured by separate args - checkpoint_name, state_dict = model.load_checkpoint(load_dir, + checkpoint_name, state_dict = model.load_checkpoint(args.load, load_optimizer_states=load_optim_and_scheduler, load_lr_scheduler_states=load_optim_and_scheduler) diff --git a/megatron/data/dataset_utils.py b/megatron/data/dataset_utils.py deleted file mode 100644 index 231744bf7..000000000 --- a/megatron/data/dataset_utils.py +++ /dev/null @@ -1,406 +0,0 @@ -# Copyright (c) 2021, EleutherAI contributors -# This file is based on code by the authors denoted below and has been modified from its original version. -# -# Copyright 2018 The Google AI Language Team Authors, and NVIDIA. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Most of the code here has been copied from: -# https://github.com/google-research/albert/blob/master/create_pretraining_data.py -# with some modifications. - -import time -import collections - -import numpy as np -from megatron import get_args, print_rank_0 -from megatron.data.indexed_dataset import make_dataset as make_indexed_dataset - -DSET_TYPE_STD = 'standard_bert' -DSET_TYPE_ICT = 'ict' - -DSET_TYPES = [DSET_TYPE_ICT, DSET_TYPE_STD] - - -def compile_helper(): - """Compile helper function ar runtime. Make sure this - is invoked on a single process.""" - import os - import subprocess - path = os.path.abspath(os.path.dirname(__file__)) - ret = subprocess.run(['make', '-C', path]) - if ret.returncode != 0: - print("Making C++ dataset helpers module failed, exiting.") - import sys - sys.exit(1) - - -def get_a_and_b_segments(sample, np_rng): - """Divide sample into a and b segments.""" - - # Number of sentences in the sample. - n_sentences = len(sample) - # Make sure we always have two sentences. - assert n_sentences > 1, 'make sure each sample has at least two sentences.' - - # First part: - # `a_end` is how many sentences go into the `A`. - a_end = 1 - if n_sentences >= 3: - # Note that randin in numpy is exclusive. - a_end = np_rng.randint(1, n_sentences) - tokens_a = [] - for j in range(a_end): - tokens_a.extend(sample[j]) - - # Second part: - tokens_b = [] - for j in range(a_end, n_sentences): - tokens_b.extend(sample[j]) - - # Random next: - is_next_random = False - if np_rng.random() < 0.5: - is_next_random = True - tokens_a, tokens_b = tokens_b, tokens_a - - return tokens_a, tokens_b, is_next_random - - -def truncate_segments(tokens_a, tokens_b, len_a, len_b, max_num_tokens, np_rng): - """Truncates a pair of sequences to a maximum sequence length.""" - #print(len_a, len_b, max_num_tokens) - assert len_a > 0 - assert len_b > 0 - if len_a + len_b <= max_num_tokens: - return False - while len_a + len_b > max_num_tokens: - if len_a > len_b: - len_a -= 1 - tokens = tokens_a - else: - len_b -= 1 - tokens = tokens_b - if np_rng.random() < 0.5: - del tokens[0] - else: - tokens.pop() - return True - - -def create_tokens_and_tokentypes(tokens_a, tokens_b, cls_id, sep_id): - """Merge segments A and B, add [CLS] and [SEP] and build tokentypes.""" - - tokens = [] - tokentypes = [] - # [CLS]. - tokens.append(cls_id) - tokentypes.append(0) - # Segment A. - for token in tokens_a: - tokens.append(token) - tokentypes.append(0) - # [SEP]. - tokens.append(sep_id) - tokentypes.append(0) - # Segment B. - for token in tokens_b: - tokens.append(token) - tokentypes.append(1) - # [SEP]. - tokens.append(sep_id) - tokentypes.append(1) - - return tokens, tokentypes - - -MaskedLmInstance = collections.namedtuple("MaskedLmInstance", - ["index", "label"]) - - -def is_start_piece(piece): - """Check if the current word piece is the starting piece (BERT).""" - # When a word has been split into - # WordPieces, the first token does not have any marker and any subsequence - # tokens are prefixed with ##. So whenever we see the ## token, we - # append it to the previous set of word indexes. - return not piece.startswith("##") - - -def create_masked_lm_predictions(tokens, - vocab_id_list, vocab_id_to_token_dict, - masked_lm_prob, - cls_id, sep_id, mask_id, - max_predictions_per_seq, - np_rng, - max_ngrams=3, - do_whole_word_mask=True, - favor_longer_ngram=False, - do_permutation=False): - """Creates the predictions for the masked LM objective. - Note: Tokens here are vocab ids and not text tokens.""" - - cand_indexes = [] - # Note(mingdachen): We create a list for recording if the piece is - # the starting piece of current token, where 1 means true, so that - # on-the-fly whole word masking is possible. - token_boundary = [0] * len(tokens) - - for (i, token) in enumerate(tokens): - if token == cls_id or token == sep_id: - token_boundary[i] = 1 - continue - # Whole Word Masking means that if we mask all of the wordpieces - # corresponding to an original word. - # - # Note that Whole Word Masking does *not* change the training code - # at all -- we still predict each WordPiece independently, softmaxed - # over the entire vocabulary. - if (do_whole_word_mask and len(cand_indexes) >= 1 and - not is_start_piece(vocab_id_to_token_dict[token])): - cand_indexes[-1].append(i) - else: - cand_indexes.append([i]) - if is_start_piece(vocab_id_to_token_dict[token]): - token_boundary[i] = 1 - - output_tokens = list(tokens) - - masked_lm_positions = [] - masked_lm_labels = [] - - if masked_lm_prob == 0: - return (output_tokens, masked_lm_positions, - masked_lm_labels, token_boundary) - - num_to_predict = min(max_predictions_per_seq, - max(1, int(round(len(tokens) * masked_lm_prob)))) - - # Note(mingdachen): - # By default, we set the probilities to favor shorter ngram sequences. - ngrams = np.arange(1, max_ngrams + 1, dtype=np.int64) - pvals = 1. / np.arange(1, max_ngrams + 1) - pvals /= pvals.sum(keepdims=True) - - if favor_longer_ngram: - pvals = pvals[::-1] - - ngram_indexes = [] - for idx in range(len(cand_indexes)): - ngram_index = [] - for n in ngrams: - ngram_index.append(cand_indexes[idx:idx + n]) - ngram_indexes.append(ngram_index) - - np_rng.shuffle(ngram_indexes) - - masked_lms = [] - covered_indexes = set() - for cand_index_set in ngram_indexes: - if len(masked_lms) >= num_to_predict: - break - if not cand_index_set: - continue - # Note(mingdachen): - # Skip current piece if they are covered in lm masking or previous ngrams. - for index_set in cand_index_set[0]: - for index in index_set: - if index in covered_indexes: - continue - - n = np_rng.choice(ngrams[:len(cand_index_set)], - p=pvals[:len(cand_index_set)] / - pvals[:len(cand_index_set)].sum(keepdims=True)) - index_set = sum(cand_index_set[n - 1], []) - n -= 1 - # Note(mingdachen): - # Repeatedly looking for a candidate that does not exceed the - # maximum number of predictions by trying shorter ngrams. - while len(masked_lms) + len(index_set) > num_to_predict: - if n == 0: - break - index_set = sum(cand_index_set[n - 1], []) - n -= 1 - # If adding a whole-word mask would exceed the maximum number of - # predictions, then just skip this candidate. - if len(masked_lms) + len(index_set) > num_to_predict: - continue - is_any_index_covered = False - for index in index_set: - if index in covered_indexes: - is_any_index_covered = True - break - if is_any_index_covered: - continue - for index in index_set: - covered_indexes.add(index) - - masked_token = None - # 80% of the time, replace with [MASK] - if np_rng.random() < 0.8: - masked_token = mask_id - else: - # 10% of the time, keep original - if np_rng.random() < 0.5: - masked_token = tokens[index] - # 10% of the time, replace with random word - else: - masked_token = vocab_id_list[np_rng.randint(0, len(vocab_id_list))] - - output_tokens[index] = masked_token - - masked_lms.append(MaskedLmInstance(index=index, label=tokens[index])) - assert len(masked_lms) <= num_to_predict - - np_rng.shuffle(ngram_indexes) - - select_indexes = set() - if do_permutation: - for cand_index_set in ngram_indexes: - if len(select_indexes) >= num_to_predict: - break - if not cand_index_set: - continue - # Note(mingdachen): - # Skip current piece if they are covered in lm masking or previous ngrams. - for index_set in cand_index_set[0]: - for index in index_set: - if index in covered_indexes or index in select_indexes: - continue - - n = np.random.choice(ngrams[:len(cand_index_set)], - p=pvals[:len(cand_index_set)] / - pvals[:len(cand_index_set)].sum(keepdims=True)) - index_set = sum(cand_index_set[n - 1], []) - n -= 1 - - while len(select_indexes) + len(index_set) > num_to_predict: - if n == 0: - break - index_set = sum(cand_index_set[n - 1], []) - n -= 1 - # If adding a whole-word mask would exceed the maximum number of - # predictions, then just skip this candidate. - if len(select_indexes) + len(index_set) > num_to_predict: - continue - is_any_index_covered = False - for index in index_set: - if index in covered_indexes or index in select_indexes: - is_any_index_covered = True - break - if is_any_index_covered: - continue - for index in index_set: - select_indexes.add(index) - assert len(select_indexes) <= num_to_predict - - select_indexes = sorted(select_indexes) - permute_indexes = list(select_indexes) - np_rng.shuffle(permute_indexes) - orig_token = list(output_tokens) - - for src_i, tgt_i in zip(select_indexes, permute_indexes): - output_tokens[src_i] = orig_token[tgt_i] - masked_lms.append(MaskedLmInstance(index=src_i, label=orig_token[src_i])) - - masked_lms = sorted(masked_lms, key=lambda x: x.index) - - for p in masked_lms: - masked_lm_positions.append(p.index) - masked_lm_labels.append(p.label) - - return (output_tokens, masked_lm_positions, masked_lm_labels, token_boundary) - - -def pad_and_convert_to_numpy(tokens, tokentypes, masked_positions, - masked_labels, pad_id, max_seq_length): - """Pad sequences and convert them to numpy.""" - - # Some checks. - num_tokens = len(tokens) - padding_length = max_seq_length - num_tokens - assert padding_length >= 0 - assert len(tokentypes) == num_tokens - assert len(masked_positions) == len(masked_labels) - - # Tokens and token types. - filler = [pad_id] * padding_length - tokens_np = np.array(tokens + filler, dtype=np.int64) - tokentypes_np = np.array(tokentypes + filler, dtype=np.int64) - - # Padding mask. - padding_mask_np = np.array([1] * num_tokens + [0] * padding_length, - dtype=np.int64) - - # Lables and loss mask. - labels = [-1] * max_seq_length - loss_mask = [0] * max_seq_length - for i in range(len(masked_positions)): - assert masked_positions[i] < num_tokens - labels[masked_positions[i]] = masked_labels[i] - loss_mask[masked_positions[i]] = 1 - labels_np = np.array(labels, dtype=np.int64) - loss_mask_np = np.array(loss_mask, dtype=np.int64) - - return tokens_np, tokentypes_np, labels_np, padding_mask_np, loss_mask_np - - -def get_indexed_dataset_(data_prefix, data_impl, skip_warmup): - - print_rank_0(' > building dataset index ...') - - start_time = time.time() - indexed_dataset = make_indexed_dataset(data_prefix, - data_impl, - skip_warmup) - assert indexed_dataset.sizes.shape[0] == indexed_dataset.doc_idx[-1] - print_rank_0(' > finished creating indexed dataset in {:4f} ' - 'seconds'.format(time.time() - start_time)) - - print_rank_0(' > indexed dataset stats:') - print_rank_0(' number of documents: {}'.format( - indexed_dataset.doc_idx.shape[0] - 1)) - print_rank_0(' number of sentences: {}'.format( - indexed_dataset.sizes.shape[0])) - - return indexed_dataset - - -def get_train_valid_test_split_(splits_string, size): - """ Get dataset splits from comma or '/' separated string list.""" - - splits = [] - if splits_string.find(',') != -1: - splits = [float(s) for s in splits_string.split(',')] - elif splits_string.find('/') != -1: - splits = [float(s) for s in splits_string.split('/')] - else: - splits = [float(splits_string)] - while len(splits) < 3: - splits.append(0.) - splits = splits[:3] - splits_sum = sum(splits) - assert splits_sum > 0.0 - splits = [split / splits_sum for split in splits] - splits_index = [0] - for index, split in enumerate(splits): - splits_index.append(splits_index[index] + - int(round(split * float(size)))) - diff = splits_index[-1] - size - for index in range(1, len(splits_index)): - splits_index[index] -= diff - assert len(splits_index) == 4 - assert splits_index[-1] == size - return splits_index - - diff --git a/megatron/data/gpt2_dataset.py b/megatron/data/gpt2_dataset.py index 3b654be95..d1cef21e7 100644 --- a/megatron/data/gpt2_dataset.py +++ b/megatron/data/gpt2_dataset.py @@ -25,7 +25,6 @@ import torch from megatron import mpu, print_rank_0 -from megatron.data.dataset_utils import get_train_valid_test_split_ from megatron.data.indexed_dataset import make_dataset as make_indexed_dataset @@ -111,7 +110,6 @@ def __len__(self): # sample i --> [sample_idx[i], sample_idx[i+1]) return self.sample_idx.shape[0] - 1 - def __getitem__(self, idx): # Get the shuffled index. idx = self.shuffle_idx[idx] @@ -182,7 +180,6 @@ def _build_index_mappings(name, data_prefix, documents, sizes, start_time = time.time() # Use C++ implementation for speed. # First compile and then import. - from megatron.data.dataset_utils import compile_helper compile_helper() from megatron.data import helpers assert doc_idx.dtype == np.int32 @@ -319,3 +316,44 @@ def _build_shuffle_idx(size, np_rng): shuffle_idx = np.arange(start=0, stop=size, step=1, dtype=dtype_) np_rng.shuffle(shuffle_idx) return shuffle_idx + + +def compile_helper(): + """Compile helper function ar runtime. Make sure this + is invoked on a single process.""" + import os + import subprocess + path = os.path.abspath(os.path.dirname(__file__)) + ret = subprocess.run(['make', '-C', path]) + if ret.returncode != 0: + print("Making C++ dataset helpers module failed, exiting.") + import sys + sys.exit(1) + + +def get_train_valid_test_split_(splits_string, size): + """ Get dataset splits from comma or '/' separated string list.""" + + splits = [] + if splits_string.find(',') != -1: + splits = [float(s) for s in splits_string.split(',')] + elif splits_string.find('/') != -1: + splits = [float(s) for s in splits_string.split('/')] + else: + splits = [float(splits_string)] + while len(splits) < 3: + splits.append(0.) + splits = splits[:3] + splits_sum = sum(splits) + assert splits_sum > 0.0 + splits = [split / splits_sum for split in splits] + splits_index = [0] + for index, split in enumerate(splits): + splits_index.append(splits_index[index] + + int(round(split * float(size)))) + diff = splits_index[-1] - size + for index in range(1, len(splits_index)): + splits_index[index] -= diff + assert len(splits_index) == 4 + assert splits_index[-1] == size + return splits_index diff --git a/megatron/data/indexed_dataset.py b/megatron/data/indexed_dataset.py index 125106623..fa3b9bb78 100644 --- a/megatron/data/indexed_dataset.py +++ b/megatron/data/indexed_dataset.py @@ -10,14 +10,15 @@ # Added document index to index file and made it accessible. # An empty sentence no longer separates documents. -from functools import lru_cache import os import shutil import struct +from functools import lru_cache from itertools import accumulate import numpy as np import torch + from megatron import print_rank_0 @@ -28,10 +29,6 @@ def __best_fitting_dtype(vocab_size=None): return np.int32 -def get_available_dataset_impl(): - return ['lazy', 'cached', 'mmap'] - - def infer_dataset_impl(path): if IndexedDataset.exists(path): with open(index_file_path(path), 'rb') as f: @@ -200,7 +197,7 @@ def size(self, index): @staticmethod def exists(path): return ( - os.path.exists(index_file_path(path)) and os.path.exists(data_file_path(path)) + os.path.exists(index_file_path(path)) and os.path.exists(data_file_path(path)) ) @property @@ -532,7 +529,7 @@ def supports_prefetch(self): @staticmethod def exists(path): return ( - os.path.exists(index_file_path(path)) and os.path.exists(data_file_path(path)) + os.path.exists(index_file_path(path)) and os.path.exists(data_file_path(path)) ) diff --git a/megatron/fp16/__init__.py b/megatron/fp16/__init__.py index 56ee11f79..6e3cd9bc1 100644 --- a/megatron/fp16/__init__.py +++ b/megatron/fp16/__init__.py @@ -12,19 +12,5 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from .fp16util import ( - BN_convert_float, - network_to_half, - prep_param_lists, - model_grads_to_master_grads, - master_params_to_model_params, - tofp16, - to_python_float, - clip_grad_norm, - convert_module, - convert_network, - FP16Model, -) from .fp16 import * -from .loss_scaler import * diff --git a/megatron/fp16/fp16.py b/megatron/fp16/fp16.py index bdea6adbb..512621c28 100755 --- a/megatron/fp16/fp16.py +++ b/megatron/fp16/fp16.py @@ -14,18 +14,8 @@ # limitations under the License. """Stable version of apex FP16 Optimizer""" import torch -from torch import nn from torch.autograd import Variable from torch.nn.parameter import Parameter -from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors - -from .loss_scaler import DynamicLossScaler, LossScaler -from .fp16util import model_grads_to_master_grads, master_params_to_model_params, clip_grad_norm - -from apex.multi_tensor_apply import multi_tensor_applier -import amp_C - -from megatron.module import MegatronModule FLOAT_TYPES = (torch.FloatTensor, torch.cuda.FloatTensor) HALF_TYPES = (torch.HalfTensor, torch.cuda.HalfTensor) @@ -64,588 +54,3 @@ def float_conversion(val): return val return conversion_helper(val, float_conversion) - -class FP16_Module(MegatronModule): - def __init__(self, module): - super(FP16_Module, self).__init__() - self.add_module('module', module.half()) - - def forward(self, *inputs, **kwargs): - return fp16_to_fp32(self.module(*(fp32_to_fp16(inputs)), **kwargs)) - - def state_dict(self, destination=None, prefix='', keep_vars=False): - return self.module.state_dict(destination, prefix, keep_vars) - - def state_dict_for_save_checkpoint(self, destination=None, prefix='', - keep_vars=False): - return self.module.state_dict_for_save_checkpoint(destination, prefix, - keep_vars) - - def load_state_dict(self, state_dict, strict=True): - self.module.load_state_dict(state_dict, strict=strict) - -# TODO: Update overflow check + downscale to use Carl's fused kernel. - - -class FP16_Optimizer(object): - """ - :class:`FP16_Optimizer` is designed to wrap an existing PyTorch optimizer, - and manage static or dynamic loss scaling and master weights in a manner transparent to the user. - For standard use, only two lines must be changed: creating the :class:`FP16_Optimizer` instance, - and changing the call to ``backward``. - - Example:: - - model = torch.nn.Linear(D_in, D_out).cuda().half() - optimizer = torch.optim.SGD(model.parameters(), lr=1e-3) - # Name the FP16_Optimizer instance to replace the existing optimizer - # (recommended but not required): - optimizer = FP16_Optimizer(optimizer, static_loss_scale = 128.0) - ... - # loss.backward() becomes: - optimizer.backward(loss) - ... - - Example with dynamic loss scaling:: - - ... - optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) - # optional arg to control dynamic loss scaling behavior - # dynamic_loss_args={'scale_window' : 500}) - # Usually, dynamic_loss_args is not necessary. - - Args: - init_optimizer (torch.optim.optimizer): Existing optimizer created with the parameters to optimize. Internally, :class:`FP16_Optimizer` replaces the passed optimizer's fp16 parameters, if any, with fp32 master parameters copied from the original ones. :class:`FP16_Optimizer` also stores references to the original fp16 parameters, and updates these fp16 parameters from the master fp32 copy at the end of each :attr:`step`. - static_loss_scale (float, optional, default=1.0): Loss scale used internally to scale gradients computed by the model. Any fp16 gradients will be copied to fp32, then downscaled before being applied to the fp32 master params, so ``static_loss_scale`` should not affect learning rate. - dynamic_loss_scale (bool, optional, default=False): Use dynamic loss scaling. If True, this will override any ``static_loss_scale`` option. - dynamic_loss_args (dict, optional, default=None): Dict of kwargs that will be forwarded to the internal :class:`DynamicLossScaler` instance's constructor. Keys of this dict must match kwargs accepted by :class:`DynamicLossScaler`'s constructor. If ``dynamic_loss_args`` is unspecified, :class:`DynamicLossScaler`'s defaults will be used. - verbose (bool, optional, default=True): By default, FP16_Optimizer's constructor prints out the parameters and parameter groups it is ingesting, as a sanity check. If this becomes annoying (e.g. for large models), it can be disabled by passing ``verbose=False``. ``verbose=False`` will not disable printing when the loss scale is readjusted during dynamic loss scaling. - - ``init_optimizer`` is expected to have been constructed in the ordinary way. - It is recommended (although not required) that the newly constructed :class:`FP16_Optimizer` instance be - named to replace ``init_optimizer``, for two reasons: - First, it means that references to the same name - later in the file will not have to change. - Second, :class:`FP16_Optimizer` reserves the right (as an implementation detail) to - modify ``init_optimizer``. If you do choose a unique name for the new - :class:`FP16_Optimizer` instance, you should only work with this new instance, - because the preexisting optimizer might no longer behave as expected. - - ``init_optimizer`` may be any Pytorch optimizer. - It may contain a mixture of fp16 and fp32 parameters organized into any number of - ``param_groups`` with different hyperparameters. The :class:`FP16_Optimizer` constructor will - ingest these ``param_groups`` and remember them. - - Calls to :: - - loss.backward() - - must be replaced with :: - - optimizer.backward(loss) - - because :class:`FP16_Optimizer` requires ownership of the backward pass to implement - loss scaling and copies to master gradients. - - .. note:: - Loss scaling, either static or dynamic, is orthogonal to learning rate, because gradients - are downscaled before being applied. This means that adjusting the loss scale, or using - dynamic loss scaling, should not require retuning the learning rate or any other - hyperparameters. - - - **Advanced options** - - **Closures**: :class:`FP16_Optimizer` can wrap a Pytorch optimizer that receives a closure. - See docstring for :attr:`step`. - - **Gradient clipping**: Use :attr:`clip_master_grads`. - - **Multiple losses**: If your model accumulates gradients from multiple losses, - this can be made more efficient by supplying ``update_master_grads=False`` - to :attr:`backward`. See docstring for :attr:`backward`. - - **Manually adjusting loss scale**: The current loss scale can be retrieved or set via :: - - print(optimizer.loss_scale) - optimizer.loss_scale = new_loss_scale - - For static loss scaling, manually adjusting the loss scale over time is a reasonable - thing to do. During later epochs, gradients may become smaller, and a - higher loss scale may be required, analogous to scheduling the learning rate. Dynamic loss - scaling is more subtle (see :class:`DynamicLossScaler`) and in this case, manually adjusting - the loss scale is not recommended. - - **Multi_GPU training**: If the wrapped ``init_optimizer`` was created from a model wrapped in - Pytorch DistributedDataParallel or Apex DistributedDataParallel, :class:`FP16_Optimizer` - should still work as intended. - """ - - def __init__(self, - init_optimizer, - static_loss_scale=1.0, - dynamic_loss_scale=False, - dynamic_loss_args=None, - verbose=False): - if not torch.cuda.is_available: - raise SystemError("Cannot use fp16 without CUDA.") - - self.verbose = verbose - - self.optimizer = init_optimizer - # init_state_dict sets up an alternative way to cast per-param state tensors. - # Stashing here in case https://github.com/pytorch/pytorch/issues/7733 makes it necessary. - # init_state_dict = init_optimizer.state_dict() - - self.fp16_groups = [] - self.fp32_from_fp16_groups = [] - self.fp32_from_fp32_groups = [] - for i, param_group in enumerate(self.optimizer.param_groups): - self.maybe_print("FP16_Optimizer processing param group {}:".format(i)) - fp16_params_this_group = [] - fp32_params_this_group = [] - fp32_from_fp16_params_this_group = [] - for i, param in enumerate(param_group['params']): - if param.requires_grad: - if param.type() == 'torch.cuda.HalfTensor': - self.maybe_print("FP16_Optimizer received torch.cuda.HalfTensor with {}" - .format(param.size())) - fp16_params_this_group.append(param) - master_param = param.detach().clone().float() - master_param.requires_grad = True - # Copythe model parallel flag. - master_param.model_parallel = param.model_parallel - param_group['params'][i] = master_param - fp32_from_fp16_params_this_group.append(master_param) - # Reset existing state dict key to the new master param. - # We still need to recast per-param state tensors, if any, to FP32. - if param in self.optimizer.state: - self.optimizer.state[master_param] = self.optimizer.state.pop(param) - elif param.type() == 'torch.cuda.FloatTensor': - self.maybe_print("FP16_Optimizer received torch.cuda.FloatTensor with {}" - .format(param.size())) - fp32_params_this_group.append(param) - param_group['params'][i] = param - else: - raise TypeError("Wrapped parameters must be either " - "torch.cuda.FloatTensor or torch.cuda.HalfTensor. " - "Received {}".format(param.type())) - - self.fp16_groups.append(fp16_params_this_group) - self.fp32_from_fp16_groups.append(fp32_from_fp16_params_this_group) - self.fp32_from_fp32_groups.append(fp32_params_this_group) - - # Leverage state_dict() and load_state_dict() to recast preexisting per-param state tensors - self.optimizer.load_state_dict(self.optimizer.state_dict()) - # alternative way to cast per-param state tensors: - # self.optimizer.load_state_dict(init_state_dict) - - if dynamic_loss_scale: - self.dynamic_loss_scale = True - if dynamic_loss_args is not None: - self.loss_scaler = DynamicLossScaler(**dynamic_loss_args) - else: - self.loss_scaler = DynamicLossScaler() - else: - self.dynamic_loss_scale = False - self.loss_scaler = LossScaler(static_loss_scale) - - self.overflow = False - self.first_closure_call_this_step = True - - self.clip_grad_norm = clip_grad_norm - - def maybe_print(self, msg): - if self.verbose: - print(msg) - - def __getstate__(self): - raise RuntimeError("FP16_Optimizer should be serialized using state_dict().") - - def __setstate__(self, state): - raise RuntimeError("FP16_Optimizer should be deserialized using load_state_dict().") - - def zero_grad(self, set_grads_to_None=False): - """ - Zero fp32 and fp16 parameter grads. - """ - # In principle, only the .grad attributes of the model params need to be zeroed, - # because gradients are copied into the FP32 master params. However, we zero - # all gradients owned by the optimizer, just to be safe: - for group in self.optimizer.param_groups: - for p in group['params']: - if set_grads_to_None: - p.grad = None - else: - if p.grad is not None: - p.grad.detach_() - p.grad.zero_() - - # Zero fp16 gradients owned by the model: - for fp16_group in self.fp16_groups: - for param in fp16_group: - if set_grads_to_None: - param.grad = None - else: - if param.grad is not None: - param.grad.detach_() # as in torch.optim.optimizer.zero_grad() - param.grad.zero_() - - def _check_overflow(self): - params = [] - for group in self.fp16_groups: - for param in group: - params.append(param) - for group in self.fp32_from_fp32_groups: - for param in group: - params.append(param) - self.overflow = self.loss_scaler.has_overflow(params) - - def _update_scale(self, has_overflow=False): - self.loss_scaler.update_scale(has_overflow) - - def _master_params_to_model_params(self): - for fp16_group, fp32_from_fp16_group in zip(self.fp16_groups, self.fp32_from_fp16_groups): - master_params_to_model_params(fp16_group, fp32_from_fp16_group) - - def _model_params_to_master_params(self): - for fp16_group, fp32_from_fp16_group in zip(self.fp16_groups, self.fp32_from_fp16_groups): - master_params_to_model_params(fp32_from_fp16_group, fp16_group) - - # To consider: Integrate distributed with this wrapper by registering a hook on each variable - # that does the overflow check, gradient copy + downscale, and fp32 - # allreduce in a different stream. - def _model_grads_to_master_grads(self): - for fp16_group, fp32_from_fp16_group in zip(self.fp16_groups, self.fp32_from_fp16_groups): - model_grads_to_master_grads(fp16_group, fp32_from_fp16_group) - - def _downscale_master(self): - if self.loss_scale != 1.0: - for group in self.optimizer.param_groups: - grads = [p.grad for p in group['params'] if p.grad is not None] - _overflow_buf = torch.cuda.IntTensor([0]) - multi_tensor_applier(amp_C.multi_tensor_scale, - _overflow_buf, - [grads, grads], - 1./self.loss_scale) - - def clip_master_grads(self, max_norm, norm_type=2): - """ - Clips fp32 master gradients via ``torch.nn.utils.clip_grad_norm``. - - Args: - max_norm (float or int): max norm of the gradients - norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for - infinity norm. - - Returns: - Total norm of the current fp32 gradients (viewed as a single vector). - - .. warning:: - Returns -1 if the most recently computed fp16 gradients overflowed (that is, if ``self.overflow`` is ``True``). - """ - if not self.overflow: - fp32_params = [] - for param_group in self.optimizer.param_groups: - for param in param_group['params']: - fp32_params.append(param) - return self.clip_grad_norm(fp32_params, max_norm, norm_type) - else: - return -1 - - def state_dict(self): - """ - Returns a dict containing the current state of this :class:`FP16_Optimizer` instance. - This dict contains attributes of :class:`FP16_Optimizer`, as well as the state_dict - of the contained Pytorch optimizer. - Example:: - - checkpoint = {} - checkpoint['model'] = model.state_dict() - checkpoint['optimizer'] = optimizer.state_dict() - torch.save(checkpoint, "saved.pth") - """ - state_dict = {} - state_dict['loss_scaler'] = self.loss_scaler - state_dict['dynamic_loss_scale'] = self.dynamic_loss_scale - state_dict['overflow'] = self.overflow - state_dict['first_closure_call_this_step'] = self.first_closure_call_this_step - state_dict['optimizer_state_dict'] = self.optimizer.state_dict() - state_dict['fp32_from_fp16'] = self.fp32_from_fp16_groups - return state_dict - - def load_state_dict(self, state_dict): - """ - Loads a state_dict created by an earlier call to state_dict(). - If ``fp16_optimizer_instance`` was constructed from some ``init_optimizer``, - whose parameters in turn came from ``model``, it is expected that the user - will call ``model.load_state_dict()`` before - ``fp16_optimizer_instance.load_state_dict()`` is called. - - Example:: - - model = torch.nn.Linear(D_in, D_out).cuda().half() - optimizer = torch.optim.SGD(model.parameters(), lr=1e-3) - optimizer = FP16_Optimizer(optimizer, static_loss_scale = 128.0) - ... - checkpoint = torch.load("saved.pth") - model.load_state_dict(checkpoint['model']) - optimizer.load_state_dict(checkpoint['optimizer']) - """ - # I think it should actually be ok to reload the optimizer before the model. - self.loss_scaler = state_dict['loss_scaler'] - self.dynamic_loss_scale = state_dict['dynamic_loss_scale'] - self.overflow = state_dict['overflow'] - self.first_closure_call_this_step = state_dict['first_closure_call_this_step'] - self.optimizer.load_state_dict(state_dict['optimizer_state_dict']) - # At this point, the optimizer's references to the model's fp32 parameters are up to date. - # The optimizer's hyperparameters and internal buffers are also up to date. - # However, the fp32 master copies of the model's fp16 params stored by the optimizer are still - # out of date. There are two options. - # 1: Refresh the master params from the model's fp16 params. - # This requires less storage but incurs precision loss. - # 2: Save and restore the fp32 master copies separately. - # We choose option 2. - # - # Pytorch Optimizer.load_state_dict casts saved buffers (e.g. momentum) to the type and device - # of their associated parameters, because it's possible those buffers might not exist yet in - # the current optimizer instance. In our case, as long as the current FP16_Optimizer has been - # constructed in the same way as the one whose state_dict we are loading, the same master params - # are guaranteed to exist, so we can just copy_() from the saved master params. - for current_group, saved_group in zip( - self.fp32_from_fp16_groups, state_dict['fp32_from_fp16']): - for current, saved in zip(current_group, saved_group): - current.data.copy_(saved.data) - - def step(self, closure=None): # could add clip option. - """ - If no closure is supplied, :attr:`step` should be called after - ``fp16_optimizer_obj.backward(loss)``. - :attr:`step` updates the fp32 master copy of parameters using the optimizer supplied to - :class:`FP16_Optimizer`'s constructor, then copies the updated fp32 params into the fp16 params - originally referenced by :class:`FP16_Optimizer`'s constructor, so the user may immediately run - another forward pass using their model. - - If a closure is supplied, :attr:`step` may be called without a prior call to - :attr:`backward(loss)`. - This control flow is identical to `ordinary Pytorch optimizer use`_ with closures. - However, the user should take care that any ``loss.backward()`` call within the closure - has been replaced by ``fp16_optimizer_obj.backward(loss)``. - - Args: - closure (optional): Closure that will be supplied to the underlying optimizer originally passed to :class:`FP16_Optimizer`'s constructor. closure should call :attr:`zero_grad()` on the :class:`FP16_Optimizer` object, compute the loss, call :attr:`backward(loss)`, and return the loss. - - Example with closure:: - - # optimizer is assumed to be an FP16_Optimizer object, previously constructed from an - # existing pytorch optimizer. - for input, target in dataset: - def closure(): - optimizer.zero_grad() - output = model(input) - loss = loss_fn(output, target) - # loss.backward() becomes: - optimizer.backward(loss) - return loss - optimizer.step(closure) - - .. warning:: - Currently, calling :attr:`step` with a closure is not compatible with dynamic loss scaling. - - .. _`ordinary Pytorch optimizer use`: - http://pytorch.org/docs/master/optim.html#optimizer-step-closure - """ - - scale = self.loss_scaler.loss_scale - self._update_scale(self.overflow) - - if self.overflow: - self.maybe_print("OVERFLOW! Skipping step. Attempted loss scale: {}, reducing to {}" - .format(scale, self.loss_scale)) - return - - if closure is not None: - retval = self._step_with_closure(closure) - else: - retval = self.optimizer.step() - - self._master_params_to_model_params() - - return retval - - def _step_with_closure(self, closure): - def wrapped_closure(): - # helpful for debugging - # print("Calling wrapped_closure, first_closure_call_this_step = {}" - # .format(self.first_closure_call_this_step)) - if self.first_closure_call_this_step: - # We expect that the fp16 params are initially fresh on entering self.step(), - # so _master_params_to_model_params() is unnecessary the first time wrapped_closure() - # is called within self.optimizer.step(). - self.first_closure_call_this_step = False - else: - # If self.optimizer.step() internally calls wrapped_closure more than once, - # it may update the fp32 params after each call. However, self.optimizer - # doesn't know about the fp16 params at all. If the fp32 params get updated, - # we can't rely on self.optimizer to refresh the fp16 params. We need - # to handle that manually: - self._master_params_to_model_params() - # Our API expects the user to give us ownership of the backward() call by - # replacing all calls to loss.backward() with optimizer.backward(loss). - # This requirement holds whether or not the call to backward() is made within a closure. - # If the user is properly calling optimizer.backward(loss) within "closure," - # calling closure() here will give the fp32 master params fresh gradients - # for the optimizer to play with, so all wrapped_closure needs to do is call - # closure() and return the loss. - temp_loss = closure() - while(self.overflow): - scale = self.loss_scaler.loss_scale - self._update_scale(self.overflow) - self.maybe_print("OVERFLOW within closure! Skipping step. Attempted loss scale: {}, " - "reducing to {}".format(scale, self.loss_scale)) - temp_loss = closure() - return temp_loss - - retval = self.optimizer.step(wrapped_closure) - - self.first_closure_call_this_step = True - - return retval - - def backward(self, loss, update_master_grads=True, retain_graph=False): - """ - :attr:`backward` performs the following conceptual steps: - - 1. fp32_loss = loss.float() (see first Note below) - 2. scaled_loss = fp32_loss*loss_scale - 3. scaled_loss.backward(), which accumulates scaled gradients into the ``.grad`` attributes of the model's leaves (which may be fp16, fp32, or a mixture, depending how your model was defined). - 4. fp16 grads are then copied to the master params' ``.grad`` attributes (see second Note), which are guaranteed to be fp32. - 5. Finally, master grads are divided by loss_scale. - - In this way, after :attr:`backward`, the master params have fresh gradients, - and :attr:`step` may be called. - - .. note:: - :attr:`backward` internally converts the loss to fp32 before applying the loss scale. - This provides some additional safety against overflow if the user has supplied an - fp16 loss value. - However, for maximum overflow safety, the user should - compute the loss criterion (MSE, cross entropy, etc) in fp32 before supplying it to - :attr:`backward`. - - .. warning:: - The gradients found in a model's leaves after the call to - :attr:`backward` should not be regarded as valid in general, - because it's possible - they have been scaled (and in the case of dynamic loss scaling, - the scale factor may change over time). - If the user wants to inspect gradients after a call to :attr:`backward`, - only the master gradients should be regarded as valid. These can be retrieved via - :attr:`inspect_master_grad_data()`. - - Args: - loss: The loss output by the user's model. loss may be either float or half (but see first Note above). - update_master_grads (bool, optional, default=True): Option to copy fp16 grads to fp32 grads on this call. By setting this to False, the user can delay the copy, which is useful to eliminate redundant fp16->fp32 grad copies if :attr:`backward` is being called on multiple losses in one iteration. If set to False, the user becomes responsible for calling :attr:`update_master_grads` before calling :attr:`step`. - retain_graph (bool, optional, default=False): Forwards the usual ``retain_graph=True`` option to the internal call to ``loss.backward``. If ``retain_graph`` is being used to accumulate gradient values from multiple backward passes before calling ``optimizer.step``, passing ``update_master_grads=False`` is also recommended (see Example below). - - Example:: - - # Ordinary operation: - optimizer.backward(loss) - - # Naive operation with multiple losses (technically valid, but less efficient): - # fp32 grads will be correct after the second call, but - # the first call incurs an unnecessary fp16->fp32 grad copy. - optimizer.backward(loss1) - optimizer.backward(loss2) - - # More efficient way to handle multiple losses: - # The fp16->fp32 grad copy is delayed until fp16 grads from all - # losses have been accumulated. - optimizer.backward(loss1, update_master_grads=False) - optimizer.backward(loss2, update_master_grads=False) - optimizer.update_master_grads() - """ - # To consider: try multiple backward passes using retain_grad=True to find - # a loss scale that works. After you find a loss scale that works, do a final dummy - # backward pass with retain_graph=False to tear down the graph. Doing this would avoid - # discarding the iteration, but probably wouldn't improve overall efficiency. - self.loss_scaler.backward(loss.float(), retain_graph=retain_graph) - if update_master_grads: - self.update_master_grads() - - def update_master_grads(self): - """ - Copy the ``.grad`` attribute from stored references to fp16 parameters to - the ``.grad`` attribute of the fp32 master parameters that are directly - updated by the optimizer. :attr:`update_master_grads` only needs to be called if - ``fp16_optimizer_obj.backward`` was called with ``update_master_grads=False``. - """ - if self.dynamic_loss_scale: - self._check_overflow() - if self.overflow: - return - self._model_grads_to_master_grads() - self._downscale_master() - - def inspect_master_grad_data(self): - """ - When running with :class:`FP16_Optimizer`, - ``.grad`` attributes of a model's fp16 leaves should not be - regarded as truthful, because they might be scaled. - After a call to :attr:`fp16_optimizer_obj.backward(loss)`, if no overflow was encountered, - the fp32 master params' ``.grad`` - attributes will contain valid gradients properly divided by the loss scale. However, - because :class:`FP16_Optimizer` flattens some parameters, accessing them may be - nonintuitive. :attr:`inspect_master_grad_data` - allows those gradients to be viewed with shapes corresponding to their associated model leaves. - - Returns: - List of lists (one list for each parameter group). The list for each parameter group - is a list of the ``.grad.data`` attributes of the fp32 master params belonging to that group. - """ - if self.overflow: - print("Warning: calling FP16_Optimizer.inspect_master_grad_data while in an overflow state. " - "Gradients are currently invalid (may be inf, nan, or stale). Returning None.") - return None - else: - # The optimizer owns only references to master params. - master_grads_data = [] - for param_group in self.optimizer.param_groups: - master_grads_this_group = [] - for param in param_group['params']: - if param.grad is not None: - master_grads_this_group.append(param.grad.data) - else: - master_grads_this_group.append(None) - master_grads_data.append(master_grads_this_group) - return master_grads_data - - # Promote loss scale so it can be retrieved or set via "fp16_optimizer_instance.loss_scale" - - def _get_loss_scale(self): - return self.loss_scaler.loss_scale - - def _set_loss_scale(self, value): - self.loss_scaler.cur_scale = value - - loss_scale = property(_get_loss_scale, _set_loss_scale) - - # Promote state so it can be retrieved or set via "fp16_optimizer_instance.state" - def _get_state(self): - return self.optimizer.state - - def _set_state(self, value): - self.optimizer.state = value - - state = property(_get_state, _set_state) - - # Promote param_groups so it can be retrieved or set via "fp16_optimizer_instance.param_groups" - # (for example, to adjust the learning rate) - def _get_param_groups(self): - return self.optimizer.param_groups - - def _set_param_groups(self, value): - self.optimizer.param_groups = value - - param_groups = property(_get_param_groups, _set_param_groups) diff --git a/megatron/fp16/fp16util.py b/megatron/fp16/fp16util.py deleted file mode 100644 index 0266ede34..000000000 --- a/megatron/fp16/fp16util.py +++ /dev/null @@ -1,216 +0,0 @@ -# coding=utf-8 -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch -import torch.nn as nn -from torch.autograd import Variable -from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors - -from apex.multi_tensor_apply import multi_tensor_applier -import amp_C - -from megatron import mpu - - -class tofp16(nn.Module): - """ - Utility module that implements:: - - def forward(self, input): - return input.half() - """ - - def __init__(self): - super(tofp16, self).__init__() - - def forward(self, input): - return input.half() - - -def BN_convert_float(module): - """ - Utility function for network_to_half(). - - Retained for legacy purposes. - """ - if isinstance(module, torch.nn.modules.batchnorm._BatchNorm) and module.affine is True: - module.float() - for child in module.children(): - BN_convert_float(child) - return module - - -def network_to_half(network): - """ - Convert model to half precision in a batchnorm-safe way. - - Retained for legacy purposes. It is recommended to use FP16Model. - """ - return nn.Sequential(tofp16(), BN_convert_float(network.half())) - - -def convert_module(module, dtype): - """ - Converts a module's immediate parameters and buffers to dtype. - """ - for param in module.parameters(recurse=False): - if param is not None: - if param.data.dtype.is_floating_point: - param.data = param.data.to(dtype=dtype) - if param._grad is not None and param._grad.data.dtype.is_floating_point: - param._grad.data = param._grad.data.to(dtype=dtype) - - for buf in module.buffers(recurse=False): - if buf is not None and buf.data.dtype.is_floating_point: - buf.data = buf.data.to(dtype=dtype) - - -def convert_network(network, dtype): - """ - Converts a network's parameters and buffers to dtype. - """ - for module in network.modules(): - if isinstance(module, torch.nn.modules.batchnorm._BatchNorm) and module.affine is True: - continue - convert_module(module, dtype) - return network - - -class FP16Model(nn.Module): - """ - Convert model to half precision in a batchnorm-safe way. - """ - - def __init__(self, network): - super(FP16Model, self).__init__() - self.network = convert_network(network, dtype=torch.half) - - def forward(self, *inputs): - inputs = tuple(t.half() for t in inputs) - return self.network(*inputs) - - -def backwards_debug_hook(grad): - raise RuntimeError("master_params recieved a gradient in the backward pass!") - - -def prep_param_lists(model, flat_master=False): - """ - Creates a list of FP32 master parameters for a given model, as in - `Training Neural Networks with Mixed Precision: Real Examples`_. - - Args: - model (torch.nn.Module): Existing Pytorch model - flat_master (bool, optional, default=False): Flatten the master parameters into a single tensor, as a performance optimization. - Returns: - A tuple (``model_params``, ``master_params``). ``model_params`` is a list of the model's parameters for later use with :func:`model_grads_to_master_grads` and :func:`master_params_to_model_params`. ``master_params`` is a list of FP32 master gradients. If ``flat_master=True``, ``master_params`` will be a list with one element. - - Example:: - - model_params, master_params = prep_param_lists(model) - - .. warning:: - Currently, if ``flat_master=True``, all the model's parameters must be the same type. If the model has parameters of different types, use ``flat_master=False``, or use :class:`FP16_Optimizer`. - - .. _`Training Neural Networks with Mixed Precision: Real Examples`: - http://on-demand.gputechconf.com/gtc/2018/video/S81012/ - """ - model_params = [param for param in model.parameters() if param.requires_grad] - - if flat_master: - # Give the user some more useful error messages - try: - # flatten_dense_tensors returns a contiguous flat array. - # http://pytorch.org/docs/master/_modules/torch/_utils.html - master_params = _flatten_dense_tensors([param.data for param in model_params]).float() - except BaseException: - print("Error in prep_param_lists: model may contain a mixture of parameters " - "of different types. Use flat_master=False, or use F16_Optimizer.") - raise - master_params = torch.nn.Parameter(master_params) - master_params.requires_grad = True - # master_params.register_hook(backwards_debug_hook) - if master_params.grad is None: - master_params.grad = master_params.new(*master_params.size()) - return model_params, [master_params] - else: - master_params = [param.clone().float().detach() for param in model_params] - for param in master_params: - param.requires_grad = True - return model_params, master_params - - -def model_grads_to_master_grads(model_params, master_params, flat_master=False): - """ - Copy model gradients to master gradients. - - Args: - model_params: List of model parameters created by :func:`prep_param_lists`. - master_params: List of FP32 master parameters created by :func:`prep_param_lists`. If ``master_params`` was created with ``flat_master=True``, ``flat_master=True`` should also be supplied to :func:`model_grads_to_master_grads`. - """ - if flat_master: - # The flattening may incur one more deep copy than is necessary. - master_params[0].grad.data.copy_( - _flatten_dense_tensors([p.grad.data for p in model_params])) - else: - for model, master in zip(model_params, master_params): - if model.grad is not None: - if master.grad is None: - master.grad = Variable(master.data.new(*master.data.size())) - else: - master.grad = None - model_grads = [p.grad for p in model_params if p.grad is not None] - master_grads = [p.grad for p in master_params if p.grad is not None] - _overflow_buf = torch.cuda.IntTensor([0]) - multi_tensor_applier(amp_C.multi_tensor_scale, - _overflow_buf, - [model_grads, master_grads], - 1.0) - - -def master_params_to_model_params(model_params, master_params, flat_master=False): - """ - Copy master parameters to model parameters. - - Args: - model_params: List of model parameters created by :func:`prep_param_lists`. - master_params: List of FP32 master parameters created by :func:`prep_param_lists`. If ``master_params`` was created with ``flat_master=True``, ``flat_master=True`` should also be supplied to :func:`master_params_to_model_params`. - """ - if flat_master: - for model, master in zip(model_params, - _unflatten_dense_tensors(master_params[0].data, model_params)): - model.data.copy_(master) - else: - for model, master in zip(model_params, master_params): - model.data.copy_(master.data) - -# Backward compatibility fixes - - -def to_python_float(t): - if hasattr(t, 'item'): - return t.item() - else: - return t[0] - - -TORCH_MAJOR = int(torch.__version__.split('.')[0]) -TORCH_MINOR = int(torch.__version__.split('.')[1]) - -clip_grad_norm = mpu.clip_grad_norm -# elif TORCH_MAJOR == 0 and TORCH_MINOR <= 4: -# clip_grad_norm = torch.nn.utils.clip_grad_norm -# else: -# clip_grad_norm = torch.nn.utils.clip_grad_norm_ diff --git a/megatron/fp16/loss_scaler.py b/megatron/fp16/loss_scaler.py deleted file mode 100755 index 126b7863f..000000000 --- a/megatron/fp16/loss_scaler.py +++ /dev/null @@ -1,256 +0,0 @@ -# coding=utf-8 -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch - -from apex.multi_tensor_apply import multi_tensor_applier -import amp_C - -from megatron import mpu - -# item() is a recent addition, so this helps with backward compatibility. - - -def to_python_float(t): - if hasattr(t, 'item'): - return t.item() - else: - return t[0] - - -class LossScaler: - """ - Class that manages a static loss scale. This class is intended to interact with - :class:`FP16_Optimizer`, and should not be directly manipulated by the user. - - Use of :class:`LossScaler` is enabled via the ``static_loss_scale`` argument to - :class:`FP16_Optimizer`'s constructor. - - Args: - scale (float, optional, default=1.0): The loss scale. - """ - - def __init__(self, scale=1): - self.cur_scale = scale - - # `params` is a list / generator of torch.Variable - def has_overflow(self, params): - return False - - # `x` is a torch.Tensor - def _has_inf_or_nan(x): - return False - - def update_scale(self, overflow): - pass - - @property - def loss_scale(self): - return self.cur_scale - - def scale_gradient(self, module, grad_in, grad_out): - _overflow_buf = torch.cuda.IntTensor([0]) - multi_tensor_applier(amp_C.multi_tensor_scale, - _overflow_buf, - [grad_in, grad_in], - self.loss_scale) - return grad_in - - def backward(self, loss, retain_graph=False): - scaled_loss = loss * self.loss_scale - scaled_loss.backward(retain_graph=retain_graph) - - -class DynamicLossScaler: - """ - Class that manages dynamic loss scaling. It is recommended to use :class:`DynamicLossScaler` - indirectly, by supplying ``dynamic_loss_scale=True`` to the constructor of - :class:`FP16_Optimizer`. However, it's important to understand how :class:`DynamicLossScaler` - operates, because the default options can be changed using the - the ``dynamic_loss_args`` argument to :class:`FP16_Optimizer`'s constructor. - - Loss scaling is designed to combat the problem of underflowing gradients encountered at long - times when training fp16 networks. Dynamic loss scaling begins by attempting a very high loss - scale. Ironically, this may result in OVERflowing gradients. If overflowing gradients are - encountered, :class:`DynamicLossScaler` informs :class:`FP16_Optimizer` that an overflow has - occurred. - :class:`FP16_Optimizer` then skips the update step for this particular iteration/minibatch, - and :class:`DynamicLossScaler` adjusts the loss scale to a lower value. - If a certain number of iterations occur without overflowing gradients detected, - :class:`DynamicLossScaler` increases the loss scale once more. - In this way :class:`DynamicLossScaler` attempts to "ride the edge" of - always using the highest loss scale possible without incurring overflow. - - Args: - init_scale (float, optional, default=2**32): Initial loss scale attempted by :class:`DynamicLossScaler.` - scale_factor (float, optional, default=2.0): Factor used when adjusting the loss scale. If an overflow is encountered, the loss scale is readjusted to loss scale/``scale_factor``. If ``scale_window`` consecutive iterations take place without an overflow, the loss scale is readjusted to loss_scale*``scale_factor``. - scale_window (int, optional, default=1000): Number of consecutive iterations without an overflow to wait before increasing the loss scale. - """ - - def __init__(self, - init_scale=2**32, - scale_factor=2., - scale_window=1000, - min_scale=1, - delayed_shift=1, - consecutive_hysteresis=False): - self.cur_scale = init_scale - self.cur_iter = 0 - self.last_overflow_iter = -1 - self.scale_factor = scale_factor - self.scale_window = scale_window - self.min_scale = min_scale - self.delayed_shift = delayed_shift - self.cur_hysteresis = delayed_shift - self.consecutive_hysteresis = consecutive_hysteresis - - # `params` is a list / generator of torch.Variable - def has_overflow_serial(self, params): - for p in params: - if p.grad is not None and DynamicLossScaler._has_inf_or_nan(p.grad.data): - return True - - return False - - def has_overflow(self, params): - overflow = self.has_overflow_serial(params) - # Since each model parallel GPU carries only part of the model, - # make sure overflow flag is synced across all the model parallel GPUs - overflow_gpu = torch.cuda.ByteTensor([overflow]) - torch.distributed.all_reduce(overflow_gpu, - op=torch.distributed.ReduceOp.MAX, - group=mpu.get_model_parallel_group()) - overflow = overflow_gpu[0].item() - return bool(overflow) - - # `x` is a torch.Tensor - - def _has_inf_or_nan(x): - try: - # if x is half, the .float() incurs an additional deep copy, but it's necessary if - # Pytorch's .sum() creates a one-element tensor of the same type as x - # (which is true for some recent version of pytorch). - cpu_sum = float(x.float().sum()) - # More efficient version that can be used if .sum() returns a Python scalar - # cpu_sum = float(x.sum()) - except RuntimeError as instance: - # We want to check if inst is actually an overflow exception. - # RuntimeError could come from a different error. - # If so, we still want the exception to propagate. - if "value cannot be converted" not in instance.args[0]: - raise - return True - else: - if cpu_sum == float('inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum: - return True - return False - - # `overflow` is boolean indicating whether the gradient overflowed - def update_scale(self, overflow): - - if not hasattr(self, 'min_scale'): - self.min_scale = 1 - if not hasattr(self, 'delayed_shift'): - self.delayed_shift = 1 - if not hasattr(self, 'cur_hysteresis'): - self.cur_hysteresis = 1 - if not hasattr(self, 'consecutive_hysteresis'): - self.consecutive_hysteresis = True - if overflow: - # self.cur_scale /= self.scale_factor - if self.delayed_shift == 1 or self.cur_hysteresis == 1: - self.cur_scale = max(self.cur_scale / self.scale_factor, self.min_scale) - else: - self.cur_hysteresis -= 1 - self.last_overflow_iter = self.cur_iter - else: - if self.consecutive_hysteresis: - self.cur_hysteresis = self.delayed_shift - if (self.cur_iter - self.last_overflow_iter) % self.scale_window == 0: - if not self.consecutive_hysteresis: - self.cur_hysteresis = self.delayed_shift - self.cur_scale *= self.scale_factor - self.cur_iter += 1 - - @property - def loss_scale(self): - return self.cur_scale - - def scale_gradient(self, module, grad_in, grad_out): - _overflow_buf = torch.cuda.IntTensor([0]) - multi_tensor_applier(amp_C.multi_tensor_scale, - _overflow_buf, - [grad_in, grad_in], - self.loss_scale) - return grad_in - - def backward(self, loss, retain_graph=False): - scaled_loss = loss * self.loss_scale - scaled_loss.backward(retain_graph=retain_graph) - - -############################################################## -# Example usage below here -- assuming it's in a separate file -############################################################## -""" -TO-DO separate out into an example. -if __name__ == "__main__": - import torch - from torch.autograd import Variable - from dynamic_loss_scaler import DynamicLossScaler - - # N is batch size; D_in is input dimension; - # H is hidden dimension; D_out is output dimension. - N, D_in, H, D_out = 64, 1000, 100, 10 - - # Create random Tensors to hold inputs and outputs, and wrap them in Variables. - x = Variable(torch.randn(N, D_in), requires_grad=False) - y = Variable(torch.randn(N, D_out), requires_grad=False) - - w1 = Variable(torch.randn(D_in, H), requires_grad=True) - w2 = Variable(torch.randn(H, D_out), requires_grad=True) - parameters = [w1, w2] - - learning_rate = 1e-6 - optimizer = torch.optim.SGD(parameters, lr=learning_rate) - loss_scaler = DynamicLossScaler() - - for t in range(500): - y_pred = x.mm(w1).clamp(min=0).mm(w2) - loss = (y_pred - y).pow(2).sum() * loss_scaler.loss_scale - print('Iter {} loss scale: {}'.format(t, loss_scaler.loss_scale)) - print('Iter {} scaled loss: {}'.format(t, loss.data[0])) - print('Iter {} unscaled loss: {}'.format(t, loss.data[0] / loss_scaler.loss_scale)) - - # Run backprop - optimizer.zero_grad() - loss.backward() - - # Check for overflow - has_overflow = DynamicLossScaler.has_overflow(parameters) - - # If no overflow, unscale grad and update as usual - if not has_overflow: - for param in parameters: - param.grad.data.mul_(1. / loss_scaler.loss_scale) - optimizer.step() - # Otherwise, don't do anything -- ie, skip iteration - else: - print('OVERFLOW!') - - # Update loss scale for next iteration - loss_scaler.update_scale(has_overflow) - -""" diff --git a/megatron/global_vars.py b/megatron/global_vars.py index 560d6c778..ba5f23887 100644 --- a/megatron/global_vars.py +++ b/megatron/global_vars.py @@ -34,6 +34,7 @@ _GLOBAL_TIMERS = None _GLOBAL_USE_WANDB = False + def get_args(): """Return arguments.""" _ensure_var_is_initialized(_GLOBAL_ARGS, 'args') @@ -67,7 +68,7 @@ def get_timers(): def set_global_variables(): """Set args, tokenizer, tensorboard-writer, adlr-autoresume, and timers.""" from megatron.neox_arguments import NeoXArgs - args = NeoXArgs.consume_megatron_args() + args = NeoXArgs.consume_neox_args() global _GLOBAL_ARGS _ensure_var_is_not_initialized(_GLOBAL_ARGS, 'args') @@ -118,7 +119,7 @@ def _set_tensorboard_writer(args): 'tensorboard writer') if hasattr(args, 'tensorboard_dir') and \ - args.tensorboard_dir and args.rank == 0: + args.tensorboard_dir and args.rank == 0: try: from torch.utils.tensorboard import SummaryWriter print('> setting tensorboard ...') @@ -251,10 +252,12 @@ def log(self, names, normalizer=1.0, reset=True): else: print(string, flush=True) + def get_use_wandb(): global _GLOBAL_USE_WANDB return _GLOBAL_USE_WANDB + def set_use_wandb(b: bool): global _GLOBAL_USE_WANDB - _GLOBAL_USE_WANDB = b \ No newline at end of file + _GLOBAL_USE_WANDB = b diff --git a/megatron/gradient_noise_scale/gradient_noise_scale.py b/megatron/gradient_noise_scale/gradient_noise_scale.py index 35acdefd0..6f4e94b9a 100644 --- a/megatron/gradient_noise_scale/gradient_noise_scale.py +++ b/megatron/gradient_noise_scale/gradient_noise_scale.py @@ -74,7 +74,7 @@ def flatten_grads(self): return torch.cat(grads) def _sync_overflow(self, is_overflow): - if self.args.pipe_parallel_size > 1: + if self.args.is_pipe_parallel: # Since each model parallel GPU carries only part of the model, # make sure overflow flag is synced across all the pipe parallel GPUs overflow_gpu = torch.cuda.ByteTensor([is_overflow]) @@ -104,7 +104,7 @@ def _update(self): # calculate Gbig and Gsmall # this needs to be done in fp32 or it overflows - if self.args.pipe_parallel_size > 1: + if self.args.is_pipe_parallel: g_big = torch.square(torch.norm(grads.to(torch.float))) g_small = torch.square(torch.norm(grad.to(torch.float))) @@ -151,7 +151,7 @@ def _update(self): self.n_updates += 1 def update(self): - if self.args.pipe_parallel_size > 1: + if self.args.is_pipe_parallel: # update on all ranks self._update() else: diff --git a/megatron/initialize.py b/megatron/initialize.py index c3d680d15..9891a3388 100644 --- a/megatron/initialize.py +++ b/megatron/initialize.py @@ -89,9 +89,6 @@ def finish_mpu_init(): # Megatron's MPU is the master. Complete initialization right away. finish_mpu_init() - # Initialize memory buffers. - _initialize_mem_buffs() - # Autoresume. _init_autoresume() @@ -127,10 +124,6 @@ def setup_deepspeed_random_and_activation_checkpointing(args): synchronize=args.synchronize_each_layer, profile=args.profile_backward) - mpu.checkpoint = deepspeed.checkpointing.checkpoint - mpu.get_cuda_rng_tracker = deepspeed.checkpointing.get_cuda_rng_tracker - mpu.model_parallel_cuda_manual_seed = deepspeed.checkpointing.model_parallel_cuda_manual_seed - def _initialize_distributed(): """Initialize torch.distributed and mpu.""" @@ -158,7 +151,7 @@ def _initialize_distributed(): else: args.local_rank = device torch.cuda.set_device(device) - + distributed.init_distributed( dist_backend=args.distributed_backend, auto_mpi_discovery=True, @@ -167,25 +160,22 @@ def _initialize_distributed(): ) # Setup 3D topology. - if args.pipe_parallel_size > 0: - pp = args.pipe_parallel_size - mp = args.model_parallel_size - assert args.world_size % (pp * mp) == 0, f'world_size={args.world_size}, pp={pp}, mp={mp}' - dp = args.world_size // (pp * mp) - - from deepspeed.runtime.pipe.topology import PipeModelDataParallelTopology - # this does pipe on the most outside, then data, then model. - # PipeModelDataParallelTopology is just a wrapper over ProcessTopology that predefines this order. - topo = PipeModelDataParallelTopology(num_pp=pp, num_mp=mp, num_dp=dp) - - # Offset base seeds for the interior pipeline stages. - # TODO: adjust last stage too once IO is improved. - stage_id = topo.get_coord(rank=torch.distributed.get_rank()).pipe - if 0 < stage_id < topo.get_dim('pipe') - 1: - offset = args.seed + 1138 - args.seed = offset + (stage_id * mp) - else: - topo = None + pp = args.pipe_parallel_size if args.pipe_parallel_size >= 1 else 1 + mp = args.model_parallel_size if args.model_parallel_size >= 1 else 1 + assert args.world_size % (pp * mp) == 0, f'world_size={args.world_size}, pp={pp}, mp={mp}' + dp = args.world_size // (pp * mp) + + from deepspeed.runtime.pipe.topology import PipeModelDataParallelTopology + # this does pipe on the most outside, then data, then model. + # PipeModelDataParallelTopology is just a wrapper over ProcessTopology that predefines this order. + topo = PipeModelDataParallelTopology(num_pp=pp, num_mp=mp, num_dp=dp) + + # Offset base seeds for the interior pipeline stages. + # TODO: adjust last stage too once IO is improved. + stage_id = topo.get_coord(rank=torch.distributed.get_rank()).pipe + if 0 < stage_id < topo.get_dim('pipe') - 1: + offset = args.seed + 1138 + args.seed = offset + (stage_id * mp) # Set the model-parallel / data-parallel communicators. if device_count > 0: @@ -194,10 +184,8 @@ def _initialize_distributed(): else: mpu.initialize_model_parallel(args.model_parallel_size, topology=topo) - # Optional DeepSpeed Activation Checkpointing Features - # - if args.deepspeed and args.deepspeed_activation_checkpointing: - setup_deepspeed_random_and_activation_checkpointing(args) + # Init DeepSpeed Activation Checkpointing Features + setup_deepspeed_random_and_activation_checkpointing(args) def _init_autoresume(): @@ -228,12 +216,3 @@ def _write_args_to_tensorboard(): if writer: for arg in vars(args): writer.add_text(arg, str(getattr(args, arg))) - - -def _initialize_mem_buffs(): - """Initialize manually allocated static memory.""" - args = get_args() - - # Initialize memory for checkpointed activations. - if args.distribute_checkpointed_activations: - mpu.init_checkpointed_activations_memory_buffer() diff --git a/megatron/memory.py b/megatron/memory.py deleted file mode 100644 index be5a117bc..000000000 --- a/megatron/memory.py +++ /dev/null @@ -1,145 +0,0 @@ -# coding=utf-8 -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import torch - - -# A dictionary of all the memory buffers allocated. -_MEM_BUFFS = dict() - - -def allocate_mem_buff(name, numel, dtype, track_usage): - """Allocate a memory buffer.""" - assert name not in _MEM_BUFFS, \ - 'memory buffer {} already allocated.'.format(name) - _MEM_BUFFS[name] = MemoryBuffer(name, numel, dtype, track_usage) - return _MEM_BUFFS[name] - - -def get_mem_buff(name): - """Get the memory buffer.""" - return _MEM_BUFFS[name] - - -class MemoryBuffer: - """Contiguous memory buffer. - Allocate a contiguous memory of type `dtype` and size `numel`. It is - used to reduce memory fragmentation. - - Usage: After the allocation, the `_start` index is set tot the first - index of the memory. A memory chunk starting from `_start` index - can be `allocated` for an input tensor, with the elements of the - tensor being coppied. The buffer can be reused by resetting the - `_start` index. - - """ - def __init__(self, name, numel, dtype, track_usage): - if torch.distributed.get_rank() == 0: - element_size = torch.tensor([], dtype=dtype).element_size() - print('> building the {} memory buffer with {} num elements ' - 'and {} dtype ({:.1f} MB)...'.format( - name, numel, dtype, numel*element_size/1024/1024), - flush=True) - self.name = name - self.numel = numel - self.dtype = dtype - self.data = torch.empty(self.numel, - dtype=self.dtype, - device=torch.cuda.current_device(), - requires_grad=False) - - # Index tracking the start of the free memory. - self._start = 0 - - # Values used for tracking usage. - self.track_usage = track_usage - if self.track_usage: - self.in_use_value = 0.0 - self.total_value = 0.0 - - - def reset(self): - """Reset the buffer start index to the beginning of the buffer.""" - self._start = 0 - - - def is_in_use(self): - """Whether the current buffer hold on to any memory.""" - return self._start > 0 - - - def numel_in_use(self): - """Return number of elements in use.""" - return self._start - - - def add(self, tensor): - """Allocate a chunk of memory from the buffer to tensor and copy - the values.""" - assert tensor.dtype == self.dtype, \ - 'Input tensor type {} different from buffer type {}'.format( - tensor.dtype, self.dtype) - # Number of elements of the input tensor. - tensor_numel = torch.numel(tensor) - new_start = self._start + tensor_numel - assert new_start <= self.numel, \ - 'Not enough memory left in the buffer ({} > {})'.format( - tensor_numel, self.numel - self._start) - # New tensor is a view into the memory. - new_tensor = self.data[self._start:new_start] - self._start = new_start - new_tensor = new_tensor.view(tensor.shape) - new_tensor.copy_(tensor) - # Return a pointer to the new tensor. - return new_tensor - - - def get_data(self): - """Return the data currently in use.""" - if self.track_usage: - self.in_use_value += float(self._start) - self.total_value += float(self.numel) - return self.data[:self._start] - - - def print_average_usage(self): - """Print memory usage average over time. We would like this value - to be as high as possible.""" - assert self.track_usage, 'You need to enable track usage.' - if torch.distributed.get_rank() == 0: - print(' > usage of {} memory buffer: {:.2f} %'.format( - self.name, self.in_use_value * 100.0 / self.total_value), - flush=True) - - - -class RingMemBuffer: - """A ring of memory buffers.""" - - def __init__(self, name, num_buffers, numel, dtype, track_usage): - self.num_buffers = num_buffers - self.buffers = [ - allocate_mem_buff(name+' {}'.format(i), numel, dtype, track_usage) - for i in range(num_buffers)] - self._index = -1 - - - def get_next_buffer(self): - self._index += 1 - self._index = self._index % self.num_buffers - buff = self.buffers[self._index] - assert not buff.is_in_use(), 'buffer is already in use.' - return buff diff --git a/megatron/model/__init__.py b/megatron/model/__init__.py index 396210f1f..8d32f6911 100755 --- a/megatron/model/__init__.py +++ b/megatron/model/__init__.py @@ -16,7 +16,5 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .gpt2_model import GPT2Model, GPT2ModelPipe +from .gpt2_model import GPT2ModelPipe from .utils import get_params_for_weight_decay_optimization -from .language_model import get_language_model -from .norms import RMSNorm, ScaleNorm, LayerNorm diff --git a/megatron/model/fused_bias_dropout.py b/megatron/model/fused_bias_dropout.py new file mode 100644 index 000000000..b3bb8c8f5 --- /dev/null +++ b/megatron/model/fused_bias_dropout.py @@ -0,0 +1,35 @@ +import torch +import torch.nn.functional as F + +# flags required to enable jit fusion kernels +torch._C._jit_set_profiling_mode(False) +torch._C._jit_set_profiling_executor(False) +torch._C._jit_override_can_fuse_on_cpu(True) +torch._C._jit_override_can_fuse_on_gpu(True) + + +def bias_dropout_add(x, bias, residual, prob, training): + # type: (Tensor, Tensor, Tensor, float, bool) -> Tensor + out = torch.nn.functional.dropout(x + bias, p=prob, training=training) + out = residual + out + return out + + +def get_bias_dropout_add(training): + def _bias_dropout_add(x, bias, residual, prob): + return bias_dropout_add(x, bias, residual, prob, training) + + return _bias_dropout_add + + +@torch.jit.script +def bias_dropout_add_fused_train(x, bias, residual, prob): + # type: (Tensor, Tensor, Tensor, float) -> Tensor + return bias_dropout_add(x, bias, residual, prob, True) + + +@torch.jit.script +def bias_dropout_add_fused_inference(x, bias, residual, prob): + # type: (Tensor, Tensor, Tensor, float) -> Tensor + return bias_dropout_add(x, bias, residual, prob, False) + diff --git a/megatron/model/gpt2_model.py b/megatron/model/gpt2_model.py index 0101165ff..db0def9f5 100644 --- a/megatron/model/gpt2_model.py +++ b/megatron/model/gpt2_model.py @@ -21,20 +21,17 @@ import torch from megatron import get_args -from megatron.module import MegatronModule from functools import partial -from .language_model import get_language_model -from .utils import init_method_normal -from .utils import scaled_init_method_normal -from .norms import LayerNorm, RMSNorm, ScaleNorm +from megatron.model.utils import init_method_normal, scaled_init_method_normal, Lambda +from megatron.model.norms import LayerNorm, RMSNorm, ScaleNorm -# Pipeline parallelism from megatron import mpu from megatron.mpu import ParallelRelativePositionBias import megatron.fp16 as fp16 -from megatron.model.transformer import ParallelTransformerLayerPipe, NormPipe, ParallelLinearPipe, ParallelLinear -from .language_model import EmbeddingPipe, parallel_lm_logits +from megatron.model.transformer import ParallelTransformerLayerPipe, NormPipe, ParallelLinearPipe, parallel_lm_logits +from megatron.model.word_embeddings import EmbeddingPipe +# Pipeline parallelism from deepspeed.pipe import PipelineModule, LayerSpec, TiedLayerSpec @@ -65,82 +62,7 @@ def cross_entropy(output, labels, _fp16=False): return loss -class GPT2Model(MegatronModule): - """GPT-2 Language model.""" - - def __init__(self, num_tokentypes=0, parallel_output=True, inference=False, get_key_value=True): - super(GPT2Model, self).__init__() - args = get_args() - self.parallel_output = parallel_output - self.weight_tying = not args.no_weight_tying - self.fp16_lm_cross_entropy = args.fp16_lm_cross_entropy - - self.inference = inference - self.get_key_value = get_key_value if inference else False - - self.language_model, self._language_model_key = get_language_model( - attention_mask_func=gpt2_attention_mask_func, - num_tokentypes=num_tokentypes, - init_method=init_method_normal(args.init_method_std), - scaled_init_method=scaled_init_method_normal(args.init_method_std, - args.num_layers), - get_key_value=self.get_key_value) - if not self.weight_tying: - self.final_linear = ParallelLinear(self.parallel_output) - - def forward(self, input_ids, position_ids, attention_mask, - layer_past=None, tokentype_ids=None, forward_method_parallel_output=None, labels=None): - - # Language model. - lm_output = self.language_model(input_ids, - position_ids, - attention_mask, - tokentype_ids=tokentype_ids, - layer_past=layer_past) - - if self.get_key_value: - lm_output, presents = lm_output - - # Output. - parallel_output = self.parallel_output - if forward_method_parallel_output is not None: - parallel_output = forward_method_parallel_output - if self.weight_tying: - output = parallel_lm_logits( - lm_output, - self.language_model.embedding.word_embeddings.weight, - parallel_output) - else: - output, bias = self.final_linear(lm_output) - - if self.get_key_value: - output = [output, presents] - - if labels is None: - return output - else: - if self.fp16_lm_cross_entropy: - assert output.dtype == torch.half - loss = mpu.vocab_parallel_cross_entropy(output, labels) - else: - loss = mpu.vocab_parallel_cross_entropy(output.float(), labels) - return loss - - def state_dict_for_save_checkpoint(self, destination=None, prefix='', - keep_vars=False): - state_dict_ = {self._language_model_key: self.language_model.state_dict_for_save_checkpoint( - destination, prefix, keep_vars)} - return state_dict_ - - def load_state_dict(self, state_dict, strict=True): - """Customized load.""" - - if self._language_model_key in state_dict: - state_dict = state_dict[self._language_model_key] - self.language_model.load_state_dict(state_dict, strict=strict) - - -class GPT2ModelPipe(PipelineModule, MegatronModule): +class GPT2ModelPipe(PipelineModule, torch.nn.Module): """GPT2Model adapted for pipeline parallelism. The largest change is flattening the GPTModel class so we can express it as a @@ -317,3 +239,33 @@ def _logits_helper(embedding, lm_output): ) # so output in training should just be logits # in inference it will be (logits, presents) (assuming get_key_value) is true + + def to_sequential(self): + """ + Transforms the PipelineModule to a plain nn.Sequential module + :return: + """ + layers = [] + from collections import defaultdict + tied_layers = defaultdict(list) + for n, spec in enumerate(self.specs): + if isinstance(spec, TiedLayerSpec): + if spec.key in tied_layers: + # receiver + layers.append(Lambda(lambda x: spec.forward_fn(tied_layers[spec.key][0], x))) + else: + # owner + module = spec.build(log=False) + layers.append(module) + tied_layers[spec.key].append(module) + elif isinstance(spec, LayerSpec): + layers.append(spec.build(log=False)) + else: + # check that it's a lambda function + LAMBDA = lambda:0 + if isinstance(spec, type(LAMBDA)) and spec.__name__ == LAMBDA.__name__: + # we assume it is a lambda function + layers.append(Lambda(spec)) + else: + raise ValueError(f'Layer number {n} ({spec}) Not recognized') + return torch.nn.Sequential(*layers) diff --git a/megatron/model/language_model.py b/megatron/model/language_model.py deleted file mode 100644 index 7dc61f0f9..000000000 --- a/megatron/model/language_model.py +++ /dev/null @@ -1,174 +0,0 @@ -# coding=utf-8 -# -# Copyright 2021 Biderman et al. This file is based on code by the authors denoted below and has been modified from its original version. -# -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Transformer based language model.""" - -import torch -import torch.nn.functional as F -from einops import rearrange, repeat - -from megatron import get_args -from megatron import mpu -from megatron.module import MegatronModule -from megatron.model.transformer import ParallelTransformer, SinusoidalPositionalEmbedding, Embedding, EmbeddingPipe -from megatron.model.utils import get_linear_layer -from megatron.model.utils import init_method_normal, scaled_init_method_normal -from megatron.model.utils import identity - - -def parallel_lm_logits(input_, word_embeddings_weight, parallel_output, - bias=None): - """LM logits using word embedding weights.""" - # Parallel logits. - input_parallel = mpu.copy_to_model_parallel_region(input_) - - # Matrix multiply. - if bias is None: - logits_parallel = F.linear(input_parallel, word_embeddings_weight) - else: - logits_parallel = F.linear(input_parallel, word_embeddings_weight, bias) - - # Gather if needed. - if parallel_output: - return logits_parallel - - return mpu.gather_from_model_parallel_region(logits_parallel) - - -def get_language_model(attention_mask_func, num_tokentypes, - init_method=None, scaled_init_method=None, get_key_value=False): - """Build language model and return along with the key to save.""" - args = get_args() - - if init_method is None: - init_method = init_method_normal(args.init_method_std) - - if scaled_init_method is None: - scaled_init_method = scaled_init_method_normal(args.init_method_std, args.num_layers) - - # Language model. - language_model = TransformerLanguageModel( - attention_mask_func=attention_mask_func, - init_method=init_method, - output_layer_init_method=scaled_init_method, - num_tokentypes=num_tokentypes, - get_key_value=get_key_value) - # key used for checkpoints. - language_model_key = 'language_model' - - return language_model, language_model_key - - -class TransformerLanguageModel(MegatronModule): - """Transformer language model. - - Arguments: - transformer_hparams: transformer hyperparameters - attention_mask_func: a function that takes `unmaksed-attention-scores` - with size [b, np, s, s] and an `attention-mask` and will apply - the masking. The function should return a masked score of the - same size [b, np, s, s]. - masked-attention-scores = attention_mask_func( - unmaksed-attention-scores, attention-mask) - vocab_size: vocabulary size - max_sequence_length: maximum size of sequence. This - is used for positional embedding - embedding_dropout_prob: dropout probability for embeddings - num_tokentypes: size of the token-type embeddings. 0 value - will ignore this embedding - """ - - def __init__(self, - attention_mask_func, - init_method, - output_layer_init_method, - num_tokentypes=0, - get_key_value=False): - super(TransformerLanguageModel, self).__init__() - args = get_args() - - self.hidden_size = args.hidden_size - self.num_tokentypes = num_tokentypes - self.init_method = init_method - self.embedding_type = args.pos_emb - # Embeddings - self.embedding = Embedding(self.hidden_size, - args.padded_vocab_size, - args.max_position_embeddings, - args.hidden_dropout, - self.init_method, - self.num_tokentypes) - self._embedding_key = 'embedding' - self.get_key_value = get_key_value - - # Transformer - self.transformer = ParallelTransformer( - attention_mask_func, self.init_method, - output_layer_init_method, get_key_value=self.get_key_value) - self._transformer_key = 'transformer' - - def forward(self, input_ids, position_ids, attention_mask, - tokentype_ids=None, layer_past=None): - - # Embeddings. - embedding_output = self.embedding(input_ids, position_ids, - tokentype_ids=tokentype_ids) - # Transformer. - transformer_output = self.transformer(embedding_output, - attention_mask, - layer_past=layer_past) - return transformer_output - - def state_dict_for_save_checkpoint(self, destination=None, prefix='', - keep_vars=False): - """For easy load.""" - - state_dict_ = {} - state_dict_[self._embedding_key] \ - = self.embedding.state_dict_for_save_checkpoint( - destination, prefix, keep_vars) - state_dict_[self._transformer_key] \ - = self.transformer.state_dict_for_save_checkpoint( - destination, prefix, keep_vars) - - return state_dict_ - - def load_state_dict(self, state_dict, strict=True): - """Customized load.""" - - # Embedding. - if self._embedding_key in state_dict: - state_dict_ = state_dict[self._embedding_key] - else: - # for backward compatibility. - state_dict_ = {} - for key in state_dict.keys(): - if '_embeddings' in key: - state_dict_[key] = state_dict[key] - self.embedding.load_state_dict(state_dict_, strict=strict) - - # Transformer. - if self._transformer_key in state_dict: - state_dict_ = state_dict[self._transformer_key] - else: - # for backward compatibility. - state_dict_ = {} - for key in state_dict.keys(): - if 'transformer.' in key: - state_dict_[key.split('transformer.')[1]] = state_dict[key] - self.transformer.load_state_dict(state_dict_, strict=strict) diff --git a/megatron/model/positional_embeddings.py b/megatron/model/positional_embeddings.py index 1f47bc312..547c46c60 100644 --- a/megatron/model/positional_embeddings.py +++ b/megatron/model/positional_embeddings.py @@ -1,8 +1,7 @@ import torch -from megatron.module import MegatronModule -class SinusoidalPositionalEmbedding(MegatronModule): +class SinusoidalPositionalEmbedding(torch.nn.Module): def __init__(self, dim, base=10000): super().__init__() @@ -16,7 +15,7 @@ def forward(self, x, seq_dim=1): return emb[None, :, :] -class RotaryEmbedding(MegatronModule): +class RotaryEmbedding(torch.nn.Module): def __init__(self, dim, base=10000): super().__init__() diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py index b351918a1..d8a6cd3b4 100644 --- a/megatron/model/transformer.py +++ b/megatron/model/transformer.py @@ -25,13 +25,12 @@ from .norms import LayerNorm, RMSNorm, ScaleNorm from megatron import get_args from megatron import mpu -from megatron.module import MegatronModule from megatron.model.fused_softmax import FusedScaleMaskSoftmax from megatron.model.fused_bias_gelu import bias_gelu_impl from megatron.model.utils import openai_gelu, erf_gelu, exists -from megatron.mpu import ParallelRelativePositionBias -from megatron.model.positional_embeddings import SinusoidalPositionalEmbedding, RotaryEmbedding, apply_rotary_pos_emb - +from megatron.model.positional_embeddings import RotaryEmbedding, apply_rotary_pos_emb +from megatron.model.fused_bias_dropout import get_bias_dropout_add, bias_dropout_add_fused_train, \ + bias_dropout_add_fused_inference import deepspeed from deepspeed.ops.sparse_attention import SparseSelfAttention, VariableSparsityConfig @@ -63,7 +62,7 @@ """ -class GEGLU(MegatronModule): +class GEGLU(torch.nn.Module): def __init__(self): super(GEGLU, self).__init__() @@ -91,7 +90,7 @@ def forward(self, x, bias=None): return intermediate_parallel * x -class ParallelMLP(MegatronModule): +class ParallelMLP(torch.nn.Module): """MLP. MLP will take the input with h hidden state, project it to 4*h @@ -157,7 +156,7 @@ def forward(self, hidden_states): return output, output_bias -class ParallelLinear(MegatronModule): +class ParallelLinear(torch.nn.Module): """ A Parallel Linear Layer transforming the transformer outputs from hidden_size -> vocab_size """ @@ -178,7 +177,7 @@ def forward(self, hidden_states): return self.final_linear(hidden_states) -class ParallelSelfAttention(MegatronModule): +class ParallelSelfAttention(torch.nn.Module): """Parallel self-attention layer abstract class. Self-attention layer takes input with size [b, s, h] @@ -456,33 +455,7 @@ def forward(self, hidden_states, attention_mask, layer_past=None): return output, bias -def bias_dropout_add(x, bias, residual, prob, training): - # type: (Tensor, Tensor, Tensor, float, bool) -> Tensor - out = torch.nn.functional.dropout(x + bias, p=prob, training=training) - out = residual + out - return out - - -def get_bias_dropout_add(training): - def _bias_dropout_add(x, bias, residual, prob): - return bias_dropout_add(x, bias, residual, prob, training) - - return _bias_dropout_add - - -@torch.jit.script -def bias_dropout_add_fused_train(x, bias, residual, prob): - # type: (Tensor, Tensor, Tensor, float) -> Tensor - return bias_dropout_add(x, bias, residual, prob, True) - - -@torch.jit.script -def bias_dropout_add_fused_inference(x, bias, residual, prob): - # type: (Tensor, Tensor, Tensor, float) -> Tensor - return bias_dropout_add(x, bias, residual, prob, False) - - -class ParallelTransformerLayer(MegatronModule): +class ParallelTransformerLayer(torch.nn.Module): """A single transformer layer. Transformer layer takes input with size [b, s, h] and returns an @@ -603,164 +576,6 @@ def forward(self, hidden_states, attention_mask, layer_past=None): return output -class ParallelTransformer(MegatronModule): - """Transformer class.""" - - def __init__(self, attention_mask_func, - init_method, output_layer_init_method, get_key_value=False): - super(ParallelTransformer, self).__init__() - args = get_args() - - # Store activation checkpoiting flag. - self.checkpoint_activations = args.checkpoint_activations - self.checkpoint_num_layers = args.checkpoint_num_layers - - self.get_key_value = get_key_value - # Number of layers: - self.num_layers = args.num_layers - self.num_unique_layers = args.num_unique_layers - if self.num_unique_layers is None: - self.num_unique_layers = self.num_layers - assert self.num_layers % self.num_unique_layers == 0, \ - 'number of layers should be divisible by number of unique layers' - self.param_sharing_style = args.param_sharing_style - - if args.pos_emb == 'rpe': - rpe_emb = ParallelRelativePositionBias(causal=True, num_buckets=args.rpe_num_buckets, - max_distance=args.rpe_max_distance, - heads=args.num_attention_heads) - - # Transformer layers. - sparsity = args.sparsity - - def build_layer(layer_number): - if sparsity == 'none': - sparse = False - elif sparsity == 'all': - sparse = True - elif sparsity == 'interspersed': - sparse = not layer_number % 2 == 0 - else: - raise ValueError(f'Sparsity type {sparsity} not recognized') - return ParallelTransformerLayer( - attention_mask_func, init_method, - output_layer_init_method, layer_number, sparse=sparse, - rpe=rpe_emb if args.pos_emb == 'rpe' else None, - get_key_value=get_key_value, - rotary=args.pos_emb == 'rotary') - - self.layers = torch.nn.ModuleList( - [build_layer(i + 1) for i in range(self.num_unique_layers)]) - - # Print layer ordering. - if self.num_layers != self.num_unique_layers: - if torch.distributed.get_rank() == 0: - print('> will be using the following layer ordering:') - for i in range(self.num_layers): - print(' layer id: {:3d} --> unique layer id: ' - '{:3d}'.format(i, self._get_layer_index(i)), - flush=True) - - # Final layer norm before output. - if args.norm == "rmsnorm": - norm = RMSNorm - eps = args.rms_norm_epsilon - elif args.norm == "layernorm": - eps = args.layernorm_epsilon - norm = LayerNorm - elif args.norm == "scalenorm": - eps = args.scalenorm_epsilon - norm = ScaleNorm - - self.final_layernorm = norm( - args.hidden_size, - eps=eps) - - if deepspeed.checkpointing.is_configured(): - global get_cuda_rng_tracker, checkpoint - get_cuda_rng_tracker = deepspeed.checkpointing.get_cuda_rng_tracker - checkpoint = deepspeed.checkpointing.checkpoint - - def _get_layer_index(self, layer_number): - if self.param_sharing_style == 'grouped': - return layer_number % self.num_unique_layers - if self.param_sharing_style == 'spaced': - return layer_number // (self.num_layers // self.num_unique_layers) - assert False, 'should not be here' - - def _get_layer(self, layer_number): - return self.layers[self._get_layer_index(layer_number)] - - def _checkpointed_forward(self, hidden_states, attention_mask): - """Forward method with activation checkpointing.""" - - def custom(start, end): - def custom_forward(*inputs): - x_ = inputs[0] - for index in range(start, end): - layer = self._get_layer(index) - x_ = layer(x_, inputs[1]) - return x_ - - return custom_forward - - # Make sure memory is freed. - mpu.reset_checkpointed_activations_memory_buffer() - l = 0 - while l < self.num_layers: - hidden_states = mpu.checkpoint( - custom(l, l + self.checkpoint_num_layers), - hidden_states, attention_mask) - l += self.checkpoint_num_layers - - return hidden_states - - def forward(self, hidden_states, attention_mask, layer_past=None, ): - # Checks - if layer_past is not None and layer_past.numel() > 0: - assert self.get_key_value, \ - 'for not None values in layer_past, ' \ - 'expected get_key_value to be set' - if self.get_key_value: - assert not self.checkpoint_activations, \ - 'get_key_value does not work with ' \ - 'activation checkpointing' - - # data format change to avoid explicit tranposes : [b s h] --> [s b h] - hidden_states = hidden_states.transpose(0, 1).contiguous() - - if self.checkpoint_activations: - hidden_states = self._checkpointed_forward(hidden_states, - attention_mask) - else: - if self.get_key_value: - presents = torch.Tensor() - for index in range(self.num_layers): - layer = self._get_layer(index) - past = None - if layer_past.numel() > 0: - past = layer_past[index] - hidden_states = layer(hidden_states, - attention_mask, - layer_past=past) - if self.get_key_value: - hidden_states, present = hidden_states - if presents.numel() == 0: - presents = present.unsqueeze(dim=0) - else: - presents = torch.cat((presents, present.unsqueeze(dim=0))) - - # reverting data format change [s b h] --> [b s h] - hidden_states = hidden_states.transpose(0, 1).contiguous() - - # Final layer norm. - output = self.final_layernorm(hidden_states) - if self.get_key_value: - output = [output, presents] - - return output - - class ParallelTransformerLayerPipe(ParallelTransformerLayer): """Extends ParallelTransformerLayer to forward attention_mask through the pipeline. """ @@ -794,226 +609,59 @@ def forward(self, args): f'In layer {self.layer_number} - Incorrect number of arguments ({len(args)}) for {self.__class__.__name__}') -class NormPipe(MegatronModule): - """Just a helper class to pass presents through to the output when doing inference with a Pipe Parallel model""" - - def __init__(self, norm_class, hidden_size, eps): - super().__init__() - self.norm = norm_class(hidden_size, eps=eps) +class ParallelLinearPipe(ParallelLinear): + """Another helper class to pass presents through to the output when doing inference with a Pipe Parallel model""" def forward(self, args): if not isinstance(args, tuple): # in training, args = hidden_state (tensor, so we check if object isn't a tuple and pass through here) hidden_state = args - return self.norm(hidden_state) + logits, bias = super().forward(hidden_state) + return logits elif len(args) == 2: - # in inference, args will be (hidden_state, presents) + # we are in inference, so input is (hidden_states, presents) hidden_state, presents = args - hidden_state = self.norm(hidden_state) - return hidden_state, presents + logits, bias = super().forward(hidden_state) + return logits, presents else: raise ValueError(f'Incorrect number of arguments for {self.__class__.__name__}') -class Embedding(MegatronModule): - """Language model embeddings. - Arguments: - hidden_size: hidden size - vocab_size: vocabulary size - max_sequence_length: maximum size of sequence. This - is used for positional embedding - embedding_dropout_prob: dropout probability for embeddings - init_method: weight initialization method - num_tokentypes: size of the token-type embeddings. 0 value - will ignore this embedding - """ - - def __init__(self, - hidden_size, - vocab_size, - max_sequence_length, - embedding_dropout_prob, - init_method, - num_tokentypes=0): - super(Embedding, self).__init__() - args = get_args() - self.hidden_size = hidden_size - self.init_method = init_method - self.num_tokentypes = num_tokentypes - - # Word embeddings (parallel). - self.word_embeddings = mpu.VocabParallelEmbedding( - vocab_size, self.hidden_size, init_method=self.init_method) - self._word_embeddings_key = 'word_embeddings' - - # Position embedding (serial). - self.embedding_type = args.pos_emb - if self.embedding_type == "learned": - self.position_embeddings = torch.nn.Embedding( - max_sequence_length, self.hidden_size) - self._position_embeddings_key = 'position_embeddings' - # Initialize the position embeddings. - self.init_method(self.position_embeddings.weight) - elif self.embedding_type == "sinusoidal": - self.position_embeddings = SinusoidalPositionalEmbedding(self.hidden_size) - - # Token type embedding. - # Add this as an optional field that can be added through - # method call so we can load a pretrain model without - # token types and add them as needed. - self._tokentype_embeddings_key = 'tokentype_embeddings' - if self.num_tokentypes > 0: - self.tokentype_embeddings = torch.nn.Embedding(self.num_tokentypes, - self.hidden_size) - # Initialize the token-type embeddings. - self.init_method(self.tokentype_embeddings.weight) - else: - self.tokentype_embeddings = None - - # Embeddings dropout - self.embedding_dropout = torch.nn.Dropout(embedding_dropout_prob) - - def add_tokentype_embeddings(self, num_tokentypes): - """Add token-type embedding. This function is provided so we can add - token-type embeddings in case the pretrained model does not have it. - This allows us to load the model normally and then add this embedding. - """ - if self.tokentype_embeddings is not None: - raise Exception('tokentype embeddings is already initialized') - if torch.distributed.get_rank() == 0: - print('adding embedding for {} tokentypes'.format(num_tokentypes), - flush=True) - self.num_tokentypes = num_tokentypes - self.tokentype_embeddings = torch.nn.Embedding(num_tokentypes, - self.hidden_size) - # Initialize the token-type embeddings. - self.init_method(self.tokentype_embeddings.weight) - - def forward(self, input_ids, position_ids, tokentype_ids=None): - # Embeddings. - words_embeddings = self.word_embeddings(input_ids) - if self.embedding_type in ["learned", "sinusoidal"]: - position_embeddings = self.position_embeddings(position_ids) - embeddings = words_embeddings + position_embeddings - else: - embeddings = words_embeddings - if tokentype_ids is not None: - assert self.tokentype_embeddings is not None - embeddings = embeddings + self.tokentype_embeddings(tokentype_ids) - else: - assert self.tokentype_embeddings is None - - # Dropout. - embeddings = self.embedding_dropout(embeddings) - return embeddings - - def state_dict_for_save_checkpoint(self, destination=None, prefix='', - keep_vars=False): - """For easy load.""" - - state_dict_ = {} - state_dict_[self._word_embeddings_key] \ - = self.word_embeddings.state_dict(destination, prefix, keep_vars) - if self.embedding_type == "learned": - state_dict_[self._position_embeddings_key] \ - = self.position_embeddings.state_dict( - destination, prefix, keep_vars) - if self.num_tokentypes > 0: - state_dict_[self._tokentype_embeddings_key] \ - = self.tokentype_embeddings.state_dict( - destination, prefix, keep_vars) - - return state_dict_ - - def load_state_dict(self, state_dict, strict=True): - """Customized load.""" - - # Word embedding. - if self._word_embeddings_key in state_dict: - state_dict_ = state_dict[self._word_embeddings_key] - else: - # for backward compatibility. - state_dict_ = {} - for key in state_dict.keys(): - if 'word_embeddings' in key: - state_dict_[key.split('word_embeddings.')[1]] \ - = state_dict[key] - self.word_embeddings.load_state_dict(state_dict_, strict=strict) - - # Position embedding. - if self.embedding_type == "learned": - if self._position_embeddings_key in state_dict: - state_dict_ = state_dict[self._position_embeddings_key] - else: - # for backward compatibility. - state_dict_ = {} - for key in state_dict.keys(): - if 'position_embeddings' in key: - state_dict_[key.split('position_embeddings.')[1]] \ - = state_dict[key] - self.position_embeddings.load_state_dict(state_dict_, strict=strict) - - # Tokentype embedding. - if self.num_tokentypes > 0: - state_dict_ = {} - if self._tokentype_embeddings_key in state_dict: - state_dict_ = state_dict[self._tokentype_embeddings_key] - else: - # for backward compatibility. - for key in state_dict.keys(): - if 'tokentype_embeddings' in key: - state_dict_[key.split('tokentype_embeddings.')[1]] \ - = state_dict[key] - if len(state_dict_.keys()) > 0: - self.tokentype_embeddings.load_state_dict(state_dict_, - strict=strict) - else: - print('***WARNING*** expected tokentype embeddings in the ' - 'checkpoint but could not find it', flush=True) - +class NormPipe(torch.nn.Module): + """Just a helper class to pass presents through to the output when doing inference with a Pipe Parallel model""" -class ParallelLinearPipe(ParallelLinear): - """Another helper class to pass presents through to the output when doing inference with a Pipe Parallel model""" + def __init__(self, norm_class, hidden_size, eps): + super().__init__() + self.norm = norm_class(hidden_size, eps=eps) def forward(self, args): if not isinstance(args, tuple): # in training, args = hidden_state (tensor, so we check if object isn't a tuple and pass through here) hidden_state = args - logits, bias = super().forward(hidden_state) - return logits + return self.norm(hidden_state) elif len(args) == 2: - # we are in inference, so input is (hidden_states, presents) + # in inference, args will be (hidden_state, presents) hidden_state, presents = args - logits, bias = super().forward(hidden_state) - return logits, presents + hidden_state = self.norm(hidden_state) + return hidden_state, presents else: raise ValueError(f'Incorrect number of arguments for {self.__class__.__name__}') -class EmbeddingPipe(Embedding): - """Extends Embedding to forward attention_mask through the pipeline.""" +def parallel_lm_logits(input_, word_embeddings_weight, parallel_output, + bias=None): + """LM logits using word embedding weights.""" + # Parallel logits. + input_parallel = mpu.copy_to_model_parallel_region(input_) - @property - def word_embeddings_weight(self): - """Easy accessory for the pipeline engine to tie embeddings across stages.""" - return self.word_embeddings.weight + # Matrix multiply. + if bias is None: + logits_parallel = F.linear(input_parallel, word_embeddings_weight) + else: + logits_parallel = F.linear(input_parallel, word_embeddings_weight, bias) - def forward(self, args): - in_inference = len(args) == 4 # if the length of the args is 4, we're in inference :| - in_train = len(args) == 3 - - input_ids = args[0] - position_ids = args[1] - attention_mask = args[2] - if in_inference: - layer_past = args[3] - elif in_train: - pass - else: - raise ValueError(f'Incorrect number of args passed to {self.__class__.__name__}') + # Gather if needed. + if parallel_output: + return logits_parallel - embeddings = super().forward(input_ids, position_ids) - if in_inference: - return embeddings, layer_past, attention_mask - else: - return embeddings, attention_mask + return mpu.gather_from_model_parallel_region(logits_parallel) diff --git a/megatron/model/utils.py b/megatron/model/utils.py index 51bfe4abd..0ddcdbbae 100644 --- a/megatron/model/utils.py +++ b/megatron/model/utils.py @@ -22,8 +22,6 @@ import torch -from .transformer import LayerNorm, RMSNorm, ScaleNorm - def init_method_normal(sigma): """Init method based on N(0, sigma).""" @@ -44,15 +42,6 @@ def init_(tensor): return init_ -def get_linear_layer(rows, columns, init_method): - """Simple linear layer with weight initialization.""" - layer = torch.nn.Linear(rows, columns) - init_method(layer.weight) - with torch.no_grad(): - layer.bias.zero_() - return layer - - @torch.jit.script def gelu_impl(x): """OpenAI's gelu implementation.""" @@ -76,6 +65,7 @@ def get_params_for_weight_decay_optimization(module, args): """ weight_decay_params = {'params': []} no_weight_decay_params = {'params': [], 'weight_decay': 0.0} + from .transformer import LayerNorm, RMSNorm, ScaleNorm for module_ in module.modules(): if any([isinstance(module_, LayerNorm), isinstance(module_, RMSNorm), isinstance(module_, ScaleNorm)]) or \ (args.weight_decay == 0.0): # also include all parameters here if no weight decay is being done @@ -97,9 +87,15 @@ def get_params_for_weight_decay_optimization(module, args): return weight_decay_params, no_weight_decay_params -def identity(x, *args, **kwargs): - return x - - def exists(x): return x is not None + + +class Lambda(torch.nn.Module): + def __init__(self, func): + super().__init__() + self.func = func + + def forward(self, x): + return self.func(x) + diff --git a/megatron/model/word_embeddings.py b/megatron/model/word_embeddings.py new file mode 100644 index 000000000..70a49806c --- /dev/null +++ b/megatron/model/word_embeddings.py @@ -0,0 +1,128 @@ +import torch + +from megatron import get_args +from megatron import mpu +from megatron.model.positional_embeddings import SinusoidalPositionalEmbedding + + +class Embedding(torch.nn.Module): + """Language model embeddings. + Arguments: + hidden_size: hidden size + vocab_size: vocabulary size + max_sequence_length: maximum size of sequence. This + is used for positional embedding + embedding_dropout_prob: dropout probability for embeddings + init_method: weight initialization method + num_tokentypes: size of the token-type embeddings. 0 value + will ignore this embedding + """ + + def __init__(self, + hidden_size, + vocab_size, + max_sequence_length, + embedding_dropout_prob, + init_method, + num_tokentypes=0): + super(Embedding, self).__init__() + args = get_args() + self.hidden_size = hidden_size + self.init_method = init_method + self.num_tokentypes = num_tokentypes + + # Word embeddings (parallel). + self.word_embeddings = mpu.VocabParallelEmbedding( + vocab_size, self.hidden_size, init_method=self.init_method) + self._word_embeddings_key = 'word_embeddings' + + # Position embedding (serial). + self.embedding_type = args.pos_emb + if self.embedding_type == "learned": + self.position_embeddings = torch.nn.Embedding( + max_sequence_length, self.hidden_size) + self._position_embeddings_key = 'position_embeddings' + # Initialize the position embeddings. + self.init_method(self.position_embeddings.weight) + elif self.embedding_type == "sinusoidal": + self.position_embeddings = SinusoidalPositionalEmbedding(self.hidden_size) + + # Token type embedding. + # Add this as an optional field that can be added through + # method call so we can load a pretrain model without + # token types and add them as needed. + self._tokentype_embeddings_key = 'tokentype_embeddings' + if self.num_tokentypes > 0: + self.tokentype_embeddings = torch.nn.Embedding(self.num_tokentypes, + self.hidden_size) + # Initialize the token-type embeddings. + self.init_method(self.tokentype_embeddings.weight) + else: + self.tokentype_embeddings = None + + # Embeddings dropout + self.embedding_dropout = torch.nn.Dropout(embedding_dropout_prob) + + def add_tokentype_embeddings(self, num_tokentypes): + """Add token-type embedding. This function is provided so we can add + token-type embeddings in case the pretrained model does not have it. + This allows us to load the model normally and then add this embedding. + """ + if self.tokentype_embeddings is not None: + raise Exception('tokentype embeddings is already initialized') + if torch.distributed.get_rank() == 0: + print('adding embedding for {} tokentypes'.format(num_tokentypes), + flush=True) + self.num_tokentypes = num_tokentypes + self.tokentype_embeddings = torch.nn.Embedding(num_tokentypes, + self.hidden_size) + # Initialize the token-type embeddings. + self.init_method(self.tokentype_embeddings.weight) + + def forward(self, input_ids, position_ids, tokentype_ids=None): + # Embeddings. + words_embeddings = self.word_embeddings(input_ids) + if self.embedding_type in ["learned", "sinusoidal"]: + position_embeddings = self.position_embeddings(position_ids) + embeddings = words_embeddings + position_embeddings + else: + embeddings = words_embeddings + if tokentype_ids is not None: + assert self.tokentype_embeddings is not None + embeddings = embeddings + self.tokentype_embeddings(tokentype_ids) + else: + assert self.tokentype_embeddings is None + + # Dropout. + embeddings = self.embedding_dropout(embeddings) + return embeddings + + +class EmbeddingPipe(Embedding): + """Extends Embedding to forward attention_mask through the pipeline.""" + + @property + def word_embeddings_weight(self): + """Easy accessory for the pipeline engine to tie embeddings across stages.""" + return self.word_embeddings.weight + + def forward(self, args): + in_inference = len(args) == 4 # if the length of the args is 4, we're in inference :| + in_train = len(args) == 3 + + input_ids = args[0] + position_ids = args[1] + attention_mask = args[2] + if in_inference: + layer_past = args[3] + elif in_train: + pass + else: + raise ValueError(f'Incorrect number of args passed to {self.__class__.__name__}') + + embeddings = super().forward(input_ids, position_ids) + if in_inference: + return embeddings, layer_past, attention_mask + else: + return embeddings, attention_mask + diff --git a/megatron/module.py b/megatron/module.py deleted file mode 100644 index a78c228f9..000000000 --- a/megatron/module.py +++ /dev/null @@ -1,31 +0,0 @@ -# coding=utf-8 -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Megatron Module""" - -import torch - - -class MegatronModule(torch.nn.Module): - """Megatron specific extentions of torch Module.""" - - def __init__(self): - super(MegatronModule, self).__init__() - - def state_dict_for_save_checkpoint(self, destination=None, prefix='', - keep_vars=False): - """Use this function to override the state dict for - saving checkpoints.""" - return self.state_dict(destination, prefix, keep_vars) diff --git a/megatron/mpu/__init__.py b/megatron/mpu/__init__.py index bed81b74b..419e596e5 100644 --- a/megatron/mpu/__init__.py +++ b/megatron/mpu/__init__.py @@ -19,8 +19,6 @@ from .data import broadcast_data -from .grads import clip_grad_norm - from .initialize import is_unitialized from .initialize import destroy_model_parallel from .initialize import get_data_parallel_group @@ -50,9 +48,7 @@ from .random import checkpoint from .random import get_cuda_rng_tracker -from .random import init_checkpointed_activations_memory_buffer from .random import model_parallel_cuda_manual_seed -from .random import reset_checkpointed_activations_memory_buffer from .utils import divide from .utils import split_tensor_along_last_dim diff --git a/megatron/mpu/grads.py b/megatron/mpu/grads.py deleted file mode 100644 index f1e511ea9..000000000 --- a/megatron/mpu/grads.py +++ /dev/null @@ -1,127 +0,0 @@ -# coding=utf-8 -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -# Parts of the code here are adapted from PyTorch -# repo: https://github.com/pytorch/pytorch - - -import torch -from torch._six import inf - -try: - from apex.multi_tensor_apply import multi_tensor_applier - import amp_C - -except Exception as e: - print('WARNING: APEX is not installed, multi_tensor_applier will not be available.') - -from .initialize import get_model_parallel_group -from .initialize import get_model_parallel_rank - - -def l2_grad_clipper(parameters, max_norm): - """Efficient L2 norm gradient clipping.""" - - overflow_buf = torch.zeros(1, dtype=torch.int, device='cuda') - # Make sure we have an iterable. - if isinstance(parameters, torch.Tensor): - parameters = [parameters] - # Filter parameters with gradients. - parameters_with_grads = list(filter( - lambda p: p.grad is not None, parameters)) - # Filter parameters for norm calculations. - mp_rank_is_zero = (get_model_parallel_rank() == 0) - parameters_for_norm = list(filter( - lambda p: p.model_parallel or mp_rank_is_zero, parameters_with_grads)) - # Calculate L2 norm. - norm, _ = multi_tensor_applier( - amp_C.multi_tensor_l2norm, - overflow_buf, - [parameters_for_norm], - False # no per-parameter norm - ) - # Sum across all model parallel GPUs. - norm_2 = norm * norm - torch.distributed.all_reduce(norm_2, - op=torch.distributed.ReduceOp.SUM, - group=get_model_parallel_group()) - total_norm = norm_2.item() ** 0.5 - # Scale to get max_norm. - clip_coef = float(max_norm) / (total_norm + 1.0e-6) - grads = [p.grad for p in parameters_with_grads] - if clip_coef < 1.0: - multi_tensor_applier( - amp_C.multi_tensor_scale, - overflow_buf, - [grads, grads], - clip_coef) - return total_norm - - -def clip_grad_norm(parameters, max_norm, norm_type=2): - """Clips gradient norm of an iterable of parameters. - - This is adapted from torch.nn.utils.clip_grad.clip_grad_norm_ and - added functionality to handle model parallel parameters. Note that - the gradients are modified in place. - - Arguments: - parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a - single Tensor that will have gradients normalized - max_norm (float or int): max norm of the gradients - norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for - infinity norm. - - Returns: - Total norm of the parameters (viewed as a single vector). - """ - if isinstance(parameters, torch.Tensor): - parameters = [parameters] - parameters = list(filter(lambda p: p.grad is not None, parameters)) - max_norm = float(max_norm) - norm_type = float(norm_type) - if norm_type == inf: - total_norm = max(p.grad.data.abs().max() for p in parameters) - total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)]) - # Take max across all GPUs. - torch.distributed.all_reduce(total_norm_cuda, - op=torch.distributed.ReduceOp.MAX, - group=get_model_parallel_group()) - total_norm = total_norm_cuda[0].item() - clip_coef = max_norm / (total_norm + 1e-6) - if clip_coef < 1: - for p in parameters: - p.grad.data.mul_(clip_coef) - #elif norm_type == 2: - # total_norm = l2_grad_clipper(parameters, max_norm) - - else: - total_norm = 0 - for p in parameters: - if p.model_parallel or (get_model_parallel_rank() == 0): - param_norm = p.grad.data.norm(norm_type) - total_norm += param_norm.item() ** norm_type - # Sum across all model parallel GPUs. - total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)]) - torch.distributed.all_reduce(total_norm_cuda, - op=torch.distributed.ReduceOp.SUM, - group=get_model_parallel_group()) - total_norm = total_norm_cuda[0].item() ** (1. / norm_type) - clip_coef = max_norm / (total_norm + 1e-6) - if clip_coef < 1: - for p in parameters: - p.grad.data.mul_(clip_coef) - return total_norm diff --git a/megatron/mpu/random.py b/megatron/mpu/random.py index 547d90b6b..b3eaecba6 100644 --- a/megatron/mpu/random.py +++ b/megatron/mpu/random.py @@ -1,319 +1,22 @@ -# coding=utf-8 -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -# Parts of the code here are adapted from PyTorch -# repo: https://github.com/pytorch/pytorch - -import contextlib - -import torch -from torch import _C -from torch.cuda import _lazy_call, device as device_ctx_manager -from torch.utils.checkpoint import detach_variable - -from megatron import get_args -from megatron.memory import allocate_mem_buff - -from .initialize import get_data_parallel_rank -from .initialize import get_model_parallel_group -from .initialize import get_model_parallel_rank -from .initialize import get_model_parallel_world_size +# mostly moving to using checkpointing from deepspeed (identical code anyway) so currently this file is only imports +# TODO: should be able to get rid of this file entirely +import deepspeed +import deepspeed.runtime.activation_checkpointing.checkpointing as checkpointing # Default name for the model parallel rng tracker. -_MODEL_PARALLEL_RNG_TRACKER_NAME = 'model-parallel-rng' - +_MODEL_PARALLEL_RNG_TRACKER_NAME = deepspeed.checkpointing._MODEL_PARALLEL_RNG_TRACKER_NAME # Whether apply model parallelsim to checkpointed hidden states. _CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER = None - -def init_checkpointed_activations_memory_buffer(): - """Initializ the memory buffer for the checkpointed activations.""" - args = get_args() - - per_layer = args.batch_size * args.max_position_embeddings * \ - args.hidden_size // args.model_parallel_size - assert args.num_layers % args.checkpoint_num_layers == 0, \ - 'number of layers is not divisible by checkpoint-num-layers' - num_checkpointer_layers = args.num_layers // args.checkpoint_num_layers - numel = per_layer * num_checkpointer_layers - dtype = torch.half - if not (args.precision == "fp16"): - dtype = torch.float - - global _CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER - assert _CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER is None, \ - 'checkpointed activations memory buffer is already allocated.' - _CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER = allocate_mem_buff( - 'checkpointed activations', numel, dtype, track_usage=False) - - -def reset_checkpointed_activations_memory_buffer(): - """Reset the memory used for checkpointing.""" - if _CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER is not None: - _CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER.reset() - - -def _set_cuda_rng_state(new_state, device=-1): - """Sets the random number generator state of the current GPU. - - Argumentss: - new_state (torch.ByteTensor): The desired state - This function is adapted from PyTorch repo (torch.cuda.set_rng_state) - with a single change: the input state is not cloned. Cloning caused - major performance issues for +4 GPU cases. - """ - if hasattr(_C, '_cuda_setRNGState') and callable(_C._cuda_setRNGState): - # older PyTorch - def cb(): - with device_ctx_manager(device): - _C._cuda_setRNGState(new_state) - else: - # newer PyTorch - if device == -1: - device = torch.device('cuda') - elif isinstance(device, str): - device = torch.device(device) - elif isinstance(device, int): - device = torch.device('cuda', device) - - def cb(): - idx = device.index - if idx is None: - idx = torch.cuda.current_device() - default_generator = torch.cuda.default_generators[idx] - default_generator.set_state(new_state) - - _lazy_call(cb) - - -def split_tensor_into_1d_equal_chunks(tensor): - """Break a tensor into equal 1D chunks.""" - data = tensor.view(-1) - partition_size = torch.numel(data) // get_model_parallel_world_size() - start_index = partition_size * get_model_parallel_rank() - end_index = start_index + partition_size - return data[start_index:end_index] - - -def gather_split_1d_tensor(tensor): - """Opposite of above function, gather values from model parallel ranks.""" - world_size = get_model_parallel_world_size() - numel = torch.numel(tensor) - numel_gathered = world_size * numel - gathered = torch.empty(numel_gathered, dtype=tensor.dtype, - device=torch.cuda.current_device(), - requires_grad=False) - chunks = [gathered[i*numel:(i+1)*numel] for i in range(world_size)] - torch.distributed.all_gather(chunks, tensor, - group=get_model_parallel_group()) - return gathered - - -class CudaRNGStatesTracker: - """Tracker for the cuda RNG states. - - Using the `add` method, a cuda rng state is initialized based on - the input `seed` and is assigned to `name`. Later, by forking the - rng state, we can perform operations and return to our starting - cuda state. - """ - - def __init__(self): - # Map from a string name to the cuda rng state. - self.states_ = {} - # Seeds are just for book keeping and ensure no seed is set twice. - self.seeds_ = set() - - def reset(self): - """Set to the initial state (no tracker).""" - self.states_ = {} - self.seeds_ = set() - - def get_states(self): - """Get rng states. Copy the dictionary so we have direct - pointers to the states, not just a pointer to the dictionary.""" - states = {} - for name in self.states_: - states[name] = self.states_[name] - return states - - def set_states(self, states): - """Set the rng states. For efficiency purposes, we do not check - the size of seed for compatibility.""" - self.states_ = states - - def add(self, name, seed): - """Track the rng state.""" - # Check seed is not already used. - if seed in self.seeds_: - raise Exception('seed {} already exists'.format(seed)) - self.seeds_.add(seed) - # Check that state is not already defined. - if name in self.states_: - raise Exception('cuda rng state {} already exists'.format(name)) - # Get the current rng state. - orig_rng_state = torch.cuda.get_rng_state() - # Set the new state and store it. - torch.cuda.manual_seed(seed) - self.states_[name] = torch.cuda.get_rng_state() - # Reset rng state to what it was. - _set_cuda_rng_state(orig_rng_state) - - @contextlib.contextmanager - def fork(self, name=_MODEL_PARALLEL_RNG_TRACKER_NAME): - """Fork the cuda rng state, perform operations, and exit with - the original state.""" - # Check if we have added the state - if name not in self.states_: - raise Exception('cuda rng state {} is not added'.format(name)) - # Store current rng state. - orig_cuda_rng_state = torch.cuda.get_rng_state() - # Set rng state to the desired one - _set_cuda_rng_state(self.states_[name]) - # Do the stuff we wanted to do. - try: - yield - finally: - # Update the current rng state for later use. - self.states_[name] = torch.cuda.get_rng_state() - # And set the state to the original state we started with. - _set_cuda_rng_state(orig_cuda_rng_state) - - # RNG tracker object. -_CUDA_RNG_STATE_TRACKER = CudaRNGStatesTracker() - - -def get_cuda_rng_tracker(): - """Get cuda rng tracker.""" - return _CUDA_RNG_STATE_TRACKER - - -def model_parallel_cuda_manual_seed(seed): - """Initialize model parallel cuda seed. - - This function should be called after the model parallel is - initialized. Also, no torch.cuda.manual_seed should be called - after this function. Basically, this is replacement for that - function. - Two set of RNG states are tracked: - default state: This is for data parallelism and is the same among a - set of model parallel GPUs but different across - different model paralle groups. This is used for - example for dropout in the non-model-parallel regions. - model-parallel state: This state is different among a set of model - parallel GPUs, but the same across data parallel - groups. This is used for example for dropout in - model parallel regions. - """ - # 2718 is just for fun and any POSITIVE value will work. - offset = seed + 2718 - model_parallel_seed = offset + get_model_parallel_rank() - # Data parallel gets the original sedd. - data_parallel_seed = seed - - if torch.distributed.get_rank() == 0: - print('> initializing model parallel cuda seeds on global rank {}, ' - 'model parallel rank {}, and data parallel rank {} with ' - 'model parallel seed: {} and data parallel seed: {}'.format( - torch.distributed.get_rank(), get_model_parallel_rank(), - get_data_parallel_rank(), model_parallel_seed, - data_parallel_seed), flush=True) - _CUDA_RNG_STATE_TRACKER.reset() - # Set the default state. - torch.cuda.manual_seed(data_parallel_seed) - # and model parallel state. - _CUDA_RNG_STATE_TRACKER.add(_MODEL_PARALLEL_RNG_TRACKER_NAME, - model_parallel_seed) - - -class CheckpointFunction(torch.autograd.Function): - """This function is adapted from torch.utils.checkpoint with - two main changes: - 1) torch.cuda.set_rng_state is replaced with `_set_cuda_rng_state` - 2) the states in the model parallel tracker are also properly - tracked/set/reset. - """ - @staticmethod - def forward(ctx, run_function, *args): - ctx.run_function = run_function - - # Copy the rng states. - ctx.fwd_cpu_rng_state = torch.get_rng_state() - ctx.fwd_cuda_rng_state = torch.cuda.get_rng_state() - ctx.fwd_cuda_rng_state_tracker = get_cuda_rng_tracker().get_states() - - with torch.no_grad(): - outputs = run_function(*args) - - # Divide hidden states across model parallel group and only keep - # the chunk corresponding to the current rank. - if _CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER is not None: - ctx.input_0_shape = args[0].data.shape - args[0].data = split_tensor_into_1d_equal_chunks(args[0].data) - args[0].data = _CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER.add( - args[0].data) - - # Store everything. - ctx.save_for_backward(*args) - - - return outputs - - @staticmethod - def backward(ctx, *args): - if not torch.autograd._is_checkpoint_valid(): - raise RuntimeError("Checkpointing is not compatible with .grad(), " - "please use .backward() if possible") - inputs = ctx.saved_tensors - if _CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER is not None: - inputs[0].data = gather_split_1d_tensor(inputs[0].data) - inputs[0].data = inputs[0].data.view(ctx.input_0_shape) - - # Store the current states. - bwd_cpu_rng_state = torch.get_rng_state() - bwd_cuda_rng_state = torch.cuda.get_rng_state() - bwd_cuda_rng_state_tracker = get_cuda_rng_tracker().get_states() - - # Set the states to what it used to be before the forward pass. - torch.set_rng_state(ctx.fwd_cpu_rng_state) - _set_cuda_rng_state(ctx.fwd_cuda_rng_state) - get_cuda_rng_tracker().set_states(ctx.fwd_cuda_rng_state_tracker) - - # Compute the forward pass. - detached_inputs = detach_variable(inputs) - with torch.enable_grad(): - outputs = ctx.run_function(*detached_inputs) - - # Set the states back to what it was at the start of this function. - torch.set_rng_state(bwd_cpu_rng_state) - _set_cuda_rng_state(bwd_cuda_rng_state) - get_cuda_rng_tracker().set_states(bwd_cuda_rng_state_tracker) - - if isinstance(outputs, torch.Tensor): - outputs = (outputs,) - torch.autograd.backward(outputs, args) - grads = tuple(inp.grad if isinstance(inp, torch.Tensor) else inp - for inp in detached_inputs) - return (None,) + grads +_CUDA_RNG_STATE_TRACKER = deepspeed.checkpointing._CUDA_RNG_STATE_TRACKER +# Deepspeed checkpointing functions +# TODO: replace calls to these in our codebase with calls to the deepspeed ones +_set_cuda_rng_state = checkpointing._set_cuda_rng_state +checkpoint = checkpointing.checkpoint +model_parallel_cuda_manual_seed = checkpointing.model_parallel_cuda_manual_seed +get_cuda_rng_tracker = checkpointing.get_cuda_rng_tracker -def checkpoint(function, *args): - """Checkpoint a model or part of the model. - This has been directly copied from torch.utils.checkpoint.""" - return CheckpointFunction.apply(function, *args) diff --git a/megatron/neox_arguments/__init__.py b/megatron/neox_arguments/__init__.py index 2cc0152ba..afe9cb571 100644 --- a/megatron/neox_arguments/__init__.py +++ b/megatron/neox_arguments/__init__.py @@ -19,7 +19,7 @@ * NeoXArgs.from_dict({"num_layers": 12, ...}): load attribute values from dict; checks unknown arguments are performed * NeoXArgs.consume_deepy_args(): entry point for deepy.py configuring and consuming command line arguments (i.e. user_script, conf_dir, conf_file, wandb_group, wandb_team); neox_args.get_deepspeed_main_args() produces a list of command line arguments to feed to deepspeed.launcher.runner.main -* NeoXArgs.consume_megatron_args(): In the call stack deepy.py -> deepspeed -> pretrain_gpt2.py; arguments are passed to pretrain_gpt2.py by neox_args.get_deepspeed_main_args(). So produced arguments can be read with consume_megatron_args() to instantiate a NeoXArgs instance. +* NeoXArgs.consume_neox_args(): In the call stack deepy.py -> deepspeed -> pretrain_gpt2.py; arguments are passed to pretrain_gpt2.py by neox_args.get_deepspeed_main_args(). So produced arguments can be read with consume_neox_args() to instantiate a NeoXArgs instance. **code structure** diff --git a/megatron/neox_arguments/arguments.py b/megatron/neox_arguments/arguments.py index f01e75b1f..48cd25e22 100644 --- a/megatron/neox_arguments/arguments.py +++ b/megatron/neox_arguments/arguments.py @@ -3,26 +3,20 @@ import json import logging import shortuuid +import copy +import torch +import argparse -import dataclasses from dataclasses import dataclass from typing import List -from pathlib import Path from socket import gethostname from typing import Literal, Dict - -import torch - from deepspeed.launcher.runner import DLTS_HOSTFILE - from megatron.logging import Tee from megatron.utils import obtain_resource_pool - from .deepspeed_args import NeoXArgsDeepspeedConfig, NeoXArgsDeepspeedRunner -from .megatron_args import NeoXArgsModel, NeoXArgsTokenizer, NeoXArgsTraining, NeoXArgsParallelism, \ - NeoXArgsLogging, NeoXArgsOther, NeoXArgsTextgen, NeoXArgsOptimizer, NeoXArgsLRScheduler - -import argparse +from .neox_args import NeoXArgsModel, NeoXArgsTokenizer, NeoXArgsTraining, NeoXArgsParallelism, \ + NeoXArgsLogging, NeoXArgsOther, NeoXArgsTextgen, NeoXArgsOptimizer, NeoXArgsLRScheduler # ZERO defaults by deespeed # These values should not be changed unless defaults in deepspeed are changed @@ -54,22 +48,23 @@ } BASE_CLASSES = [ - NeoXArgsDeepspeedRunner, + NeoXArgsDeepspeedRunner, NeoXArgsDeepspeedConfig, - NeoXArgsModel, + NeoXArgsModel, NeoXArgsLRScheduler, NeoXArgsOptimizer, NeoXArgsTokenizer, - NeoXArgsTraining, + NeoXArgsTraining, NeoXArgsParallelism, NeoXArgsLogging, NeoXArgsOther, NeoXArgsTextgen - ] +] DEEPSPEED_ARG_CLASSES = [NeoXArgsDeepspeedRunner, NeoXArgsDeepspeedConfig] NEOX_ARG_CLASSES = [i for i in BASE_CLASSES if i not in DEEPSPEED_ARG_CLASSES] + @dataclass class NeoXArgs(*BASE_CLASSES): """ @@ -88,19 +83,19 @@ def __post_init__(self): calculate values, assert consistency and do typechecking. """ if not NeoXArgs.validate_keys(): - raise ValueError(self.__class__.__name__+".__post_init__() NeoXArgs keys cannot be validated") + raise ValueError(self.__class__.__name__ + ".__post_init__() NeoXArgs keys cannot be validated") self.enable_logging() self.configure_distributed_args() self.calculate_derived() - + if not self.validate_types(): - raise ValueError(self.__class__.__name__+".__post_init__() NeoXArgs types cannot be validated") + raise ValueError(self.__class__.__name__ + ".__post_init__() NeoXArgs types cannot be validated") if not self.validate_values(): - raise ValueError(self.__class__.__name__+".__post_init__() NeoXArgs values cannot be validated") - + raise ValueError(self.__class__.__name__ + ".__post_init__() NeoXArgs values cannot be validated") + self.save_yml() @classmethod @@ -113,7 +108,7 @@ def from_ymls(cls, paths_to_yml_files: List[str], overwrite_values: Dict = None) overwrite_values: If provided, overwrite any values in the yamls with these values """ - print(cls.__name__+".from_ymls() "+str(paths_to_yml_files), flush=True) + print(cls.__name__ + ".from_ymls() " + str(paths_to_yml_files), flush=True) # initialize an empty config dictionary to be filled by yamls config = dict() @@ -128,15 +123,18 @@ def from_ymls(cls, paths_to_yml_files: List[str], overwrite_values: Dict = None) # check for key duplicates and load values for conf_key, conf_value in conf.items(): if conf_key in config: - raise ValueError(f'Conf file {conf_file_name} has the following duplicate keys with previously loaded file: {conf_key}') + raise ValueError( + f'Conf file {conf_file_name} has the following duplicate keys with previously loaded file: {conf_key}') - conf_key_converted = conf_key.replace("-", "_") #TODO remove replace and update configuration files? + conf_key_converted = conf_key.replace("-", "_") # TODO remove replace and update configuration files? config[conf_key_converted] = conf_value # Configuration parameters not specified - params_not_in_config = sorted(list(set(cls.__dataclass_fields__.keys()) - set(config.keys()))) + params_not_in_config = sorted(list(set(cls.__dataclass_fields__.keys()) - set(config.keys()))) if len(params_not_in_config) > 0: - logging.debug(cls.__name__+".from_ymls() Configuration parameters not specified (using defaults): "+", ".join(params_not_in_config)) + logging.debug( + cls.__name__ + ".from_ymls() Configuration parameters not specified (using defaults): " + ", ".join( + params_not_in_config)) if overwrite_values is not None: for k, v in overwrite_values.items(): @@ -170,30 +168,29 @@ def consume_deepy_args(cls): group = parser.add_argument_group(title='Training Configuration') group.add_argument("user_script", - type=str, - help="User script to launch, followed by any required " - "arguments.") + type=str, + help="User script to launch, followed by any required " + "arguments.") group.add_argument("--conf_dir", '-d', - type=str, - default=None, - help="Directory to prefix to all configuration file paths") + type=str, + default=None, + help="Directory to prefix to all configuration file paths") group.add_argument("conf_file", - type=str, - nargs='+', - help="Configuration file path. Multiple files can be provided and will be merged.") - + type=str, + nargs='+', + help="Configuration file path. Multiple files can be provided and will be merged.") + group = parser.add_argument_group(title='Weights and Biases monitoring args') group.add_argument('--wandb_group', type=str, default=None, - help='Weights and Biases group name - used to group together "runs".') + help='Weights and Biases group name - used to group together "runs".') group.add_argument('--wandb_team', type=str, default=None, - help='Team name for Weights and Biases.') + help='Team name for Weights and Biases.') args_parsed = parser.parse_args() - # Validate user_script exists assert os.path.exists(args_parsed.user_script), f"User script could not be found: {args_parsed.user_script}" @@ -205,17 +202,22 @@ def consume_deepy_args(cls): # enables us to pass in `small` instead of `small.yml` conf_files = [(cf if cf.endswith('.yml') else cf + ".yml") for cf in conf_files] + # determine overwrite values + overwrite_values = dict() + if args_parsed.wandb_group is not None: + overwrite_values["wandb_group"] = args_parsed.wandb_group + if args_parsed.wandb_team is not None: + overwrite_values["wandb_team"] = args_parsed.wandb_team + if args_parsed.user_script is not None: + overwrite_values["user_script"] = args_parsed.user_script + # load args - neox_args = cls.from_ymls(paths_to_yml_files=conf_files, overwrite_values={ - "wandb_group": args_parsed.wandb_group, - "wandb_team": args_parsed.wandb_team, - "user_script": args_parsed.user_script - }) + neox_args = cls.from_ymls(paths_to_yml_files=conf_files, overwrite_values=overwrite_values) return neox_args @classmethod - def consume_megatron_args(cls): + def consume_neox_args(cls): """ Deepspeed launcher needs to pass the arguments for `pretrain_gpt2.py` across to all machines. @@ -245,7 +247,6 @@ def convert_key_value_to_command_line_arg(k, v): return [] return [f'--{k}', str(v)] - def get_deepspeed_main_args(self): args_list = list() @@ -265,9 +266,9 @@ def get_deepspeed_main_args(self): # get all config values args_list.append("--megatron_config") - megatron_args = self.get_parent_class_value_dict(*self.__class__.__bases__, only_non_defaults=True) - args_list.append(json.dumps(megatron_args)) - + neox_args = self.get_parent_class_value_dict(*self.__class__.__bases__, only_non_defaults=True) + args_list.append(json.dumps(neox_args)) + return args_list ############################################################################################################################ @@ -298,7 +299,7 @@ def get_parent_class_value_dict(self, *parent_classes, only_non_defaults=False) """ takes a sequence of parent classes and returns corresponding values (with defaults set) """ - #TODO no Nones or non-defaults + # TODO no Nones or non-defaults result = dict() for parent in parent_classes: for key, default_value in parent().defaults(): @@ -320,7 +321,7 @@ def params_dtype(self): ############################################################################################################################ # start of logging and output - + def enable_logging(self): """ enable Tee logs based on the configured logdir @@ -329,7 +330,7 @@ def enable_logging(self): os.makedirs(self.log_dir, exist_ok=True) hostname = gethostname() file_prefix = os.path.join(self.log_dir, hostname) - Tee(file_prefix+'_stdout.txt', err=False) + Tee(file_prefix + '_stdout.txt', err=False) Tee(file_prefix + '_stderr.txt', err=True) def save_yml(self): @@ -348,7 +349,6 @@ def print(self): print('-------------------- arguments --------------------', flush=True) str_list = [] for arg in vars(self): - # add arg + value dots = '.' * (32 - len(arg)) value = getattr(self, arg) @@ -360,7 +360,6 @@ def print(self): dots = '.' * (64 - len(print_str)) print_str += dots + default_info - str_list.append(print_str) for arg in sorted(str_list, key=lambda x: x.lower()): print(arg, flush=True) @@ -376,14 +375,16 @@ def configure_distributed_args(self): if self.deepspeed_mpi: from deepspeed.utils.distributed import mpi_discovery mpi_discovery() - + self.update_value("local_rank", int(os.getenv('LOCAL_RANK', '0'))) self.update_value("rank", int(os.getenv('RANK', '0'))) self.update_value("world_size", int(os.getenv("WORLD_SIZE", '1'))) self.update_value("model_parallel_size", min(self.model_parallel_size, self.world_size)) if self.rank == 0: - print(self.__class__.__name__+".configure_distributed_args() using world size: {} and model-parallel size: {} ".format(self.world_size, self.model_parallel_size), flush=True) + print( + self.__class__.__name__ + ".configure_distributed_args() using world size: {} and model-parallel size: {} ".format( + self.world_size, self.model_parallel_size), flush=True) @staticmethod def calculate_batch_parameters(dp_world_size, train_batch=None, micro_batch=None, grad_acc=None): @@ -440,8 +441,8 @@ def check_batch_parameters(dp_world_size, train_batch, micro_batch, grad_acc): assert train_batch == micro_batch * grad_acc * dp_world_size, \ (f'Check batch related parameters. train_batch_size is not equal' - ' to micro_batch_per_gpu * gradient_acc_step * world_size' - f'{train_batch} != {micro_batch} * {grad_acc} * {dp_world_size}') + ' to micro_batch_per_gpu * gradient_acc_step * world_size' + f'{train_batch} != {micro_batch} * {grad_acc} * {dp_world_size}') def calculate_derived(self): """ @@ -453,15 +454,12 @@ def calculate_derived(self): if self.wandb_group is None: # if none is defined a uuid is set for the run self.wandb_group = shortuuid.uuid() - else: - # if one is defined it is concatenated with a uuid to make the run unique - self.wandb_group = str(self.wandb_group) + shortuuid.uuid() # number of gpus # Get number of GPUs param or hostfile to determine train_batch_size num_gpus = self.num_gpus if num_gpus is None: - num_gpus = -1 # set -1 for backwards compatibility to old default value + num_gpus = -1 # set -1 for backwards compatibility to old default value if num_gpus < 1: if self.hostfile is not None or os.path.exists(DLTS_HOSTFILE): hostfile_path = self.hostfile or DLTS_HOSTFILE @@ -471,7 +469,8 @@ def calculate_derived(self): num_gpus = torch.cuda.device_count() self.update_value("num_gpus", num_gpus) - logging.info(self.__class__.__name__+".calculate_derived() "+f"Total number of GPUs determined to be: {self.num_gpus}") + logging.info( + self.__class__.__name__ + ".calculate_derived() " + f"Total number of GPUs determined to be: {self.num_gpus}") # get world size in the model/pipe parallel case, the actual `world size` deepspeed uses is the size of the # data-parallel group, or (num_gpus / mp_size) / pp_size @@ -479,25 +478,26 @@ def calculate_derived(self): pp_size = pp_size if pp_size >= 1 else 1 mp_size = self.model_parallel_size mp_size = mp_size if mp_size >= 1 else 1 - + self.update_value("model_parallel_size", mp_size) + # pp_size and mp_size are only used here to compute dp world size and nowhere else. dp_world_size = ((num_gpus / pp_size) / mp_size) if not (dp_world_size % 1 == 0): - error_message = self.__class__.__name__+".calculate_derived() "+f"(num_gpus / pp_size) / mp_size [({num_gpus} / {pp_size}) / {mp_size}] must be a whole number" + error_message = self.__class__.__name__ + ".calculate_derived() " + f"(num_gpus / pp_size) / mp_size [({num_gpus} / {pp_size}) / {mp_size}] must be a whole number" logging.error(error_message) raise AssertionError(error_message) # Automatically derive train_batch_size = train_micro_batch_size_per_gpu*num_gpus*gradient_accumulation_steps train_batch_size, train_micro_batch_size_per_gpu, gradient_accumulation_steps = self.calculate_batch_parameters( - dp_world_size=dp_world_size, - train_batch=self.train_batch_size, - micro_batch=self.train_micro_batch_size_per_gpu, + dp_world_size=dp_world_size, + train_batch=self.train_batch_size, + micro_batch=self.train_micro_batch_size_per_gpu, grad_acc=self.gradient_accumulation_steps - ) + ) self.check_batch_parameters( - dp_world_size=dp_world_size, - train_batch=train_batch_size, - micro_batch=train_micro_batch_size_per_gpu, + dp_world_size=dp_world_size, + train_batch=train_batch_size, + micro_batch=train_micro_batch_size_per_gpu, grad_acc=gradient_accumulation_steps ) self.update_values({ @@ -516,13 +516,16 @@ def calculate_derived(self): # zero optimization if self.zero_optimization is None: - self.zero_optimization = copy.deepcopy(ZERO_DEFAULTS) # a dict is overwritten and not updated key by key + self.zero_optimization = copy.deepcopy(ZERO_DEFAULTS) # a dict is overwritten and not updated key by key self.update_values({ "zero_stage": self.zero_optimization.get('stage', ZERO_DEFAULTS['stage']), "zero_reduce_scatter": self.zero_optimization.get('reduce_scatter', ZERO_DEFAULTS['reduce_scatter']), - "zero_contiguous_gradients": self.zero_optimization.get('contiguous_gradients', ZERO_DEFAULTS['contiguous_gradients']), - "zero_reduce_bucket_size": self.zero_optimization.get('reduce_bucket_size', ZERO_DEFAULTS['reduce_bucket_size']), - "zero_allgather_bucket_size": self.zero_optimization.get('allgather_bucket_size', ZERO_DEFAULTS['allgather_bucket_size']) + "zero_contiguous_gradients": self.zero_optimization.get('contiguous_gradients', + ZERO_DEFAULTS['contiguous_gradients']), + "zero_reduce_bucket_size": self.zero_optimization.get('reduce_bucket_size', + ZERO_DEFAULTS['reduce_bucket_size']), + "zero_allgather_bucket_size": self.zero_optimization.get('allgather_bucket_size', + ZERO_DEFAULTS['allgather_bucket_size']) }) # optimizer and scheduler @@ -542,11 +545,15 @@ def calculate_derived(self): "warmup_max_lr": self.lr, "warmup_num_steps": int(self.train_iters * self.warmup), "total_num_steps": self.lr_decay_iters or self.train_iters - }} + }} # Fp16 loss scaling. self.update_value("dynamic_loss_scale", self.loss_scale is None) + # Update 'is pipe parallel' flag + # if we set pipe_parallel_size to 0 or 1, GPT2ModelPipe.to_sequential() is called, and we run training with + # the sequential model without the PipelineModule wrapper to avoid the overhead it incurs + self.update_value("is_pipe_parallel", self.pipe_parallel_size >= 1) ############################################################################################################################ # start of validation functions @@ -563,12 +570,13 @@ def validate_keys(cls): source_vars = list(source_class.__dataclass_fields__) for item in source_vars: if item in defined_properties.keys(): - logging.error(f'({cls.__name__}) duplicate of item: {item}, in class {source_class.__name__} and {defined_properties[item]}') + logging.error( + f'({cls.__name__}) duplicate of item: {item}, in class {source_class.__name__} and {defined_properties[item]}') return False else: defined_properties[item] = source_class.__name__ return True - + def validate_values(self): # the current codebase assumes running with deepspeed only if not self.deepspeed: @@ -576,7 +584,7 @@ def validate_values(self): # learning rate if self.lr is None: - error_message = self.__class__.__name__+".validate_values() lr is None" + error_message = self.__class__.__name__ + ".validate_values() lr is None" logging.error(error_message) raise ValueError(error_message) return False @@ -585,67 +593,58 @@ def validate_values(self): required_args = ['num_layers', 'hidden_size', 'num_attention_heads', 'max_position_embeddings'] for req_arg in required_args: if getattr(self, req_arg) is None: - error_message = self.__class__.__name__+".validate_values() "+req_arg+" is None." + error_message = self.__class__.__name__ + ".validate_values() " + req_arg + " is None." logging.error(error_message) raise ValueError(error_message) return False # Checks. if self.hidden_size % self.num_attention_heads != 0: - error_message = self.__class__.__name__+".validate_values() hidden_size must be divisable by num_attention_heads" + error_message = self.__class__.__name__ + ".validate_values() hidden_size must be divisable by num_attention_heads" logging.error(error_message) raise ValueError(error_message) return False if self.seq_length is not None: - if not(self.max_position_embeddings >= self.seq_length): - error_message = self.__class__.__name__+".validate_values() max_position_embeddings must be bigger or equal seq_length" + if not (self.max_position_embeddings >= self.seq_length): + error_message = self.__class__.__name__ + ".validate_values() max_position_embeddings must be bigger or equal seq_length" logging.error(error_message) raise ValueError(error_message) return False - - if not(self.min_lr <= self.lr): - error_message = self.__class__.__name__+".validate_values() min_lr must be smaller or equal lr" + + if not (self.min_lr <= self.lr): + error_message = self.__class__.__name__ + ".validate_values() min_lr must be smaller or equal lr" logging.error(error_message) raise ValueError(error_message) return False if self.save is not None and self.save_interval is None: - error_message = self.__class__.__name__+".validate_values() save_interval must be defined if save is defined" + error_message = self.__class__.__name__ + ".validate_values() save_interval must be defined if save is defined" logging.error(error_message) raise ValueError(error_message) return False # Parameters sharing does not work with torch DDP. if (self.num_unique_layers is not None) and (self.num_layers is not None): - + if not (self.num_unique_layers <= self.num_layers): - error_message = self.__class__.__name__+".validate_values() num-unique-layers must be smaller or equal num_layers" + error_message = self.__class__.__name__ + ".validate_values() num-unique-layers must be smaller or equal num_layers" logging.error(error_message) raise ValueError(error_message) return False if not (self.num_layers % self.num_unique_layers == 0): - error_message = self.__class__.__name__+".validate_values() num-layers should be divisible by num-unique-layers" + error_message = self.__class__.__name__ + ".validate_values() num-layers should be divisible by num-unique-layers" logging.error(error_message) raise ValueError(error_message) return False - if self.fp16_lm_cross_entropy and self.precision != "fp16": - error_message = self.__class__.__name__+".validate_values() lm cross entropy in fp16 only support in fp16 mode." - logging.error(error_message) - raise ValueError(error_message) - return False - - # Activation checkpointing. - if self.distribute_checkpointed_activations and not self.checkpoint_activations: - error_message = self.__class__.__name__+".validate_values() 'for distribute-checkpointed-activations to work you need to enable checkpoint-activations'" + error_message = self.__class__.__name__ + ".validate_values() lm cross entropy in fp16 only support in fp16 mode." logging.error(error_message) raise ValueError(error_message) return False - return True def validate_types(self): @@ -655,16 +654,16 @@ def validate_types(self): for field_name, field_def in self.__dataclass_fields__.items(): actual_value = getattr(self, field_name) - if actual_value is None: - continue # we allow for some values not to be configured + if actual_value is None: + continue # we allow for some values not to be configured actual_type = type(actual_value) if actual_type != field_def.type: - if actual_type == int and field_def.type == float: # floats should be able to be configured as ints + if actual_type == int and field_def.type == float: # floats should be able to be configured as ints continue # for typing.Literal (i.e a list of choices) - checks that actual value is in accepted values - elif field_def.type.__origin__ == Literal: + elif field_def.type.__origin__ == Literal: accepted_values = field_def.type.__args__ if actual_value in accepted_values: continue @@ -673,36 +672,42 @@ def validate_types(self): lowercase_accepted_values = [i.lower() for i in accepted_values if isinstance(i, str)] if actual_value.lower() in lowercase_accepted_values: continue - logging.error(self.__class__.__name__+".validate_types() "+f"{field_name}: '{actual_value}' Not in accepted values: '{accepted_values}'") + logging.error( + self.__class__.__name__ + ".validate_types() " + f"{field_name}: '{actual_value}' Not in accepted values: '{accepted_values}'") return False - logging.error(self.__class__.__name__+".validate_types() "+f"{field_name}: '{actual_type}' instead of '{field_def.type}'") + logging.error( + self.__class__.__name__ + ".validate_types() " + f"{field_name}: '{actual_type}' instead of '{field_def.type}'") return False - + # validate deepspeed dicts for field_name in ["optimizer", "scheduler"]: value = getattr(self, field_name) - if isinstance(value, dict): # dict is checked above, only fields are checked here + if isinstance(value, dict): # dict is checked above, only fields are checked here if "type" in value: if not isinstance(value["type"], str): - logging.error(self.__class__.__name__+".validate_types() "+f"{field_name}: key 'type' must be a string") - return False + logging.error( + self.__class__.__name__ + ".validate_types() " + f"{field_name}: key 'type' must be a string") + return False else: - logging.error(self.__class__.__name__+".validate_types() "+f"{field_name}: must contain key 'type'") + logging.error( + self.__class__.__name__ + ".validate_types() " + f"{field_name}: must contain key 'type'") return False if "params" in value: if not isinstance(value["params"], dict): - logging.error(self.__class__.__name__+".validate_types() "+f"{field_name}: key 'params' must be a dict") - return False + logging.error( + self.__class__.__name__ + ".validate_types() " + f"{field_name}: key 'params' must be a dict") + return False else: - logging.error(self.__class__.__name__+".validate_types() "+f"{field_name}: must contain key 'params'") + logging.error( + self.__class__.__name__ + ".validate_types() " + f"{field_name}: must contain key 'params'") return False - + for field_name in ["fp16", "amp", "flops_profiler"]: value = getattr(self, field_name) if isinstance(value, dict): if not "enabled" in value: - error_message = self.__class__.__name__+".validate_types() "+f"{field_name}: must contain key 'enabled'" + error_message = self.__class__.__name__ + ".validate_types() " + f"{field_name}: must contain key 'enabled'" logging.error(error_message) return False diff --git a/megatron/neox_arguments/megatron_args.py b/megatron/neox_arguments/neox_args.py similarity index 96% rename from megatron/neox_arguments/megatron_args.py rename to megatron/neox_arguments/neox_args.py index c10ab2ea7..3b1c1bcca 100644 --- a/megatron/neox_arguments/megatron_args.py +++ b/megatron/neox_arguments/neox_args.py @@ -3,6 +3,7 @@ from .template import NeoXArgsTemplate from typing import Literal + def get_git_commit_hash(): """ Gets the git commit hash of your current repo (if it exists) """ try: @@ -12,14 +13,14 @@ def get_git_commit_hash(): git_hash = None return git_hash + @dataclass class NeoXArgsParallelism(NeoXArgsTemplate): - pipe_parallel_size: int = 0 """ Number of pipeline parallel stages. Disable with 0. """ - + model_parallel_size: int = 1 """ Size of the model parallelism. @@ -35,11 +36,16 @@ class NeoXArgsParallelism(NeoXArgsTemplate): Total world size (i.e number of gpus in cluster). Configured post-launch using distributed launcher """ + is_pipe_parallel: bool = False + """ + flag to determine whether pipeline parallelism is on - shouldn't be set by user, is automatically determined + according to pipeline parallel size. + """ + @dataclass class NeoXArgsModel(NeoXArgsTemplate): - - precision: Literal["fp16", "fp32"] = None + precision: Literal["fp16", "fp32"] = None """ description of the used precision, either one of fp16 or fp32 (and in the future bf16). """ @@ -188,13 +194,13 @@ class NeoXArgsModel(NeoXArgsTemplate): """ Run attention masking and softmax in fp32. """ - - rotary_pct: float = 1.0 + + rotary_pct: float = 1.0 """ pct of hidden dims to apply rotary positional embedding to """ - rotary_emb_base: int = 10000 + rotary_emb_base: int = 10000 """ Base for rotary positional embedding """ @@ -202,7 +208,6 @@ class NeoXArgsModel(NeoXArgsTemplate): @dataclass class NeoXArgsOptimizer(NeoXArgsTemplate): - optimizer_type: Literal['adam', 'onebitadam', 'cpu_adam', 'cpu_torch_adam', 'sm3'] = "adam" """ Type of optimizer to use. Choose from ['adam', 'onebitadam', 'cpu_adam', 'cpu_torch_adam', 'sm3'] @@ -212,28 +217,28 @@ class NeoXArgsOptimizer(NeoXArgsTemplate): """ Zero Optimizer stage """ - + zero_reduce_scatter: bool = None """ Zero: Uses reduce or reduce scatter instead of allreduce to average gradients """ - + zero_contiguous_gradients: bool = None """ Zero: Copies the gradients to a contiguous buffer as they are produced. Avoids memory fragmentation during backward pass. Only useful when running very large models. """ - + zero_reduce_bucket_size: int = None """ Zero: Number of elements reduced/allreduced at a time. Limits the memory required for the allgather for large model sizes """ - + zero_allgather_bucket_size: int = None """ Zero: Number of elements allgathered at a time. Limits the memory required for the allgather for large model sizes """ - lr: float = None + lr: float = None """ Max Learning rate during training """ @@ -274,7 +279,6 @@ class NeoXArgsLRScheduler(NeoXArgsTemplate): @dataclass class NeoXArgsLogging(NeoXArgsTemplate): - wandb_group: str = None """Weights and Biases group name - used to group together "runs".""" @@ -334,7 +338,6 @@ class NeoXArgsLogging(NeoXArgsTemplate): @dataclass class NeoXArgsOther(NeoXArgsTemplate): - distributed_backend: str = "nccl" """ Which backend to use for distributed training. @@ -407,7 +410,7 @@ class NeoXArgsOther(NeoXArgsTemplate): """ Run via MPI, this will attempt to discover the necessary variables to initialize torch distributed from the MPI environment """ - + user_script: str = None """ user script to be run @@ -433,10 +436,11 @@ class NeoXArgsOther(NeoXArgsTemplate): Set during training """ + @dataclass class NeoXArgsTokenizer(NeoXArgsTemplate): - - tokenizer_type: Literal["GPT2BPETokenizer", "HFTokenizer", "HFGPT2Tokenizer", "CharLevelTokenizer"] = "GPT2BPETokenizer" + tokenizer_type: Literal[ + "GPT2BPETokenizer", "HFTokenizer", "HFGPT2Tokenizer", "CharLevelTokenizer"] = "GPT2BPETokenizer" """ Type of tokenizer to use - should be one of ["GPT2BPETokenizer", "HFTokenizer", "HFGPT2Tokenizer", "CharLevelTokenizer"] """ @@ -450,7 +454,6 @@ class NeoXArgsTokenizer(NeoXArgsTemplate): @dataclass class NeoXArgsTextgen(NeoXArgsTemplate): - text_gen_type: str = None """ How to generate text/sample the model. @@ -510,7 +513,6 @@ class NeoXArgsTextgen(NeoXArgsTemplate): @dataclass class NeoXArgsTraining(NeoXArgsTemplate): - data_path: str = None """ Path to combined dataset to split. @@ -641,44 +643,39 @@ class NeoXArgsTraining(NeoXArgsTemplate): Chunk size (number of layers) for checkpointing. """ - distribute_checkpointed_activations: bool = False - """ - If set, distribute checkpointed activations across model parallel group. - """ - - deepspeed_activation_checkpointing: bool = False + deepspeed_activation_checkpointing: bool = True """ + DEPRECATED - TODO: remove Uses activation checkpointing from deepspeed """ - + contiguous_checkpointing: bool = False """ Contiguous memory checkpointing for activations. """ - + checkpoint_in_cpu: bool = False """ Move the activation checkpoints to CPU. """ - + synchronize_each_layer: bool = False """ does a synchronize at the beginning and end of each checkpointed layer. """ - + profile_backward: bool = False """ Enables backward pass profiling for checkpointed layers. """ - + partition_activations: bool = False """ Partition Activations across GPUs before checkpointing. """ gas: int = None - """gradient_accumulation_steps""" #TODO this is a duplicate, remove? - + """gradient_accumulation_steps""" # TODO this is a duplicate, remove? clip_grad: float = None """ @@ -709,4 +706,4 @@ class NeoXArgsTraining(NeoXArgsTemplate): min_scale: float = 1.0 """ Minimum loss scale for dynamic loss scale. - """ \ No newline at end of file + """ diff --git a/megatron/text_generation_utils.py b/megatron/text_generation_utils.py index 1997bbb2a..b24a82a76 100644 --- a/megatron/text_generation_utils.py +++ b/megatron/text_generation_utils.py @@ -21,6 +21,7 @@ import json import os import time +from typing import List, Union import torch import torch.nn.functional as F @@ -29,8 +30,7 @@ from megatron import get_tokenizer from megatron import mpu from megatron.utils import get_ltor_masks_and_position_ids, is_mp_rank_0 -from megatron.fp16 import fp32_to_fp16 -from typing import List, Union + def get_batch(context_tokens): """Generate batch from context tokens.""" @@ -165,14 +165,22 @@ def forward_model(model, model_inputs): # we need to forward a pipe model by access model.module() instead of just model() args = get_args() torch.distributed.barrier() - if args.pipe_parallel_size == 1: + if args.pipe_parallel_size <= 1: return model.module(model_inputs) - elif args.pipe_parallel_size > 1: - data_iterator = iter([[model_inputs, torch.Tensor(1)]]) # we need to feed in fake labels bc deepspeed is only built for training + else: + data_iterator = iter( + [[model_inputs, torch.Tensor(1)]]) # we need to feed in fake labels bc deepspeed is only built for training x = model.inference_batch(data_iterator) return x - else: - return model(*model_inputs) + + +def broadcast_terminate_signal(terminate_runs: int): + """Send signal to all workers to terminate if we've finished the process""" + terminate_runs_tensor = torch.cuda.LongTensor([terminate_runs]) + torch.distributed.broadcast(terminate_runs_tensor, + mpu.get_model_parallel_src_rank(), + group=mpu.get_model_parallel_group()) + return terminate_runs_tensor[0].item() def sample_sequence_batch(model, context_tokens, context_lengths, @@ -215,10 +223,10 @@ def sample_sequence_batch(model, context_tokens, context_lengths, if args.recompute: # we need to use args instead of kwargs here because deepspeed :| model_inputs = (tokens, - position_ids, - attention_mask, - torch.Tensor(), - ) + position_ids, + attention_mask, + torch.Tensor(), + ) logits = forward_model(model, model_inputs) logits = logits[:, context_length - 1, :] else: @@ -231,10 +239,10 @@ def sample_sequence_batch(model, context_tokens, context_lengths, positions2use = position_ids[:, context_length - 1].view( batch_size, -1) # we have to use args instead of kwargs here because deepspeed :| - model_inputs = (tokens2use, # input_ids - positions2use, # position_ids - attention_mask, # attention_mask - layer_past, # layer_past + model_inputs = (tokens2use, # input_ids + positions2use, # position_ids + attention_mask, # attention_mask + layer_past, # layer_past ) logits, layer_past = forward_model(model, model_inputs) @@ -247,14 +255,14 @@ def sample_sequence_batch(model, context_tokens, context_lengths, logits = logits.float() logits /= args.temperature logits = top_k_logits(logits, top_k=args.top_k, - top_p=args.top_p) + top_p=args.top_p) log_probs = F.softmax(logits, dim=-1) prev = torch.multinomial(log_probs, num_samples=1).view(-1) print_logits = [] for p in prev: print_logits.append([logits[i, p].item() - for i in range(batch_size)]) + for i in range(batch_size)]) started = context_lengths <= context_length tokens[:, context_length] = switch( tokens[:, context_length].view(-1), prev, started) @@ -290,7 +298,7 @@ def generate_samples_from_prompt(model, text: Union[List[str], str]): assert any([isinstance(text, str), isinstance(text, list)]), "Text should be in string or list form" if isinstance(text, str): text = [text] - + if is_mp_rank_0(): input_count = len(text) input_pos = 0 @@ -300,7 +308,7 @@ def generate_samples_from_prompt(model, text: Union[List[str], str]): generated_texts = [] while True: start_time = time.time() - + # Tokenize text, and check whether we should terminate process terminate_runs = 0 if is_mp_rank_0(): @@ -315,19 +323,14 @@ def generate_samples_from_prompt(model, text: Union[List[str], str]): if context_length >= (args.seq_length // 2): print_rank_0("\nContext length", context_length, - "\nPlease give smaller context (half of the " - "sequence length)!", flush=True) + "\nPlease give smaller context (half of the " + "sequence length)!", flush=True) continue else: context_tokens = tokenizer.tokenize("EMPTY TEXT") context_length = len(context_tokens) - # Send signal to all workers to terminate if we've finished the process - terminate_runs_tensor = torch.cuda.LongTensor([terminate_runs]) - torch.distributed.broadcast(terminate_runs_tensor, - mpu.get_model_parallel_src_rank(), - group=mpu.get_model_parallel_group()) - terminate_runs = terminate_runs_tensor[0].item() + terminate_runs = broadcast_terminate_signal(terminate_runs) if terminate_runs == 1: return generated_texts @@ -349,7 +352,7 @@ def generate_samples_from_prompt(model, text: Union[List[str], str]): generated_texts.append(data) if iterations % args.log_interval == 0: print_rank_0('Avg s/batch:', - (time.time() - start_time) / min(args.log_interval, iterations + 1)) + (time.time() - start_time) / min(args.log_interval, iterations + 1)) start_time = time.time() iterations += 1 @@ -375,7 +378,7 @@ def generate_samples_input_from_file(model): if args.sample_output_file is None: sample_output_file = args.sample_input_file + ".out" print_rank_0('could not find `sample-output-file`, setting ' - 'it to {}'.format(sample_output_file)) + 'it to {}'.format(sample_output_file)) else: sample_output_file = args.sample_output_file f_out = open(sample_output_file, "w+") @@ -417,19 +420,14 @@ def generate_samples_interactive(model, print_frequency=24): if context_length >= (args.seq_length // 2): print_rank_0("\nContext length", context_length, - "\nPlease give smaller context (half of the " - "sequence length)!", flush=True) + "\nPlease give smaller context (half of the " + "sequence length)!", flush=True) continue else: context_tokens = tokenizer.tokenize("EMPTY TEXT") context_length = len(context_tokens) - terminate_runs_tensor = torch.cuda.LongTensor([terminate_runs]) - torch.distributed.broadcast(terminate_runs_tensor, - mpu.get_model_parallel_src_rank(), - group=mpu.get_model_parallel_group()) - terminate_runs = terminate_runs_tensor[0].item() - + terminate_runs = broadcast_terminate_signal(terminate_runs) if terminate_runs == 1: return @@ -439,7 +437,7 @@ def generate_samples_interactive(model, print_frequency=24): decode_tokens = decode_tokens[0].cpu().numpy().tolist() if mpu.get_model_parallel_rank() == 0 and \ - counter % print_frequency == 0: + counter % print_frequency == 0: os.system('clear') print_rank_0("\nContext:", raw_text, flush=True) trim_decode_tokens = tokenizer.detokenize( @@ -489,7 +487,7 @@ def generate_samples_unconditional(model): if token_stream is None: break if ctr % args.log_interval == 0: print_rank_0('Avg s/batch:', - (time.time() - start_time) / min(args.log_interval, ctr + 1)) + (time.time() - start_time) / min(args.log_interval, ctr + 1)) start_time = time.time() length = len(token_stream) token_batch = token_stream[0].cpu().numpy().tolist() diff --git a/megatron/training.py b/megatron/training.py index c6225c597..f8e8d2a1c 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -26,8 +26,6 @@ import sys import torch -from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP -from apex.optimizers import FusedAdam as Adam from megatron import get_args from megatron import get_timers @@ -35,7 +33,6 @@ from megatron import print_rank_0 from megatron.checkpointing import load_checkpoint from megatron.checkpointing import save_checkpoint -from megatron.fp16 import FP16_Module from megatron.global_vars import get_use_wandb from megatron.initialize import initialize_megatron from megatron.learning_rates import AnnealingLR @@ -138,8 +135,6 @@ def get_optimizer(model): if args.no_load_optim: return None, None # Build parameter groups (weight decay and non-decay). - while isinstance(model, (torchDDP, FP16_Module)): - model = model.module param_groups = get_params_for_weight_decay_optimization(model, args) print_rank_0(f'Configuring Optimizer type: {args.optimizer_type} with params: {args.optimizer["params"]}') # Add model parallel attribute if it is not set. @@ -168,6 +163,13 @@ def get_optimizer(model): **args.optimizer["params"]) elif args.optimizer_type.lower() == "adam": # Use Adam + try: + # default to apex as it's slightly faster + from apex.optimizers import FusedAdam as Adam + except ImportError: + # if apex isn't installed, use deepspeed's FusedAdam + print("WARNING: APEX not installed - defaulting to deepspeed's fused adam") + from deepspeed.ops.adam import FusedAdam as Adam optimizer = Adam(param_groups, weight_decay=args.weight_decay, **args.optimizer["params"]) @@ -237,16 +239,15 @@ def setup_model_and_optimizer(model_provider_func): optimizer=optimizer, args=args, lr_scheduler=_lr_scheduler, - mpu=mpu if args.pipe_parallel_size == 0 else None, dist_init_required=False, model_parameters=_model_params, config_params=args.deepspeed_config, + mpu=mpu if not args.is_pipe_parallel else None ) - model.total_params = get_total_params(model.module) print_rank_0(f' > total params: {"{:,}".format(model.total_params)}') - if args.pipe_parallel_size > 0: + if args.is_pipe_parallel: model.set_batch_fn(model.module._megatron_batch_fn) else: raise ValueError("Must be using deepspeed to run neox") @@ -257,11 +258,6 @@ def setup_model_and_optimizer(model_provider_func): else: args.iteration = 0 - # get model without FP16 and/or TorchDDP wrappers - unwrapped_model = model - while hasattr(unwrapped_model, 'module'): - unwrapped_model = unwrapped_model.module - return model, optimizer, lr_scheduler @@ -293,9 +289,11 @@ def train_step(forward_step_func, data_iterator, timers = get_timers() # Pipeline parallelism schedules forward/backward/step - if args.pipe_parallel_size > 0: + if args.is_pipe_parallel: return train_step_pipe(model, data_iterator) + # TODO: Dead code (?) + # Forward model for one step. timers('forward').start() loss, loss_reduced = forward_step_func(data_iterator, model) @@ -371,7 +369,7 @@ def add_to_logging(name): if name in timers.timers: timers_to_log.append(name) - if args.pipe_parallel_size <= 0: + if not args.is_pipe_parallel: add_to_logging('forward') add_to_logging('backward') add_to_logging('backward-backward') @@ -602,11 +600,10 @@ def evaluate_and_print_results(prefix, forward_step_func, # Pipeline parallelism needs eval_batch() instead of a simple forward(). args = get_args() - if args.pipe_parallel_size > 0: - def _eval_helper(data_iter, pipe_model): + if args.is_pipe_parallel: + def _eval_helper(data_iter, _): loss = model.eval_batch(data_iter) return None, {'lm loss': loss} - forward_step_func = _eval_helper total_loss_dict = evaluate(forward_step_func, data_iterator, model, verbose) @@ -634,7 +631,7 @@ def build_train_valid_test_data_iterators( print_rank_0('> building train, validation, and test datasets ...') # Ensure only the first/last pipeline stages have data loaders - if args.pipe_parallel_size > 0: + if args.is_pipe_parallel: is_first_stage = mpu.get_pipe_parallel_rank() == 0 is_last_stage = mpu.get_pipe_parallel_rank() == mpu.get_pipe_parallel_world_size() - 1 pipe_load = is_first_stage or is_last_stage @@ -679,7 +676,7 @@ def build_train_valid_test_data_iterators( flags = torch.cuda.LongTensor([0, 0, 0]) # Broadcast num tokens. - if args.pipe_parallel_size > 0: + if args.is_pipe_parallel: # Only first/last pipeline stages have data loaders, so pipeline parallelism should # broadcast globally instead of just the model parallel group. torch.distributed.broadcast(flags, src=0) diff --git a/megatron/utils.py b/megatron/utils.py index 6bad05ea8..17faaded5 100644 --- a/megatron/utils.py +++ b/megatron/utils.py @@ -34,7 +34,6 @@ from megatron import mpu from megatron.data.samplers import DistributedBatchSampler from megatron.global_vars import get_use_wandb, get_tensorboard_writer -from megatron.fp16 import FP16_Optimizer from deepspeed import PipelineEngine, DeepSpeedEngine @@ -44,7 +43,6 @@ def reduce_losses(losses): [loss.clone().detach().view(1) for loss in losses]) torch.distributed.all_reduce(reduced_losses) reduced_losses = reduced_losses / torch.distributed.get_world_size() - return reduced_losses @@ -62,26 +60,6 @@ def report_memory(name): print_rank_0(string) -def print_params_min_max_norm(optimizer, iteration): - """Print min, max, and norm of all parameters.""" - index = 0 - rank = torch.distributed.get_rank() - string = 'iteration, rank, index, model-parallel,min, max, norm\n' - optimizer_ = optimizer - if isinstance(optimizer, FP16_Optimizer): - optimizer_ = optimizer.optimizer - for param_group in optimizer_.param_groups: - for param in param_group['params']: - index += 1 - min_ = param.data.min() - max_ = param.data.max() - norm = param.data.norm() - string += '{:7d}, {:4d}, {:4d}, {:2d}, '.format( - iteration, rank, index, int(param.model_parallel)) - string += '{:.6E}, {:.6E}, {:.6E}\n'.format(min_, max_, norm) - print(string, flush=True) - - def check_adlr_autoresume_termination(iteration, model, optimizer, lr_scheduler): """Check for autoresume signal and exit if it is received.""" @@ -196,10 +174,12 @@ def is_local_main(): """ True if is the local main process """ return local_rank() == 0 + def is_mp_rank_0(): """True if mp rank == 0""" return mpu.get_model_parallel_rank() == 0 + def get_wandb_api_key(): """ Get Weights and Biases API key from ENV or .netrc file. Otherwise return None """ if 'WANDB_API_KEY' in os.environ: @@ -210,6 +190,7 @@ def get_wandb_api_key(): if wandb_token is not None: return wandb_token[1] + def obtain_resource_pool(hostfile_path, include_arg, exclude_arg) -> Dict[str, List[int]]: """ Get dict of `resource_pool[hostname] = [list of GPU ranks]` using hostfile, include and exclude args. @@ -235,7 +216,7 @@ def natural_sort(l): return sorted(l, key=alphanum_key) -def pipe_to_normal(model_engine): +def pipe_to_normal(model_engine, **kwargs): """ Takes in a deepspeed.PipelineEngine model and returns a deepspeed.DeepspeedEngine model with the same model weights so we can directly access the .forward() function (for inference). @@ -244,12 +225,17 @@ def pipe_to_normal(model_engine): """ assert isinstance(model_engine, PipelineEngine), f"model engine {model_engine} not a PipelineEngine instance" - return DeepSpeedEngine( + ret = DeepSpeedEngine( args=get_args(), model=model_engine.module, mpu=model_engine.module.mpu(), dist_init_required=False, - config_params=model_engine.config_params) + config_params=model_engine.config_params, + optimizer=model_engine.optimizer, + lr_scheduler=model_engine.lr_scheduler, + **kwargs) + return ret + def tb_wandb_log(key, value, iteration_no): # logs to both tb and wandb (if present) from the zeroth rank @@ -260,6 +246,7 @@ def tb_wandb_log(key, value, iteration_no): if get_use_wandb(): wandb.log({key: value}, step=iteration_no) + def ddb(rank=0): """ Distributed Debugger that will insert a py debugger on rank `rank` and diff --git a/pretrain_gpt2.py b/pretrain_gpt2.py index 6eb9217fb..4001dcc91 100644 --- a/pretrain_gpt2.py +++ b/pretrain_gpt2.py @@ -19,46 +19,32 @@ import socket import torch +import wandb from wandb import UsageError from megatron import get_args -from megatron import print_rank_0 from megatron import get_timers from megatron import get_tokenizer from megatron import mpu +from megatron import print_rank_0 from megatron.data.gpt2_dataset import build_train_valid_test_datasets -from megatron.global_vars import set_use_wandb, get_use_wandb -from megatron.model import GPT2Model, GPT2ModelPipe +from megatron.fp16 import fp32_to_fp16 +from megatron.global_vars import set_use_wandb +from megatron.model import GPT2ModelPipe +from megatron.model.gpt2_model import cross_entropy from megatron.training import pretrain from megatron.utils import get_ltor_masks_and_position_ids, is_local_main, local_rank, get_wandb_api_key from megatron.utils import reduce_losses -from megatron.fp16 import fp32_to_fp16 -import wandb - - -def model_provider(use_wandb=True, inference=False, get_key_value=True): - """Build the model.""" - args = get_args() - print_rank_0('building GPT2 model ...') - if args.pipe_parallel_size == 0: - model = GPT2Model(num_tokentypes=0, parallel_output=True, inference=inference, get_key_value=get_key_value) - else: - model = GPT2ModelPipe(num_tokentypes=0, parallel_output=True, topology=mpu.get_topology(), inference=inference, get_key_value=get_key_value) - # This is a hack to give us a reference to get_batch_pipe from within training.py - # We need to call model.set_batch_fn after deepspeed.initialize - model._megatron_batch_fn = get_batch_pipe - - ## Wandb. (one worker per machine) - # I think it should be like this it use the use_wandb input +def init_wandb(use_wandb, args): + # Wandb. (one worker per machine) use_wandb = is_local_main() and (get_wandb_api_key() is not None) and use_wandb set_use_wandb(use_wandb) args_dict = vars(args) if use_wandb: group_name = args_dict.get('wandb_group') name = f'{socket.gethostname()}-{local_rank()}' if group_name else None - try: wandb.init(project="neox", group=group_name, name=name, save_code=False, force=False, entity=args_dict.get('wandb_team')) @@ -66,27 +52,29 @@ def model_provider(use_wandb=True, inference=False, get_key_value=True): set_use_wandb(False) print(e) print('Skipping wandb. Execute `wandb login` on local or main node machine to enable.') - - if use_wandb: wandb.config.update(args_dict) - return model +def model_provider(use_wandb=True, inference=False, get_key_value=True): + """Build the model.""" -def get_batch(data_iterator): - """Generate a batch""" args = get_args() - tokenizer = get_tokenizer() + print_rank_0('building GPT2 model ...') + model = GPT2ModelPipe(num_tokentypes=0, parallel_output=True, topology=mpu.get_topology(), inference=inference, + get_key_value=get_key_value) + if not args.is_pipe_parallel: + # Export PipeParallel model to nn.Sequential model to avoid the overhead of deepspeed's pipe parallel training + model = model.to_sequential() + else: + # This is a hack to give us a reference to get_batch_pipe from within training.py + # We need to call model.set_batch_fn after deepspeed.initialize + model._megatron_batch_fn = get_batch_pipe + init_wandb(use_wandb, args) + return model - # Items and their type. - keys = ['text'] - datatype = torch.int64 - # Broadcast data. - if data_iterator is not None: - data = next(data_iterator) - else: - data = None +def _get_batch(args, tokenizer, keys, data, datatype): + """Support function for get_batch / get_batch pipe (to avoid code repetition)""" data_b = mpu.broadcast_data(keys, data, datatype) # Unpack. @@ -105,8 +93,8 @@ def get_batch(data_iterator): return tokens, labels, loss_mask, attention_mask, position_ids -def get_batch_pipe(data): - """A modification of get_batch() to work with the latest batch instead of an iterator. """ +def get_batch(data_iterator): + """Generate a batch""" args = get_args() tokenizer = get_tokenizer() @@ -115,21 +103,23 @@ def get_batch_pipe(data): datatype = torch.int64 # Broadcast data. - data_b = mpu.broadcast_data(keys, data, datatype) + if data_iterator is not None: + data = next(data_iterator) + else: + data = None + return _get_batch(args, tokenizer, keys, data, datatype) - # Unpack. - tokens_ = data_b['text'].long() - labels = tokens_[:, 1:].contiguous() - tokens = tokens_[:, :-1].contiguous() - # Get the masks and postition ids. - attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( - tokens, - tokenizer.eod, - args.reset_position_ids, - args.reset_attention_mask, - args.eod_mask_loss) +def get_batch_pipe(data): + """A modification of get_batch() to work with the latest batch instead of an iterator. """ + args = get_args() + tokenizer = get_tokenizer() + + # Items and their type. + keys = ['text'] + datatype = torch.int64 + tokens, labels, loss_mask, attention_mask, position_ids = _get_batch(args, tokenizer, keys, data, datatype) # unpack data if args.precision == "fp16": # cast to fp16 because pipeline parallelism skips the FP16 wrapper. @@ -148,10 +138,9 @@ def forward_step(data_iterator, model): tokens, labels, loss_mask, attention_mask, position_ids = get_batch( data_iterator) timers('batch generator').stop() - # Forward model. - losses = model(tokens, position_ids, attention_mask, labels=labels) - loss_mask = loss_mask.view(-1) - loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum() + + outputs = model((tokens, position_ids, attention_mask)) + loss = cross_entropy(outputs, (labels, loss_mask), _fp16=args.fp16_lm_cross_entropy) # Reduce loss for logging. reduced_loss = reduce_losses([loss]) diff --git a/requirements/requirements-onebitadam.txt b/requirements/requirements-onebitadam.txt new file mode 100644 index 000000000..a6dd402b3 --- /dev/null +++ b/requirements/requirements-onebitadam.txt @@ -0,0 +1 @@ +cupy-cuda111==8.6.0 diff --git a/requirements/requirements-sparseattention.txt b/requirements/requirements-sparseattention.txt new file mode 100644 index 000000000..424b2d146 --- /dev/null +++ b/requirements/requirements-sparseattention.txt @@ -0,0 +1 @@ +triton==1.0.0.dev20210329 \ No newline at end of file diff --git a/requirements/requirements.txt b/requirements/requirements.txt index 2f7723432..c1638ad9a 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -3,7 +3,6 @@ six regex numpy==1.20.2 -e git+git://github.com/EleutherAI/DeeperSpeed.git@750f2140bf782cffeb578ce14a4e4cdb076f4326#egg=deepspeed -cupy-cuda111==8.6.0 mpi4py==3.0.3 wandb==0.10.25 einops==0.3.0 diff --git a/run_tests.py b/run_tests.py index 8ee9241bb..492272f5b 100644 --- a/run_tests.py +++ b/run_tests.py @@ -8,7 +8,6 @@ import unittest -from tests import * if __name__ == "__main__": loader = unittest.TestLoader() diff --git a/tasks/data_utils.py b/tasks/data_utils.py index 866a5e69a..f3376dc90 100644 --- a/tasks/data_utils.py +++ b/tasks/data_utils.py @@ -16,6 +16,7 @@ """ Tasks data utility.""" import re + import numpy as np diff --git a/tasks/eval_utils.py b/tasks/eval_utils.py index c89ea2cbf..04489c88c 100644 --- a/tasks/eval_utils.py +++ b/tasks/eval_utils.py @@ -21,8 +21,8 @@ import torch from megatron import get_args -from megatron import print_rank_0 from megatron import mpu +from megatron import print_rank_0 from tasks.finetune_utils import build_data_loader from tasks.finetune_utils import process_batch diff --git a/tasks/finetune_utils.py b/tasks/finetune_utils.py index 6df0eeef5..0effe0e82 100644 --- a/tasks/finetune_utils.py +++ b/tasks/finetune_utils.py @@ -21,9 +21,9 @@ import torch from megatron import get_args -from megatron import print_rank_0 from megatron import get_timers from megatron import mpu +from megatron import print_rank_0 from megatron.checkpointing import load_checkpoint from megatron.checkpointing import save_checkpoint from megatron.training import evaluate_and_print_results diff --git a/tasks/zeroshot_gpt2/datasets.py b/tasks/zeroshot_gpt2/datasets.py index 0d1f03756..077d00c94 100644 --- a/tasks/zeroshot_gpt2/datasets.py +++ b/tasks/zeroshot_gpt2/datasets.py @@ -22,8 +22,8 @@ import torch from megatron import get_args -from megatron import print_rank_0 from megatron import get_tokenizer +from megatron import print_rank_0 from .detokenizer import get_detokenizer diff --git a/tasks/zeroshot_gpt2/evaluate.py b/tasks/zeroshot_gpt2/evaluate.py index b1c06d205..06d04a719 100644 --- a/tasks/zeroshot_gpt2/evaluate.py +++ b/tasks/zeroshot_gpt2/evaluate.py @@ -20,15 +20,13 @@ import torch from megatron import get_args -from megatron import print_rank_0 from megatron import get_tokenizer from megatron import mpu +from megatron import print_rank_0 from megatron.checkpointing import load_checkpoint -from megatron.model import GPT2Model from megatron.training import get_model from megatron.utils import get_ltor_masks_and_position_ids from tasks.finetune_utils import build_data_loader - from .datasets import build_dataset @@ -48,9 +46,11 @@ def model_provider(): 'is not supported.'.format(eval_metric)) print_rank_0('building GPT2 model ...') - model = GPT2Model(num_tokentypes=0, parallel_output=parallel_output) + # TODO: reimplement for pipe parallel + raise NotImplementedError + # model = GPT2Model(num_tokentypes=0, parallel_output=parallel_output) - return model + # return model return model_provider diff --git a/tests/__init__.py b/tests/__init__.py index c3ed58809..7863a66f4 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -2,5 +2,5 @@ Testcases for GPT NeoX """ -from .neox_args import * from .model import * +from .neox_args import * diff --git a/tests/model/__init__.py b/tests/model/__init__.py index 3c8d7288e..3b332cfaa 100644 --- a/tests/model/__init__.py +++ b/tests/model/__init__.py @@ -2,6 +2,5 @@ Tests concerning the GPT2Model class """ -from .test_model_initialization import TestModelInitialization from .test_model_checkpoint import TestModelCheckpoint #from .test_model_initialization_pipeline import TestModelInitializationPipeline \ No newline at end of file diff --git a/tests/model/test_configs/medium.yml b/tests/model/test_configs/medium.yml new file mode 100644 index 000000000..b75c018bd --- /dev/null +++ b/tests/model/test_configs/medium.yml @@ -0,0 +1,85 @@ +# GPT-2 pretraining setup +{ + # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages + # across the node boundaries ) + "pipe-parallel-size": 1, + "model-parallel-size": 1, + + # model settings + "num-layers": 4, + "hidden-size": 128, + "num-attention-heads": 8, + "seq-length": 2048, + "max-position-embeddings": 2048, + "norm": "layernorm", + "pos-emb": "rotary", + "no-weight-tying": true, + + # these should provide some speedup but takes a while to build, set to true if desired + "scaled-upper-triang-masked-softmax-fusion": false, + "bias-gelu-fusion": false, + + + + # optimizer settings + "optimizer": { + "type": "Adam", + "params": { + "lr": 0.0003, + "betas": [0.9, 0.999], + "eps": 1.0e-8, + } + }, + "zero_optimization": { + "stage": 1, + "allgather_partitions": True, + "allgather_bucket_size": 500000000, + "overlap_comm": True, + "reduce_scatter": True, + "reduce_bucket_size": 500000000, + "contiguous_gradients": True, + "cpu_offload": False + }, + # batch / data settings + "train_micro_batch_size_per_gpu": 4, + "data-impl": "mmap", + "split": "949,50,1", + + # activation checkpointing + "checkpoint-activations": true, + "checkpoint-num-layers": 1, + "partition-activations": true, + "synchronize-each-layer": true, + + # regularization + "gradient_clipping": 1.0, + "weight-decay": 0, + "hidden-dropout": 0, + "attention-dropout": 0, + + # precision settings + "fp16": { + "fp16": true, + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 1000, + "hysteresis": 2, + "min_loss_scale": 1 + }, + + # misc. training settings + "train-iters": 320000, + "lr-decay-iters": 320000, + "distributed-backend": "nccl", + "lr-decay-style": "cosine", + "warmup": 0.01, + "save-interval": 10000, + "eval-interval": 1000, + "eval-iters": 10, + + # logging + "log-interval": 100, + "steps_per_print": 10, + "keep-last-n-checkpoints": 4, + "wall_clock_breakdown": true, +} diff --git a/tests/model/test_configs/small.yml b/tests/model/test_configs/small.yml new file mode 100644 index 000000000..06787c78a --- /dev/null +++ b/tests/model/test_configs/small.yml @@ -0,0 +1,84 @@ +# GPT-2 pretraining setup +{ + # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages + # across the node boundaries ) + "pipe-parallel-size": 1, + "model-parallel-size": 1, + + # model settings + "num-layers": 12, + "hidden-size": 768, + "num-attention-heads": 12, + "seq-length": 2048, + "max-position-embeddings": 2048, + "norm": "layernorm", + "pos-emb": "rotary", + "no-weight-tying": true, + + # these should provide some speedup but takes a while to build, set to true if desired + "scaled-upper-triang-masked-softmax-fusion": false, + "bias-gelu-fusion": false, + + + # optimizer settings + "optimizer": { + "type": "Adam", + "params": { + "lr": 0.0006, + "betas": [0.9, 0.999], + "eps": 1.0e-8, + } + }, + "zero_optimization": { + "stage": 0, + "allgather_partitions": True, + "allgather_bucket_size": 500000000, + "overlap_comm": True, + "reduce_scatter": True, + "reduce_bucket_size": 500000000, + "contiguous_gradients": True, + "cpu_offload": False + }, + + # batch / data settings + "train_micro_batch_size_per_gpu": 4, + "data-impl": "mmap", + "split": "949,50,1", + + # activation checkpointing + "checkpoint-activations": true, + "checkpoint-num-layers": 1, + "partition-activations": true, + "synchronize-each-layer": true, + + # regularization + "gradient_clipping": 1.0, + "weight-decay": 0.0, + "hidden-dropout": 0.0, + "attention-dropout": 0.0, + + # precision settings + "fp16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 1000, + "hysteresis": 2, + "min_loss_scale": 1 + }, + + # misc. training settings + "train-iters": 320000, + "lr-decay-iters": 320000, + "distributed-backend": "nccl", + "lr-decay-style": "cosine", + "warmup": 0.01, + "save-interval": 10000, + "eval-interval": 1000, + "eval-iters": 10, + + # logging + "log-interval": 100, + "steps_per_print": 10, + "keep-last-n-checkpoints": 4, + "wall_clock_breakdown": true, +} diff --git a/tests/model/test_model_checkpoint.py b/tests/model/test_model_checkpoint.py index 3ee9c3551..efd27e9ed 100644 --- a/tests/model/test_model_checkpoint.py +++ b/tests/model/test_model_checkpoint.py @@ -1,26 +1,24 @@ import os -import re -import sys import shutil +import sys import unittest from unittest.mock import patch -from pathlib import Path if __name__ == "__main__": sys.path.append(os.path.abspath('')) from megatron.neox_arguments import NeoXArgs -from megatron.global_vars import set_global_variables, get_args, reset_global_variables -from megatron.model import GPT2Model, GPT2ModelPipe + +from megatron.global_vars import get_args, reset_global_variables + from megatron import initialize_megatron -from megatron import mpu from megatron.text_generation_utils import get_batch, forward_model from megatron.training import setup_model_and_optimizer from megatron.checkpointing import load_checkpoint from megatron.checkpointing import save_checkpoint from pretrain_gpt2 import model_provider -from megatron.utils import get_ltor_masks_and_position_ids, pipe_to_normal +from megatron.utils import pipe_to_normal from deepspeed import PipelineEngine from tests.common import get_root_directory, get_configs_with_path @@ -28,14 +26,14 @@ class TestModelCheckpoint(unittest.TestCase): - #def test_model_checkpoint(self): - # self.assertTrue(self.run_test_model_checkpoint(pipe_parallel_size=1)) - - def test_model_checkpoint(self): + def run_checkpoint_test(self, config_yml): reset_global_variables() # intitially load config from files as would be the case in deepy.py - yaml_list = get_configs_with_path(["small.yml", "local_setup.yml"]) + yaml_list = get_configs_with_path(["local_setup.yml"]) + yaml_list.append(f"{get_root_directory()}/tests/model/test_configs/{config_yml}") + print(os.listdir(".")) + args_loaded = NeoXArgs.from_ymls(yaml_list) args_loaded.update_value("user_script", str(get_root_directory() / "pretrain_gpt2.py")) args_loaded.update_value("pipe_parallel_size", 1) # overwrite pipeline parameter, config in small.yml may have changed! @@ -61,9 +59,14 @@ def test_model_checkpoint(self): # Initialize new model model model, optimizer, lr_scheduler = setup_model_and_optimizer(lambda: model_provider(use_wandb=False)) - if args.pipe_parallel_size == 1 and isinstance(model, PipelineEngine): - # if it's a pipe parallel model but not actually doing parallelism, convert it to a normal deepspeed model - model = pipe_to_normal(model) + + # save model checkpoint + save_checkpoint(42, model, optimizer, lr_scheduler) + + #if args.pipe_parallel_size == 1 and isinstance(model, PipelineEngine): + # # if it's a pipe parallel model but not actually doing parallelism, convert it to a normal deepspeed model + # model = pipe_to_normal(model) + #model.to_sequential() model.eval() context_tokens_tensor = torch.cuda.LongTensor([[1,2,3,4,5],[1,2,3,4,5],[6,7,8,9,10],[1,2,3,4,100]]) @@ -82,15 +85,12 @@ def test_model_checkpoint(self): self.assertFalse(torch.isclose(output[1], output[2]).all().item()) self.assertTrue(torch.isclose(output[1, 3], output[3, 3]).all().item()) - # save model checkpoint - save_checkpoint(42, model, optimizer, lr_scheduler) - # reload model from checkpoint reloaded_model, optimizer, lr_scheduler = setup_model_and_optimizer(lambda: model_provider(use_wandb=False)) - if args.pipe_parallel_size == 1 and isinstance(model, PipelineEngine): - # if it's a pipe parallel model but not actually doing parallelism, convert it to a normal deepspeed model - model = pipe_to_normal(model) iteration = load_checkpoint(reloaded_model, optimizer, lr_scheduler) + if args.pipe_parallel_size == 1 and isinstance(reloaded_model, PipelineEngine): + # if it's a pipe parallel model but not actually doing parallelism, convert it to a normal deepspeed model + reloaded_model = pipe_to_normal(reloaded_model) reloaded_model.eval() #ensure same checkpoint is loaded @@ -98,6 +98,10 @@ def test_model_checkpoint(self): reloaded_output = forward_model(model, (tokens, position_ids, attention_mask)) + #check re-loaded model returns the same results + self.assertTrue(torch.isclose(output, reloaded_output).all().item()) + + #check all weight groups are the same for idx, ((n1, p1), (n2, p2)) in enumerate(zip(list(model.module.named_parameters()), list(reloaded_model.module.named_parameters()))): self.assertTrue(n1 == n2) params_equal = (p1 == p2).all().item() @@ -105,14 +109,20 @@ def test_model_checkpoint(self): if not params_equal: print(f"test_model_checkpoint() layer {idx} {n1} has same parameters after loading of checkpoint", flush=True) - self.assertTrue(torch.isclose(output, reloaded_output).all().item()) - + #clear up checkpoint folder shutil.rmtree(path) - #TODO test changing batch size, because i had some weird experience with this last time + def test_model_small(self): + self.run_checkpoint_test("small.yml") + def test_model_medium(self): + self.run_checkpoint_test("medium.yml") if __name__ == "__main__": suite = unittest.TestSuite() - suite.addTest(TestModelCheckpoint("test_model_checkpoint")) + + #Run all required tests + #suite.addTest(TestModelCheckpoint("test_model_small")) + suite.addTest(TestModelCheckpoint("test_model_medium")) + unittest.TextTestRunner(failfast=False).run(suite) \ No newline at end of file diff --git a/tests/model/test_model_initialization.py b/tests/model/test_model_initialization.py index e2527d6d5..190eb9182 100644 --- a/tests/model/test_model_initialization.py +++ b/tests/model/test_model_initialization.py @@ -1,24 +1,22 @@ import os -import re import sys import unittest from unittest.mock import patch -from pathlib import Path if __name__ == "__main__": sys.path.append(os.path.abspath('')) from megatron.neox_arguments import NeoXArgs -from megatron.global_vars import set_global_variables, get_args, reset_global_variables +from megatron.global_vars import get_args, reset_global_variables from megatron.model import GPT2ModelPipe from megatron import initialize_megatron from megatron import mpu from tests.common import get_root_directory, get_configs_with_path -class TestModelInitializationPipeline(unittest.TestCase): +class TestModelInitialization(unittest.TestCase): - def test_model_initialization_pipeline(self): + def test_model_initialization(self): reset_global_variables() # intitially load config from files as would be the case in deepy.py diff --git a/tests/neox_args/__init__.py b/tests/neox_args/__init__.py index 535b02dab..f7dd120d7 100644 --- a/tests/neox_args/__init__.py +++ b/tests/neox_args/__init__.py @@ -2,8 +2,8 @@ Tests concerning NeoXArgs """ +from .test_neoxargs_commandline import TestNeoXArgsCommandLine from .test_neoxargs_implementation import TestNeoXArgsImplementation from .test_neoxargs_load import TestNeoXArgsLoad -from .test_neoxargs_commandline import TestNeoXArgsCommandLine +from .test_neoxargs_usage import TestNeoXArgsArgumentUsage from .test_neoxargs_validation import TestNeoXArgsValidation -from .test_neoxargs_usage import TestNeoXArgsArgumentUsage \ No newline at end of file diff --git a/tests/neox_args/test_neoxargs_commandline.py b/tests/neox_args/test_neoxargs_commandline.py index 24e5ddb68..84be22069 100644 --- a/tests/neox_args/test_neoxargs_commandline.py +++ b/tests/neox_args/test_neoxargs_commandline.py @@ -70,7 +70,7 @@ def test_neoxargs_consume_deepy_args_with_config_dir(self): self.assertTrue(args_loaded_yamls == args_loaded_consume) - def test_neoxargs_consume_megatron_args(self): + def test_neoxargs_consume_neox_args(self): """ verify megatron args are correctly consumed after sending via deepspeed """ @@ -83,7 +83,7 @@ def test_neoxargs_consume_megatron_args(self): # patch sys.argv so that args can be access by set_global_variables within initialize_megatron with patch('sys.argv', deepspeed_main_args): - args_loaded = NeoXArgs.consume_megatron_args() + args_loaded = NeoXArgs.consume_neox_args() #TODO is the wandb group really to be changed? args_loaded.wandb_group = args_baseline.wandb_group diff --git a/tests/neox_args/test_neoxargs_implementation.py b/tests/neox_args/test_neoxargs_implementation.py index ecc65ed16..702b038d6 100644 --- a/tests/neox_args/test_neoxargs_implementation.py +++ b/tests/neox_args/test_neoxargs_implementation.py @@ -1,9 +1,8 @@ -import os -import sys import unittest from megatron.neox_arguments import NeoXArgs + class TestNeoXArgsImplementation(unittest.TestCase): """ verify code implementation of NeoXArgs diff --git a/tests/neox_args/test_neoxargs_load.py b/tests/neox_args/test_neoxargs_load.py index 4113f2a20..84ee55b5b 100644 --- a/tests/neox_args/test_neoxargs_load.py +++ b/tests/neox_args/test_neoxargs_load.py @@ -1,7 +1,6 @@ import os import sys import unittest -from unittest.mock import patch if __name__ == "__main__": sys.path.append(os.path.abspath('')) diff --git a/tests/neox_args/test_neoxargs_validation.py b/tests/neox_args/test_neoxargs_validation.py index b0a57d264..3797495b2 100644 --- a/tests/neox_args/test_neoxargs_validation.py +++ b/tests/neox_args/test_neoxargs_validation.py @@ -1,9 +1,8 @@ -import os -import sys import unittest from megatron.neox_arguments import NeoXArgs + class TestNeoXArgsValidation(unittest.TestCase): """ verify the implementation of NeoXArgs diff --git a/text_gen_gpt2.py b/text_gen_gpt2.py index d933f13b0..ef68791d3 100755 --- a/text_gen_gpt2.py +++ b/text_gen_gpt2.py @@ -20,6 +20,7 @@ import os import sys + from pretrain_gpt2 import model_provider sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), @@ -30,9 +31,7 @@ from megatron.training import setup_model_and_optimizer from megatron.text_generation_utils import generate_and_write_samples_unconditional, generate_samples_input_from_file, \ generate_samples_interactive -from megatron.utils import pipe_to_normal -from deepspeed import PipelineEngine def main(extra_args_provider=None, get_key_value=True): """ @@ -53,9 +52,6 @@ def main(extra_args_provider=None, get_key_value=True): # Set up model and load checkpoint. model, _, _ = setup_model_and_optimizer(lambda: model_provider(use_wandb=False, inference=True, get_key_value=get_key_value)) - if args.pipe_parallel_size == 1 and isinstance(model, PipelineEngine): - # if it's a pipe parallel model but not actually doing parallelism, convert it to a normal deepspeed model - model = pipe_to_normal(model) print_rank_0('Finished loading model') if args.text_gen_type == 'unconditional': diff --git a/tools/convert_args_to_conf_file.py b/tools/convert_args_to_conf_file.py index 890b71651..210f5b7fc 100755 --- a/tools/convert_args_to_conf_file.py +++ b/tools/convert_args_to_conf_file.py @@ -1,24 +1,22 @@ #!/usr/bin/env python -import json import argparse +import dataclasses +import json import sys +from dataclasses import dataclass from io import StringIO from typing import Any -import dataclasses -import pandas as pd import deepspeed -from dataclasses import dataclass +import pandas as pd from deepspeed.constants import TORCH_DISTRIBUTED_DEFAULT_PORT from deepspeed.launcher.constants import PDSH_LAUNCHER from deepspeed.launcher.runner import DLTS_HOSTFILE - from megatron.arguments import _add_network_size_args, _add_regularization_args, _add_training_args, \ _add_initialization_args, _add_learning_rate_args, _add_checkpointing_args, _add_mixed_precision_args, \ _add_distributed_args, _add_validation_args, _add_data_args, _add_autoresume_args, _add_zero_args, \ _add_activation_checkpoint_args - from megatron.config_monster import megatron_keys_exclude, ds_config_keys diff --git a/tools/inspect_checkpoints.py b/tools/inspect_checkpoints.py index 63f12d128..b904f53ee 100644 --- a/tools/inspect_checkpoints.py +++ b/tools/inspect_checkpoints.py @@ -1,13 +1,14 @@ # Adapted from https://github.com/awaelchli/pytorch-lightning-snippets/blob/master/checkpoint/peek.py import code +import os +import re from argparse import ArgumentParser, Namespace from collections.abc import Mapping, Sequence from pathlib import Path -import pdb + import torch -import os -import re + class COLORS: BLUE = "\033[94m" diff --git a/tools/merge_mp_partitions.py b/tools/merge_mp_partitions.py index b4ec64db3..33bdb44ba 100644 --- a/tools/merge_mp_partitions.py +++ b/tools/merge_mp_partitions.py @@ -259,7 +259,7 @@ def main(): args.model_parallel_size = 1 mpu.initialize.set_model_parallel_rank(0) sd = {} - sd['model'] = merged_model.state_dict_for_save_checkpoint() + sd['model'] = merged_model.state_dict() sd['iteration'] = iteration merged_path = os.path.join(args.load, 'merged') checkpoint_name = get_checkpoint_name(merged_path, iteration) diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py index 7ea56ec83..236365da9 100644 --- a/tools/preprocess_data.py +++ b/tools/preprocess_data.py @@ -19,6 +19,7 @@ import multiprocessing import os import sys + import lm_dataformat as lmd sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),