diff --git a/Dockerfile b/Dockerfile
index 179dad119..d3dc6517c 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -71,9 +71,12 @@ RUN mkdir -p /home/mchorse/.ssh /job && \
 
 #### Python packages
 RUN pip install torch==1.8.1+cu111 -f https://download.pytorch.org/whl/torch_stable.html && pip cache purge
-
 COPY requirements/requirements.txt .
-RUN pip install -r requirements.txt && pip cache purge
+COPY requirements/requirements-onebitadam.txt .
+COPY requirements/requirements-sparseattention.txt .
+RUN pip install -r requirements.txt && pip install -r requirements-onebitadam.txt && pip install -r requirements-sparseattention.txt && pip cache purge
+
+## Install APEX
 RUN pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" git+https://github.com/NVIDIA/apex.git@a651e2c24ecf97cbf367fd3f330df36760e1c597
 
 # Clear staging
diff --git a/deepy.py b/deepy.py
index b2f67d49f..d4cb399da 100755
--- a/deepy.py
+++ b/deepy.py
@@ -13,12 +13,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import logging
 import os
+
 import deepspeed
-from deepspeed.launcher.runner import main
 import requests
-
-import logging
+from deepspeed.launcher.runner import main
 
 logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO"))
 
@@ -43,8 +43,13 @@ def get_wandb_api_key():
 
 
 neox_args = NeoXArgs.consume_deepy_args()
+if neox_args.wandb_group is not None:
+    # concat the wandb group name with a uid to make sure it's unique
+    import wandb
+    neox_args.wandb_group += "_" + wandb.util.generate_id()
 neox_args.print()
 deepspeed_main_args = neox_args.get_deepspeed_main_args()
 
+
 if __name__ == '__main__':
     main(deepspeed_main_args)
diff --git a/megatron/checkpointing.py b/megatron/checkpointing.py
index 84be33101..36d24c4b8 100644
--- a/megatron/checkpointing.py
+++ b/megatron/checkpointing.py
@@ -26,7 +26,6 @@
 import numpy as np
 
 import torch
-from torch.nn.parallel import DistributedDataParallel as torchDDP
 from glob import glob
 
 from megatron import mpu, get_args
@@ -34,6 +33,7 @@
 from megatron import print_rank_0
 from megatron.utils import natural_sort
 
+
 def check_checkpoint_args(checkpoint_args):
     """Ensure fixed arguments for a model are the same for the input
     arguments and the one retreived frm checkpoint."""
@@ -104,9 +104,7 @@ def delete_old_checkpoints(save_dir, n_to_keep):
 
 def save_ds_checkpoint(iteration, model, args):
     """Save a model checkpoint."""
-
-    sd = {}
-    sd['iteration'] = iteration
+    sd = {'iteration': iteration}
     # rng states.
     if not args.no_save_rng:
         sd['random_rng_state'] = random.getstate()
@@ -114,15 +112,6 @@ def save_ds_checkpoint(iteration, model, args):
         sd['torch_rng_state'] = torch.get_rng_state()
         sd['cuda_rng_state'] = torch.cuda.get_rng_state()
         sd['rng_tracker_states'] = mpu.get_cuda_rng_tracker().get_states()
-
-    if args.pipe_parallel_size == 0:
-        # megatron model uses state_dict_for_save_checkpointing instead of the standard state_dict
-        # state_dict is used by deepspeed for module saving so it needs to point to the right function
-        model.module.state_dict = model.module.state_dict_for_save_checkpoint
-    else:
-        # Pipeline parallelism manages its own state_dict.
-        pass
-
     model.save_checkpoint(args.save, client_state=sd)
 
 
@@ -152,17 +141,14 @@ def save_checkpoint(iteration, model, optimizer, lr_scheduler):
     torch.distributed.barrier()
 
 
-def load_checkpoint(model, optimizer, lr_scheduler, load_arg='load'):
+def load_checkpoint(model, optimizer, lr_scheduler):
     """Load a model checkpoint and return the iteration."""
     args = get_args()
-    load_dir = getattr(args, load_arg)
 
-    if isinstance(model, torchDDP):
-        model = model.module
     # Read the tracker file and set the iteration.
-    tracker_filename = get_checkpoint_tracker_filename(load_dir)
+    tracker_filename = get_checkpoint_tracker_filename(args.load)
 
-    # If no tracker file, return iretation zero.
+    # If no tracker file, return iteration zero.
     if not os.path.isfile(tracker_filename):
         print_rank_0('WARNING: could not find the metadata file {} '.format(
             tracker_filename))
@@ -190,7 +176,7 @@ def load_checkpoint(model, optimizer, lr_scheduler, load_arg='load'):
 
     if args.deepspeed:
         load_optim_and_scheduler = not args.no_load_optim  # TODO: These should be configured by separate args
-        checkpoint_name, state_dict = model.load_checkpoint(load_dir,
+        checkpoint_name, state_dict = model.load_checkpoint(args.load,
                                                             load_optimizer_states=load_optim_and_scheduler,
                                                             load_lr_scheduler_states=load_optim_and_scheduler)
 
diff --git a/megatron/data/dataset_utils.py b/megatron/data/dataset_utils.py
deleted file mode 100644
index 231744bf7..000000000
--- a/megatron/data/dataset_utils.py
+++ /dev/null
@@ -1,406 +0,0 @@
-# Copyright (c) 2021, EleutherAI contributors
-# This file is based on code by the authors denoted below and has been modified from its original version.
-#
-# Copyright 2018 The Google AI Language Team Authors, and NVIDIA.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Most of the code here has been copied from:
-#   https://github.com/google-research/albert/blob/master/create_pretraining_data.py
-# with some modifications.
-
-import time
-import collections
-
-import numpy as np
-from megatron import get_args, print_rank_0
-from megatron.data.indexed_dataset import make_dataset as make_indexed_dataset
-
-DSET_TYPE_STD = 'standard_bert'
-DSET_TYPE_ICT = 'ict'
-
-DSET_TYPES = [DSET_TYPE_ICT, DSET_TYPE_STD]
-
-
-def compile_helper():
-    """Compile helper function ar runtime. Make sure this
-    is invoked on a single process."""
-    import os
-    import subprocess
-    path = os.path.abspath(os.path.dirname(__file__))
-    ret = subprocess.run(['make', '-C', path])
-    if ret.returncode != 0:
-        print("Making C++ dataset helpers module failed, exiting.")
-        import sys
-        sys.exit(1)
-
-
-def get_a_and_b_segments(sample, np_rng):
-    """Divide sample into a and b segments."""
-
-    # Number of sentences in the sample.
-    n_sentences = len(sample)
-    # Make sure we always have two sentences.
-    assert n_sentences > 1, 'make sure each sample has at least two sentences.'
-
-    # First part:
-    # `a_end` is how many sentences go into the `A`.
-    a_end = 1
-    if n_sentences >= 3:
-        # Note that randin in numpy is exclusive.
-        a_end = np_rng.randint(1, n_sentences)
-    tokens_a = []
-    for j in range(a_end):
-        tokens_a.extend(sample[j])
-
-    # Second part:
-    tokens_b = []
-    for j in range(a_end, n_sentences):
-        tokens_b.extend(sample[j])
-
-    # Random next:
-    is_next_random = False
-    if np_rng.random() < 0.5:
-        is_next_random = True
-        tokens_a, tokens_b = tokens_b, tokens_a
-
-    return tokens_a, tokens_b, is_next_random
-
-
-def truncate_segments(tokens_a, tokens_b, len_a, len_b, max_num_tokens, np_rng):
-    """Truncates a pair of sequences to a maximum sequence length."""
-    #print(len_a, len_b, max_num_tokens)
-    assert len_a > 0
-    assert len_b > 0
-    if len_a + len_b <= max_num_tokens:
-        return False
-    while len_a + len_b > max_num_tokens:
-        if len_a > len_b:
-            len_a -= 1
-            tokens = tokens_a
-        else:
-            len_b -= 1
-            tokens = tokens_b
-        if np_rng.random() < 0.5:
-            del tokens[0]
-        else:
-            tokens.pop()
-    return True
-
-
-def create_tokens_and_tokentypes(tokens_a, tokens_b, cls_id, sep_id):
-    """Merge segments A and B, add [CLS] and [SEP] and build tokentypes."""
-
-    tokens = []
-    tokentypes = []
-    # [CLS].
-    tokens.append(cls_id)
-    tokentypes.append(0)
-    # Segment A.
-    for token in tokens_a:
-        tokens.append(token)
-        tokentypes.append(0)
-    # [SEP].
-    tokens.append(sep_id)
-    tokentypes.append(0)
-    # Segment B.
-    for token in tokens_b:
-        tokens.append(token)
-        tokentypes.append(1)
-    # [SEP].
-    tokens.append(sep_id)
-    tokentypes.append(1)
-
-    return tokens, tokentypes
-
-
-MaskedLmInstance = collections.namedtuple("MaskedLmInstance",
-                                          ["index", "label"])
-
-
-def is_start_piece(piece):
-    """Check if the current word piece is the starting piece (BERT)."""
-    # When a word has been split into
-    # WordPieces, the first token does not have any marker and any subsequence
-    # tokens are prefixed with ##. So whenever we see the ## token, we
-    # append it to the previous set of word indexes.
-    return not piece.startswith("##")
-
-
-def create_masked_lm_predictions(tokens,
-                                 vocab_id_list, vocab_id_to_token_dict,
-                                 masked_lm_prob,
-                                 cls_id, sep_id, mask_id,
-                                 max_predictions_per_seq,
-                                 np_rng,
-                                 max_ngrams=3,
-                                 do_whole_word_mask=True,
-                                 favor_longer_ngram=False,
-                                 do_permutation=False):
-    """Creates the predictions for the masked LM objective.
-    Note: Tokens here are vocab ids and not text tokens."""
-
-    cand_indexes = []
-    # Note(mingdachen): We create a list for recording if the piece is
-    # the starting piece of current token, where 1 means true, so that
-    # on-the-fly whole word masking is possible.
-    token_boundary = [0] * len(tokens)
-
-    for (i, token) in enumerate(tokens):
-        if token == cls_id or token == sep_id:
-            token_boundary[i] = 1
-            continue
-        # Whole Word Masking means that if we mask all of the wordpieces
-        # corresponding to an original word.
-        #
-        # Note that Whole Word Masking does *not* change the training code
-        # at all -- we still predict each WordPiece independently, softmaxed
-        # over the entire vocabulary.
-        if (do_whole_word_mask and len(cand_indexes) >= 1 and
-                not is_start_piece(vocab_id_to_token_dict[token])):
-            cand_indexes[-1].append(i)
-        else:
-            cand_indexes.append([i])
-            if is_start_piece(vocab_id_to_token_dict[token]):
-                token_boundary[i] = 1
-
-    output_tokens = list(tokens)
-
-    masked_lm_positions = []
-    masked_lm_labels = []
-
-    if masked_lm_prob == 0:
-        return (output_tokens, masked_lm_positions,
-                masked_lm_labels, token_boundary)
-
-    num_to_predict = min(max_predictions_per_seq,
-                         max(1, int(round(len(tokens) * masked_lm_prob))))
-
-    # Note(mingdachen):
-    # By default, we set the probilities to favor shorter ngram sequences.
-    ngrams = np.arange(1, max_ngrams + 1, dtype=np.int64)
-    pvals = 1. / np.arange(1, max_ngrams + 1)
-    pvals /= pvals.sum(keepdims=True)
-
-    if favor_longer_ngram:
-        pvals = pvals[::-1]
-
-    ngram_indexes = []
-    for idx in range(len(cand_indexes)):
-        ngram_index = []
-        for n in ngrams:
-            ngram_index.append(cand_indexes[idx:idx + n])
-        ngram_indexes.append(ngram_index)
-
-    np_rng.shuffle(ngram_indexes)
-
-    masked_lms = []
-    covered_indexes = set()
-    for cand_index_set in ngram_indexes:
-        if len(masked_lms) >= num_to_predict:
-            break
-        if not cand_index_set:
-            continue
-        # Note(mingdachen):
-        # Skip current piece if they are covered in lm masking or previous ngrams.
-        for index_set in cand_index_set[0]:
-            for index in index_set:
-                if index in covered_indexes:
-                    continue
-
-        n = np_rng.choice(ngrams[:len(cand_index_set)],
-                          p=pvals[:len(cand_index_set)] /
-                          pvals[:len(cand_index_set)].sum(keepdims=True))
-        index_set = sum(cand_index_set[n - 1], [])
-        n -= 1
-        # Note(mingdachen):
-        # Repeatedly looking for a candidate that does not exceed the
-        # maximum number of predictions by trying shorter ngrams.
-        while len(masked_lms) + len(index_set) > num_to_predict:
-            if n == 0:
-                break
-            index_set = sum(cand_index_set[n - 1], [])
-            n -= 1
-        # If adding a whole-word mask would exceed the maximum number of
-        # predictions, then just skip this candidate.
-        if len(masked_lms) + len(index_set) > num_to_predict:
-            continue
-        is_any_index_covered = False
-        for index in index_set:
-            if index in covered_indexes:
-                is_any_index_covered = True
-                break
-        if is_any_index_covered:
-            continue
-        for index in index_set:
-            covered_indexes.add(index)
-
-            masked_token = None
-            # 80% of the time, replace with [MASK]
-            if np_rng.random() < 0.8:
-                masked_token = mask_id
-            else:
-                # 10% of the time, keep original
-                if np_rng.random() < 0.5:
-                    masked_token = tokens[index]
-                # 10% of the time, replace with random word
-                else:
-                    masked_token = vocab_id_list[np_rng.randint(0, len(vocab_id_list))]
-
-            output_tokens[index] = masked_token
-
-            masked_lms.append(MaskedLmInstance(index=index, label=tokens[index]))
-    assert len(masked_lms) <= num_to_predict
-
-    np_rng.shuffle(ngram_indexes)
-
-    select_indexes = set()
-    if do_permutation:
-        for cand_index_set in ngram_indexes:
-            if len(select_indexes) >= num_to_predict:
-                break
-            if not cand_index_set:
-                continue
-            # Note(mingdachen):
-            # Skip current piece if they are covered in lm masking or previous ngrams.
-            for index_set in cand_index_set[0]:
-                for index in index_set:
-                    if index in covered_indexes or index in select_indexes:
-                        continue
-
-            n = np.random.choice(ngrams[:len(cand_index_set)],
-                                 p=pvals[:len(cand_index_set)] /
-                                 pvals[:len(cand_index_set)].sum(keepdims=True))
-            index_set = sum(cand_index_set[n - 1], [])
-            n -= 1
-
-            while len(select_indexes) + len(index_set) > num_to_predict:
-                if n == 0:
-                    break
-                index_set = sum(cand_index_set[n - 1], [])
-                n -= 1
-            # If adding a whole-word mask would exceed the maximum number of
-            # predictions, then just skip this candidate.
-            if len(select_indexes) + len(index_set) > num_to_predict:
-                continue
-            is_any_index_covered = False
-            for index in index_set:
-                if index in covered_indexes or index in select_indexes:
-                    is_any_index_covered = True
-                    break
-            if is_any_index_covered:
-                continue
-            for index in index_set:
-                select_indexes.add(index)
-        assert len(select_indexes) <= num_to_predict
-
-        select_indexes = sorted(select_indexes)
-        permute_indexes = list(select_indexes)
-        np_rng.shuffle(permute_indexes)
-        orig_token = list(output_tokens)
-
-        for src_i, tgt_i in zip(select_indexes, permute_indexes):
-            output_tokens[src_i] = orig_token[tgt_i]
-            masked_lms.append(MaskedLmInstance(index=src_i, label=orig_token[src_i]))
-
-    masked_lms = sorted(masked_lms, key=lambda x: x.index)
-
-    for p in masked_lms:
-        masked_lm_positions.append(p.index)
-        masked_lm_labels.append(p.label)
-
-    return (output_tokens, masked_lm_positions, masked_lm_labels, token_boundary)
-
-
-def pad_and_convert_to_numpy(tokens, tokentypes, masked_positions,
-                             masked_labels, pad_id, max_seq_length):
-    """Pad sequences and convert them to numpy."""
-
-    # Some checks.
-    num_tokens = len(tokens)
-    padding_length = max_seq_length - num_tokens
-    assert padding_length >= 0
-    assert len(tokentypes) == num_tokens
-    assert len(masked_positions) == len(masked_labels)
-
-    # Tokens and token types.
-    filler = [pad_id] * padding_length
-    tokens_np = np.array(tokens + filler, dtype=np.int64)
-    tokentypes_np = np.array(tokentypes + filler, dtype=np.int64)
-
-    # Padding mask.
-    padding_mask_np = np.array([1] * num_tokens + [0] * padding_length,
-                               dtype=np.int64)
-
-    # Lables and loss mask.
-    labels = [-1] * max_seq_length
-    loss_mask = [0] * max_seq_length
-    for i in range(len(masked_positions)):
-        assert masked_positions[i] < num_tokens
-        labels[masked_positions[i]] = masked_labels[i]
-        loss_mask[masked_positions[i]] = 1
-    labels_np = np.array(labels, dtype=np.int64)
-    loss_mask_np = np.array(loss_mask, dtype=np.int64)
-
-    return tokens_np, tokentypes_np, labels_np, padding_mask_np, loss_mask_np
-
-
-def get_indexed_dataset_(data_prefix, data_impl, skip_warmup):
-
-    print_rank_0(' > building dataset index ...')
-
-    start_time = time.time()
-    indexed_dataset = make_indexed_dataset(data_prefix,
-                                           data_impl,
-                                           skip_warmup)
-    assert indexed_dataset.sizes.shape[0] == indexed_dataset.doc_idx[-1]
-    print_rank_0(' > finished creating indexed dataset in {:4f} '
-                 'seconds'.format(time.time() - start_time))
-
-    print_rank_0(' > indexed dataset stats:')
-    print_rank_0('    number of documents: {}'.format(
-        indexed_dataset.doc_idx.shape[0] - 1))
-    print_rank_0('    number of sentences: {}'.format(
-        indexed_dataset.sizes.shape[0]))
-
-    return indexed_dataset
-
-
-def get_train_valid_test_split_(splits_string, size):
-    """ Get dataset splits from comma or '/' separated string list."""
-
-    splits = []
-    if splits_string.find(',') != -1:
-        splits = [float(s) for s in splits_string.split(',')]
-    elif splits_string.find('/') != -1:
-        splits = [float(s) for s in splits_string.split('/')]
-    else:
-        splits = [float(splits_string)]
-    while len(splits) < 3:
-        splits.append(0.)
-    splits = splits[:3]
-    splits_sum = sum(splits)
-    assert splits_sum > 0.0
-    splits = [split / splits_sum for split in splits]
-    splits_index = [0]
-    for index, split in enumerate(splits):
-        splits_index.append(splits_index[index] +
-                            int(round(split * float(size))))
-    diff = splits_index[-1] - size
-    for index in range(1, len(splits_index)):
-        splits_index[index] -= diff
-    assert len(splits_index) == 4
-    assert splits_index[-1] == size
-    return splits_index
-
-
diff --git a/megatron/data/gpt2_dataset.py b/megatron/data/gpt2_dataset.py
index 3b654be95..d1cef21e7 100644
--- a/megatron/data/gpt2_dataset.py
+++ b/megatron/data/gpt2_dataset.py
@@ -25,7 +25,6 @@
 import torch
 
 from megatron import mpu, print_rank_0
-from megatron.data.dataset_utils import get_train_valid_test_split_
 from megatron.data.indexed_dataset import make_dataset as make_indexed_dataset
 
 
@@ -111,7 +110,6 @@ def __len__(self):
         #    sample i --> [sample_idx[i], sample_idx[i+1])
         return self.sample_idx.shape[0] - 1
 
-
     def __getitem__(self, idx):
         # Get the shuffled index.
         idx = self.shuffle_idx[idx]
@@ -182,7 +180,6 @@ def _build_index_mappings(name, data_prefix, documents, sizes,
             start_time = time.time()
             # Use C++ implementation for speed.
             # First compile and then import.
-            from megatron.data.dataset_utils import compile_helper
             compile_helper()
             from megatron.data import helpers
             assert doc_idx.dtype == np.int32
@@ -319,3 +316,44 @@ def _build_shuffle_idx(size, np_rng):
     shuffle_idx = np.arange(start=0, stop=size, step=1, dtype=dtype_)
     np_rng.shuffle(shuffle_idx)
     return shuffle_idx
+
+
+def compile_helper():
+    """Compile helper function ar runtime. Make sure this
+    is invoked on a single process."""
+    import os
+    import subprocess
+    path = os.path.abspath(os.path.dirname(__file__))
+    ret = subprocess.run(['make', '-C', path])
+    if ret.returncode != 0:
+        print("Making C++ dataset helpers module failed, exiting.")
+        import sys
+        sys.exit(1)
+
+
+def get_train_valid_test_split_(splits_string, size):
+    """ Get dataset splits from comma or '/' separated string list."""
+
+    splits = []
+    if splits_string.find(',') != -1:
+        splits = [float(s) for s in splits_string.split(',')]
+    elif splits_string.find('/') != -1:
+        splits = [float(s) for s in splits_string.split('/')]
+    else:
+        splits = [float(splits_string)]
+    while len(splits) < 3:
+        splits.append(0.)
+    splits = splits[:3]
+    splits_sum = sum(splits)
+    assert splits_sum > 0.0
+    splits = [split / splits_sum for split in splits]
+    splits_index = [0]
+    for index, split in enumerate(splits):
+        splits_index.append(splits_index[index] +
+                            int(round(split * float(size))))
+    diff = splits_index[-1] - size
+    for index in range(1, len(splits_index)):
+        splits_index[index] -= diff
+    assert len(splits_index) == 4
+    assert splits_index[-1] == size
+    return splits_index
diff --git a/megatron/data/indexed_dataset.py b/megatron/data/indexed_dataset.py
index 125106623..fa3b9bb78 100644
--- a/megatron/data/indexed_dataset.py
+++ b/megatron/data/indexed_dataset.py
@@ -10,14 +10,15 @@
 # Added document index to index file and made it accessible.
 #    An empty sentence no longer separates documents.
 
-from functools import lru_cache
 import os
 import shutil
 import struct
+from functools import lru_cache
 from itertools import accumulate
 
 import numpy as np
 import torch
+
 from megatron import print_rank_0
 
 
@@ -28,10 +29,6 @@ def __best_fitting_dtype(vocab_size=None):
         return np.int32
 
 
-def get_available_dataset_impl():
-    return ['lazy', 'cached', 'mmap']
-
-
 def infer_dataset_impl(path):
     if IndexedDataset.exists(path):
         with open(index_file_path(path), 'rb') as f:
@@ -200,7 +197,7 @@ def size(self, index):
     @staticmethod
     def exists(path):
         return (
-            os.path.exists(index_file_path(path)) and os.path.exists(data_file_path(path))
+                os.path.exists(index_file_path(path)) and os.path.exists(data_file_path(path))
         )
 
     @property
@@ -532,7 +529,7 @@ def supports_prefetch(self):
     @staticmethod
     def exists(path):
         return (
-            os.path.exists(index_file_path(path)) and os.path.exists(data_file_path(path))
+                os.path.exists(index_file_path(path)) and os.path.exists(data_file_path(path))
         )
 
 
diff --git a/megatron/fp16/__init__.py b/megatron/fp16/__init__.py
index 56ee11f79..6e3cd9bc1 100644
--- a/megatron/fp16/__init__.py
+++ b/megatron/fp16/__init__.py
@@ -12,19 +12,5 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from .fp16util import (
-    BN_convert_float,
-    network_to_half,
-    prep_param_lists,
-    model_grads_to_master_grads,
-    master_params_to_model_params,
-    tofp16,
-    to_python_float,
-    clip_grad_norm,
-    convert_module,
-    convert_network,
-    FP16Model,
-)
 
 from .fp16 import *
-from .loss_scaler import *
diff --git a/megatron/fp16/fp16.py b/megatron/fp16/fp16.py
index bdea6adbb..512621c28 100755
--- a/megatron/fp16/fp16.py
+++ b/megatron/fp16/fp16.py
@@ -14,18 +14,8 @@
 # limitations under the License.
 """Stable version of apex FP16 Optimizer"""
 import torch
-from torch import nn
 from torch.autograd import Variable
 from torch.nn.parameter import Parameter
-from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
-
-from .loss_scaler import DynamicLossScaler, LossScaler
-from .fp16util import model_grads_to_master_grads, master_params_to_model_params, clip_grad_norm
-
-from apex.multi_tensor_apply import multi_tensor_applier
-import amp_C
-
-from megatron.module import MegatronModule
 
 FLOAT_TYPES = (torch.FloatTensor, torch.cuda.FloatTensor)
 HALF_TYPES = (torch.HalfTensor, torch.cuda.HalfTensor)
@@ -64,588 +54,3 @@ def float_conversion(val):
         return val
     return conversion_helper(val, float_conversion)
 
-
-class FP16_Module(MegatronModule):
-    def __init__(self, module):
-        super(FP16_Module, self).__init__()
-        self.add_module('module', module.half())
-
-    def forward(self, *inputs, **kwargs):
-        return fp16_to_fp32(self.module(*(fp32_to_fp16(inputs)), **kwargs))
-
-    def state_dict(self, destination=None, prefix='', keep_vars=False):
-        return self.module.state_dict(destination, prefix, keep_vars)
-
-    def state_dict_for_save_checkpoint(self, destination=None, prefix='',
-                                       keep_vars=False):
-        return self.module.state_dict_for_save_checkpoint(destination, prefix,
-                                                          keep_vars)
-
-    def load_state_dict(self, state_dict, strict=True):
-        self.module.load_state_dict(state_dict, strict=strict)
-
-# TODO:  Update overflow check + downscale to use Carl's fused kernel.
-
-
-class FP16_Optimizer(object):
-    """
-    :class:`FP16_Optimizer` is designed to wrap an existing PyTorch optimizer,
-    and manage static or dynamic loss scaling and master weights in a manner transparent to the user.
-    For standard use, only two lines must be changed:  creating the :class:`FP16_Optimizer` instance,
-    and changing the call to ``backward``.
-
-    Example::
-
-        model = torch.nn.Linear(D_in, D_out).cuda().half()
-        optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
-        # Name the FP16_Optimizer instance to replace the existing optimizer
-        # (recommended but not required):
-        optimizer = FP16_Optimizer(optimizer, static_loss_scale = 128.0)
-        ...
-        # loss.backward() becomes:
-        optimizer.backward(loss)
-        ...
-
-    Example with dynamic loss scaling::
-
-        ...
-        optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
-                                   # optional arg to control dynamic loss scaling behavior
-                                   # dynamic_loss_args={'scale_window' : 500})
-                                   # Usually, dynamic_loss_args is not necessary.
-
-    Args:
-        init_optimizer (torch.optim.optimizer):  Existing optimizer created with the parameters to optimize.  Internally, :class:`FP16_Optimizer` replaces the passed optimizer's fp16 parameters, if any, with fp32 master parameters copied from the original ones.  :class:`FP16_Optimizer` also stores references to the original fp16 parameters, and updates these fp16 parameters from the master fp32 copy at the end of each :attr:`step`.
-        static_loss_scale (float, optional, default=1.0):  Loss scale used internally to scale gradients computed by the model.  Any fp16 gradients will be copied to fp32, then downscaled before being applied to the fp32 master params, so ``static_loss_scale`` should not affect learning rate.
-        dynamic_loss_scale (bool, optional, default=False):  Use dynamic loss scaling.  If True, this will override any ``static_loss_scale`` option.
-        dynamic_loss_args (dict, optional, default=None):  Dict of kwargs that will be forwarded to the internal :class:`DynamicLossScaler` instance's constructor.  Keys of this dict must match kwargs accepted by :class:`DynamicLossScaler`'s constructor.  If ``dynamic_loss_args`` is unspecified, :class:`DynamicLossScaler`'s defaults will be used.
-        verbose (bool, optional, default=True):  By default, FP16_Optimizer's constructor prints out the parameters and parameter groups it is ingesting, as a sanity check.  If this becomes annoying (e.g. for large models), it can be disabled by passing ``verbose=False``.  ``verbose=False`` will not disable printing when the loss scale is readjusted during dynamic loss scaling.
-
-    ``init_optimizer`` is expected to have been constructed in the ordinary way.
-    It is recommended (although not required) that the newly constructed :class:`FP16_Optimizer` instance be
-    named to replace ``init_optimizer``, for two reasons:
-    First, it means that references to the same name
-    later in the file will not have to change.
-    Second, :class:`FP16_Optimizer` reserves the right (as an implementation detail) to
-    modify ``init_optimizer``.  If you do choose a unique name for the new
-    :class:`FP16_Optimizer` instance, you should only work with this new instance,
-    because the preexisting optimizer might no longer behave as expected.
-
-    ``init_optimizer`` may be any Pytorch optimizer.
-    It may contain a mixture of fp16 and fp32 parameters organized into any number of
-    ``param_groups`` with different hyperparameters.  The :class:`FP16_Optimizer` constructor will
-    ingest these ``param_groups`` and remember them.
-
-    Calls to ::
-
-        loss.backward()
-
-    must be replaced with ::
-
-        optimizer.backward(loss)
-
-    because :class:`FP16_Optimizer` requires ownership of the backward pass to implement
-    loss scaling and copies to master gradients.
-
-    .. note::
-        Loss scaling, either static or dynamic, is orthogonal to learning rate, because gradients
-        are downscaled before being applied.  This means that adjusting the loss scale, or using
-        dynamic loss scaling, should not require retuning the learning rate or any other
-        hyperparameters.
-
-
-    **Advanced options**
-
-    **Closures**:  :class:`FP16_Optimizer` can wrap a Pytorch optimizer that receives a closure.
-    See docstring for :attr:`step`.
-
-    **Gradient clipping**:  Use :attr:`clip_master_grads`.
-
-    **Multiple losses**:  If your model accumulates gradients from multiple losses,
-    this can be made more efficient by supplying ``update_master_grads=False``
-    to :attr:`backward`.  See docstring for :attr:`backward`.
-
-    **Manually adjusting loss scale**:  The current loss scale can be retrieved or set via ::
-
-        print(optimizer.loss_scale)
-        optimizer.loss_scale = new_loss_scale
-
-    For static loss scaling, manually adjusting the loss scale over time is a reasonable
-    thing to do.  During later epochs, gradients may become smaller, and a
-    higher loss scale may be required, analogous to scheduling the learning rate.  Dynamic loss
-    scaling is more subtle (see :class:`DynamicLossScaler`) and in this case, manually adjusting
-    the loss scale is not recommended.
-
-    **Multi_GPU training**:  If the wrapped ``init_optimizer`` was created from a model wrapped in
-    Pytorch DistributedDataParallel or Apex DistributedDataParallel, :class:`FP16_Optimizer`
-    should still work as intended.
-    """
-
-    def __init__(self,
-                 init_optimizer,
-                 static_loss_scale=1.0,
-                 dynamic_loss_scale=False,
-                 dynamic_loss_args=None,
-                 verbose=False):
-        if not torch.cuda.is_available:
-            raise SystemError("Cannot use fp16 without CUDA.")
-
-        self.verbose = verbose
-
-        self.optimizer = init_optimizer
-        # init_state_dict sets up an alternative way to cast per-param state tensors.
-        # Stashing here in case https://github.com/pytorch/pytorch/issues/7733 makes it necessary.
-        # init_state_dict = init_optimizer.state_dict()
-
-        self.fp16_groups = []
-        self.fp32_from_fp16_groups = []
-        self.fp32_from_fp32_groups = []
-        for i, param_group in enumerate(self.optimizer.param_groups):
-            self.maybe_print("FP16_Optimizer processing param group {}:".format(i))
-            fp16_params_this_group = []
-            fp32_params_this_group = []
-            fp32_from_fp16_params_this_group = []
-            for i, param in enumerate(param_group['params']):
-                if param.requires_grad:
-                    if param.type() == 'torch.cuda.HalfTensor':
-                        self.maybe_print("FP16_Optimizer received torch.cuda.HalfTensor with {}"
-                                         .format(param.size()))
-                        fp16_params_this_group.append(param)
-                        master_param = param.detach().clone().float()
-                        master_param.requires_grad = True
-                        # Copythe model parallel flag.
-                        master_param.model_parallel = param.model_parallel
-                        param_group['params'][i] = master_param
-                        fp32_from_fp16_params_this_group.append(master_param)
-                        # Reset existing state dict key to the new master param.
-                        # We still need to recast per-param state tensors, if any, to FP32.
-                        if param in self.optimizer.state:
-                            self.optimizer.state[master_param] = self.optimizer.state.pop(param)
-                    elif param.type() == 'torch.cuda.FloatTensor':
-                        self.maybe_print("FP16_Optimizer received torch.cuda.FloatTensor with {}"
-                                         .format(param.size()))
-                        fp32_params_this_group.append(param)
-                        param_group['params'][i] = param
-                    else:
-                        raise TypeError("Wrapped parameters must be either "
-                                        "torch.cuda.FloatTensor or torch.cuda.HalfTensor. "
-                                        "Received {}".format(param.type()))
-
-            self.fp16_groups.append(fp16_params_this_group)
-            self.fp32_from_fp16_groups.append(fp32_from_fp16_params_this_group)
-            self.fp32_from_fp32_groups.append(fp32_params_this_group)
-
-        # Leverage state_dict() and load_state_dict() to recast preexisting per-param state tensors
-        self.optimizer.load_state_dict(self.optimizer.state_dict())
-        # alternative way to cast per-param state tensors:
-        # self.optimizer.load_state_dict(init_state_dict)
-
-        if dynamic_loss_scale:
-            self.dynamic_loss_scale = True
-            if dynamic_loss_args is not None:
-                self.loss_scaler = DynamicLossScaler(**dynamic_loss_args)
-            else:
-                self.loss_scaler = DynamicLossScaler()
-        else:
-            self.dynamic_loss_scale = False
-            self.loss_scaler = LossScaler(static_loss_scale)
-
-        self.overflow = False
-        self.first_closure_call_this_step = True
-
-        self.clip_grad_norm = clip_grad_norm
-
-    def maybe_print(self, msg):
-        if self.verbose:
-            print(msg)
-
-    def __getstate__(self):
-        raise RuntimeError("FP16_Optimizer should be serialized using state_dict().")
-
-    def __setstate__(self, state):
-        raise RuntimeError("FP16_Optimizer should be deserialized using load_state_dict().")
-
-    def zero_grad(self, set_grads_to_None=False):
-        """
-        Zero fp32 and fp16 parameter grads.
-        """
-        # In principle, only the .grad attributes of the model params need to be zeroed,
-        # because gradients are copied into the FP32 master params.  However, we zero
-        # all gradients owned by the optimizer, just to be safe:
-        for group in self.optimizer.param_groups:
-            for p in group['params']:
-                if set_grads_to_None:
-                    p.grad = None
-                else:
-                    if p.grad is not None:
-                        p.grad.detach_()
-                        p.grad.zero_()
-
-        # Zero fp16 gradients owned by the model:
-        for fp16_group in self.fp16_groups:
-            for param in fp16_group:
-                if set_grads_to_None:
-                    param.grad = None
-                else:
-                    if param.grad is not None:
-                        param.grad.detach_()  # as in torch.optim.optimizer.zero_grad()
-                        param.grad.zero_()
-
-    def _check_overflow(self):
-        params = []
-        for group in self.fp16_groups:
-            for param in group:
-                params.append(param)
-        for group in self.fp32_from_fp32_groups:
-            for param in group:
-                params.append(param)
-        self.overflow = self.loss_scaler.has_overflow(params)
-
-    def _update_scale(self, has_overflow=False):
-        self.loss_scaler.update_scale(has_overflow)
-
-    def _master_params_to_model_params(self):
-        for fp16_group, fp32_from_fp16_group in zip(self.fp16_groups, self.fp32_from_fp16_groups):
-            master_params_to_model_params(fp16_group, fp32_from_fp16_group)
-
-    def _model_params_to_master_params(self):
-        for fp16_group, fp32_from_fp16_group in zip(self.fp16_groups, self.fp32_from_fp16_groups):
-            master_params_to_model_params(fp32_from_fp16_group, fp16_group)
-
-    # To consider:  Integrate distributed with this wrapper by registering a hook on each variable
-    # that does the overflow check, gradient copy + downscale, and fp32
-    # allreduce in a different stream.
-    def _model_grads_to_master_grads(self):
-        for fp16_group, fp32_from_fp16_group in zip(self.fp16_groups, self.fp32_from_fp16_groups):
-            model_grads_to_master_grads(fp16_group, fp32_from_fp16_group)
-
-    def _downscale_master(self):
-        if self.loss_scale != 1.0:
-            for group in self.optimizer.param_groups:
-                grads = [p.grad for p in group['params'] if p.grad is not None]
-                _overflow_buf = torch.cuda.IntTensor([0])
-                multi_tensor_applier(amp_C.multi_tensor_scale,
-                                     _overflow_buf,
-                                     [grads, grads],
-                                     1./self.loss_scale)
-      
-    def clip_master_grads(self, max_norm, norm_type=2):
-        """
-        Clips fp32 master gradients via ``torch.nn.utils.clip_grad_norm``.
-
-        Args:
-            max_norm (float or int): max norm of the gradients
-            norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
-                infinity norm.
-
-        Returns:
-            Total norm of the current fp32 gradients (viewed as a single vector).
-
-        .. warning::
-            Returns -1 if the most recently computed fp16 gradients overflowed (that is, if ``self.overflow`` is ``True``).
-        """
-        if not self.overflow:
-            fp32_params = []
-            for param_group in self.optimizer.param_groups:
-                for param in param_group['params']:
-                    fp32_params.append(param)
-            return self.clip_grad_norm(fp32_params, max_norm, norm_type)
-        else:
-            return -1
-
-    def state_dict(self):
-        """
-        Returns a dict containing the current state of this :class:`FP16_Optimizer` instance.
-        This dict contains attributes of :class:`FP16_Optimizer`, as well as the state_dict
-        of the contained Pytorch optimizer.
-        Example::
-
-            checkpoint = {}
-            checkpoint['model'] = model.state_dict()
-            checkpoint['optimizer'] = optimizer.state_dict()
-            torch.save(checkpoint, "saved.pth")
-        """
-        state_dict = {}
-        state_dict['loss_scaler'] = self.loss_scaler
-        state_dict['dynamic_loss_scale'] = self.dynamic_loss_scale
-        state_dict['overflow'] = self.overflow
-        state_dict['first_closure_call_this_step'] = self.first_closure_call_this_step
-        state_dict['optimizer_state_dict'] = self.optimizer.state_dict()
-        state_dict['fp32_from_fp16'] = self.fp32_from_fp16_groups
-        return state_dict
-
-    def load_state_dict(self, state_dict):
-        """
-        Loads a state_dict created by an earlier call to state_dict().
-        If ``fp16_optimizer_instance`` was constructed from some ``init_optimizer``,
-        whose parameters in turn came from ``model``, it is expected that the user
-        will call ``model.load_state_dict()`` before
-        ``fp16_optimizer_instance.load_state_dict()`` is called.
-
-        Example::
-
-            model = torch.nn.Linear(D_in, D_out).cuda().half()
-            optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
-            optimizer = FP16_Optimizer(optimizer, static_loss_scale = 128.0)
-            ...
-            checkpoint = torch.load("saved.pth")
-            model.load_state_dict(checkpoint['model'])
-            optimizer.load_state_dict(checkpoint['optimizer'])
-        """
-        # I think it should actually be ok to reload the optimizer before the model.
-        self.loss_scaler = state_dict['loss_scaler']
-        self.dynamic_loss_scale = state_dict['dynamic_loss_scale']
-        self.overflow = state_dict['overflow']
-        self.first_closure_call_this_step = state_dict['first_closure_call_this_step']
-        self.optimizer.load_state_dict(state_dict['optimizer_state_dict'])
-        # At this point, the optimizer's references to the model's fp32 parameters are up to date.
-        # The optimizer's hyperparameters and internal buffers are also up to date.
-        # However, the fp32 master copies of the model's fp16 params stored by the optimizer are still
-        # out of date.  There are two options.
-        # 1:  Refresh the master params from the model's fp16 params.
-        # This requires less storage but incurs precision loss.
-        # 2:  Save and restore the fp32 master copies separately.
-        # We choose option 2.
-        #
-        # Pytorch Optimizer.load_state_dict casts saved buffers (e.g. momentum) to the type and device
-        # of their associated parameters, because it's possible those buffers might not exist yet in
-        # the current optimizer instance.  In our case, as long as the current FP16_Optimizer has been
-        # constructed in the same way as the one whose state_dict we are loading, the same master params
-        # are guaranteed to exist, so we can just copy_() from the saved master params.
-        for current_group, saved_group in zip(
-                self.fp32_from_fp16_groups, state_dict['fp32_from_fp16']):
-            for current, saved in zip(current_group, saved_group):
-                current.data.copy_(saved.data)
-
-    def step(self, closure=None):  # could add clip option.
-        """
-        If no closure is supplied, :attr:`step` should be called after
-        ``fp16_optimizer_obj.backward(loss)``.
-        :attr:`step` updates the fp32 master copy of parameters using the optimizer supplied to
-        :class:`FP16_Optimizer`'s constructor, then copies the updated fp32 params into the fp16 params
-        originally referenced by :class:`FP16_Optimizer`'s constructor, so the user may immediately run
-        another forward pass using their model.
-
-        If a closure is supplied, :attr:`step` may be called without a prior call to
-        :attr:`backward(loss)`.
-        This control flow is identical to `ordinary Pytorch optimizer use`_ with closures.
-        However, the user should take care that any ``loss.backward()`` call within the closure
-        has been replaced by ``fp16_optimizer_obj.backward(loss)``.
-
-        Args:
-           closure (optional):  Closure that will be supplied to the underlying optimizer originally passed to :class:`FP16_Optimizer`'s constructor.  closure should call :attr:`zero_grad()` on the :class:`FP16_Optimizer` object, compute the loss, call :attr:`backward(loss)`, and return the loss.
-
-        Example with closure::
-
-            # optimizer is assumed to be an FP16_Optimizer object, previously constructed from an
-            # existing pytorch optimizer.
-            for input, target in dataset:
-                def closure():
-                    optimizer.zero_grad()
-                    output = model(input)
-                    loss = loss_fn(output, target)
-                    # loss.backward() becomes:
-                    optimizer.backward(loss)
-                    return loss
-                optimizer.step(closure)
-
-        .. warning::
-            Currently, calling :attr:`step` with a closure is not compatible with dynamic loss scaling.
-
-        .. _`ordinary Pytorch optimizer use`:
-            http://pytorch.org/docs/master/optim.html#optimizer-step-closure
-        """
-
-        scale = self.loss_scaler.loss_scale
-        self._update_scale(self.overflow)
-
-        if self.overflow:
-            self.maybe_print("OVERFLOW! Skipping step. Attempted loss scale: {}, reducing to {}"
-                             .format(scale, self.loss_scale))
-            return
-
-        if closure is not None:
-            retval = self._step_with_closure(closure)
-        else:
-            retval = self.optimizer.step()
-
-        self._master_params_to_model_params()
-
-        return retval
-
-    def _step_with_closure(self, closure):
-        def wrapped_closure():
-            # helpful for debugging
-            # print("Calling wrapped_closure, first_closure_call_this_step = {}"
-            #       .format(self.first_closure_call_this_step))
-            if self.first_closure_call_this_step:
-                # We expect that the fp16 params are initially fresh on entering self.step(),
-                # so _master_params_to_model_params() is unnecessary the first time wrapped_closure()
-                # is called within self.optimizer.step().
-                self.first_closure_call_this_step = False
-            else:
-                # If self.optimizer.step() internally calls wrapped_closure more than once,
-                # it may update the fp32 params after each call.  However, self.optimizer
-                # doesn't know about the fp16 params at all.  If the fp32 params get updated,
-                # we can't rely on self.optimizer to refresh the fp16 params.  We need
-                # to handle that manually:
-                self._master_params_to_model_params()
-            # Our API expects the user to give us ownership of the backward() call by
-            # replacing all calls to loss.backward() with optimizer.backward(loss).
-            # This requirement holds whether or not the call to backward() is made within a closure.
-            # If the user is properly calling optimizer.backward(loss) within "closure,"
-            # calling closure() here will give the fp32 master params fresh gradients
-            # for the optimizer to play with, so all wrapped_closure needs to do is call
-            # closure() and return the loss.
-            temp_loss = closure()
-            while(self.overflow):
-                scale = self.loss_scaler.loss_scale
-                self._update_scale(self.overflow)
-                self.maybe_print("OVERFLOW within closure! Skipping step. Attempted loss scale: {}, "
-                                 "reducing to {}".format(scale, self.loss_scale))
-                temp_loss = closure()
-            return temp_loss
-
-        retval = self.optimizer.step(wrapped_closure)
-
-        self.first_closure_call_this_step = True
-
-        return retval
-
-    def backward(self, loss, update_master_grads=True, retain_graph=False):
-        """
-        :attr:`backward` performs the following conceptual steps:
-
-        1. fp32_loss = loss.float() (see first Note below)
-        2. scaled_loss = fp32_loss*loss_scale
-        3. scaled_loss.backward(), which accumulates scaled gradients into the ``.grad`` attributes of the model's leaves (which may be fp16, fp32, or a mixture, depending how your model was defined).
-        4. fp16 grads are then copied to the master params' ``.grad`` attributes (see second Note), which are guaranteed to be fp32.
-        5. Finally, master grads are divided by loss_scale.
-
-        In this way, after :attr:`backward`, the master params have fresh gradients,
-        and :attr:`step` may be called.
-
-        .. note::
-            :attr:`backward` internally converts the loss to fp32 before applying the loss scale.
-            This provides some additional safety against overflow if the user has supplied an
-            fp16 loss value.
-            However, for maximum overflow safety, the user should
-            compute the loss criterion (MSE, cross entropy, etc) in fp32 before supplying it to
-            :attr:`backward`.
-
-        .. warning::
-            The gradients found in a model's leaves after the call to
-            :attr:`backward` should not be regarded as valid in general,
-            because it's possible
-            they have been scaled (and in the case of dynamic loss scaling,
-            the scale factor may change over time).
-            If the user wants to inspect gradients after a call to :attr:`backward`,
-            only the master gradients should be regarded as valid.  These can be retrieved via
-            :attr:`inspect_master_grad_data()`.
-
-        Args:
-            loss:  The loss output by the user's model.  loss may be either float or half (but see first Note above).
-            update_master_grads (bool, optional, default=True):  Option to copy fp16 grads to fp32 grads on this call.  By setting this to False, the user can delay the copy, which is useful to eliminate redundant fp16->fp32 grad copies if :attr:`backward` is being called on multiple losses in one iteration.  If set to False, the user becomes responsible for calling :attr:`update_master_grads` before calling :attr:`step`.
-            retain_graph (bool, optional, default=False):  Forwards the usual ``retain_graph=True`` option to the internal call to ``loss.backward``.  If ``retain_graph`` is being used to accumulate gradient values from multiple backward passes before calling ``optimizer.step``, passing ``update_master_grads=False`` is also recommended (see Example below).
-
-        Example::
-
-            # Ordinary operation:
-            optimizer.backward(loss)
-
-            # Naive operation with multiple losses (technically valid, but less efficient):
-            # fp32 grads will be correct after the second call,  but
-            # the first call incurs an unnecessary fp16->fp32 grad copy.
-            optimizer.backward(loss1)
-            optimizer.backward(loss2)
-
-            # More efficient way to handle multiple losses:
-            # The fp16->fp32 grad copy is delayed until fp16 grads from all
-            # losses have been accumulated.
-            optimizer.backward(loss1, update_master_grads=False)
-            optimizer.backward(loss2, update_master_grads=False)
-            optimizer.update_master_grads()
-        """
-        # To consider:  try multiple backward passes using retain_grad=True to find
-        # a loss scale that works.  After you find a loss scale that works, do a final dummy
-        # backward pass with retain_graph=False to tear down the graph.  Doing this would avoid
-        # discarding the iteration,  but probably wouldn't improve overall efficiency.
-        self.loss_scaler.backward(loss.float(), retain_graph=retain_graph)
-        if update_master_grads:
-            self.update_master_grads()
-
-    def update_master_grads(self):
-        """
-        Copy the ``.grad`` attribute from stored references to fp16 parameters to
-        the ``.grad`` attribute of the fp32 master parameters that are directly
-        updated by the optimizer.  :attr:`update_master_grads` only needs to be called if
-        ``fp16_optimizer_obj.backward`` was called with ``update_master_grads=False``.
-        """
-        if self.dynamic_loss_scale:
-            self._check_overflow()
-            if self.overflow:
-                return
-        self._model_grads_to_master_grads()
-        self._downscale_master()
-
-    def inspect_master_grad_data(self):
-        """
-        When running with :class:`FP16_Optimizer`,
-        ``.grad`` attributes of a model's fp16 leaves should not be
-        regarded as truthful, because they might be scaled.
-        After a call to :attr:`fp16_optimizer_obj.backward(loss)`, if no overflow was encountered,
-        the fp32 master params' ``.grad``
-        attributes will contain valid gradients properly divided by the loss scale.  However,
-        because :class:`FP16_Optimizer` flattens some parameters, accessing them may be
-        nonintuitive.  :attr:`inspect_master_grad_data`
-        allows those gradients to be viewed with shapes corresponding to their associated model leaves.
-
-        Returns:
-            List of lists (one list for each parameter group).  The list for each parameter group
-            is a list of the ``.grad.data`` attributes of the fp32 master params belonging to that group.
-        """
-        if self.overflow:
-            print("Warning:  calling FP16_Optimizer.inspect_master_grad_data while in an overflow state.  "
-                  "Gradients are currently invalid (may be inf, nan, or stale).  Returning None.")
-            return None
-        else:
-            # The optimizer owns only references to master params.
-            master_grads_data = []
-            for param_group in self.optimizer.param_groups:
-                master_grads_this_group = []
-                for param in param_group['params']:
-                    if param.grad is not None:
-                        master_grads_this_group.append(param.grad.data)
-                    else:
-                        master_grads_this_group.append(None)
-                master_grads_data.append(master_grads_this_group)
-            return master_grads_data
-
-    # Promote loss scale so it can be retrieved or set via "fp16_optimizer_instance.loss_scale"
-
-    def _get_loss_scale(self):
-        return self.loss_scaler.loss_scale
-
-    def _set_loss_scale(self, value):
-        self.loss_scaler.cur_scale = value
-
-    loss_scale = property(_get_loss_scale, _set_loss_scale)
-
-    # Promote state so it can be retrieved or set via "fp16_optimizer_instance.state"
-    def _get_state(self):
-        return self.optimizer.state
-
-    def _set_state(self, value):
-        self.optimizer.state = value
-
-    state = property(_get_state, _set_state)
-
-    # Promote param_groups so it can be retrieved or set via "fp16_optimizer_instance.param_groups"
-    # (for example, to adjust the learning rate)
-    def _get_param_groups(self):
-        return self.optimizer.param_groups
-
-    def _set_param_groups(self, value):
-        self.optimizer.param_groups = value
-
-    param_groups = property(_get_param_groups, _set_param_groups)
diff --git a/megatron/fp16/fp16util.py b/megatron/fp16/fp16util.py
deleted file mode 100644
index 0266ede34..000000000
--- a/megatron/fp16/fp16util.py
+++ /dev/null
@@ -1,216 +0,0 @@
-# coding=utf-8
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import torch.nn as nn
-from torch.autograd import Variable
-from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
-
-from apex.multi_tensor_apply import multi_tensor_applier
-import amp_C
-
-from megatron import mpu
-
-
-class tofp16(nn.Module):
-    """
-    Utility module that implements::
-
-        def forward(self, input):
-            return input.half()
-    """
-
-    def __init__(self):
-        super(tofp16, self).__init__()
-
-    def forward(self, input):
-        return input.half()
-
-
-def BN_convert_float(module):
-    """
-    Utility function for network_to_half().
-
-    Retained for legacy purposes.
-    """
-    if isinstance(module, torch.nn.modules.batchnorm._BatchNorm) and module.affine is True:
-        module.float()
-    for child in module.children():
-        BN_convert_float(child)
-    return module
-
-
-def network_to_half(network):
-    """
-    Convert model to half precision in a batchnorm-safe way.
-
-    Retained for legacy purposes. It is recommended to use FP16Model.
-    """
-    return nn.Sequential(tofp16(), BN_convert_float(network.half()))
-
-
-def convert_module(module, dtype):
-    """
-    Converts a module's immediate parameters and buffers to dtype.
-    """
-    for param in module.parameters(recurse=False):
-        if param is not None:
-            if param.data.dtype.is_floating_point:
-                param.data = param.data.to(dtype=dtype)
-            if param._grad is not None and param._grad.data.dtype.is_floating_point:
-                param._grad.data = param._grad.data.to(dtype=dtype)
-
-    for buf in module.buffers(recurse=False):
-        if buf is not None and buf.data.dtype.is_floating_point:
-            buf.data = buf.data.to(dtype=dtype)
-
-
-def convert_network(network, dtype):
-    """
-    Converts a network's parameters and buffers to dtype.
-    """
-    for module in network.modules():
-        if isinstance(module, torch.nn.modules.batchnorm._BatchNorm) and module.affine is True:
-            continue
-        convert_module(module, dtype)
-    return network
-
-
-class FP16Model(nn.Module):
-    """
-    Convert model to half precision in a batchnorm-safe way.
-    """
-
-    def __init__(self, network):
-        super(FP16Model, self).__init__()
-        self.network = convert_network(network, dtype=torch.half)
-
-    def forward(self, *inputs):
-        inputs = tuple(t.half() for t in inputs)
-        return self.network(*inputs)
-
-
-def backwards_debug_hook(grad):
-    raise RuntimeError("master_params recieved a gradient in the backward pass!")
-
-
-def prep_param_lists(model, flat_master=False):
-    """
-    Creates a list of FP32 master parameters for a given model, as in
-    `Training Neural Networks with Mixed Precision:  Real Examples`_.
-
-    Args:
-        model (torch.nn.Module): Existing Pytorch model
-        flat_master (bool, optional, default=False):  Flatten the master parameters into a single tensor, as a performance optimization.
-    Returns:
-        A tuple (``model_params``, ``master_params``). ``model_params`` is a list of the model's parameters for later use with :func:`model_grads_to_master_grads` and :func:`master_params_to_model_params`.  ``master_params`` is a list of FP32 master gradients.  If ``flat_master=True``, ``master_params`` will be a list with one element.
-
-    Example::
-
-        model_params, master_params = prep_param_lists(model)
-
-    .. warning::
-        Currently, if ``flat_master=True``, all the model's parameters must be the same type.  If the model has parameters of different types, use ``flat_master=False``, or use :class:`FP16_Optimizer`.
-
-    .. _`Training Neural Networks with Mixed Precision:  Real Examples`:
-        http://on-demand.gputechconf.com/gtc/2018/video/S81012/
-    """
-    model_params = [param for param in model.parameters() if param.requires_grad]
-
-    if flat_master:
-        # Give the user some more useful error messages
-        try:
-            # flatten_dense_tensors returns a contiguous flat array.
-            # http://pytorch.org/docs/master/_modules/torch/_utils.html
-            master_params = _flatten_dense_tensors([param.data for param in model_params]).float()
-        except BaseException:
-            print("Error in prep_param_lists:  model may contain a mixture of parameters "
-                  "of different types.  Use flat_master=False, or use F16_Optimizer.")
-            raise
-        master_params = torch.nn.Parameter(master_params)
-        master_params.requires_grad = True
-        # master_params.register_hook(backwards_debug_hook)
-        if master_params.grad is None:
-            master_params.grad = master_params.new(*master_params.size())
-        return model_params, [master_params]
-    else:
-        master_params = [param.clone().float().detach() for param in model_params]
-        for param in master_params:
-            param.requires_grad = True
-        return model_params, master_params
-
-
-def model_grads_to_master_grads(model_params, master_params, flat_master=False):
-    """
-    Copy model gradients to master gradients.
-
-    Args:
-        model_params:  List of model parameters created by :func:`prep_param_lists`.
-        master_params:  List of FP32 master parameters created by :func:`prep_param_lists`.  If ``master_params`` was created with ``flat_master=True``, ``flat_master=True`` should also be supplied to :func:`model_grads_to_master_grads`.
-    """
-    if flat_master:
-        # The flattening may incur one more deep copy than is necessary.
-        master_params[0].grad.data.copy_(
-            _flatten_dense_tensors([p.grad.data for p in model_params]))
-    else:
-        for model, master in zip(model_params, master_params):
-            if model.grad is not None:
-                if master.grad is None:
-                    master.grad = Variable(master.data.new(*master.data.size()))
-            else:
-                master.grad = None
-        model_grads = [p.grad for p in model_params if p.grad is not None]
-        master_grads = [p.grad for p in master_params if p.grad is not None]
-        _overflow_buf = torch.cuda.IntTensor([0])
-        multi_tensor_applier(amp_C.multi_tensor_scale,
-                             _overflow_buf,
-                             [model_grads, master_grads],
-                             1.0)
-
-
-def master_params_to_model_params(model_params, master_params, flat_master=False):
-    """
-    Copy master parameters to model parameters.
-
-    Args:
-        model_params:  List of model parameters created by :func:`prep_param_lists`.
-        master_params:  List of FP32 master parameters created by :func:`prep_param_lists`.  If ``master_params`` was created with ``flat_master=True``, ``flat_master=True`` should also be supplied to :func:`master_params_to_model_params`.
-    """
-    if flat_master:
-        for model, master in zip(model_params,
-                                 _unflatten_dense_tensors(master_params[0].data, model_params)):
-            model.data.copy_(master)
-    else:
-        for model, master in zip(model_params, master_params):
-            model.data.copy_(master.data)
-
-# Backward compatibility fixes
-
-
-def to_python_float(t):
-    if hasattr(t, 'item'):
-        return t.item()
-    else:
-        return t[0]
-
-
-TORCH_MAJOR = int(torch.__version__.split('.')[0])
-TORCH_MINOR = int(torch.__version__.split('.')[1])
-
-clip_grad_norm = mpu.clip_grad_norm
-# elif TORCH_MAJOR == 0 and TORCH_MINOR <= 4:
-#    clip_grad_norm = torch.nn.utils.clip_grad_norm
-# else:
-#    clip_grad_norm = torch.nn.utils.clip_grad_norm_
diff --git a/megatron/fp16/loss_scaler.py b/megatron/fp16/loss_scaler.py
deleted file mode 100755
index 126b7863f..000000000
--- a/megatron/fp16/loss_scaler.py
+++ /dev/null
@@ -1,256 +0,0 @@
-# coding=utf-8
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-
-from apex.multi_tensor_apply import multi_tensor_applier
-import amp_C
-
-from megatron import mpu
-
-# item() is a recent addition, so this helps with backward compatibility.
-
-
-def to_python_float(t):
-    if hasattr(t, 'item'):
-        return t.item()
-    else:
-        return t[0]
-
-
-class LossScaler:
-    """
-    Class that manages a static loss scale.  This class is intended to interact with
-    :class:`FP16_Optimizer`, and should not be directly manipulated by the user.
-
-    Use of :class:`LossScaler` is enabled via the ``static_loss_scale`` argument to
-    :class:`FP16_Optimizer`'s constructor.
-
-    Args:
-        scale (float, optional, default=1.0):  The loss scale.
-    """
-
-    def __init__(self, scale=1):
-        self.cur_scale = scale
-
-    # `params` is a list / generator of torch.Variable
-    def has_overflow(self, params):
-        return False
-
-    # `x` is a torch.Tensor
-    def _has_inf_or_nan(x):
-        return False
-
-    def update_scale(self, overflow):
-        pass
-
-    @property
-    def loss_scale(self):
-        return self.cur_scale
-
-    def scale_gradient(self, module, grad_in, grad_out):
-        _overflow_buf = torch.cuda.IntTensor([0])
-        multi_tensor_applier(amp_C.multi_tensor_scale,
-                             _overflow_buf,
-                             [grad_in, grad_in],
-                             self.loss_scale)
-        return grad_in
-
-    def backward(self, loss, retain_graph=False):
-        scaled_loss = loss * self.loss_scale
-        scaled_loss.backward(retain_graph=retain_graph)
-
-
-class DynamicLossScaler:
-    """
-    Class that manages dynamic loss scaling.  It is recommended to use :class:`DynamicLossScaler`
-    indirectly, by supplying ``dynamic_loss_scale=True`` to the constructor of
-    :class:`FP16_Optimizer`.  However, it's important to understand how :class:`DynamicLossScaler`
-    operates, because the default options can be changed using the
-    the ``dynamic_loss_args`` argument to :class:`FP16_Optimizer`'s constructor.
-
-    Loss scaling is designed to combat the problem of underflowing gradients encountered at long
-    times when training fp16 networks.  Dynamic loss scaling begins by attempting a very high loss
-    scale.  Ironically, this may result in OVERflowing gradients.  If overflowing gradients are
-    encountered, :class:`DynamicLossScaler` informs :class:`FP16_Optimizer` that an overflow has
-    occurred.
-    :class:`FP16_Optimizer` then skips the update step for this particular iteration/minibatch,
-    and :class:`DynamicLossScaler` adjusts the loss scale to a lower value.
-    If a certain number of iterations occur without overflowing gradients detected,
-    :class:`DynamicLossScaler` increases the loss scale once more.
-    In this way :class:`DynamicLossScaler` attempts to "ride the edge" of
-    always using the highest loss scale possible without incurring overflow.
-
-    Args:
-        init_scale (float, optional, default=2**32):  Initial loss scale attempted by :class:`DynamicLossScaler.`
-        scale_factor (float, optional, default=2.0):  Factor used when adjusting the loss scale. If an overflow is encountered, the loss scale is readjusted to loss scale/``scale_factor``.  If ``scale_window`` consecutive iterations take place without an overflow, the loss scale is readjusted to loss_scale*``scale_factor``.
-        scale_window (int, optional, default=1000):  Number of consecutive iterations without an overflow to wait before increasing the loss scale.
-    """
-
-    def __init__(self,
-                 init_scale=2**32,
-                 scale_factor=2.,
-                 scale_window=1000,
-                 min_scale=1,
-                 delayed_shift=1,
-                 consecutive_hysteresis=False):
-        self.cur_scale = init_scale
-        self.cur_iter = 0
-        self.last_overflow_iter = -1
-        self.scale_factor = scale_factor
-        self.scale_window = scale_window
-        self.min_scale = min_scale
-        self.delayed_shift = delayed_shift
-        self.cur_hysteresis = delayed_shift
-        self.consecutive_hysteresis = consecutive_hysteresis
-
-    # `params` is a list / generator of torch.Variable
-    def has_overflow_serial(self, params):
-        for p in params:
-            if p.grad is not None and DynamicLossScaler._has_inf_or_nan(p.grad.data):
-                return True
-
-        return False
-
-    def has_overflow(self, params):
-        overflow = self.has_overflow_serial(params)
-        # Since each model parallel GPU carries only part of the model,
-        # make sure overflow flag is synced across all the model parallel GPUs
-        overflow_gpu = torch.cuda.ByteTensor([overflow])
-        torch.distributed.all_reduce(overflow_gpu,
-                                     op=torch.distributed.ReduceOp.MAX,
-                                     group=mpu.get_model_parallel_group())
-        overflow = overflow_gpu[0].item()
-        return bool(overflow)
-
-    # `x` is a torch.Tensor
-
-    def _has_inf_or_nan(x):
-        try:
-            # if x is half, the .float() incurs an additional deep copy, but it's necessary if
-            # Pytorch's .sum() creates a one-element tensor of the same type as x
-            # (which is true for some recent version of pytorch).
-            cpu_sum = float(x.float().sum())
-            # More efficient version that can be used if .sum() returns a Python scalar
-            # cpu_sum = float(x.sum())
-        except RuntimeError as instance:
-            # We want to check if inst is actually an overflow exception.
-            # RuntimeError could come from a different error.
-            # If so, we still want the exception to propagate.
-            if "value cannot be converted" not in instance.args[0]:
-                raise
-            return True
-        else:
-            if cpu_sum == float('inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum:
-                return True
-            return False
-
-    # `overflow` is boolean indicating whether the gradient overflowed
-    def update_scale(self, overflow):
-
-        if not hasattr(self, 'min_scale'):
-            self.min_scale = 1
-        if not hasattr(self, 'delayed_shift'):
-            self.delayed_shift = 1
-        if not hasattr(self, 'cur_hysteresis'):
-            self.cur_hysteresis = 1
-        if not hasattr(self, 'consecutive_hysteresis'):
-            self.consecutive_hysteresis = True
-        if overflow:
-            # self.cur_scale /= self.scale_factor
-            if self.delayed_shift == 1 or self.cur_hysteresis == 1:
-                self.cur_scale = max(self.cur_scale / self.scale_factor, self.min_scale)
-            else:
-                self.cur_hysteresis -= 1
-            self.last_overflow_iter = self.cur_iter
-        else:
-            if self.consecutive_hysteresis:
-                self.cur_hysteresis = self.delayed_shift
-            if (self.cur_iter - self.last_overflow_iter) % self.scale_window == 0:
-                if not self.consecutive_hysteresis:
-                    self.cur_hysteresis = self.delayed_shift
-                self.cur_scale *= self.scale_factor
-        self.cur_iter += 1
-
-    @property
-    def loss_scale(self):
-        return self.cur_scale
-
-    def scale_gradient(self, module, grad_in, grad_out):
-        _overflow_buf = torch.cuda.IntTensor([0])
-        multi_tensor_applier(amp_C.multi_tensor_scale,
-                             _overflow_buf,
-                             [grad_in, grad_in],
-                             self.loss_scale)
-        return grad_in
-
-    def backward(self, loss, retain_graph=False):
-        scaled_loss = loss * self.loss_scale
-        scaled_loss.backward(retain_graph=retain_graph)
-
-
-##############################################################
-# Example usage below here -- assuming it's in a separate file
-##############################################################
-"""
-TO-DO separate out into an example.
-if __name__ == "__main__":
-    import torch
-    from torch.autograd import Variable
-    from dynamic_loss_scaler import DynamicLossScaler
-
-    # N is batch size; D_in is input dimension;
-    # H is hidden dimension; D_out is output dimension.
-    N, D_in, H, D_out = 64, 1000, 100, 10
-
-    # Create random Tensors to hold inputs and outputs, and wrap them in Variables.
-    x = Variable(torch.randn(N, D_in), requires_grad=False)
-    y = Variable(torch.randn(N, D_out), requires_grad=False)
-
-    w1 = Variable(torch.randn(D_in, H), requires_grad=True)
-    w2 = Variable(torch.randn(H, D_out), requires_grad=True)
-    parameters = [w1, w2]
-
-    learning_rate = 1e-6
-    optimizer = torch.optim.SGD(parameters, lr=learning_rate)
-    loss_scaler = DynamicLossScaler()
-
-    for t in range(500):
-        y_pred = x.mm(w1).clamp(min=0).mm(w2)
-        loss = (y_pred - y).pow(2).sum() * loss_scaler.loss_scale
-        print('Iter {} loss scale: {}'.format(t, loss_scaler.loss_scale))
-        print('Iter {} scaled loss: {}'.format(t, loss.data[0]))
-        print('Iter {} unscaled loss: {}'.format(t, loss.data[0] / loss_scaler.loss_scale))
-
-        # Run backprop
-        optimizer.zero_grad()
-        loss.backward()
-
-        # Check for overflow
-        has_overflow = DynamicLossScaler.has_overflow(parameters)
-
-        # If no overflow, unscale grad and update as usual
-        if not has_overflow:
-            for param in parameters:
-                param.grad.data.mul_(1. / loss_scaler.loss_scale)
-            optimizer.step()
-        # Otherwise, don't do anything -- ie, skip iteration
-        else:
-            print('OVERFLOW!')
-
-        # Update loss scale for next iteration
-        loss_scaler.update_scale(has_overflow)
-
-"""
diff --git a/megatron/global_vars.py b/megatron/global_vars.py
index 560d6c778..ba5f23887 100644
--- a/megatron/global_vars.py
+++ b/megatron/global_vars.py
@@ -34,6 +34,7 @@
 _GLOBAL_TIMERS = None
 _GLOBAL_USE_WANDB = False
 
+
 def get_args():
     """Return arguments."""
     _ensure_var_is_initialized(_GLOBAL_ARGS, 'args')
@@ -67,7 +68,7 @@ def get_timers():
 def set_global_variables():
     """Set args, tokenizer, tensorboard-writer, adlr-autoresume, and timers."""
     from megatron.neox_arguments import NeoXArgs
-    args = NeoXArgs.consume_megatron_args()
+    args = NeoXArgs.consume_neox_args()
 
     global _GLOBAL_ARGS
     _ensure_var_is_not_initialized(_GLOBAL_ARGS, 'args')
@@ -118,7 +119,7 @@ def _set_tensorboard_writer(args):
                                    'tensorboard writer')
 
     if hasattr(args, 'tensorboard_dir') and \
-       args.tensorboard_dir and args.rank == 0:
+            args.tensorboard_dir and args.rank == 0:
         try:
             from torch.utils.tensorboard import SummaryWriter
             print('> setting tensorboard ...')
@@ -251,10 +252,12 @@ def log(self, names, normalizer=1.0, reset=True):
         else:
             print(string, flush=True)
 
+
 def get_use_wandb():
     global _GLOBAL_USE_WANDB
     return _GLOBAL_USE_WANDB
 
+
 def set_use_wandb(b: bool):
     global _GLOBAL_USE_WANDB
-    _GLOBAL_USE_WANDB = b
\ No newline at end of file
+    _GLOBAL_USE_WANDB = b
diff --git a/megatron/gradient_noise_scale/gradient_noise_scale.py b/megatron/gradient_noise_scale/gradient_noise_scale.py
index 35acdefd0..6f4e94b9a 100644
--- a/megatron/gradient_noise_scale/gradient_noise_scale.py
+++ b/megatron/gradient_noise_scale/gradient_noise_scale.py
@@ -74,7 +74,7 @@ def flatten_grads(self):
             return torch.cat(grads)
 
     def _sync_overflow(self, is_overflow):
-        if self.args.pipe_parallel_size > 1:
+        if self.args.is_pipe_parallel:
             # Since each model parallel GPU carries only part of the model,
             # make sure overflow flag is synced across all the pipe parallel GPUs
             overflow_gpu = torch.cuda.ByteTensor([is_overflow])
@@ -104,7 +104,7 @@ def _update(self):
 
             # calculate Gbig and Gsmall
             # this needs to be done in fp32 or it overflows
-            if self.args.pipe_parallel_size > 1:
+            if self.args.is_pipe_parallel:
 
                 g_big = torch.square(torch.norm(grads.to(torch.float)))
                 g_small = torch.square(torch.norm(grad.to(torch.float)))
@@ -151,7 +151,7 @@ def _update(self):
         self.n_updates += 1
 
     def update(self):
-        if self.args.pipe_parallel_size > 1:
+        if self.args.is_pipe_parallel:
             # update on all ranks
             self._update()
         else:
diff --git a/megatron/initialize.py b/megatron/initialize.py
index c3d680d15..9891a3388 100644
--- a/megatron/initialize.py
+++ b/megatron/initialize.py
@@ -89,9 +89,6 @@ def finish_mpu_init():
         # Megatron's MPU is the master. Complete initialization right away.
         finish_mpu_init()
 
-        # Initialize memory buffers.
-        _initialize_mem_buffs()
-
         # Autoresume.
         _init_autoresume()
 
@@ -127,10 +124,6 @@ def setup_deepspeed_random_and_activation_checkpointing(args):
         synchronize=args.synchronize_each_layer,
         profile=args.profile_backward)
 
-    mpu.checkpoint = deepspeed.checkpointing.checkpoint
-    mpu.get_cuda_rng_tracker = deepspeed.checkpointing.get_cuda_rng_tracker
-    mpu.model_parallel_cuda_manual_seed = deepspeed.checkpointing.model_parallel_cuda_manual_seed
-
 
 def _initialize_distributed():
     """Initialize torch.distributed and mpu."""
@@ -158,7 +151,7 @@ def _initialize_distributed():
             else:
                 args.local_rank = device
             torch.cuda.set_device(device)
-            
+
         distributed.init_distributed(
             dist_backend=args.distributed_backend,
             auto_mpi_discovery=True,
@@ -167,25 +160,22 @@ def _initialize_distributed():
         )
 
     # Setup 3D topology.
-    if args.pipe_parallel_size > 0:
-        pp = args.pipe_parallel_size
-        mp = args.model_parallel_size
-        assert args.world_size % (pp * mp) == 0, f'world_size={args.world_size}, pp={pp}, mp={mp}'
-        dp = args.world_size // (pp * mp)
-
-        from deepspeed.runtime.pipe.topology import PipeModelDataParallelTopology
-        # this does pipe on the most outside, then data, then model. 
-        # PipeModelDataParallelTopology is just a wrapper over ProcessTopology that predefines this order.
-        topo = PipeModelDataParallelTopology(num_pp=pp, num_mp=mp, num_dp=dp)
-
-        # Offset base seeds for the interior pipeline stages.
-        # TODO: adjust last stage too once IO is improved.
-        stage_id = topo.get_coord(rank=torch.distributed.get_rank()).pipe
-        if 0 < stage_id < topo.get_dim('pipe') - 1:
-            offset = args.seed + 1138
-            args.seed = offset + (stage_id * mp)
-    else:
-        topo = None
+    pp = args.pipe_parallel_size if args.pipe_parallel_size >= 1 else 1
+    mp = args.model_parallel_size if args.model_parallel_size >= 1 else 1
+    assert args.world_size % (pp * mp) == 0, f'world_size={args.world_size}, pp={pp}, mp={mp}'
+    dp = args.world_size // (pp * mp)
+
+    from deepspeed.runtime.pipe.topology import PipeModelDataParallelTopology
+    # this does pipe on the most outside, then data, then model.
+    # PipeModelDataParallelTopology is just a wrapper over ProcessTopology that predefines this order.
+    topo = PipeModelDataParallelTopology(num_pp=pp, num_mp=mp, num_dp=dp)
+
+    # Offset base seeds for the interior pipeline stages.
+    # TODO: adjust last stage too once IO is improved.
+    stage_id = topo.get_coord(rank=torch.distributed.get_rank()).pipe
+    if 0 < stage_id < topo.get_dim('pipe') - 1:
+        offset = args.seed + 1138
+        args.seed = offset + (stage_id * mp)
 
     # Set the model-parallel / data-parallel communicators.
     if device_count > 0:
@@ -194,10 +184,8 @@ def _initialize_distributed():
         else:
             mpu.initialize_model_parallel(args.model_parallel_size, topology=topo)
 
-    # Optional DeepSpeed Activation Checkpointing Features
-    #
-    if args.deepspeed and args.deepspeed_activation_checkpointing:
-        setup_deepspeed_random_and_activation_checkpointing(args)
+    # Init DeepSpeed Activation Checkpointing Features
+    setup_deepspeed_random_and_activation_checkpointing(args)
 
 
 def _init_autoresume():
@@ -228,12 +216,3 @@ def _write_args_to_tensorboard():
     if writer:
         for arg in vars(args):
             writer.add_text(arg, str(getattr(args, arg)))
-
-
-def _initialize_mem_buffs():
-    """Initialize manually allocated static memory."""
-    args = get_args()
-
-    # Initialize memory for checkpointed activations.
-    if args.distribute_checkpointed_activations:
-        mpu.init_checkpointed_activations_memory_buffer()
diff --git a/megatron/memory.py b/megatron/memory.py
deleted file mode 100644
index be5a117bc..000000000
--- a/megatron/memory.py
+++ /dev/null
@@ -1,145 +0,0 @@
-# coding=utf-8
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import torch
-
-
-# A dictionary of all the memory buffers allocated.
-_MEM_BUFFS = dict()
-
-
-def allocate_mem_buff(name, numel, dtype, track_usage):
-    """Allocate a memory buffer."""
-    assert name not in _MEM_BUFFS, \
-        'memory buffer {} already allocated.'.format(name)
-    _MEM_BUFFS[name] = MemoryBuffer(name, numel, dtype, track_usage)
-    return _MEM_BUFFS[name]
-
-
-def get_mem_buff(name):
-    """Get the memory buffer."""
-    return _MEM_BUFFS[name]
-
-
-class MemoryBuffer:
-    """Contiguous memory buffer.
-    Allocate a contiguous memory of type `dtype` and size `numel`. It is
-    used to reduce memory fragmentation.
-
-    Usage: After the allocation, the `_start` index is set tot the first
-           index of the memory. A memory chunk starting from `_start` index
-           can be `allocated` for an input tensor, with the elements of the
-           tensor being coppied. The buffer can be reused by resetting the
-           `_start` index.
-
-    """
-    def __init__(self, name, numel, dtype, track_usage):
-        if torch.distributed.get_rank() == 0:
-            element_size = torch.tensor([], dtype=dtype).element_size()
-            print('> building the {} memory buffer with {} num elements '
-                  'and {} dtype ({:.1f} MB)...'.format(
-                      name, numel, dtype, numel*element_size/1024/1024),
-                  flush=True)
-        self.name = name
-        self.numel = numel
-        self.dtype = dtype
-        self.data = torch.empty(self.numel,
-                                dtype=self.dtype,
-                                device=torch.cuda.current_device(),
-                                requires_grad=False)
-
-        # Index tracking the start of the free memory.
-        self._start = 0
-
-        # Values used for tracking usage.
-        self.track_usage = track_usage
-        if self.track_usage:
-            self.in_use_value = 0.0
-            self.total_value = 0.0
-
-
-    def reset(self):
-        """Reset the buffer start index to the beginning of the buffer."""
-        self._start = 0
-
-
-    def is_in_use(self):
-        """Whether the current buffer hold on to any memory."""
-        return self._start > 0
-
-
-    def numel_in_use(self):
-        """Return number of elements in use."""
-        return self._start
-
-
-    def add(self, tensor):
-        """Allocate a chunk of memory from the buffer to tensor and copy
-        the values."""
-        assert tensor.dtype == self.dtype, \
-            'Input tensor type {} different from buffer type {}'.format(
-                tensor.dtype, self.dtype)
-        # Number of elements of the input tensor.
-        tensor_numel = torch.numel(tensor)
-        new_start = self._start + tensor_numel
-        assert new_start <= self.numel, \
-            'Not enough memory left in the buffer ({} > {})'.format(
-                tensor_numel, self.numel - self._start)
-        # New tensor is a view into the memory.
-        new_tensor = self.data[self._start:new_start]
-        self._start = new_start
-        new_tensor = new_tensor.view(tensor.shape)
-        new_tensor.copy_(tensor)
-        # Return a pointer to the new tensor.
-        return new_tensor
-
-
-    def get_data(self):
-        """Return the data currently in use."""
-        if self.track_usage:
-            self.in_use_value += float(self._start)
-            self.total_value += float(self.numel)
-        return self.data[:self._start]
-
-
-    def print_average_usage(self):
-        """Print memory usage average over time. We would like this value
-        to be as high as possible."""
-        assert self.track_usage, 'You need to enable track usage.'
-        if torch.distributed.get_rank() == 0:
-            print(' > usage of {} memory buffer: {:.2f} %'.format(
-                self.name, self.in_use_value * 100.0 / self.total_value),
-                  flush=True)
-
-
-
-class RingMemBuffer:
-    """A ring of memory buffers."""
-
-    def __init__(self, name, num_buffers, numel, dtype, track_usage):
-        self.num_buffers = num_buffers
-        self.buffers = [
-            allocate_mem_buff(name+' {}'.format(i), numel, dtype, track_usage)
-            for i in range(num_buffers)]
-        self._index = -1
-
-
-    def get_next_buffer(self):
-        self._index += 1
-        self._index = self._index % self.num_buffers
-        buff = self.buffers[self._index]
-        assert not buff.is_in_use(), 'buffer is already in use.'
-        return buff
diff --git a/megatron/model/__init__.py b/megatron/model/__init__.py
index 396210f1f..8d32f6911 100755
--- a/megatron/model/__init__.py
+++ b/megatron/model/__init__.py
@@ -16,7 +16,5 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .gpt2_model import GPT2Model, GPT2ModelPipe
+from .gpt2_model import GPT2ModelPipe
 from .utils import get_params_for_weight_decay_optimization
-from .language_model import get_language_model
-from .norms import RMSNorm, ScaleNorm, LayerNorm
diff --git a/megatron/model/fused_bias_dropout.py b/megatron/model/fused_bias_dropout.py
new file mode 100644
index 000000000..b3bb8c8f5
--- /dev/null
+++ b/megatron/model/fused_bias_dropout.py
@@ -0,0 +1,35 @@
+import torch
+import torch.nn.functional as F
+
+# flags required to enable jit fusion kernels
+torch._C._jit_set_profiling_mode(False)
+torch._C._jit_set_profiling_executor(False)
+torch._C._jit_override_can_fuse_on_cpu(True)
+torch._C._jit_override_can_fuse_on_gpu(True)
+
+
+def bias_dropout_add(x, bias, residual, prob, training):
+    # type: (Tensor, Tensor, Tensor, float, bool) -> Tensor
+    out = torch.nn.functional.dropout(x + bias, p=prob, training=training)
+    out = residual + out
+    return out
+
+
+def get_bias_dropout_add(training):
+    def _bias_dropout_add(x, bias, residual, prob):
+        return bias_dropout_add(x, bias, residual, prob, training)
+
+    return _bias_dropout_add
+
+
+@torch.jit.script
+def bias_dropout_add_fused_train(x, bias, residual, prob):
+    # type: (Tensor, Tensor, Tensor, float) -> Tensor
+    return bias_dropout_add(x, bias, residual, prob, True)
+
+
+@torch.jit.script
+def bias_dropout_add_fused_inference(x, bias, residual, prob):
+    # type: (Tensor, Tensor, Tensor, float) -> Tensor
+    return bias_dropout_add(x, bias, residual, prob, False)
+
diff --git a/megatron/model/gpt2_model.py b/megatron/model/gpt2_model.py
index 0101165ff..db0def9f5 100644
--- a/megatron/model/gpt2_model.py
+++ b/megatron/model/gpt2_model.py
@@ -21,20 +21,17 @@
 import torch
 
 from megatron import get_args
-from megatron.module import MegatronModule
 from functools import partial
-from .language_model import get_language_model
-from .utils import init_method_normal
-from .utils import scaled_init_method_normal
-from .norms import LayerNorm, RMSNorm, ScaleNorm
+from megatron.model.utils import init_method_normal, scaled_init_method_normal, Lambda
+from megatron.model.norms import LayerNorm, RMSNorm, ScaleNorm
 
-# Pipeline parallelism
 from megatron import mpu
 from megatron.mpu import ParallelRelativePositionBias
 import megatron.fp16 as fp16
-from megatron.model.transformer import ParallelTransformerLayerPipe, NormPipe, ParallelLinearPipe, ParallelLinear
-from .language_model import EmbeddingPipe, parallel_lm_logits
+from megatron.model.transformer import ParallelTransformerLayerPipe, NormPipe, ParallelLinearPipe, parallel_lm_logits
+from megatron.model.word_embeddings import EmbeddingPipe
 
+# Pipeline parallelism
 from deepspeed.pipe import PipelineModule, LayerSpec, TiedLayerSpec
 
 
@@ -65,82 +62,7 @@ def cross_entropy(output, labels, _fp16=False):
     return loss
 
 
-class GPT2Model(MegatronModule):
-    """GPT-2 Language model."""
-
-    def __init__(self, num_tokentypes=0, parallel_output=True, inference=False, get_key_value=True):
-        super(GPT2Model, self).__init__()
-        args = get_args()
-        self.parallel_output = parallel_output
-        self.weight_tying = not args.no_weight_tying
-        self.fp16_lm_cross_entropy = args.fp16_lm_cross_entropy
-
-        self.inference = inference
-        self.get_key_value = get_key_value if inference else False
-
-        self.language_model, self._language_model_key = get_language_model(
-            attention_mask_func=gpt2_attention_mask_func,
-            num_tokentypes=num_tokentypes,
-            init_method=init_method_normal(args.init_method_std),
-            scaled_init_method=scaled_init_method_normal(args.init_method_std,
-                                                         args.num_layers),
-            get_key_value=self.get_key_value)
-        if not self.weight_tying:
-            self.final_linear = ParallelLinear(self.parallel_output)
-
-    def forward(self, input_ids, position_ids, attention_mask,
-                layer_past=None, tokentype_ids=None, forward_method_parallel_output=None, labels=None):
-
-        # Language model.
-        lm_output = self.language_model(input_ids,
-                                        position_ids,
-                                        attention_mask,
-                                        tokentype_ids=tokentype_ids,
-                                        layer_past=layer_past)
-
-        if self.get_key_value:
-            lm_output, presents = lm_output
-
-        # Output.
-        parallel_output = self.parallel_output
-        if forward_method_parallel_output is not None:
-            parallel_output = forward_method_parallel_output
-        if self.weight_tying:
-            output = parallel_lm_logits(
-                lm_output,
-                self.language_model.embedding.word_embeddings.weight,
-                parallel_output)
-        else:
-            output, bias = self.final_linear(lm_output)
-
-        if self.get_key_value:
-            output = [output, presents]
-
-        if labels is None:
-            return output
-        else:
-            if self.fp16_lm_cross_entropy:
-                assert output.dtype == torch.half
-                loss = mpu.vocab_parallel_cross_entropy(output, labels)
-            else:
-                loss = mpu.vocab_parallel_cross_entropy(output.float(), labels)
-            return loss
-
-    def state_dict_for_save_checkpoint(self, destination=None, prefix='',
-                                       keep_vars=False):
-        state_dict_ = {self._language_model_key: self.language_model.state_dict_for_save_checkpoint(
-            destination, prefix, keep_vars)}
-        return state_dict_
-
-    def load_state_dict(self, state_dict, strict=True):
-        """Customized load."""
-
-        if self._language_model_key in state_dict:
-            state_dict = state_dict[self._language_model_key]
-        self.language_model.load_state_dict(state_dict, strict=strict)
-
-
-class GPT2ModelPipe(PipelineModule, MegatronModule):
+class GPT2ModelPipe(PipelineModule, torch.nn.Module):
     """GPT2Model adapted for pipeline parallelism.
 
     The largest change is flattening the GPTModel class so we can express it as a
@@ -317,3 +239,33 @@ def _logits_helper(embedding, lm_output):
             )
         # so output in training should just be logits
         # in inference it will be (logits, presents) (assuming get_key_value) is true
+
+    def to_sequential(self):
+        """
+        Transforms the PipelineModule to a plain nn.Sequential module
+        :return:
+        """
+        layers = []
+        from collections import defaultdict
+        tied_layers = defaultdict(list)
+        for n, spec in enumerate(self.specs):
+            if isinstance(spec, TiedLayerSpec):
+                if spec.key in tied_layers:
+                    # receiver
+                    layers.append(Lambda(lambda x: spec.forward_fn(tied_layers[spec.key][0], x)))
+                else:
+                    # owner
+                    module = spec.build(log=False)
+                    layers.append(module)
+                    tied_layers[spec.key].append(module)
+            elif isinstance(spec, LayerSpec):
+                layers.append(spec.build(log=False))
+            else:
+                # check that it's a lambda function
+                LAMBDA = lambda:0
+                if isinstance(spec, type(LAMBDA)) and spec.__name__ == LAMBDA.__name__:
+                    # we assume it is a lambda function
+                    layers.append(Lambda(spec))
+                else:
+                    raise ValueError(f'Layer number {n} ({spec}) Not recognized')
+        return torch.nn.Sequential(*layers)
diff --git a/megatron/model/language_model.py b/megatron/model/language_model.py
deleted file mode 100644
index 7dc61f0f9..000000000
--- a/megatron/model/language_model.py
+++ /dev/null
@@ -1,174 +0,0 @@
-# coding=utf-8
-#
-# Copyright 2021 Biderman et al. This file is based on code by the authors denoted below and has been modified from its original version.
-#
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Transformer based language model."""
-
-import torch
-import torch.nn.functional as F
-from einops import rearrange, repeat
-
-from megatron import get_args
-from megatron import mpu
-from megatron.module import MegatronModule
-from megatron.model.transformer import ParallelTransformer, SinusoidalPositionalEmbedding, Embedding, EmbeddingPipe
-from megatron.model.utils import get_linear_layer
-from megatron.model.utils import init_method_normal, scaled_init_method_normal
-from megatron.model.utils import identity
-
-
-def parallel_lm_logits(input_, word_embeddings_weight, parallel_output,
-                       bias=None):
-    """LM logits using word embedding weights."""
-    # Parallel logits.
-    input_parallel = mpu.copy_to_model_parallel_region(input_)
-
-    # Matrix multiply.
-    if bias is None:
-        logits_parallel = F.linear(input_parallel, word_embeddings_weight)
-    else:
-        logits_parallel = F.linear(input_parallel, word_embeddings_weight, bias)
-
-    # Gather if needed.
-    if parallel_output:
-        return logits_parallel
-
-    return mpu.gather_from_model_parallel_region(logits_parallel)
-
-
-def get_language_model(attention_mask_func, num_tokentypes,
-                       init_method=None, scaled_init_method=None, get_key_value=False):
-    """Build language model and return along with the key to save."""
-    args = get_args()
-
-    if init_method is None:
-        init_method = init_method_normal(args.init_method_std)
-
-    if scaled_init_method is None:
-        scaled_init_method = scaled_init_method_normal(args.init_method_std, args.num_layers)
-
-    # Language model.
-    language_model = TransformerLanguageModel(
-        attention_mask_func=attention_mask_func,
-        init_method=init_method,
-        output_layer_init_method=scaled_init_method,
-        num_tokentypes=num_tokentypes,
-        get_key_value=get_key_value)
-    # key used for checkpoints.
-    language_model_key = 'language_model'
-
-    return language_model, language_model_key
-
-
-class TransformerLanguageModel(MegatronModule):
-    """Transformer language model.
-
-    Arguments:
-        transformer_hparams: transformer hyperparameters
-        attention_mask_func: a function that takes `unmaksed-attention-scores`
-            with size [b, np, s, s] and an `attention-mask` and will apply
-            the masking. The function should return a masked score of the
-            same size [b, np, s, s].
-          masked-attention-scores = attention_mask_func(
-                                     unmaksed-attention-scores, attention-mask)
-        vocab_size: vocabulary size
-        max_sequence_length: maximum size of sequence. This
-                             is used for positional embedding
-        embedding_dropout_prob: dropout probability for embeddings
-        num_tokentypes: size of the token-type embeddings. 0 value
-                        will ignore this embedding
-    """
-
-    def __init__(self,
-                 attention_mask_func,
-                 init_method,
-                 output_layer_init_method,
-                 num_tokentypes=0,
-                 get_key_value=False):
-        super(TransformerLanguageModel, self).__init__()
-        args = get_args()
-
-        self.hidden_size = args.hidden_size
-        self.num_tokentypes = num_tokentypes
-        self.init_method = init_method
-        self.embedding_type = args.pos_emb
-        # Embeddings
-        self.embedding = Embedding(self.hidden_size,
-                                   args.padded_vocab_size,
-                                   args.max_position_embeddings,
-                                   args.hidden_dropout,
-                                   self.init_method,
-                                   self.num_tokentypes)
-        self._embedding_key = 'embedding'
-        self.get_key_value = get_key_value
-
-        # Transformer
-        self.transformer = ParallelTransformer(
-            attention_mask_func, self.init_method,
-            output_layer_init_method, get_key_value=self.get_key_value)
-        self._transformer_key = 'transformer'
-
-    def forward(self, input_ids, position_ids, attention_mask,
-                tokentype_ids=None, layer_past=None):
-
-        # Embeddings.
-        embedding_output = self.embedding(input_ids, position_ids,
-                                          tokentype_ids=tokentype_ids)
-        # Transformer.
-        transformer_output = self.transformer(embedding_output,
-                                              attention_mask,
-                                              layer_past=layer_past)
-        return transformer_output
-
-    def state_dict_for_save_checkpoint(self, destination=None, prefix='',
-                                       keep_vars=False):
-        """For easy load."""
-
-        state_dict_ = {}
-        state_dict_[self._embedding_key] \
-            = self.embedding.state_dict_for_save_checkpoint(
-            destination, prefix, keep_vars)
-        state_dict_[self._transformer_key] \
-            = self.transformer.state_dict_for_save_checkpoint(
-            destination, prefix, keep_vars)
-
-        return state_dict_
-
-    def load_state_dict(self, state_dict, strict=True):
-        """Customized load."""
-
-        # Embedding.
-        if self._embedding_key in state_dict:
-            state_dict_ = state_dict[self._embedding_key]
-        else:
-            # for backward compatibility.
-            state_dict_ = {}
-            for key in state_dict.keys():
-                if '_embeddings' in key:
-                    state_dict_[key] = state_dict[key]
-        self.embedding.load_state_dict(state_dict_, strict=strict)
-
-        # Transformer.
-        if self._transformer_key in state_dict:
-            state_dict_ = state_dict[self._transformer_key]
-        else:
-            # for backward compatibility.
-            state_dict_ = {}
-            for key in state_dict.keys():
-                if 'transformer.' in key:
-                    state_dict_[key.split('transformer.')[1]] = state_dict[key]
-        self.transformer.load_state_dict(state_dict_, strict=strict)
diff --git a/megatron/model/positional_embeddings.py b/megatron/model/positional_embeddings.py
index 1f47bc312..547c46c60 100644
--- a/megatron/model/positional_embeddings.py
+++ b/megatron/model/positional_embeddings.py
@@ -1,8 +1,7 @@
 import torch
-from megatron.module import MegatronModule
 
 
-class SinusoidalPositionalEmbedding(MegatronModule):
+class SinusoidalPositionalEmbedding(torch.nn.Module):
 
     def __init__(self, dim, base=10000):
         super().__init__()
@@ -16,7 +15,7 @@ def forward(self, x, seq_dim=1):
         return emb[None, :, :]
 
 
-class RotaryEmbedding(MegatronModule):
+class RotaryEmbedding(torch.nn.Module):
     
     def __init__(self, dim, base=10000):
         super().__init__()
diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index b351918a1..d8a6cd3b4 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -25,13 +25,12 @@
 from .norms import LayerNorm, RMSNorm, ScaleNorm
 from megatron import get_args
 from megatron import mpu
-from megatron.module import MegatronModule
 from megatron.model.fused_softmax import FusedScaleMaskSoftmax
 from megatron.model.fused_bias_gelu import bias_gelu_impl
 from megatron.model.utils import openai_gelu, erf_gelu, exists
-from megatron.mpu import ParallelRelativePositionBias
-from megatron.model.positional_embeddings import SinusoidalPositionalEmbedding, RotaryEmbedding, apply_rotary_pos_emb
-
+from megatron.model.positional_embeddings import RotaryEmbedding, apply_rotary_pos_emb
+from megatron.model.fused_bias_dropout import get_bias_dropout_add, bias_dropout_add_fused_train, \
+    bias_dropout_add_fused_inference
 import deepspeed
 from deepspeed.ops.sparse_attention import SparseSelfAttention, VariableSparsityConfig
 
@@ -63,7 +62,7 @@
 """
 
 
-class GEGLU(MegatronModule):
+class GEGLU(torch.nn.Module):
 
     def __init__(self):
         super(GEGLU, self).__init__()
@@ -91,7 +90,7 @@ def forward(self, x, bias=None):
         return intermediate_parallel * x
 
 
-class ParallelMLP(MegatronModule):
+class ParallelMLP(torch.nn.Module):
     """MLP.
 
     MLP will take the input with h hidden state, project it to 4*h
@@ -157,7 +156,7 @@ def forward(self, hidden_states):
         return output, output_bias
 
 
-class ParallelLinear(MegatronModule):
+class ParallelLinear(torch.nn.Module):
     """
     A Parallel Linear Layer transforming the transformer outputs from hidden_size -> vocab_size
     """
@@ -178,7 +177,7 @@ def forward(self, hidden_states):
         return self.final_linear(hidden_states)
 
 
-class ParallelSelfAttention(MegatronModule):
+class ParallelSelfAttention(torch.nn.Module):
     """Parallel self-attention layer abstract class.
 
     Self-attention layer takes input with size [b, s, h]
@@ -456,33 +455,7 @@ def forward(self, hidden_states, attention_mask, layer_past=None):
         return output, bias
 
 
-def bias_dropout_add(x, bias, residual, prob, training):
-    # type: (Tensor, Tensor, Tensor, float, bool) -> Tensor
-    out = torch.nn.functional.dropout(x + bias, p=prob, training=training)
-    out = residual + out
-    return out
-
-
-def get_bias_dropout_add(training):
-    def _bias_dropout_add(x, bias, residual, prob):
-        return bias_dropout_add(x, bias, residual, prob, training)
-
-    return _bias_dropout_add
-
-
-@torch.jit.script
-def bias_dropout_add_fused_train(x, bias, residual, prob):
-    # type: (Tensor, Tensor, Tensor, float) -> Tensor
-    return bias_dropout_add(x, bias, residual, prob, True)
-
-
-@torch.jit.script
-def bias_dropout_add_fused_inference(x, bias, residual, prob):
-    # type: (Tensor, Tensor, Tensor, float) -> Tensor
-    return bias_dropout_add(x, bias, residual, prob, False)
-
-
-class ParallelTransformerLayer(MegatronModule):
+class ParallelTransformerLayer(torch.nn.Module):
     """A single transformer layer.
 
     Transformer layer takes input with size [b, s, h] and returns an
@@ -603,164 +576,6 @@ def forward(self, hidden_states, attention_mask, layer_past=None):
         return output
 
 
-class ParallelTransformer(MegatronModule):
-    """Transformer class."""
-
-    def __init__(self, attention_mask_func,
-                 init_method, output_layer_init_method, get_key_value=False):
-        super(ParallelTransformer, self).__init__()
-        args = get_args()
-
-        # Store activation checkpoiting flag.
-        self.checkpoint_activations = args.checkpoint_activations
-        self.checkpoint_num_layers = args.checkpoint_num_layers
-
-        self.get_key_value = get_key_value
-        # Number of layers:
-        self.num_layers = args.num_layers
-        self.num_unique_layers = args.num_unique_layers
-        if self.num_unique_layers is None:
-            self.num_unique_layers = self.num_layers
-        assert self.num_layers % self.num_unique_layers == 0, \
-            'number of layers should be divisible by number of unique layers'
-        self.param_sharing_style = args.param_sharing_style
-
-        if args.pos_emb == 'rpe':
-            rpe_emb = ParallelRelativePositionBias(causal=True, num_buckets=args.rpe_num_buckets,
-                                                   max_distance=args.rpe_max_distance,
-                                                   heads=args.num_attention_heads)
-
-        # Transformer layers.
-        sparsity = args.sparsity
-
-        def build_layer(layer_number):
-            if sparsity == 'none':
-                sparse = False
-            elif sparsity == 'all':
-                sparse = True
-            elif sparsity == 'interspersed':
-                sparse = not layer_number % 2 == 0
-            else:
-                raise ValueError(f'Sparsity type {sparsity} not recognized')
-            return ParallelTransformerLayer(
-                attention_mask_func, init_method,
-                output_layer_init_method, layer_number, sparse=sparse,
-                rpe=rpe_emb if args.pos_emb == 'rpe' else None,
-                get_key_value=get_key_value,
-                rotary=args.pos_emb == 'rotary')
-
-        self.layers = torch.nn.ModuleList(
-            [build_layer(i + 1) for i in range(self.num_unique_layers)])
-
-        # Print layer ordering.
-        if self.num_layers != self.num_unique_layers:
-            if torch.distributed.get_rank() == 0:
-                print('> will be using the following layer ordering:')
-                for i in range(self.num_layers):
-                    print('   layer id: {:3d} --> unique layer id: '
-                          '{:3d}'.format(i, self._get_layer_index(i)),
-                          flush=True)
-
-        # Final layer norm before output.
-        if args.norm == "rmsnorm":
-            norm = RMSNorm
-            eps = args.rms_norm_epsilon
-        elif args.norm == "layernorm":
-            eps = args.layernorm_epsilon
-            norm = LayerNorm
-        elif args.norm == "scalenorm":
-            eps = args.scalenorm_epsilon
-            norm = ScaleNorm
-
-        self.final_layernorm = norm(
-            args.hidden_size,
-            eps=eps)
-
-        if deepspeed.checkpointing.is_configured():
-            global get_cuda_rng_tracker, checkpoint
-            get_cuda_rng_tracker = deepspeed.checkpointing.get_cuda_rng_tracker
-            checkpoint = deepspeed.checkpointing.checkpoint
-
-    def _get_layer_index(self, layer_number):
-        if self.param_sharing_style == 'grouped':
-            return layer_number % self.num_unique_layers
-        if self.param_sharing_style == 'spaced':
-            return layer_number // (self.num_layers // self.num_unique_layers)
-        assert False, 'should not be here'
-
-    def _get_layer(self, layer_number):
-        return self.layers[self._get_layer_index(layer_number)]
-
-    def _checkpointed_forward(self, hidden_states, attention_mask):
-        """Forward method with activation checkpointing."""
-
-        def custom(start, end):
-            def custom_forward(*inputs):
-                x_ = inputs[0]
-                for index in range(start, end):
-                    layer = self._get_layer(index)
-                    x_ = layer(x_, inputs[1])
-                return x_
-
-            return custom_forward
-
-        # Make sure memory is freed.
-        mpu.reset_checkpointed_activations_memory_buffer()
-        l = 0
-        while l < self.num_layers:
-            hidden_states = mpu.checkpoint(
-                custom(l, l + self.checkpoint_num_layers),
-                hidden_states, attention_mask)
-            l += self.checkpoint_num_layers
-
-        return hidden_states
-
-    def forward(self, hidden_states, attention_mask, layer_past=None, ):
-        # Checks
-        if layer_past is not None and layer_past.numel() > 0:
-            assert self.get_key_value, \
-                'for not None values in layer_past, ' \
-                'expected get_key_value to be set'
-        if self.get_key_value:
-            assert not self.checkpoint_activations, \
-                'get_key_value does not work with ' \
-                'activation checkpointing'
-
-        # data format change to avoid explicit tranposes : [b s h] --> [s b h]
-        hidden_states = hidden_states.transpose(0, 1).contiguous()
-
-        if self.checkpoint_activations:
-            hidden_states = self._checkpointed_forward(hidden_states,
-                                                       attention_mask)
-        else:
-            if self.get_key_value:
-                presents = torch.Tensor()
-            for index in range(self.num_layers):
-                layer = self._get_layer(index)
-                past = None
-                if layer_past.numel() > 0:
-                    past = layer_past[index]
-                hidden_states = layer(hidden_states,
-                                      attention_mask,
-                                      layer_past=past)
-                if self.get_key_value:
-                    hidden_states, present = hidden_states
-                    if presents.numel() == 0:
-                        presents = present.unsqueeze(dim=0)
-                    else:
-                        presents = torch.cat((presents, present.unsqueeze(dim=0)))
-
-        # reverting data format change [s b h] --> [b s h]
-        hidden_states = hidden_states.transpose(0, 1).contiguous()
-
-        # Final layer norm.
-        output = self.final_layernorm(hidden_states)
-        if self.get_key_value:
-            output = [output, presents]
-
-        return output
-
-
 class ParallelTransformerLayerPipe(ParallelTransformerLayer):
     """Extends ParallelTransformerLayer to forward attention_mask through the pipeline. """
 
@@ -794,226 +609,59 @@ def forward(self, args):
                 f'In layer {self.layer_number} - Incorrect number of arguments ({len(args)}) for {self.__class__.__name__}')
 
 
-class NormPipe(MegatronModule):
-    """Just a helper class to pass presents through to the output when doing inference with a Pipe Parallel model"""
-
-    def __init__(self, norm_class, hidden_size, eps):
-        super().__init__()
-        self.norm = norm_class(hidden_size, eps=eps)
+class ParallelLinearPipe(ParallelLinear):
+    """Another helper class to pass presents through to the output when doing inference with a Pipe Parallel model"""
 
     def forward(self, args):
         if not isinstance(args, tuple):
             # in training, args = hidden_state (tensor, so we check if object isn't a tuple and pass through here)
             hidden_state = args
-            return self.norm(hidden_state)
+            logits, bias = super().forward(hidden_state)
+            return logits
         elif len(args) == 2:
-            # in inference, args will be (hidden_state, presents)
+            # we are in inference, so input is (hidden_states, presents)
             hidden_state, presents = args
-            hidden_state = self.norm(hidden_state)
-            return hidden_state, presents
+            logits, bias = super().forward(hidden_state)
+            return logits, presents
         else:
             raise ValueError(f'Incorrect number of arguments for {self.__class__.__name__}')
 
 
-class Embedding(MegatronModule):
-    """Language model embeddings.
-    Arguments:
-        hidden_size: hidden size
-        vocab_size: vocabulary size
-        max_sequence_length: maximum size of sequence. This
-                             is used for positional embedding
-        embedding_dropout_prob: dropout probability for embeddings
-        init_method: weight initialization method
-        num_tokentypes: size of the token-type embeddings. 0 value
-                        will ignore this embedding
-    """
-
-    def __init__(self,
-                 hidden_size,
-                 vocab_size,
-                 max_sequence_length,
-                 embedding_dropout_prob,
-                 init_method,
-                 num_tokentypes=0):
-        super(Embedding, self).__init__()
-        args = get_args()
-        self.hidden_size = hidden_size
-        self.init_method = init_method
-        self.num_tokentypes = num_tokentypes
-
-        # Word embeddings (parallel).
-        self.word_embeddings = mpu.VocabParallelEmbedding(
-            vocab_size, self.hidden_size, init_method=self.init_method)
-        self._word_embeddings_key = 'word_embeddings'
-
-        # Position embedding (serial).
-        self.embedding_type = args.pos_emb
-        if self.embedding_type == "learned":
-            self.position_embeddings = torch.nn.Embedding(
-                max_sequence_length, self.hidden_size)
-            self._position_embeddings_key = 'position_embeddings'
-            # Initialize the position embeddings.
-            self.init_method(self.position_embeddings.weight)
-        elif self.embedding_type == "sinusoidal":
-            self.position_embeddings = SinusoidalPositionalEmbedding(self.hidden_size)
-
-        # Token type embedding.
-        # Add this as an optional field that can be added through
-        # method call so we can load a pretrain model without
-        # token types and add them as needed.
-        self._tokentype_embeddings_key = 'tokentype_embeddings'
-        if self.num_tokentypes > 0:
-            self.tokentype_embeddings = torch.nn.Embedding(self.num_tokentypes,
-                                                           self.hidden_size)
-            # Initialize the token-type embeddings.
-            self.init_method(self.tokentype_embeddings.weight)
-        else:
-            self.tokentype_embeddings = None
-
-        # Embeddings dropout
-        self.embedding_dropout = torch.nn.Dropout(embedding_dropout_prob)
-
-    def add_tokentype_embeddings(self, num_tokentypes):
-        """Add token-type embedding. This function is provided so we can add
-        token-type embeddings in case the pretrained model does not have it.
-        This allows us to load the model normally and then add this embedding.
-        """
-        if self.tokentype_embeddings is not None:
-            raise Exception('tokentype embeddings is already initialized')
-        if torch.distributed.get_rank() == 0:
-            print('adding embedding for {} tokentypes'.format(num_tokentypes),
-                  flush=True)
-        self.num_tokentypes = num_tokentypes
-        self.tokentype_embeddings = torch.nn.Embedding(num_tokentypes,
-                                                       self.hidden_size)
-        # Initialize the token-type embeddings.
-        self.init_method(self.tokentype_embeddings.weight)
-
-    def forward(self, input_ids, position_ids, tokentype_ids=None):
-        # Embeddings.
-        words_embeddings = self.word_embeddings(input_ids)
-        if self.embedding_type in ["learned", "sinusoidal"]:
-            position_embeddings = self.position_embeddings(position_ids)
-            embeddings = words_embeddings + position_embeddings
-        else:
-            embeddings = words_embeddings
-        if tokentype_ids is not None:
-            assert self.tokentype_embeddings is not None
-            embeddings = embeddings + self.tokentype_embeddings(tokentype_ids)
-        else:
-            assert self.tokentype_embeddings is None
-
-        # Dropout.
-        embeddings = self.embedding_dropout(embeddings)
-        return embeddings
-
-    def state_dict_for_save_checkpoint(self, destination=None, prefix='',
-                                       keep_vars=False):
-        """For easy load."""
-
-        state_dict_ = {}
-        state_dict_[self._word_embeddings_key] \
-            = self.word_embeddings.state_dict(destination, prefix, keep_vars)
-        if self.embedding_type == "learned":
-            state_dict_[self._position_embeddings_key] \
-                = self.position_embeddings.state_dict(
-                destination, prefix, keep_vars)
-        if self.num_tokentypes > 0:
-            state_dict_[self._tokentype_embeddings_key] \
-                = self.tokentype_embeddings.state_dict(
-                destination, prefix, keep_vars)
-
-        return state_dict_
-
-    def load_state_dict(self, state_dict, strict=True):
-        """Customized load."""
-
-        # Word embedding.
-        if self._word_embeddings_key in state_dict:
-            state_dict_ = state_dict[self._word_embeddings_key]
-        else:
-            # for backward compatibility.
-            state_dict_ = {}
-            for key in state_dict.keys():
-                if 'word_embeddings' in key:
-                    state_dict_[key.split('word_embeddings.')[1]] \
-                        = state_dict[key]
-        self.word_embeddings.load_state_dict(state_dict_, strict=strict)
-
-        # Position embedding.
-        if self.embedding_type == "learned":
-            if self._position_embeddings_key in state_dict:
-                state_dict_ = state_dict[self._position_embeddings_key]
-            else:
-                # for backward compatibility.
-                state_dict_ = {}
-                for key in state_dict.keys():
-                    if 'position_embeddings' in key:
-                        state_dict_[key.split('position_embeddings.')[1]] \
-                            = state_dict[key]
-            self.position_embeddings.load_state_dict(state_dict_, strict=strict)
-
-        # Tokentype embedding.
-        if self.num_tokentypes > 0:
-            state_dict_ = {}
-            if self._tokentype_embeddings_key in state_dict:
-                state_dict_ = state_dict[self._tokentype_embeddings_key]
-            else:
-                # for backward compatibility.
-                for key in state_dict.keys():
-                    if 'tokentype_embeddings' in key:
-                        state_dict_[key.split('tokentype_embeddings.')[1]] \
-                            = state_dict[key]
-            if len(state_dict_.keys()) > 0:
-                self.tokentype_embeddings.load_state_dict(state_dict_,
-                                                          strict=strict)
-            else:
-                print('***WARNING*** expected tokentype embeddings in the '
-                      'checkpoint but could not find it', flush=True)
-
+class NormPipe(torch.nn.Module):
+    """Just a helper class to pass presents through to the output when doing inference with a Pipe Parallel model"""
 
-class ParallelLinearPipe(ParallelLinear):
-    """Another helper class to pass presents through to the output when doing inference with a Pipe Parallel model"""
+    def __init__(self, norm_class, hidden_size, eps):
+        super().__init__()
+        self.norm = norm_class(hidden_size, eps=eps)
 
     def forward(self, args):
         if not isinstance(args, tuple):
             # in training, args = hidden_state (tensor, so we check if object isn't a tuple and pass through here)
             hidden_state = args
-            logits, bias = super().forward(hidden_state)
-            return logits
+            return self.norm(hidden_state)
         elif len(args) == 2:
-            # we are in inference, so input is (hidden_states, presents)
+            # in inference, args will be (hidden_state, presents)
             hidden_state, presents = args
-            logits, bias = super().forward(hidden_state)
-            return logits, presents
+            hidden_state = self.norm(hidden_state)
+            return hidden_state, presents
         else:
             raise ValueError(f'Incorrect number of arguments for {self.__class__.__name__}')
 
 
-class EmbeddingPipe(Embedding):
-    """Extends Embedding to forward attention_mask through the pipeline."""
+def parallel_lm_logits(input_, word_embeddings_weight, parallel_output,
+                       bias=None):
+    """LM logits using word embedding weights."""
+    # Parallel logits.
+    input_parallel = mpu.copy_to_model_parallel_region(input_)
 
-    @property
-    def word_embeddings_weight(self):
-        """Easy accessory for the pipeline engine to tie embeddings across stages."""
-        return self.word_embeddings.weight
+    # Matrix multiply.
+    if bias is None:
+        logits_parallel = F.linear(input_parallel, word_embeddings_weight)
+    else:
+        logits_parallel = F.linear(input_parallel, word_embeddings_weight, bias)
 
-    def forward(self, args):
-        in_inference = len(args) == 4  # if the length of the args is 4, we're in inference :|
-        in_train = len(args) == 3
-
-        input_ids = args[0]
-        position_ids = args[1]
-        attention_mask = args[2]
-        if in_inference:
-            layer_past = args[3]
-        elif in_train:
-            pass
-        else:
-            raise ValueError(f'Incorrect number of args passed to {self.__class__.__name__}')
+    # Gather if needed.
+    if parallel_output:
+        return logits_parallel
 
-        embeddings = super().forward(input_ids, position_ids)
-        if in_inference:
-            return embeddings, layer_past, attention_mask
-        else:
-            return embeddings, attention_mask
+    return mpu.gather_from_model_parallel_region(logits_parallel)
diff --git a/megatron/model/utils.py b/megatron/model/utils.py
index 51bfe4abd..0ddcdbbae 100644
--- a/megatron/model/utils.py
+++ b/megatron/model/utils.py
@@ -22,8 +22,6 @@
 
 import torch
 
-from .transformer import LayerNorm, RMSNorm, ScaleNorm
-
 
 def init_method_normal(sigma):
     """Init method based on N(0, sigma)."""
@@ -44,15 +42,6 @@ def init_(tensor):
     return init_
 
 
-def get_linear_layer(rows, columns, init_method):
-    """Simple linear layer with weight initialization."""
-    layer = torch.nn.Linear(rows, columns)
-    init_method(layer.weight)
-    with torch.no_grad():
-        layer.bias.zero_()
-    return layer
-
-
 @torch.jit.script
 def gelu_impl(x):
     """OpenAI's gelu implementation."""
@@ -76,6 +65,7 @@ def get_params_for_weight_decay_optimization(module, args):
     """
     weight_decay_params = {'params': []}
     no_weight_decay_params = {'params': [], 'weight_decay': 0.0}
+    from .transformer import LayerNorm, RMSNorm, ScaleNorm
     for module_ in module.modules():
         if any([isinstance(module_, LayerNorm), isinstance(module_, RMSNorm), isinstance(module_, ScaleNorm)]) or \
                 (args.weight_decay == 0.0):  # also include all parameters here if no weight decay is being done
@@ -97,9 +87,15 @@ def get_params_for_weight_decay_optimization(module, args):
     return weight_decay_params, no_weight_decay_params
 
 
-def identity(x, *args, **kwargs):
-    return x
-
-
 def exists(x):
     return x is not None
+
+
+class Lambda(torch.nn.Module):
+    def __init__(self, func):
+        super().__init__()
+        self.func = func
+
+    def forward(self, x):
+        return self.func(x)
+
diff --git a/megatron/model/word_embeddings.py b/megatron/model/word_embeddings.py
new file mode 100644
index 000000000..70a49806c
--- /dev/null
+++ b/megatron/model/word_embeddings.py
@@ -0,0 +1,128 @@
+import torch
+
+from megatron import get_args
+from megatron import mpu
+from megatron.model.positional_embeddings import SinusoidalPositionalEmbedding
+
+
+class Embedding(torch.nn.Module):
+    """Language model embeddings.
+    Arguments:
+        hidden_size: hidden size
+        vocab_size: vocabulary size
+        max_sequence_length: maximum size of sequence. This
+                             is used for positional embedding
+        embedding_dropout_prob: dropout probability for embeddings
+        init_method: weight initialization method
+        num_tokentypes: size of the token-type embeddings. 0 value
+                        will ignore this embedding
+    """
+
+    def __init__(self,
+                 hidden_size,
+                 vocab_size,
+                 max_sequence_length,
+                 embedding_dropout_prob,
+                 init_method,
+                 num_tokentypes=0):
+        super(Embedding, self).__init__()
+        args = get_args()
+        self.hidden_size = hidden_size
+        self.init_method = init_method
+        self.num_tokentypes = num_tokentypes
+
+        # Word embeddings (parallel).
+        self.word_embeddings = mpu.VocabParallelEmbedding(
+            vocab_size, self.hidden_size, init_method=self.init_method)
+        self._word_embeddings_key = 'word_embeddings'
+
+        # Position embedding (serial).
+        self.embedding_type = args.pos_emb
+        if self.embedding_type == "learned":
+            self.position_embeddings = torch.nn.Embedding(
+                max_sequence_length, self.hidden_size)
+            self._position_embeddings_key = 'position_embeddings'
+            # Initialize the position embeddings.
+            self.init_method(self.position_embeddings.weight)
+        elif self.embedding_type == "sinusoidal":
+            self.position_embeddings = SinusoidalPositionalEmbedding(self.hidden_size)
+
+        # Token type embedding.
+        # Add this as an optional field that can be added through
+        # method call so we can load a pretrain model without
+        # token types and add them as needed.
+        self._tokentype_embeddings_key = 'tokentype_embeddings'
+        if self.num_tokentypes > 0:
+            self.tokentype_embeddings = torch.nn.Embedding(self.num_tokentypes,
+                                                           self.hidden_size)
+            # Initialize the token-type embeddings.
+            self.init_method(self.tokentype_embeddings.weight)
+        else:
+            self.tokentype_embeddings = None
+
+        # Embeddings dropout
+        self.embedding_dropout = torch.nn.Dropout(embedding_dropout_prob)
+
+    def add_tokentype_embeddings(self, num_tokentypes):
+        """Add token-type embedding. This function is provided so we can add
+        token-type embeddings in case the pretrained model does not have it.
+        This allows us to load the model normally and then add this embedding.
+        """
+        if self.tokentype_embeddings is not None:
+            raise Exception('tokentype embeddings is already initialized')
+        if torch.distributed.get_rank() == 0:
+            print('adding embedding for {} tokentypes'.format(num_tokentypes),
+                  flush=True)
+        self.num_tokentypes = num_tokentypes
+        self.tokentype_embeddings = torch.nn.Embedding(num_tokentypes,
+                                                       self.hidden_size)
+        # Initialize the token-type embeddings.
+        self.init_method(self.tokentype_embeddings.weight)
+
+    def forward(self, input_ids, position_ids, tokentype_ids=None):
+        # Embeddings.
+        words_embeddings = self.word_embeddings(input_ids)
+        if self.embedding_type in ["learned", "sinusoidal"]:
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings = words_embeddings + position_embeddings
+        else:
+            embeddings = words_embeddings
+        if tokentype_ids is not None:
+            assert self.tokentype_embeddings is not None
+            embeddings = embeddings + self.tokentype_embeddings(tokentype_ids)
+        else:
+            assert self.tokentype_embeddings is None
+
+        # Dropout.
+        embeddings = self.embedding_dropout(embeddings)
+        return embeddings
+
+
+class EmbeddingPipe(Embedding):
+    """Extends Embedding to forward attention_mask through the pipeline."""
+
+    @property
+    def word_embeddings_weight(self):
+        """Easy accessory for the pipeline engine to tie embeddings across stages."""
+        return self.word_embeddings.weight
+
+    def forward(self, args):
+        in_inference = len(args) == 4  # if the length of the args is 4, we're in inference :|
+        in_train = len(args) == 3
+
+        input_ids = args[0]
+        position_ids = args[1]
+        attention_mask = args[2]
+        if in_inference:
+            layer_past = args[3]
+        elif in_train:
+            pass
+        else:
+            raise ValueError(f'Incorrect number of args passed to {self.__class__.__name__}')
+
+        embeddings = super().forward(input_ids, position_ids)
+        if in_inference:
+            return embeddings, layer_past, attention_mask
+        else:
+            return embeddings, attention_mask
+
diff --git a/megatron/module.py b/megatron/module.py
deleted file mode 100644
index a78c228f9..000000000
--- a/megatron/module.py
+++ /dev/null
@@ -1,31 +0,0 @@
-# coding=utf-8
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Megatron Module"""
-
-import torch
-
-
-class MegatronModule(torch.nn.Module):
-    """Megatron specific extentions of torch Module."""
-
-    def __init__(self):
-        super(MegatronModule, self).__init__()
-
-    def state_dict_for_save_checkpoint(self, destination=None, prefix='',
-                                       keep_vars=False):
-        """Use this function to override the state dict for
-        saving checkpoints."""
-        return self.state_dict(destination, prefix, keep_vars)
diff --git a/megatron/mpu/__init__.py b/megatron/mpu/__init__.py
index bed81b74b..419e596e5 100644
--- a/megatron/mpu/__init__.py
+++ b/megatron/mpu/__init__.py
@@ -19,8 +19,6 @@
 
 from .data import broadcast_data
 
-from .grads import clip_grad_norm
-
 from .initialize import is_unitialized
 from .initialize import destroy_model_parallel
 from .initialize import get_data_parallel_group
@@ -50,9 +48,7 @@
 
 from .random import checkpoint
 from .random import get_cuda_rng_tracker
-from .random import init_checkpointed_activations_memory_buffer
 from .random import model_parallel_cuda_manual_seed
-from .random import reset_checkpointed_activations_memory_buffer
 
 from .utils import divide
 from .utils import split_tensor_along_last_dim
diff --git a/megatron/mpu/grads.py b/megatron/mpu/grads.py
deleted file mode 100644
index f1e511ea9..000000000
--- a/megatron/mpu/grads.py
+++ /dev/null
@@ -1,127 +0,0 @@
-# coding=utf-8
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-# Parts of the code here are adapted from PyTorch
-# repo: https://github.com/pytorch/pytorch
-
-
-import torch
-from torch._six import inf
-
-try:
-    from apex.multi_tensor_apply import multi_tensor_applier
-    import amp_C
-
-except Exception as e:
-    print('WARNING: APEX is not installed, multi_tensor_applier will not be available.')
-
-from .initialize import get_model_parallel_group
-from .initialize import get_model_parallel_rank
-
-
-def l2_grad_clipper(parameters, max_norm):
-    """Efficient L2 norm gradient clipping."""
-
-    overflow_buf = torch.zeros(1, dtype=torch.int, device='cuda')
-    # Make sure we have an iterable.
-    if isinstance(parameters, torch.Tensor):
-        parameters = [parameters]
-    # Filter parameters with gradients.
-    parameters_with_grads = list(filter(
-        lambda p: p.grad is not None, parameters))
-    # Filter parameters for norm calculations.
-    mp_rank_is_zero = (get_model_parallel_rank() == 0)
-    parameters_for_norm = list(filter(
-        lambda p: p.model_parallel or mp_rank_is_zero, parameters_with_grads))
-    # Calculate L2 norm.
-    norm, _ = multi_tensor_applier(
-        amp_C.multi_tensor_l2norm,
-        overflow_buf,
-        [parameters_for_norm],
-        False # no per-parameter norm
-    )
-    # Sum across all model parallel GPUs.
-    norm_2 = norm * norm
-    torch.distributed.all_reduce(norm_2,
-                                 op=torch.distributed.ReduceOp.SUM,
-                                 group=get_model_parallel_group())
-    total_norm = norm_2.item() ** 0.5
-    # Scale to get max_norm.
-    clip_coef = float(max_norm) / (total_norm + 1.0e-6)
-    grads = [p.grad for p in parameters_with_grads]
-    if clip_coef < 1.0:
-        multi_tensor_applier(
-            amp_C.multi_tensor_scale,
-            overflow_buf,
-            [grads, grads],
-            clip_coef)
-    return total_norm
-
-
-def clip_grad_norm(parameters, max_norm, norm_type=2):
-    """Clips gradient norm of an iterable of parameters.
-
-    This is adapted from torch.nn.utils.clip_grad.clip_grad_norm_ and
-    added functionality to handle model parallel parameters. Note that
-    the gradients are modified in place.
-
-    Arguments:
-        parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
-            single Tensor that will have gradients normalized
-        max_norm (float or int): max norm of the gradients
-        norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
-            infinity norm.
-
-    Returns:
-        Total norm of the parameters (viewed as a single vector).
-    """
-    if isinstance(parameters, torch.Tensor):
-        parameters = [parameters]
-    parameters = list(filter(lambda p: p.grad is not None, parameters))
-    max_norm = float(max_norm)
-    norm_type = float(norm_type)
-    if norm_type == inf:
-        total_norm = max(p.grad.data.abs().max() for p in parameters)
-        total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
-        # Take max across all GPUs.
-        torch.distributed.all_reduce(total_norm_cuda,
-                                     op=torch.distributed.ReduceOp.MAX,
-                                     group=get_model_parallel_group())
-        total_norm = total_norm_cuda[0].item()
-        clip_coef = max_norm / (total_norm + 1e-6)
-        if clip_coef < 1:
-            for p in parameters:
-                p.grad.data.mul_(clip_coef)
-    #elif norm_type == 2:
-    #    total_norm = l2_grad_clipper(parameters, max_norm)
-
-    else:
-        total_norm = 0
-        for p in parameters:
-            if p.model_parallel or (get_model_parallel_rank() == 0):
-                param_norm = p.grad.data.norm(norm_type)
-                total_norm += param_norm.item() ** norm_type
-        # Sum across all model parallel GPUs.
-        total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
-        torch.distributed.all_reduce(total_norm_cuda,
-                                     op=torch.distributed.ReduceOp.SUM,
-                                     group=get_model_parallel_group())
-        total_norm = total_norm_cuda[0].item() ** (1. / norm_type)
-        clip_coef = max_norm / (total_norm + 1e-6)
-        if clip_coef < 1:
-            for p in parameters:
-                p.grad.data.mul_(clip_coef)
-    return total_norm
diff --git a/megatron/mpu/random.py b/megatron/mpu/random.py
index 547d90b6b..b3eaecba6 100644
--- a/megatron/mpu/random.py
+++ b/megatron/mpu/random.py
@@ -1,319 +1,22 @@
-# coding=utf-8
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-# Parts of the code here are adapted from PyTorch
-# repo: https://github.com/pytorch/pytorch
-
-import contextlib
-
-import torch
-from torch import _C
-from torch.cuda import _lazy_call, device as device_ctx_manager
-from torch.utils.checkpoint import detach_variable
-
-from megatron import get_args
-from megatron.memory import allocate_mem_buff
-
-from .initialize import get_data_parallel_rank
-from .initialize import get_model_parallel_group
-from .initialize import get_model_parallel_rank
-from .initialize import get_model_parallel_world_size
+# mostly moving to using checkpointing from deepspeed (identical code anyway) so currently this file is only imports
+# TODO: should be able to get rid of this file entirely
 
+import deepspeed
+import deepspeed.runtime.activation_checkpointing.checkpointing as checkpointing
 
 # Default name for the model parallel rng tracker.
-_MODEL_PARALLEL_RNG_TRACKER_NAME = 'model-parallel-rng'
-
+_MODEL_PARALLEL_RNG_TRACKER_NAME = deepspeed.checkpointing._MODEL_PARALLEL_RNG_TRACKER_NAME
 
 # Whether apply model parallelsim to checkpointed hidden states.
 _CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER = None
 
-
-def init_checkpointed_activations_memory_buffer():
-    """Initializ the memory buffer for the checkpointed activations."""
-    args = get_args()
-
-    per_layer = args.batch_size * args.max_position_embeddings * \
-                args.hidden_size // args.model_parallel_size
-    assert args.num_layers % args.checkpoint_num_layers == 0, \
-        'number of layers is not divisible by checkpoint-num-layers'
-    num_checkpointer_layers = args.num_layers // args.checkpoint_num_layers
-    numel = per_layer * num_checkpointer_layers
-    dtype = torch.half
-    if not (args.precision == "fp16"):
-        dtype = torch.float
-        
-    global _CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER
-    assert _CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER is None, \
-        'checkpointed activations memory buffer is already allocated.'
-    _CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER = allocate_mem_buff(
-        'checkpointed activations', numel, dtype, track_usage=False)
-
-
-def reset_checkpointed_activations_memory_buffer():
-    """Reset the memory used for checkpointing."""
-    if _CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER is not None:
-        _CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER.reset()
-
-
-def _set_cuda_rng_state(new_state, device=-1):
-    """Sets the random number generator state of the current GPU.
-
-    Argumentss:
-        new_state (torch.ByteTensor): The desired state
-    This function is adapted from PyTorch repo (torch.cuda.set_rng_state)
-    with a single change: the input state is not cloned. Cloning caused
-    major performance issues for +4 GPU cases.
-    """
-    if hasattr(_C, '_cuda_setRNGState') and callable(_C._cuda_setRNGState):
-        # older PyTorch
-        def cb():
-            with device_ctx_manager(device):
-                _C._cuda_setRNGState(new_state)
-    else:
-        # newer PyTorch
-        if device == -1:
-            device = torch.device('cuda')
-        elif isinstance(device, str):
-            device = torch.device(device)
-        elif isinstance(device, int):
-            device = torch.device('cuda', device)
-
-        def cb():
-            idx = device.index
-            if idx is None:
-                idx = torch.cuda.current_device()
-            default_generator = torch.cuda.default_generators[idx]
-            default_generator.set_state(new_state)
-
-    _lazy_call(cb)
-
-
-def split_tensor_into_1d_equal_chunks(tensor):
-    """Break a tensor into equal 1D chunks."""
-    data = tensor.view(-1)
-    partition_size = torch.numel(data) // get_model_parallel_world_size()
-    start_index = partition_size * get_model_parallel_rank()
-    end_index = start_index + partition_size
-    return data[start_index:end_index]
-
-
-def gather_split_1d_tensor(tensor):
-    """Opposite of above function, gather values from model parallel ranks."""
-    world_size = get_model_parallel_world_size()
-    numel = torch.numel(tensor)
-    numel_gathered = world_size * numel
-    gathered = torch.empty(numel_gathered, dtype=tensor.dtype,
-                           device=torch.cuda.current_device(),
-                           requires_grad=False)
-    chunks = [gathered[i*numel:(i+1)*numel] for i in range(world_size)]
-    torch.distributed.all_gather(chunks, tensor,
-                                 group=get_model_parallel_group())
-    return gathered
-
-
-class CudaRNGStatesTracker:
-    """Tracker for the cuda RNG states.
-
-    Using the `add` method, a cuda rng state is initialized based on
-    the input `seed` and is assigned to `name`. Later, by forking the
-    rng state, we can perform operations and return to our starting
-    cuda state.
-    """
-
-    def __init__(self):
-        # Map from a string name to the cuda rng state.
-        self.states_ = {}
-        # Seeds are just for book keeping and ensure no seed is set twice.
-        self.seeds_ = set()
-
-    def reset(self):
-        """Set to the initial state (no tracker)."""
-        self.states_ = {}
-        self.seeds_ = set()
-
-    def get_states(self):
-        """Get rng states. Copy the dictionary so we have direct
-        pointers to the states, not just a pointer to the dictionary."""
-        states = {}
-        for name in self.states_:
-            states[name] = self.states_[name]
-        return states
-
-    def set_states(self, states):
-        """Set the rng states. For efficiency purposes, we do not check
-        the size of seed for compatibility."""
-        self.states_ = states
-
-    def add(self, name, seed):
-        """Track the rng state."""
-        # Check seed is not already used.
-        if seed in self.seeds_:
-            raise Exception('seed {} already exists'.format(seed))
-        self.seeds_.add(seed)
-        # Check that state is not already defined.
-        if name in self.states_:
-            raise Exception('cuda rng state {} already exists'.format(name))
-        # Get the current rng state.
-        orig_rng_state = torch.cuda.get_rng_state()
-        # Set the new state and store it.
-        torch.cuda.manual_seed(seed)
-        self.states_[name] = torch.cuda.get_rng_state()
-        # Reset rng state to what it was.
-        _set_cuda_rng_state(orig_rng_state)
-
-    @contextlib.contextmanager
-    def fork(self, name=_MODEL_PARALLEL_RNG_TRACKER_NAME):
-        """Fork the cuda rng state, perform operations, and exit with
-        the original state."""
-        # Check if we have added the state
-        if name not in self.states_:
-            raise Exception('cuda rng state {} is not added'.format(name))
-        # Store current rng state.
-        orig_cuda_rng_state = torch.cuda.get_rng_state()
-        # Set rng state to the desired one
-        _set_cuda_rng_state(self.states_[name])
-        # Do the stuff we wanted to do.
-        try:
-            yield
-        finally:
-            # Update the current rng state for later use.
-            self.states_[name] = torch.cuda.get_rng_state()
-            # And set the state to the original state we started with.
-            _set_cuda_rng_state(orig_cuda_rng_state)
-
-
 # RNG tracker object.
-_CUDA_RNG_STATE_TRACKER = CudaRNGStatesTracker()
-
-
-def get_cuda_rng_tracker():
-    """Get cuda rng tracker."""
-    return _CUDA_RNG_STATE_TRACKER
-
-
-def model_parallel_cuda_manual_seed(seed):
-    """Initialize model parallel cuda seed.
-
-    This function should be called after the model parallel is
-    initialized. Also, no torch.cuda.manual_seed should be called
-    after this function. Basically, this is replacement for that
-    function.
-    Two set of RNG states are tracked:
-        default state: This is for data parallelism and is the same among a
-                       set of model parallel GPUs but different across
-                       different model paralle groups. This is used for
-                       example for dropout in the non-model-parallel regions.
-        model-parallel state: This state is different among a set of model
-                              parallel GPUs, but the same across data parallel
-                              groups. This is used for example for dropout in
-                              model parallel regions.
-    """
-    # 2718 is just for fun and any POSITIVE value will work.
-    offset = seed + 2718
-    model_parallel_seed = offset + get_model_parallel_rank()
-    # Data parallel gets the original sedd.
-    data_parallel_seed = seed
-
-    if torch.distributed.get_rank() == 0:
-        print('> initializing model parallel cuda seeds on global rank {}, '
-              'model parallel rank {}, and data parallel rank {} with '
-              'model parallel seed: {} and data parallel seed: {}'.format(
-                  torch.distributed.get_rank(), get_model_parallel_rank(),
-                  get_data_parallel_rank(), model_parallel_seed,
-                  data_parallel_seed), flush=True)
-    _CUDA_RNG_STATE_TRACKER.reset()
-    # Set the default state.
-    torch.cuda.manual_seed(data_parallel_seed)
-    # and model parallel state.
-    _CUDA_RNG_STATE_TRACKER.add(_MODEL_PARALLEL_RNG_TRACKER_NAME,
-                                model_parallel_seed)
-
-
-class CheckpointFunction(torch.autograd.Function):
-    """This function is adapted from torch.utils.checkpoint with
-       two main changes:
-           1) torch.cuda.set_rng_state is replaced with `_set_cuda_rng_state`
-           2) the states in the model parallel tracker are also properly
-              tracked/set/reset.
-    """
-    @staticmethod
-    def forward(ctx, run_function, *args):
-        ctx.run_function = run_function
-
-        # Copy the rng states.
-        ctx.fwd_cpu_rng_state = torch.get_rng_state()
-        ctx.fwd_cuda_rng_state = torch.cuda.get_rng_state()
-        ctx.fwd_cuda_rng_state_tracker = get_cuda_rng_tracker().get_states()
-
-        with torch.no_grad():
-            outputs = run_function(*args)
-
-        # Divide hidden states across model parallel group and only keep
-        # the chunk corresponding to the current rank.
-        if _CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER is not None:
-            ctx.input_0_shape = args[0].data.shape
-            args[0].data = split_tensor_into_1d_equal_chunks(args[0].data)
-            args[0].data = _CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER.add(
-                args[0].data)
-            
-        # Store everything.
-        ctx.save_for_backward(*args)
-
-            
-        return outputs
-
-    @staticmethod
-    def backward(ctx, *args):
-        if not torch.autograd._is_checkpoint_valid():
-            raise RuntimeError("Checkpointing is not compatible with .grad(), "
-                               "please use .backward() if possible")
-        inputs = ctx.saved_tensors
-        if _CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER is not None:
-            inputs[0].data = gather_split_1d_tensor(inputs[0].data)
-            inputs[0].data = inputs[0].data.view(ctx.input_0_shape)
-
-        # Store the current states.
-        bwd_cpu_rng_state = torch.get_rng_state()
-        bwd_cuda_rng_state = torch.cuda.get_rng_state()
-        bwd_cuda_rng_state_tracker = get_cuda_rng_tracker().get_states()
-
-        # Set the states to what it used to be before the forward pass.
-        torch.set_rng_state(ctx.fwd_cpu_rng_state)
-        _set_cuda_rng_state(ctx.fwd_cuda_rng_state)
-        get_cuda_rng_tracker().set_states(ctx.fwd_cuda_rng_state_tracker)
-
-        # Compute the forward pass.
-        detached_inputs = detach_variable(inputs)
-        with torch.enable_grad():
-            outputs = ctx.run_function(*detached_inputs)
-
-        # Set the states back to what it was at the start of this function.
-        torch.set_rng_state(bwd_cpu_rng_state)
-        _set_cuda_rng_state(bwd_cuda_rng_state)
-        get_cuda_rng_tracker().set_states(bwd_cuda_rng_state_tracker)
-
-        if isinstance(outputs, torch.Tensor):
-            outputs = (outputs,)
-        torch.autograd.backward(outputs, args)
-        grads = tuple(inp.grad if isinstance(inp, torch.Tensor) else inp
-                      for inp in detached_inputs)
-        return (None,) + grads
+_CUDA_RNG_STATE_TRACKER = deepspeed.checkpointing._CUDA_RNG_STATE_TRACKER
 
+# Deepspeed checkpointing functions
+# TODO: replace calls to these in our codebase with calls to the deepspeed ones
+_set_cuda_rng_state = checkpointing._set_cuda_rng_state
+checkpoint = checkpointing.checkpoint
+model_parallel_cuda_manual_seed = checkpointing.model_parallel_cuda_manual_seed
+get_cuda_rng_tracker = checkpointing.get_cuda_rng_tracker
 
-def checkpoint(function, *args):
-    """Checkpoint a model or part of the model.
-    This has been directly copied from torch.utils.checkpoint."""
-    return CheckpointFunction.apply(function, *args)
diff --git a/megatron/neox_arguments/__init__.py b/megatron/neox_arguments/__init__.py
index 2cc0152ba..afe9cb571 100644
--- a/megatron/neox_arguments/__init__.py
+++ b/megatron/neox_arguments/__init__.py
@@ -19,7 +19,7 @@
 * NeoXArgs.from_dict({"num_layers": 12, ...}): load attribute values from dict; checks unknown arguments are performed
 
 * NeoXArgs.consume_deepy_args(): entry point for deepy.py configuring and consuming command line arguments (i.e. user_script, conf_dir, conf_file, wandb_group, wandb_team); neox_args.get_deepspeed_main_args() produces a list of command line arguments to feed to deepspeed.launcher.runner.main
-* NeoXArgs.consume_megatron_args(): In the call stack deepy.py -> deepspeed -> pretrain_gpt2.py; arguments are passed to pretrain_gpt2.py by neox_args.get_deepspeed_main_args(). So produced arguments can be read with consume_megatron_args() to instantiate a NeoXArgs instance.
+* NeoXArgs.consume_neox_args(): In the call stack deepy.py -> deepspeed -> pretrain_gpt2.py; arguments are passed to pretrain_gpt2.py by neox_args.get_deepspeed_main_args(). So produced arguments can be read with consume_neox_args() to instantiate a NeoXArgs instance.
 
 
 **code structure**
diff --git a/megatron/neox_arguments/arguments.py b/megatron/neox_arguments/arguments.py
index f01e75b1f..48cd25e22 100644
--- a/megatron/neox_arguments/arguments.py
+++ b/megatron/neox_arguments/arguments.py
@@ -3,26 +3,20 @@
 import json
 import logging
 import shortuuid
+import copy
+import torch
+import argparse
 
-import dataclasses
 from dataclasses import dataclass
 from typing import List
-from pathlib import Path
 from socket import gethostname
 from typing import Literal, Dict
-
-import torch
-
 from deepspeed.launcher.runner import DLTS_HOSTFILE
-
 from megatron.logging import Tee
 from megatron.utils import obtain_resource_pool
-
 from .deepspeed_args import NeoXArgsDeepspeedConfig, NeoXArgsDeepspeedRunner
-from .megatron_args import NeoXArgsModel, NeoXArgsTokenizer, NeoXArgsTraining, NeoXArgsParallelism, \
-                            NeoXArgsLogging, NeoXArgsOther, NeoXArgsTextgen, NeoXArgsOptimizer, NeoXArgsLRScheduler
-
-import argparse
+from .neox_args import NeoXArgsModel, NeoXArgsTokenizer, NeoXArgsTraining, NeoXArgsParallelism, \
+    NeoXArgsLogging, NeoXArgsOther, NeoXArgsTextgen, NeoXArgsOptimizer, NeoXArgsLRScheduler
 
 # ZERO defaults by deespeed
 # These values should not be changed unless defaults in deepspeed are changed
@@ -54,22 +48,23 @@
 }
 
 BASE_CLASSES = [
-    NeoXArgsDeepspeedRunner, 
+    NeoXArgsDeepspeedRunner,
     NeoXArgsDeepspeedConfig,
-    NeoXArgsModel, 
+    NeoXArgsModel,
     NeoXArgsLRScheduler,
     NeoXArgsOptimizer,
     NeoXArgsTokenizer,
-    NeoXArgsTraining, 
+    NeoXArgsTraining,
     NeoXArgsParallelism,
     NeoXArgsLogging,
     NeoXArgsOther,
     NeoXArgsTextgen
-    ]
+]
 
 DEEPSPEED_ARG_CLASSES = [NeoXArgsDeepspeedRunner, NeoXArgsDeepspeedConfig]
 NEOX_ARG_CLASSES = [i for i in BASE_CLASSES if i not in DEEPSPEED_ARG_CLASSES]
 
+
 @dataclass
 class NeoXArgs(*BASE_CLASSES):
     """
@@ -88,19 +83,19 @@ def __post_init__(self):
         calculate values, assert consistency and do typechecking.
         """
         if not NeoXArgs.validate_keys():
-            raise ValueError(self.__class__.__name__+".__post_init__() NeoXArgs keys cannot be validated")
+            raise ValueError(self.__class__.__name__ + ".__post_init__() NeoXArgs keys cannot be validated")
 
         self.enable_logging()
 
         self.configure_distributed_args()
         self.calculate_derived()
-    
+
         if not self.validate_types():
-            raise ValueError(self.__class__.__name__+".__post_init__() NeoXArgs types cannot be validated")
+            raise ValueError(self.__class__.__name__ + ".__post_init__() NeoXArgs types cannot be validated")
 
         if not self.validate_values():
-            raise ValueError(self.__class__.__name__+".__post_init__() NeoXArgs values cannot be validated")
-        
+            raise ValueError(self.__class__.__name__ + ".__post_init__() NeoXArgs values cannot be validated")
+
         self.save_yml()
 
     @classmethod
@@ -113,7 +108,7 @@ def from_ymls(cls, paths_to_yml_files: List[str], overwrite_values: Dict = None)
         overwrite_values: If provided, overwrite any values in the yamls with these values
         """
 
-        print(cls.__name__+".from_ymls() "+str(paths_to_yml_files), flush=True)
+        print(cls.__name__ + ".from_ymls() " + str(paths_to_yml_files), flush=True)
 
         # initialize an empty config dictionary to be filled by yamls
         config = dict()
@@ -128,15 +123,18 @@ def from_ymls(cls, paths_to_yml_files: List[str], overwrite_values: Dict = None)
             # check for key duplicates and load values
             for conf_key, conf_value in conf.items():
                 if conf_key in config:
-                    raise ValueError(f'Conf file {conf_file_name} has the following duplicate keys with previously loaded file: {conf_key}')
+                    raise ValueError(
+                        f'Conf file {conf_file_name} has the following duplicate keys with previously loaded file: {conf_key}')
 
-                conf_key_converted = conf_key.replace("-", "_")  #TODO remove replace and update configuration files?
+                conf_key_converted = conf_key.replace("-", "_")  # TODO remove replace and update configuration files?
                 config[conf_key_converted] = conf_value
 
         # Configuration parameters not specified
-        params_not_in_config = sorted(list(set(cls.__dataclass_fields__.keys()) -  set(config.keys())))
+        params_not_in_config = sorted(list(set(cls.__dataclass_fields__.keys()) - set(config.keys())))
         if len(params_not_in_config) > 0:
-            logging.debug(cls.__name__+".from_ymls() Configuration parameters not specified (using defaults): "+", ".join(params_not_in_config))
+            logging.debug(
+                cls.__name__ + ".from_ymls() Configuration parameters not specified (using defaults): " + ", ".join(
+                    params_not_in_config))
 
         if overwrite_values is not None:
             for k, v in overwrite_values.items():
@@ -170,30 +168,29 @@ def consume_deepy_args(cls):
         group = parser.add_argument_group(title='Training Configuration')
 
         group.add_argument("user_script",
-                            type=str,
-                            help="User script to launch, followed by any required "
-                                 "arguments.")
+                           type=str,
+                           help="User script to launch, followed by any required "
+                                "arguments.")
 
         group.add_argument("--conf_dir", '-d',
-                            type=str,
-                            default=None,
-                            help="Directory to prefix to all configuration file paths")
+                           type=str,
+                           default=None,
+                           help="Directory to prefix to all configuration file paths")
 
         group.add_argument("conf_file",
-                            type=str,
-                            nargs='+',
-                            help="Configuration file path. Multiple files can be provided and will be merged.")
-    
+                           type=str,
+                           nargs='+',
+                           help="Configuration file path. Multiple files can be provided and will be merged.")
+
         group = parser.add_argument_group(title='Weights and Biases monitoring args')
 
         group.add_argument('--wandb_group', type=str, default=None,
-                            help='Weights and Biases group name - used to group together "runs".')
+                           help='Weights and Biases group name - used to group together "runs".')
         group.add_argument('--wandb_team', type=str, default=None,
-                            help='Team name for Weights and Biases.')
+                           help='Team name for Weights and Biases.')
 
         args_parsed = parser.parse_args()
 
-        
         # Validate user_script exists
         assert os.path.exists(args_parsed.user_script), f"User script could not be found: {args_parsed.user_script}"
 
@@ -205,17 +202,22 @@ def consume_deepy_args(cls):
         # enables us to pass in `small` instead of `small.yml`
         conf_files = [(cf if cf.endswith('.yml') else cf + ".yml") for cf in conf_files]
 
+        # determine overwrite values
+        overwrite_values = dict()
+        if args_parsed.wandb_group is not None:
+            overwrite_values["wandb_group"] = args_parsed.wandb_group
+        if args_parsed.wandb_team is not None:
+            overwrite_values["wandb_team"] = args_parsed.wandb_team
+        if args_parsed.user_script is not None:
+            overwrite_values["user_script"] = args_parsed.user_script
+
         # load args
-        neox_args = cls.from_ymls(paths_to_yml_files=conf_files, overwrite_values={
-            "wandb_group": args_parsed.wandb_group,
-            "wandb_team": args_parsed.wandb_team,
-            "user_script": args_parsed.user_script
-        })
+        neox_args = cls.from_ymls(paths_to_yml_files=conf_files, overwrite_values=overwrite_values)
 
         return neox_args
 
     @classmethod
-    def consume_megatron_args(cls):
+    def consume_neox_args(cls):
         """
         Deepspeed launcher needs to pass the arguments for `pretrain_gpt2.py` across to all machines.
         
@@ -245,7 +247,6 @@ def convert_key_value_to_command_line_arg(k, v):
             return []
         return [f'--{k}', str(v)]
 
-
     def get_deepspeed_main_args(self):
 
         args_list = list()
@@ -265,9 +266,9 @@ def get_deepspeed_main_args(self):
 
         # get all config values
         args_list.append("--megatron_config")
-        megatron_args = self.get_parent_class_value_dict(*self.__class__.__bases__, only_non_defaults=True)
-        args_list.append(json.dumps(megatron_args))
-        
+        neox_args = self.get_parent_class_value_dict(*self.__class__.__bases__, only_non_defaults=True)
+        args_list.append(json.dumps(neox_args))
+
         return args_list
 
     ############################################################################################################################
@@ -298,7 +299,7 @@ def get_parent_class_value_dict(self, *parent_classes, only_non_defaults=False)
         """
         takes a sequence of parent classes and returns corresponding values (with defaults set)
         """
-        #TODO no Nones or non-defaults
+        # TODO no Nones or non-defaults
         result = dict()
         for parent in parent_classes:
             for key, default_value in parent().defaults():
@@ -320,7 +321,7 @@ def params_dtype(self):
 
     ############################################################################################################################
     # start of logging and output
-    
+
     def enable_logging(self):
         """
         enable Tee logs based on the configured logdir
@@ -329,7 +330,7 @@ def enable_logging(self):
             os.makedirs(self.log_dir, exist_ok=True)
             hostname = gethostname()
             file_prefix = os.path.join(self.log_dir, hostname)
-            Tee(file_prefix+'_stdout.txt', err=False)
+            Tee(file_prefix + '_stdout.txt', err=False)
             Tee(file_prefix + '_stderr.txt', err=True)
 
     def save_yml(self):
@@ -348,7 +349,6 @@ def print(self):
             print('-------------------- arguments --------------------', flush=True)
             str_list = []
             for arg in vars(self):
-                
                 # add arg + value
                 dots = '.' * (32 - len(arg))
                 value = getattr(self, arg)
@@ -360,7 +360,6 @@ def print(self):
                 dots = '.' * (64 - len(print_str))
                 print_str += dots + default_info
 
-
                 str_list.append(print_str)
             for arg in sorted(str_list, key=lambda x: x.lower()):
                 print(arg, flush=True)
@@ -376,14 +375,16 @@ def configure_distributed_args(self):
         if self.deepspeed_mpi:
             from deepspeed.utils.distributed import mpi_discovery
             mpi_discovery()
-        
+
         self.update_value("local_rank", int(os.getenv('LOCAL_RANK', '0')))
         self.update_value("rank", int(os.getenv('RANK', '0')))
         self.update_value("world_size", int(os.getenv("WORLD_SIZE", '1')))
         self.update_value("model_parallel_size", min(self.model_parallel_size, self.world_size))
 
         if self.rank == 0:
-            print(self.__class__.__name__+".configure_distributed_args() using world size: {} and model-parallel size: {} ".format(self.world_size, self.model_parallel_size), flush=True)
+            print(
+                self.__class__.__name__ + ".configure_distributed_args() using world size: {} and model-parallel size: {} ".format(
+                    self.world_size, self.model_parallel_size), flush=True)
 
     @staticmethod
     def calculate_batch_parameters(dp_world_size, train_batch=None, micro_batch=None, grad_acc=None):
@@ -440,8 +441,8 @@ def check_batch_parameters(dp_world_size, train_batch, micro_batch, grad_acc):
 
         assert train_batch == micro_batch * grad_acc * dp_world_size, \
             (f'Check batch related parameters. train_batch_size is not equal'
-            ' to micro_batch_per_gpu * gradient_acc_step * world_size'
-            f'{train_batch} != {micro_batch} * {grad_acc} * {dp_world_size}')
+             ' to micro_batch_per_gpu * gradient_acc_step * world_size'
+             f'{train_batch} != {micro_batch} * {grad_acc} * {dp_world_size}')
 
     def calculate_derived(self):
         """
@@ -453,15 +454,12 @@ def calculate_derived(self):
         if self.wandb_group is None:
             # if none is defined a uuid is set for the run
             self.wandb_group = shortuuid.uuid()
-        else:
-            # if one is defined it is concatenated with a uuid to make the run unique
-            self.wandb_group = str(self.wandb_group) + shortuuid.uuid()
 
         # number of gpus
         # Get number of GPUs param or hostfile to determine train_batch_size
         num_gpus = self.num_gpus
         if num_gpus is None:
-            num_gpus = -1 # set -1 for backwards compatibility to old default value
+            num_gpus = -1  # set -1 for backwards compatibility to old default value
         if num_gpus < 1:
             if self.hostfile is not None or os.path.exists(DLTS_HOSTFILE):
                 hostfile_path = self.hostfile or DLTS_HOSTFILE
@@ -471,7 +469,8 @@ def calculate_derived(self):
                 num_gpus = torch.cuda.device_count()
         self.update_value("num_gpus", num_gpus)
 
-        logging.info(self.__class__.__name__+".calculate_derived() "+f"Total number of GPUs determined to be: {self.num_gpus}")
+        logging.info(
+            self.__class__.__name__ + ".calculate_derived() " + f"Total number of GPUs determined to be: {self.num_gpus}")
 
         # get world size in the model/pipe parallel case, the actual `world size` deepspeed uses is the size of the
         # data-parallel group, or (num_gpus / mp_size) / pp_size
@@ -479,25 +478,26 @@ def calculate_derived(self):
         pp_size = pp_size if pp_size >= 1 else 1
         mp_size = self.model_parallel_size
         mp_size = mp_size if mp_size >= 1 else 1
-                      
+        self.update_value("model_parallel_size", mp_size)
+
         # pp_size and mp_size are only used here to compute dp world size and nowhere else.
         dp_world_size = ((num_gpus / pp_size) / mp_size)
         if not (dp_world_size % 1 == 0):
-            error_message = self.__class__.__name__+".calculate_derived() "+f"(num_gpus / pp_size) / mp_size [({num_gpus} / {pp_size}) / {mp_size}] must be a whole number"
+            error_message = self.__class__.__name__ + ".calculate_derived() " + f"(num_gpus / pp_size) / mp_size [({num_gpus} / {pp_size}) / {mp_size}] must be a whole number"
             logging.error(error_message)
             raise AssertionError(error_message)
 
         # Automatically derive train_batch_size = train_micro_batch_size_per_gpu*num_gpus*gradient_accumulation_steps
         train_batch_size, train_micro_batch_size_per_gpu, gradient_accumulation_steps = self.calculate_batch_parameters(
-            dp_world_size=dp_world_size, 
-            train_batch=self.train_batch_size, 
-            micro_batch=self.train_micro_batch_size_per_gpu, 
+            dp_world_size=dp_world_size,
+            train_batch=self.train_batch_size,
+            micro_batch=self.train_micro_batch_size_per_gpu,
             grad_acc=self.gradient_accumulation_steps
-            )
+        )
         self.check_batch_parameters(
-            dp_world_size=dp_world_size, 
-            train_batch=train_batch_size, 
-            micro_batch=train_micro_batch_size_per_gpu, 
+            dp_world_size=dp_world_size,
+            train_batch=train_batch_size,
+            micro_batch=train_micro_batch_size_per_gpu,
             grad_acc=gradient_accumulation_steps
         )
         self.update_values({
@@ -516,13 +516,16 @@ def calculate_derived(self):
 
         # zero optimization
         if self.zero_optimization is None:
-            self.zero_optimization = copy.deepcopy(ZERO_DEFAULTS) # a dict is overwritten and not updated key by key
+            self.zero_optimization = copy.deepcopy(ZERO_DEFAULTS)  # a dict is overwritten and not updated key by key
         self.update_values({
             "zero_stage": self.zero_optimization.get('stage', ZERO_DEFAULTS['stage']),
             "zero_reduce_scatter": self.zero_optimization.get('reduce_scatter', ZERO_DEFAULTS['reduce_scatter']),
-            "zero_contiguous_gradients": self.zero_optimization.get('contiguous_gradients', ZERO_DEFAULTS['contiguous_gradients']),
-            "zero_reduce_bucket_size": self.zero_optimization.get('reduce_bucket_size', ZERO_DEFAULTS['reduce_bucket_size']),
-            "zero_allgather_bucket_size": self.zero_optimization.get('allgather_bucket_size', ZERO_DEFAULTS['allgather_bucket_size'])
+            "zero_contiguous_gradients": self.zero_optimization.get('contiguous_gradients',
+                                                                    ZERO_DEFAULTS['contiguous_gradients']),
+            "zero_reduce_bucket_size": self.zero_optimization.get('reduce_bucket_size',
+                                                                  ZERO_DEFAULTS['reduce_bucket_size']),
+            "zero_allgather_bucket_size": self.zero_optimization.get('allgather_bucket_size',
+                                                                     ZERO_DEFAULTS['allgather_bucket_size'])
         })
 
         # optimizer and scheduler
@@ -542,11 +545,15 @@ def calculate_derived(self):
                     "warmup_max_lr": self.lr,
                     "warmup_num_steps": int(self.train_iters * self.warmup),
                     "total_num_steps": self.lr_decay_iters or self.train_iters
-            }}
+                }}
 
         # Fp16 loss scaling.
         self.update_value("dynamic_loss_scale", self.loss_scale is None)
 
+        # Update 'is pipe parallel' flag
+        # if we set pipe_parallel_size to 0 or 1, GPT2ModelPipe.to_sequential() is called, and we run training with
+        # the sequential model without the PipelineModule wrapper to avoid the overhead it incurs
+        self.update_value("is_pipe_parallel", self.pipe_parallel_size >= 1)
 
     ############################################################################################################################
     # start of validation functions
@@ -563,12 +570,13 @@ def validate_keys(cls):
             source_vars = list(source_class.__dataclass_fields__)
             for item in source_vars:
                 if item in defined_properties.keys():
-                    logging.error(f'({cls.__name__}) duplicate of item: {item}, in class {source_class.__name__} and {defined_properties[item]}')
+                    logging.error(
+                        f'({cls.__name__}) duplicate of item: {item}, in class {source_class.__name__} and {defined_properties[item]}')
                     return False
                 else:
                     defined_properties[item] = source_class.__name__
         return True
-    
+
     def validate_values(self):
         # the current codebase assumes running with deepspeed only
         if not self.deepspeed:
@@ -576,7 +584,7 @@ def validate_values(self):
 
         # learning rate
         if self.lr is None:
-            error_message = self.__class__.__name__+".validate_values() lr is None"
+            error_message = self.__class__.__name__ + ".validate_values() lr is None"
             logging.error(error_message)
             raise ValueError(error_message)
             return False
@@ -585,67 +593,58 @@ def validate_values(self):
         required_args = ['num_layers', 'hidden_size', 'num_attention_heads', 'max_position_embeddings']
         for req_arg in required_args:
             if getattr(self, req_arg) is None:
-                error_message = self.__class__.__name__+".validate_values() "+req_arg+" is None." 
+                error_message = self.__class__.__name__ + ".validate_values() " + req_arg + " is None."
                 logging.error(error_message)
                 raise ValueError(error_message)
                 return False
 
         # Checks.
         if self.hidden_size % self.num_attention_heads != 0:
-            error_message = self.__class__.__name__+".validate_values() hidden_size must be divisable by num_attention_heads" 
+            error_message = self.__class__.__name__ + ".validate_values() hidden_size must be divisable by num_attention_heads"
             logging.error(error_message)
             raise ValueError(error_message)
             return False
 
         if self.seq_length is not None:
-            if not(self.max_position_embeddings >= self.seq_length):
-                error_message = self.__class__.__name__+".validate_values() max_position_embeddings must be bigger or equal seq_length" 
+            if not (self.max_position_embeddings >= self.seq_length):
+                error_message = self.__class__.__name__ + ".validate_values() max_position_embeddings must be bigger or equal seq_length"
                 logging.error(error_message)
                 raise ValueError(error_message)
                 return False
-            
-        if not(self.min_lr <= self.lr):
-            error_message = self.__class__.__name__+".validate_values() min_lr must be smaller or equal lr" 
+
+        if not (self.min_lr <= self.lr):
+            error_message = self.__class__.__name__ + ".validate_values() min_lr must be smaller or equal lr"
             logging.error(error_message)
             raise ValueError(error_message)
             return False
 
         if self.save is not None and self.save_interval is None:
-            error_message = self.__class__.__name__+".validate_values() save_interval must be defined if save is defined" 
+            error_message = self.__class__.__name__ + ".validate_values() save_interval must be defined if save is defined"
             logging.error(error_message)
             raise ValueError(error_message)
             return False
 
         # Parameters sharing does not work with torch DDP.
         if (self.num_unique_layers is not None) and (self.num_layers is not None):
-            
+
             if not (self.num_unique_layers <= self.num_layers):
-                error_message = self.__class__.__name__+".validate_values() num-unique-layers must be smaller or equal num_layers" 
+                error_message = self.__class__.__name__ + ".validate_values() num-unique-layers must be smaller or equal num_layers"
                 logging.error(error_message)
                 raise ValueError(error_message)
                 return False
 
             if not (self.num_layers % self.num_unique_layers == 0):
-                error_message = self.__class__.__name__+".validate_values() num-layers should be divisible by num-unique-layers" 
+                error_message = self.__class__.__name__ + ".validate_values() num-layers should be divisible by num-unique-layers"
                 logging.error(error_message)
                 raise ValueError(error_message)
                 return False
 
-
         if self.fp16_lm_cross_entropy and self.precision != "fp16":
-            error_message = self.__class__.__name__+".validate_values() lm cross entropy in fp16 only support in fp16 mode." 
-            logging.error(error_message)
-            raise ValueError(error_message)
-            return False
-
-        # Activation checkpointing.
-        if self.distribute_checkpointed_activations and not self.checkpoint_activations:
-            error_message = self.__class__.__name__+".validate_values() 'for distribute-checkpointed-activations to work you need to enable checkpoint-activations'" 
+            error_message = self.__class__.__name__ + ".validate_values() lm cross entropy in fp16 only support in fp16 mode."
             logging.error(error_message)
             raise ValueError(error_message)
             return False
 
-
         return True
 
     def validate_types(self):
@@ -655,16 +654,16 @@ def validate_types(self):
         for field_name, field_def in self.__dataclass_fields__.items():
 
             actual_value = getattr(self, field_name)
-            if actual_value is None: 
-                continue # we allow for some values not to be configured
+            if actual_value is None:
+                continue  # we allow for some values not to be configured
 
             actual_type = type(actual_value)
             if actual_type != field_def.type:
-                if actual_type == int and field_def.type == float: # floats should be able to be configured as ints
+                if actual_type == int and field_def.type == float:  # floats should be able to be configured as ints
                     continue
 
                 # for typing.Literal (i.e a list of choices) - checks that actual value is in accepted values
-                elif field_def.type.__origin__ == Literal: 
+                elif field_def.type.__origin__ == Literal:
                     accepted_values = field_def.type.__args__
                     if actual_value in accepted_values:
                         continue
@@ -673,36 +672,42 @@ def validate_types(self):
                         lowercase_accepted_values = [i.lower() for i in accepted_values if isinstance(i, str)]
                         if actual_value.lower() in lowercase_accepted_values:
                             continue
-                    logging.error(self.__class__.__name__+".validate_types() "+f"{field_name}: '{actual_value}' Not in accepted values: '{accepted_values}'")
+                    logging.error(
+                        self.__class__.__name__ + ".validate_types() " + f"{field_name}: '{actual_value}' Not in accepted values: '{accepted_values}'")
                     return False
 
-                logging.error(self.__class__.__name__+".validate_types() "+f"{field_name}: '{actual_type}' instead of '{field_def.type}'")
+                logging.error(
+                    self.__class__.__name__ + ".validate_types() " + f"{field_name}: '{actual_type}' instead of '{field_def.type}'")
                 return False
-        
+
         # validate deepspeed dicts
         for field_name in ["optimizer", "scheduler"]:
             value = getattr(self, field_name)
-            if isinstance(value, dict): # dict is checked above, only fields are checked here
+            if isinstance(value, dict):  # dict is checked above, only fields are checked here
                 if "type" in value:
                     if not isinstance(value["type"], str):
-                        logging.error(self.__class__.__name__+".validate_types() "+f"{field_name}: key 'type' must be a string")
-                        return False    
+                        logging.error(
+                            self.__class__.__name__ + ".validate_types() " + f"{field_name}: key 'type' must be a string")
+                        return False
                 else:
-                    logging.error(self.__class__.__name__+".validate_types() "+f"{field_name}: must contain key 'type'")
+                    logging.error(
+                        self.__class__.__name__ + ".validate_types() " + f"{field_name}: must contain key 'type'")
                     return False
                 if "params" in value:
                     if not isinstance(value["params"], dict):
-                        logging.error(self.__class__.__name__+".validate_types() "+f"{field_name}: key 'params' must be a dict")
-                        return False    
+                        logging.error(
+                            self.__class__.__name__ + ".validate_types() " + f"{field_name}: key 'params' must be a dict")
+                        return False
                 else:
-                    logging.error(self.__class__.__name__+".validate_types() "+f"{field_name}: must contain key 'params'")
+                    logging.error(
+                        self.__class__.__name__ + ".validate_types() " + f"{field_name}: must contain key 'params'")
                     return False
-        
+
         for field_name in ["fp16", "amp", "flops_profiler"]:
             value = getattr(self, field_name)
             if isinstance(value, dict):
                 if not "enabled" in value:
-                    error_message = self.__class__.__name__+".validate_types() "+f"{field_name}: must contain key 'enabled'"
+                    error_message = self.__class__.__name__ + ".validate_types() " + f"{field_name}: must contain key 'enabled'"
                     logging.error(error_message)
                     return False
 
diff --git a/megatron/neox_arguments/megatron_args.py b/megatron/neox_arguments/neox_args.py
similarity index 96%
rename from megatron/neox_arguments/megatron_args.py
rename to megatron/neox_arguments/neox_args.py
index c10ab2ea7..3b1c1bcca 100644
--- a/megatron/neox_arguments/megatron_args.py
+++ b/megatron/neox_arguments/neox_args.py
@@ -3,6 +3,7 @@
 from .template import NeoXArgsTemplate
 from typing import Literal
 
+
 def get_git_commit_hash():
     """ Gets the git commit hash of your current repo (if it exists) """
     try:
@@ -12,14 +13,14 @@ def get_git_commit_hash():
         git_hash = None
     return git_hash
 
+
 @dataclass
 class NeoXArgsParallelism(NeoXArgsTemplate):
-    
     pipe_parallel_size: int = 0
     """
     Number of pipeline parallel stages. Disable with 0.
     """
-    
+
     model_parallel_size: int = 1
     """
     Size of the model parallelism.
@@ -35,11 +36,16 @@ class NeoXArgsParallelism(NeoXArgsTemplate):
     Total world size (i.e number of gpus in cluster). Configured post-launch using distributed launcher
     """
 
+    is_pipe_parallel: bool = False
+    """
+    flag to determine whether pipeline parallelism is on - shouldn't be set by user, is automatically determined 
+    according to pipeline parallel size.
+    """
+
 
 @dataclass
 class NeoXArgsModel(NeoXArgsTemplate):
-
-    precision: Literal["fp16", "fp32"] = None 
+    precision: Literal["fp16", "fp32"] = None
     """
     description of the used precision, either one of fp16 or fp32 (and in the future bf16).
     """
@@ -188,13 +194,13 @@ class NeoXArgsModel(NeoXArgsTemplate):
     """
     Run attention masking and softmax in fp32.
     """
-    
-    rotary_pct: float = 1.0 
+
+    rotary_pct: float = 1.0
     """
     pct of hidden dims to apply rotary positional embedding to
     """
 
-    rotary_emb_base: int = 10000 
+    rotary_emb_base: int = 10000
     """
     Base for rotary positional embedding
     """
@@ -202,7 +208,6 @@ class NeoXArgsModel(NeoXArgsTemplate):
 
 @dataclass
 class NeoXArgsOptimizer(NeoXArgsTemplate):
-
     optimizer_type: Literal['adam', 'onebitadam', 'cpu_adam', 'cpu_torch_adam', 'sm3'] = "adam"
     """
     Type of optimizer to use. Choose from ['adam', 'onebitadam', 'cpu_adam', 'cpu_torch_adam', 'sm3']
@@ -212,28 +217,28 @@ class NeoXArgsOptimizer(NeoXArgsTemplate):
     """
     Zero Optimizer stage
     """
-    
+
     zero_reduce_scatter: bool = None
     """
     Zero: Uses reduce or reduce scatter instead of allreduce to average gradients
     """
-    
+
     zero_contiguous_gradients: bool = None
     """
     Zero: Copies the gradients to a contiguous buffer as they are produced. Avoids memory fragmentation during backward pass. Only useful when running very large models.
     """
-    
+
     zero_reduce_bucket_size: int = None
     """
     Zero: Number of elements reduced/allreduced at a time. Limits the memory required for the allgather for large model sizes
     """
-    
+
     zero_allgather_bucket_size: int = None
     """
     Zero: Number of elements allgathered at a time. Limits the memory required for the allgather for large model sizes
     """
 
-    lr: float = None 
+    lr: float = None
     """
     Max Learning rate during training
     """
@@ -274,7 +279,6 @@ class NeoXArgsLRScheduler(NeoXArgsTemplate):
 
 @dataclass
 class NeoXArgsLogging(NeoXArgsTemplate):
-
     wandb_group: str = None
     """Weights and Biases group name - used to group together "runs"."""
 
@@ -334,7 +338,6 @@ class NeoXArgsLogging(NeoXArgsTemplate):
 
 @dataclass
 class NeoXArgsOther(NeoXArgsTemplate):
-
     distributed_backend: str = "nccl"
     """
     Which backend to use for distributed training.
@@ -407,7 +410,7 @@ class NeoXArgsOther(NeoXArgsTemplate):
     """
     Run via MPI, this will attempt to discover the necessary variables to initialize torch distributed from the MPI environment
     """
-    
+
     user_script: str = None
     """
     user script to be run
@@ -433,10 +436,11 @@ class NeoXArgsOther(NeoXArgsTemplate):
     Set during training
     """
 
+
 @dataclass
 class NeoXArgsTokenizer(NeoXArgsTemplate):
-
-    tokenizer_type: Literal["GPT2BPETokenizer", "HFTokenizer", "HFGPT2Tokenizer", "CharLevelTokenizer"] = "GPT2BPETokenizer"
+    tokenizer_type: Literal[
+        "GPT2BPETokenizer", "HFTokenizer", "HFGPT2Tokenizer", "CharLevelTokenizer"] = "GPT2BPETokenizer"
     """
     Type of tokenizer to use - should be one of ["GPT2BPETokenizer", "HFTokenizer", "HFGPT2Tokenizer", "CharLevelTokenizer"]
     """
@@ -450,7 +454,6 @@ class NeoXArgsTokenizer(NeoXArgsTemplate):
 
 @dataclass
 class NeoXArgsTextgen(NeoXArgsTemplate):
-
     text_gen_type: str = None
     """
     How to generate text/sample the model.
@@ -510,7 +513,6 @@ class NeoXArgsTextgen(NeoXArgsTemplate):
 
 @dataclass
 class NeoXArgsTraining(NeoXArgsTemplate):
-
     data_path: str = None
     """
     Path to combined dataset to split.
@@ -641,44 +643,39 @@ class NeoXArgsTraining(NeoXArgsTemplate):
     Chunk size (number of layers) for checkpointing.
     """
 
-    distribute_checkpointed_activations: bool = False
-    """
-    If set, distribute checkpointed activations across model parallel group.
-    """
-
-    deepspeed_activation_checkpointing: bool = False
+    deepspeed_activation_checkpointing: bool = True
     """
+    DEPRECATED - TODO: remove
     Uses activation checkpointing from deepspeed
     """
-    
+
     contiguous_checkpointing: bool = False
     """
     Contiguous memory checkpointing for activations.
     """
-    
+
     checkpoint_in_cpu: bool = False
     """
     Move the activation checkpoints to CPU.
     """
-    
+
     synchronize_each_layer: bool = False
     """
     does a synchronize at the beginning and end of each checkpointed layer.
     """
-    
+
     profile_backward: bool = False
     """
     Enables backward pass profiling for checkpointed layers.
     """
-    
+
     partition_activations: bool = False
     """
     Partition Activations across GPUs before checkpointing.
     """
 
     gas: int = None
-    """gradient_accumulation_steps""" #TODO this is a duplicate, remove?
-
+    """gradient_accumulation_steps"""  # TODO this is a duplicate, remove?
 
     clip_grad: float = None
     """
@@ -709,4 +706,4 @@ class NeoXArgsTraining(NeoXArgsTemplate):
     min_scale: float = 1.0
     """
     Minimum loss scale for dynamic loss scale.
-    """
\ No newline at end of file
+    """
diff --git a/megatron/text_generation_utils.py b/megatron/text_generation_utils.py
index 1997bbb2a..b24a82a76 100644
--- a/megatron/text_generation_utils.py
+++ b/megatron/text_generation_utils.py
@@ -21,6 +21,7 @@
 import json
 import os
 import time
+from typing import List, Union
 
 import torch
 import torch.nn.functional as F
@@ -29,8 +30,7 @@
 from megatron import get_tokenizer
 from megatron import mpu
 from megatron.utils import get_ltor_masks_and_position_ids, is_mp_rank_0
-from megatron.fp16 import fp32_to_fp16
-from typing import List, Union
+
 
 def get_batch(context_tokens):
     """Generate batch from context tokens."""
@@ -165,14 +165,22 @@ def forward_model(model, model_inputs):
     # we need to forward a pipe model by access model.module() instead of just model()
     args = get_args()
     torch.distributed.barrier()
-    if args.pipe_parallel_size == 1:
+    if args.pipe_parallel_size <= 1:
         return model.module(model_inputs)
-    elif args.pipe_parallel_size > 1:
-        data_iterator = iter([[model_inputs, torch.Tensor(1)]]) # we need to feed in fake labels bc deepspeed is only built for training
+    else:
+        data_iterator = iter(
+            [[model_inputs, torch.Tensor(1)]])  # we need to feed in fake labels bc deepspeed is only built for training
         x = model.inference_batch(data_iterator)
         return x
-    else:
-        return model(*model_inputs)
+
+
+def broadcast_terminate_signal(terminate_runs: int):
+    """Send signal to all workers to terminate if we've finished the process"""
+    terminate_runs_tensor = torch.cuda.LongTensor([terminate_runs])
+    torch.distributed.broadcast(terminate_runs_tensor,
+                                mpu.get_model_parallel_src_rank(),
+                                group=mpu.get_model_parallel_group())
+    return terminate_runs_tensor[0].item()
 
 
 def sample_sequence_batch(model, context_tokens, context_lengths,
@@ -215,10 +223,10 @@ def sample_sequence_batch(model, context_tokens, context_lengths,
             if args.recompute:
                 # we need to use args instead of kwargs here because deepspeed :|
                 model_inputs = (tokens,
-                               position_ids,
-                               attention_mask,
-                               torch.Tensor(),
-                               )
+                                position_ids,
+                                attention_mask,
+                                torch.Tensor(),
+                                )
                 logits = forward_model(model, model_inputs)
                 logits = logits[:, context_length - 1, :]
             else:
@@ -231,10 +239,10 @@ def sample_sequence_batch(model, context_tokens, context_lengths,
                     positions2use = position_ids[:, context_length - 1].view(
                         batch_size, -1)
             # we have to use args instead of kwargs here because deepspeed :|
-            model_inputs = (tokens2use, # input_ids
-                            positions2use, # position_ids
-                            attention_mask, # attention_mask
-                            layer_past, # layer_past
+            model_inputs = (tokens2use,  # input_ids
+                            positions2use,  # position_ids
+                            attention_mask,  # attention_mask
+                            layer_past,  # layer_past
                             )
 
             logits, layer_past = forward_model(model, model_inputs)
@@ -247,14 +255,14 @@ def sample_sequence_batch(model, context_tokens, context_lengths,
                 logits = logits.float()
                 logits /= args.temperature
                 logits = top_k_logits(logits, top_k=args.top_k,
-                                    top_p=args.top_p)
+                                      top_p=args.top_p)
                 log_probs = F.softmax(logits, dim=-1)
                 prev = torch.multinomial(log_probs, num_samples=1).view(-1)
 
             print_logits = []
             for p in prev:
                 print_logits.append([logits[i, p].item()
-                                    for i in range(batch_size)])
+                                     for i in range(batch_size)])
             started = context_lengths <= context_length
             tokens[:, context_length] = switch(
                 tokens[:, context_length].view(-1), prev, started)
@@ -290,7 +298,7 @@ def generate_samples_from_prompt(model, text: Union[List[str], str]):
     assert any([isinstance(text, str), isinstance(text, list)]), "Text should be in string or list form"
     if isinstance(text, str):
         text = [text]
-    
+
     if is_mp_rank_0():
         input_count = len(text)
         input_pos = 0
@@ -300,7 +308,7 @@ def generate_samples_from_prompt(model, text: Union[List[str], str]):
     generated_texts = []
     while True:
         start_time = time.time()
-        
+
         # Tokenize text, and check whether we should terminate process
         terminate_runs = 0
         if is_mp_rank_0():
@@ -315,19 +323,14 @@ def generate_samples_from_prompt(model, text: Union[List[str], str]):
 
                 if context_length >= (args.seq_length // 2):
                     print_rank_0("\nContext length", context_length,
-                        "\nPlease give smaller context (half of the "
-                        "sequence length)!", flush=True)
+                                 "\nPlease give smaller context (half of the "
+                                 "sequence length)!", flush=True)
                     continue
         else:
             context_tokens = tokenizer.tokenize("EMPTY TEXT")
             context_length = len(context_tokens)
 
-        # Send signal to all workers to terminate if we've finished the process
-        terminate_runs_tensor = torch.cuda.LongTensor([terminate_runs])
-        torch.distributed.broadcast(terminate_runs_tensor,
-                                    mpu.get_model_parallel_src_rank(),
-                                    group=mpu.get_model_parallel_group())
-        terminate_runs = terminate_runs_tensor[0].item()
+        terminate_runs = broadcast_terminate_signal(terminate_runs)
         if terminate_runs == 1:
             return generated_texts
 
@@ -349,7 +352,7 @@ def generate_samples_from_prompt(model, text: Union[List[str], str]):
                 generated_texts.append(data)
                 if iterations % args.log_interval == 0:
                     print_rank_0('Avg s/batch:',
-                          (time.time() - start_time) / min(args.log_interval, iterations + 1))
+                                 (time.time() - start_time) / min(args.log_interval, iterations + 1))
                     start_time = time.time()
                 iterations += 1
 
@@ -375,7 +378,7 @@ def generate_samples_input_from_file(model):
         if args.sample_output_file is None:
             sample_output_file = args.sample_input_file + ".out"
             print_rank_0('could not find `sample-output-file`, setting '
-                  'it to {}'.format(sample_output_file))
+                         'it to {}'.format(sample_output_file))
         else:
             sample_output_file = args.sample_output_file
         f_out = open(sample_output_file, "w+")
@@ -417,19 +420,14 @@ def generate_samples_interactive(model, print_frequency=24):
 
                     if context_length >= (args.seq_length // 2):
                         print_rank_0("\nContext length", context_length,
-                              "\nPlease give smaller context (half of the "
-                              "sequence length)!", flush=True)
+                                     "\nPlease give smaller context (half of the "
+                                     "sequence length)!", flush=True)
                         continue
             else:
                 context_tokens = tokenizer.tokenize("EMPTY TEXT")
                 context_length = len(context_tokens)
 
-            terminate_runs_tensor = torch.cuda.LongTensor([terminate_runs])
-            torch.distributed.broadcast(terminate_runs_tensor,
-                                        mpu.get_model_parallel_src_rank(),
-                                        group=mpu.get_model_parallel_group())
-            terminate_runs = terminate_runs_tensor[0].item()
-
+            terminate_runs = broadcast_terminate_signal(terminate_runs)
             if terminate_runs == 1:
                 return
 
@@ -439,7 +437,7 @@ def generate_samples_interactive(model, print_frequency=24):
                 decode_tokens = decode_tokens[0].cpu().numpy().tolist()
 
                 if mpu.get_model_parallel_rank() == 0 and \
-                   counter % print_frequency == 0:
+                        counter % print_frequency == 0:
                     os.system('clear')
                     print_rank_0("\nContext:", raw_text, flush=True)
                     trim_decode_tokens = tokenizer.detokenize(
@@ -489,7 +487,7 @@ def generate_samples_unconditional(model):
         if token_stream is None: break
         if ctr % args.log_interval == 0:
             print_rank_0('Avg s/batch:',
-                  (time.time() - start_time) / min(args.log_interval, ctr + 1))
+                         (time.time() - start_time) / min(args.log_interval, ctr + 1))
             start_time = time.time()
         length = len(token_stream)
         token_batch = token_stream[0].cpu().numpy().tolist()
diff --git a/megatron/training.py b/megatron/training.py
index c6225c597..f8e8d2a1c 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -26,8 +26,6 @@
 import sys
 
 import torch
-from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP
-from apex.optimizers import FusedAdam as Adam
 
 from megatron import get_args
 from megatron import get_timers
@@ -35,7 +33,6 @@
 from megatron import print_rank_0
 from megatron.checkpointing import load_checkpoint
 from megatron.checkpointing import save_checkpoint
-from megatron.fp16 import FP16_Module
 from megatron.global_vars import get_use_wandb
 from megatron.initialize import initialize_megatron
 from megatron.learning_rates import AnnealingLR
@@ -138,8 +135,6 @@ def get_optimizer(model):
     if args.no_load_optim:
         return None, None
     # Build parameter groups (weight decay and non-decay).
-    while isinstance(model, (torchDDP, FP16_Module)):
-        model = model.module
     param_groups = get_params_for_weight_decay_optimization(model, args)
     print_rank_0(f'Configuring Optimizer type: {args.optimizer_type} with params: {args.optimizer["params"]}')
     # Add model parallel attribute if it is not set.
@@ -168,6 +163,13 @@ def get_optimizer(model):
             **args.optimizer["params"])
     elif args.optimizer_type.lower() == "adam":
         # Use Adam
+        try:
+            # default to apex as it's slightly faster
+            from apex.optimizers import FusedAdam as Adam
+        except ImportError:
+            # if apex isn't installed, use deepspeed's FusedAdam
+            print("WARNING: APEX not installed - defaulting to deepspeed's fused adam")
+            from deepspeed.ops.adam import FusedAdam as Adam
         optimizer = Adam(param_groups,
                          weight_decay=args.weight_decay,
                          **args.optimizer["params"])
@@ -237,16 +239,15 @@ def setup_model_and_optimizer(model_provider_func):
             optimizer=optimizer,
             args=args,
             lr_scheduler=_lr_scheduler,
-            mpu=mpu if args.pipe_parallel_size == 0 else None,
             dist_init_required=False,
             model_parameters=_model_params,
             config_params=args.deepspeed_config,
+            mpu=mpu if not args.is_pipe_parallel else None
         )
-
         model.total_params = get_total_params(model.module)
         print_rank_0(f' > total params: {"{:,}".format(model.total_params)}')
 
-        if args.pipe_parallel_size > 0:
+        if args.is_pipe_parallel:
             model.set_batch_fn(model.module._megatron_batch_fn)
     else:
         raise ValueError("Must be using deepspeed to run neox")
@@ -257,11 +258,6 @@ def setup_model_and_optimizer(model_provider_func):
     else:
         args.iteration = 0
 
-    # get model without FP16 and/or TorchDDP wrappers
-    unwrapped_model = model
-    while hasattr(unwrapped_model, 'module'):
-        unwrapped_model = unwrapped_model.module
-
     return model, optimizer, lr_scheduler
 
 
@@ -293,9 +289,11 @@ def train_step(forward_step_func, data_iterator,
     timers = get_timers()
 
     # Pipeline parallelism schedules forward/backward/step
-    if args.pipe_parallel_size > 0:
+    if args.is_pipe_parallel:
         return train_step_pipe(model, data_iterator)
 
+    # TODO: Dead code (?)
+
     # Forward model for one step.
     timers('forward').start()
     loss, loss_reduced = forward_step_func(data_iterator, model)
@@ -371,7 +369,7 @@ def add_to_logging(name):
         if name in timers.timers:
             timers_to_log.append(name)
 
-    if args.pipe_parallel_size <= 0:
+    if not args.is_pipe_parallel:
         add_to_logging('forward')
         add_to_logging('backward')
         add_to_logging('backward-backward')
@@ -602,11 +600,10 @@ def evaluate_and_print_results(prefix, forward_step_func,
 
     # Pipeline parallelism needs eval_batch() instead of a simple forward().
     args = get_args()
-    if args.pipe_parallel_size > 0:
-        def _eval_helper(data_iter, pipe_model):
+    if args.is_pipe_parallel:
+        def _eval_helper(data_iter, _):
             loss = model.eval_batch(data_iter)
             return None, {'lm loss': loss}
-
         forward_step_func = _eval_helper
 
     total_loss_dict = evaluate(forward_step_func, data_iterator, model, verbose)
@@ -634,7 +631,7 @@ def build_train_valid_test_data_iterators(
     print_rank_0('> building train, validation, and test datasets ...')
 
     # Ensure only the first/last pipeline stages have data loaders
-    if args.pipe_parallel_size > 0:
+    if args.is_pipe_parallel:
         is_first_stage = mpu.get_pipe_parallel_rank() == 0
         is_last_stage = mpu.get_pipe_parallel_rank() == mpu.get_pipe_parallel_world_size() - 1
         pipe_load = is_first_stage or is_last_stage
@@ -679,7 +676,7 @@ def build_train_valid_test_data_iterators(
         flags = torch.cuda.LongTensor([0, 0, 0])
 
     # Broadcast num tokens.
-    if args.pipe_parallel_size > 0:
+    if args.is_pipe_parallel:
         # Only first/last pipeline stages have data loaders, so pipeline parallelism should
         # broadcast globally instead of just the model parallel group.
         torch.distributed.broadcast(flags, src=0)
diff --git a/megatron/utils.py b/megatron/utils.py
index 6bad05ea8..17faaded5 100644
--- a/megatron/utils.py
+++ b/megatron/utils.py
@@ -34,7 +34,6 @@
 from megatron import mpu
 from megatron.data.samplers import DistributedBatchSampler
 from megatron.global_vars import get_use_wandb, get_tensorboard_writer
-from megatron.fp16 import FP16_Optimizer
 from deepspeed import PipelineEngine, DeepSpeedEngine
 
 
@@ -44,7 +43,6 @@ def reduce_losses(losses):
         [loss.clone().detach().view(1) for loss in losses])
     torch.distributed.all_reduce(reduced_losses)
     reduced_losses = reduced_losses / torch.distributed.get_world_size()
-
     return reduced_losses
 
 
@@ -62,26 +60,6 @@ def report_memory(name):
     print_rank_0(string)
 
 
-def print_params_min_max_norm(optimizer, iteration):
-    """Print min, max, and norm of all parameters."""
-    index = 0
-    rank = torch.distributed.get_rank()
-    string = 'iteration, rank, index, model-parallel,min, max, norm\n'
-    optimizer_ = optimizer
-    if isinstance(optimizer, FP16_Optimizer):
-        optimizer_ = optimizer.optimizer
-    for param_group in optimizer_.param_groups:
-        for param in param_group['params']:
-            index += 1
-            min_ = param.data.min()
-            max_ = param.data.max()
-            norm = param.data.norm()
-            string += '{:7d}, {:4d}, {:4d}, {:2d}, '.format(
-                iteration, rank, index, int(param.model_parallel))
-            string += '{:.6E}, {:.6E}, {:.6E}\n'.format(min_, max_, norm)
-    print(string, flush=True)
-
-
 def check_adlr_autoresume_termination(iteration, model,
                                       optimizer, lr_scheduler):
     """Check for autoresume signal and exit if it is received."""
@@ -196,10 +174,12 @@ def is_local_main():
     """ True if is the local main process """
     return local_rank() == 0
 
+
 def is_mp_rank_0():
     """True if mp rank == 0"""
     return mpu.get_model_parallel_rank() == 0
 
+
 def get_wandb_api_key():
     """ Get Weights and Biases API key from ENV or .netrc file. Otherwise return None """
     if 'WANDB_API_KEY' in os.environ:
@@ -210,6 +190,7 @@ def get_wandb_api_key():
     if wandb_token is not None:
         return wandb_token[1]
 
+
 def obtain_resource_pool(hostfile_path, include_arg, exclude_arg) -> Dict[str, List[int]]:
     """
     Get dict of `resource_pool[hostname] = [list of GPU ranks]` using hostfile, include and exclude args.
@@ -235,7 +216,7 @@ def natural_sort(l):
     return sorted(l, key=alphanum_key)
 
 
-def pipe_to_normal(model_engine):
+def pipe_to_normal(model_engine, **kwargs):
     """
     Takes in a deepspeed.PipelineEngine model and returns a deepspeed.DeepspeedEngine model with the same model weights
     so we can directly access the .forward() function (for inference).
@@ -244,12 +225,17 @@ def pipe_to_normal(model_engine):
 
     """
     assert isinstance(model_engine, PipelineEngine), f"model engine {model_engine} not a PipelineEngine instance"
-    return DeepSpeedEngine(
+    ret = DeepSpeedEngine(
         args=get_args(),
         model=model_engine.module,
         mpu=model_engine.module.mpu(),
         dist_init_required=False,
-        config_params=model_engine.config_params)
+        config_params=model_engine.config_params,
+        optimizer=model_engine.optimizer,
+        lr_scheduler=model_engine.lr_scheduler,
+        **kwargs)
+    return ret
+
 
 def tb_wandb_log(key, value, iteration_no):
     # logs to both tb and wandb (if present) from the zeroth rank
@@ -260,6 +246,7 @@ def tb_wandb_log(key, value, iteration_no):
         if get_use_wandb():
             wandb.log({key: value}, step=iteration_no)
 
+
 def ddb(rank=0):
     """
     Distributed Debugger that will insert a py debugger on rank `rank` and
diff --git a/pretrain_gpt2.py b/pretrain_gpt2.py
index 6eb9217fb..4001dcc91 100644
--- a/pretrain_gpt2.py
+++ b/pretrain_gpt2.py
@@ -19,46 +19,32 @@
 import socket
 
 import torch
+import wandb
 from wandb import UsageError
 
 from megatron import get_args
-from megatron import print_rank_0
 from megatron import get_timers
 from megatron import get_tokenizer
 from megatron import mpu
+from megatron import print_rank_0
 from megatron.data.gpt2_dataset import build_train_valid_test_datasets
-from megatron.global_vars import set_use_wandb, get_use_wandb
-from megatron.model import GPT2Model, GPT2ModelPipe
+from megatron.fp16 import fp32_to_fp16
+from megatron.global_vars import set_use_wandb
+from megatron.model import GPT2ModelPipe
+from megatron.model.gpt2_model import cross_entropy
 from megatron.training import pretrain
 from megatron.utils import get_ltor_masks_and_position_ids, is_local_main, local_rank, get_wandb_api_key
 from megatron.utils import reduce_losses
-from megatron.fp16 import fp32_to_fp16
-import wandb
-
-
-def model_provider(use_wandb=True, inference=False, get_key_value=True):
-    """Build the model."""
 
-    args = get_args()
 
-    print_rank_0('building GPT2 model ...')
-    if args.pipe_parallel_size == 0:
-        model = GPT2Model(num_tokentypes=0, parallel_output=True, inference=inference, get_key_value=get_key_value)
-    else:
-        model = GPT2ModelPipe(num_tokentypes=0, parallel_output=True, topology=mpu.get_topology(), inference=inference, get_key_value=get_key_value)
-        # This is a hack to give us a reference to get_batch_pipe from within training.py
-        # We need to call model.set_batch_fn after deepspeed.initialize
-        model._megatron_batch_fn = get_batch_pipe
-
-    ## Wandb. (one worker per machine)
-    # I think it should be like this it use the use_wandb input
+def init_wandb(use_wandb, args):
+    # Wandb. (one worker per machine)
     use_wandb = is_local_main() and (get_wandb_api_key() is not None) and use_wandb
     set_use_wandb(use_wandb)
     args_dict = vars(args)
     if use_wandb:
         group_name = args_dict.get('wandb_group')
         name = f'{socket.gethostname()}-{local_rank()}' if group_name else None
-
         try:
             wandb.init(project="neox", group=group_name, name=name, save_code=False,
                        force=False, entity=args_dict.get('wandb_team'))
@@ -66,27 +52,29 @@ def model_provider(use_wandb=True, inference=False, get_key_value=True):
             set_use_wandb(False)
             print(e)
             print('Skipping wandb. Execute `wandb login` on local or main node machine to enable.')
-
-    if use_wandb:
         wandb.config.update(args_dict)
 
-    return model
 
+def model_provider(use_wandb=True, inference=False, get_key_value=True):
+    """Build the model."""
 
-def get_batch(data_iterator):
-    """Generate a batch"""
     args = get_args()
-    tokenizer = get_tokenizer()
+    print_rank_0('building GPT2 model ...')
+    model = GPT2ModelPipe(num_tokentypes=0, parallel_output=True, topology=mpu.get_topology(), inference=inference,
+                          get_key_value=get_key_value)
+    if not args.is_pipe_parallel:
+        # Export PipeParallel model to nn.Sequential model to avoid the overhead of deepspeed's pipe parallel training
+        model = model.to_sequential()
+    else:
+        # This is a hack to give us a reference to get_batch_pipe from within training.py
+        # We need to call model.set_batch_fn after deepspeed.initialize
+        model._megatron_batch_fn = get_batch_pipe
+    init_wandb(use_wandb, args)
+    return model
 
-    # Items and their type.
-    keys = ['text']
-    datatype = torch.int64
 
-    # Broadcast data.
-    if data_iterator is not None:
-        data = next(data_iterator)
-    else:
-        data = None
+def _get_batch(args, tokenizer, keys, data, datatype):
+    """Support function for get_batch / get_batch pipe (to avoid code repetition)"""
     data_b = mpu.broadcast_data(keys, data, datatype)
 
     # Unpack.
@@ -105,8 +93,8 @@ def get_batch(data_iterator):
     return tokens, labels, loss_mask, attention_mask, position_ids
 
 
-def get_batch_pipe(data):
-    """A modification of get_batch() to work with the latest batch instead of an iterator. """
+def get_batch(data_iterator):
+    """Generate a batch"""
     args = get_args()
     tokenizer = get_tokenizer()
 
@@ -115,21 +103,23 @@ def get_batch_pipe(data):
     datatype = torch.int64
 
     # Broadcast data.
-    data_b = mpu.broadcast_data(keys, data, datatype)
+    if data_iterator is not None:
+        data = next(data_iterator)
+    else:
+        data = None
+    return _get_batch(args, tokenizer, keys, data, datatype)
 
-    # Unpack.
-    tokens_ = data_b['text'].long()
-    labels = tokens_[:, 1:].contiguous()
-    tokens = tokens_[:, :-1].contiguous()
 
-    # Get the masks and postition ids.
-    attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
-        tokens,
-        tokenizer.eod,
-        args.reset_position_ids,
-        args.reset_attention_mask,
-        args.eod_mask_loss)
+def get_batch_pipe(data):
+    """A modification of get_batch() to work with the latest batch instead of an iterator. """
+    args = get_args()
+    tokenizer = get_tokenizer()
+
+    # Items and their type.
+    keys = ['text']
+    datatype = torch.int64
 
+    tokens, labels, loss_mask, attention_mask, position_ids = _get_batch(args, tokenizer, keys, data, datatype)
     # unpack data
     if args.precision == "fp16":
         # cast to fp16 because pipeline parallelism skips the FP16 wrapper.
@@ -148,10 +138,9 @@ def forward_step(data_iterator, model):
     tokens, labels, loss_mask, attention_mask, position_ids = get_batch(
         data_iterator)
     timers('batch generator').stop()
-    # Forward model.
-    losses = model(tokens, position_ids, attention_mask, labels=labels)
-    loss_mask = loss_mask.view(-1)
-    loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum()
+
+    outputs = model((tokens, position_ids, attention_mask))
+    loss = cross_entropy(outputs, (labels, loss_mask), _fp16=args.fp16_lm_cross_entropy)
 
     # Reduce loss for logging.
     reduced_loss = reduce_losses([loss])
diff --git a/requirements/requirements-onebitadam.txt b/requirements/requirements-onebitadam.txt
new file mode 100644
index 000000000..a6dd402b3
--- /dev/null
+++ b/requirements/requirements-onebitadam.txt
@@ -0,0 +1 @@
+cupy-cuda111==8.6.0
diff --git a/requirements/requirements-sparseattention.txt b/requirements/requirements-sparseattention.txt
new file mode 100644
index 000000000..424b2d146
--- /dev/null
+++ b/requirements/requirements-sparseattention.txt
@@ -0,0 +1 @@
+triton==1.0.0.dev20210329
\ No newline at end of file
diff --git a/requirements/requirements.txt b/requirements/requirements.txt
index 2f7723432..c1638ad9a 100644
--- a/requirements/requirements.txt
+++ b/requirements/requirements.txt
@@ -3,7 +3,6 @@ six
 regex
 numpy==1.20.2
 -e git+git://github.com/EleutherAI/DeeperSpeed.git@750f2140bf782cffeb578ce14a4e4cdb076f4326#egg=deepspeed
-cupy-cuda111==8.6.0
 mpi4py==3.0.3
 wandb==0.10.25
 einops==0.3.0
diff --git a/run_tests.py b/run_tests.py
index 8ee9241bb..492272f5b 100644
--- a/run_tests.py
+++ b/run_tests.py
@@ -8,7 +8,6 @@
 
 
 import unittest
-from tests import *
 
 if __name__ == "__main__":
     loader = unittest.TestLoader()
diff --git a/tasks/data_utils.py b/tasks/data_utils.py
index 866a5e69a..f3376dc90 100644
--- a/tasks/data_utils.py
+++ b/tasks/data_utils.py
@@ -16,6 +16,7 @@
 """ Tasks data utility."""
 
 import re
+
 import numpy as np
 
 
diff --git a/tasks/eval_utils.py b/tasks/eval_utils.py
index c89ea2cbf..04489c88c 100644
--- a/tasks/eval_utils.py
+++ b/tasks/eval_utils.py
@@ -21,8 +21,8 @@
 import torch
 
 from megatron import get_args
-from megatron import print_rank_0
 from megatron import mpu
+from megatron import print_rank_0
 from tasks.finetune_utils import build_data_loader
 from tasks.finetune_utils import process_batch
 
diff --git a/tasks/finetune_utils.py b/tasks/finetune_utils.py
index 6df0eeef5..0effe0e82 100644
--- a/tasks/finetune_utils.py
+++ b/tasks/finetune_utils.py
@@ -21,9 +21,9 @@
 import torch
 
 from megatron import get_args
-from megatron import print_rank_0
 from megatron import get_timers
 from megatron import mpu
+from megatron import print_rank_0
 from megatron.checkpointing import load_checkpoint
 from megatron.checkpointing import save_checkpoint
 from megatron.training import evaluate_and_print_results
diff --git a/tasks/zeroshot_gpt2/datasets.py b/tasks/zeroshot_gpt2/datasets.py
index 0d1f03756..077d00c94 100644
--- a/tasks/zeroshot_gpt2/datasets.py
+++ b/tasks/zeroshot_gpt2/datasets.py
@@ -22,8 +22,8 @@
 import torch
 
 from megatron import get_args
-from megatron import print_rank_0
 from megatron import get_tokenizer
+from megatron import print_rank_0
 from .detokenizer import get_detokenizer
 
 
diff --git a/tasks/zeroshot_gpt2/evaluate.py b/tasks/zeroshot_gpt2/evaluate.py
index b1c06d205..06d04a719 100644
--- a/tasks/zeroshot_gpt2/evaluate.py
+++ b/tasks/zeroshot_gpt2/evaluate.py
@@ -20,15 +20,13 @@
 import torch
 
 from megatron import get_args
-from megatron import print_rank_0
 from megatron import get_tokenizer
 from megatron import mpu
+from megatron import print_rank_0
 from megatron.checkpointing import load_checkpoint
-from megatron.model import GPT2Model
 from megatron.training import get_model
 from megatron.utils import get_ltor_masks_and_position_ids
 from tasks.finetune_utils import build_data_loader
-
 from .datasets import build_dataset
 
 
@@ -48,9 +46,11 @@ def model_provider():
                                       'is not supported.'.format(eval_metric))
 
         print_rank_0('building GPT2 model ...')
-        model = GPT2Model(num_tokentypes=0, parallel_output=parallel_output)
+        # TODO: reimplement for pipe parallel
+        raise NotImplementedError
+        # model = GPT2Model(num_tokentypes=0, parallel_output=parallel_output)
 
-        return model
+        # return model
 
     return model_provider
 
diff --git a/tests/__init__.py b/tests/__init__.py
index c3ed58809..7863a66f4 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -2,5 +2,5 @@
 Testcases for GPT NeoX
 """
 
-from .neox_args import *
 from .model import *
+from .neox_args import *
diff --git a/tests/model/__init__.py b/tests/model/__init__.py
index 3c8d7288e..3b332cfaa 100644
--- a/tests/model/__init__.py
+++ b/tests/model/__init__.py
@@ -2,6 +2,5 @@
 Tests concerning the GPT2Model class
 """
 
-from .test_model_initialization import TestModelInitialization
 from .test_model_checkpoint import TestModelCheckpoint
 #from .test_model_initialization_pipeline import TestModelInitializationPipeline
\ No newline at end of file
diff --git a/tests/model/test_configs/medium.yml b/tests/model/test_configs/medium.yml
new file mode 100644
index 000000000..b75c018bd
--- /dev/null
+++ b/tests/model/test_configs/medium.yml
@@ -0,0 +1,85 @@
+# GPT-2 pretraining setup
+{
+   # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
+   # across the node boundaries )
+   "pipe-parallel-size": 1,
+   "model-parallel-size": 1,
+
+   # model settings
+   "num-layers": 4,
+   "hidden-size": 128,
+   "num-attention-heads": 8,
+   "seq-length": 2048,
+   "max-position-embeddings": 2048,
+   "norm": "layernorm",
+   "pos-emb": "rotary",
+   "no-weight-tying": true,
+
+   # these should provide some speedup but takes a while to build, set to true if desired
+   "scaled-upper-triang-masked-softmax-fusion": false,
+   "bias-gelu-fusion": false,
+
+
+   
+   # optimizer settings
+   "optimizer": {
+     "type": "Adam",
+     "params": {
+       "lr": 0.0003,
+       "betas": [0.9, 0.999],
+       "eps": 1.0e-8,
+     }
+   },
+   "zero_optimization": {
+    "stage": 1,
+    "allgather_partitions": True,
+    "allgather_bucket_size": 500000000,
+    "overlap_comm": True,
+    "reduce_scatter": True,
+    "reduce_bucket_size": 500000000,
+    "contiguous_gradients": True,
+    "cpu_offload": False
+  },
+   # batch / data settings
+   "train_micro_batch_size_per_gpu": 4,
+   "data-impl": "mmap",
+   "split": "949,50,1",
+
+   # activation checkpointing
+   "checkpoint-activations": true,
+   "checkpoint-num-layers": 1,
+   "partition-activations": true,
+   "synchronize-each-layer": true,
+
+   # regularization
+   "gradient_clipping": 1.0,
+   "weight-decay": 0,
+   "hidden-dropout": 0,
+   "attention-dropout": 0,
+
+   # precision settings
+   "fp16": { 
+     "fp16": true,
+     "enabled": true,
+     "loss_scale": 0,
+     "loss_scale_window": 1000,
+     "hysteresis": 2,
+     "min_loss_scale": 1
+   },
+
+   # misc. training settings
+   "train-iters": 320000,
+   "lr-decay-iters": 320000,
+   "distributed-backend": "nccl",
+   "lr-decay-style": "cosine",
+   "warmup": 0.01,
+   "save-interval": 10000,
+   "eval-interval": 1000,
+   "eval-iters": 10,
+
+   # logging
+   "log-interval": 100,
+   "steps_per_print": 10,
+   "keep-last-n-checkpoints": 4,
+   "wall_clock_breakdown": true,
+}
diff --git a/tests/model/test_configs/small.yml b/tests/model/test_configs/small.yml
new file mode 100644
index 000000000..06787c78a
--- /dev/null
+++ b/tests/model/test_configs/small.yml
@@ -0,0 +1,84 @@
+# GPT-2 pretraining setup
+{
+   # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
+   # across the node boundaries )
+   "pipe-parallel-size": 1,
+   "model-parallel-size": 1,
+
+   # model settings
+   "num-layers": 12,
+   "hidden-size": 768,
+   "num-attention-heads": 12,
+   "seq-length": 2048,
+   "max-position-embeddings": 2048,
+   "norm": "layernorm",
+   "pos-emb": "rotary",
+   "no-weight-tying": true,
+
+   # these should provide some speedup but takes a while to build, set to true if desired
+   "scaled-upper-triang-masked-softmax-fusion": false,
+   "bias-gelu-fusion": false,
+
+
+   # optimizer settings
+   "optimizer": {
+     "type": "Adam",
+     "params": {
+       "lr": 0.0006,
+       "betas": [0.9, 0.999],
+       "eps": 1.0e-8,
+     }
+   },
+   "zero_optimization": {
+    "stage": 0,
+    "allgather_partitions": True,
+    "allgather_bucket_size": 500000000,
+    "overlap_comm": True,
+    "reduce_scatter": True,
+    "reduce_bucket_size": 500000000,
+    "contiguous_gradients": True,
+    "cpu_offload": False
+  },
+
+   # batch / data settings
+   "train_micro_batch_size_per_gpu": 4,
+   "data-impl": "mmap",
+   "split": "949,50,1",
+
+   # activation checkpointing
+   "checkpoint-activations": true,
+   "checkpoint-num-layers": 1,
+   "partition-activations": true,
+   "synchronize-each-layer": true,
+
+   # regularization
+   "gradient_clipping": 1.0,
+   "weight-decay": 0.0,
+   "hidden-dropout": 0.0,
+   "attention-dropout": 0.0,
+
+   # precision settings
+   "fp16": { 
+     "enabled": true,
+     "loss_scale": 0,
+     "loss_scale_window": 1000,
+     "hysteresis": 2,
+     "min_loss_scale": 1
+   },
+
+   # misc. training settings
+   "train-iters": 320000,
+   "lr-decay-iters": 320000,
+   "distributed-backend": "nccl",
+   "lr-decay-style": "cosine",
+   "warmup": 0.01,
+   "save-interval": 10000,
+   "eval-interval": 1000,
+   "eval-iters": 10,
+
+   # logging
+   "log-interval": 100,
+   "steps_per_print": 10,
+   "keep-last-n-checkpoints": 4,
+   "wall_clock_breakdown": true,
+}
diff --git a/tests/model/test_model_checkpoint.py b/tests/model/test_model_checkpoint.py
index 3ee9c3551..efd27e9ed 100644
--- a/tests/model/test_model_checkpoint.py
+++ b/tests/model/test_model_checkpoint.py
@@ -1,26 +1,24 @@
 import os
-import re
-import sys
 import shutil
+import sys
 import unittest
 from unittest.mock import patch
-from pathlib import Path
 
 if __name__ == "__main__":
     sys.path.append(os.path.abspath(''))
 
 from megatron.neox_arguments import NeoXArgs
-from megatron.global_vars import set_global_variables, get_args, reset_global_variables
-from megatron.model import GPT2Model, GPT2ModelPipe
+
+from megatron.global_vars import get_args, reset_global_variables
+
 from megatron import initialize_megatron
-from megatron import mpu
 from megatron.text_generation_utils import get_batch, forward_model
 from megatron.training import setup_model_and_optimizer
 
 from megatron.checkpointing import load_checkpoint
 from megatron.checkpointing import save_checkpoint
 from pretrain_gpt2 import model_provider
-from megatron.utils import get_ltor_masks_and_position_ids, pipe_to_normal
+from megatron.utils import pipe_to_normal
 from deepspeed import PipelineEngine
 
 from tests.common import get_root_directory, get_configs_with_path
@@ -28,14 +26,14 @@
 
 class TestModelCheckpoint(unittest.TestCase):
 
-    #def test_model_checkpoint(self):
-    #    self.assertTrue(self.run_test_model_checkpoint(pipe_parallel_size=1))
-
-    def test_model_checkpoint(self):
+    def run_checkpoint_test(self, config_yml):
         reset_global_variables()
 
         # intitially load config from files as would be the case in deepy.py
-        yaml_list = get_configs_with_path(["small.yml", "local_setup.yml"])
+        yaml_list = get_configs_with_path(["local_setup.yml"])
+        yaml_list.append(f"{get_root_directory()}/tests/model/test_configs/{config_yml}")
+        print(os.listdir("."))
+
         args_loaded = NeoXArgs.from_ymls(yaml_list)
         args_loaded.update_value("user_script", str(get_root_directory() / "pretrain_gpt2.py"))
         args_loaded.update_value("pipe_parallel_size", 1) # overwrite pipeline parameter, config in small.yml may have changed!
@@ -61,9 +59,14 @@ def test_model_checkpoint(self):
 
         # Initialize new model model
         model, optimizer, lr_scheduler = setup_model_and_optimizer(lambda: model_provider(use_wandb=False))
-        if args.pipe_parallel_size == 1 and isinstance(model, PipelineEngine):
-            # if it's a pipe parallel model but not actually doing parallelism, convert it to a normal deepspeed model
-            model = pipe_to_normal(model)
+
+        # save model checkpoint
+        save_checkpoint(42, model, optimizer, lr_scheduler)
+
+        #if args.pipe_parallel_size == 1 and isinstance(model, PipelineEngine):
+        #    # if it's a pipe parallel model but not actually doing parallelism, convert it to a normal deepspeed model
+        #    model = pipe_to_normal(model)
+        #model.to_sequential()
         model.eval()
         
         context_tokens_tensor = torch.cuda.LongTensor([[1,2,3,4,5],[1,2,3,4,5],[6,7,8,9,10],[1,2,3,4,100]])
@@ -82,15 +85,12 @@ def test_model_checkpoint(self):
         self.assertFalse(torch.isclose(output[1], output[2]).all().item())
         self.assertTrue(torch.isclose(output[1, 3], output[3, 3]).all().item())
         
-        # save model checkpoint
-        save_checkpoint(42, model, optimizer, lr_scheduler)
-        
         # reload model from checkpoint
         reloaded_model, optimizer, lr_scheduler = setup_model_and_optimizer(lambda: model_provider(use_wandb=False))
-        if args.pipe_parallel_size == 1 and isinstance(model, PipelineEngine):
-            # if it's a pipe parallel model but not actually doing parallelism, convert it to a normal deepspeed model
-            model = pipe_to_normal(model)
         iteration = load_checkpoint(reloaded_model, optimizer, lr_scheduler)
+        if args.pipe_parallel_size == 1 and isinstance(reloaded_model, PipelineEngine):
+            # if it's a pipe parallel model but not actually doing parallelism, convert it to a normal deepspeed model
+            reloaded_model = pipe_to_normal(reloaded_model)
         reloaded_model.eval()
 
         #ensure same checkpoint is loaded
@@ -98,6 +98,10 @@ def test_model_checkpoint(self):
 
         reloaded_output = forward_model(model, (tokens, position_ids, attention_mask))
 
+        #check re-loaded model returns the same results
+        self.assertTrue(torch.isclose(output, reloaded_output).all().item())
+
+        #check all weight groups are the same
         for idx, ((n1, p1), (n2, p2)) in enumerate(zip(list(model.module.named_parameters()), list(reloaded_model.module.named_parameters()))):
             self.assertTrue(n1 == n2)
             params_equal = (p1 == p2).all().item()
@@ -105,14 +109,20 @@ def test_model_checkpoint(self):
             if not params_equal:
                 print(f"test_model_checkpoint() layer {idx} {n1} has same parameters after loading of checkpoint", flush=True)
 
-        self.assertTrue(torch.isclose(output, reloaded_output).all().item())
-
+        #clear up checkpoint folder
         shutil.rmtree(path)
 
-        #TODO test changing batch size, because i had some weird experience with this last time
+    def test_model_small(self):
+        self.run_checkpoint_test("small.yml")
 
+    def test_model_medium(self):
+        self.run_checkpoint_test("medium.yml")
 
 if __name__ == "__main__":
     suite = unittest.TestSuite()
-    suite.addTest(TestModelCheckpoint("test_model_checkpoint"))
+
+    #Run all required tests
+    #suite.addTest(TestModelCheckpoint("test_model_small"))
+    suite.addTest(TestModelCheckpoint("test_model_medium"))
+
     unittest.TextTestRunner(failfast=False).run(suite)
\ No newline at end of file
diff --git a/tests/model/test_model_initialization.py b/tests/model/test_model_initialization.py
index e2527d6d5..190eb9182 100644
--- a/tests/model/test_model_initialization.py
+++ b/tests/model/test_model_initialization.py
@@ -1,24 +1,22 @@
 import os
-import re
 import sys
 import unittest
 from unittest.mock import patch
-from pathlib import Path
 
 if __name__ == "__main__":
     sys.path.append(os.path.abspath(''))
 
 from megatron.neox_arguments import NeoXArgs
-from megatron.global_vars import set_global_variables, get_args, reset_global_variables
+from megatron.global_vars import get_args, reset_global_variables
 from megatron.model import GPT2ModelPipe
 from megatron import initialize_megatron
 from megatron import mpu
 
 from tests.common import get_root_directory, get_configs_with_path
 
-class TestModelInitializationPipeline(unittest.TestCase):
+class TestModelInitialization(unittest.TestCase):
  
-    def test_model_initialization_pipeline(self):
+    def test_model_initialization(self):
         reset_global_variables()
 
         # intitially load config from files as would be the case in deepy.py
diff --git a/tests/neox_args/__init__.py b/tests/neox_args/__init__.py
index 535b02dab..f7dd120d7 100644
--- a/tests/neox_args/__init__.py
+++ b/tests/neox_args/__init__.py
@@ -2,8 +2,8 @@
 Tests concerning NeoXArgs
 """
 
+from .test_neoxargs_commandline import TestNeoXArgsCommandLine
 from .test_neoxargs_implementation import TestNeoXArgsImplementation
 from .test_neoxargs_load import TestNeoXArgsLoad
-from .test_neoxargs_commandline import TestNeoXArgsCommandLine
+from .test_neoxargs_usage import TestNeoXArgsArgumentUsage
 from .test_neoxargs_validation import TestNeoXArgsValidation
-from .test_neoxargs_usage import TestNeoXArgsArgumentUsage
\ No newline at end of file
diff --git a/tests/neox_args/test_neoxargs_commandline.py b/tests/neox_args/test_neoxargs_commandline.py
index 24e5ddb68..84be22069 100644
--- a/tests/neox_args/test_neoxargs_commandline.py
+++ b/tests/neox_args/test_neoxargs_commandline.py
@@ -70,7 +70,7 @@ def test_neoxargs_consume_deepy_args_with_config_dir(self):
 
         self.assertTrue(args_loaded_yamls == args_loaded_consume)
 
-    def test_neoxargs_consume_megatron_args(self):
+    def test_neoxargs_consume_neox_args(self):
         """
         verify megatron args are correctly consumed after sending via deepspeed
         """
@@ -83,7 +83,7 @@ def test_neoxargs_consume_megatron_args(self):
 
         # patch sys.argv so that args can be access by set_global_variables within initialize_megatron
         with patch('sys.argv', deepspeed_main_args):
-            args_loaded = NeoXArgs.consume_megatron_args()
+            args_loaded = NeoXArgs.consume_neox_args()
 
         #TODO is the wandb group really to be changed?
         args_loaded.wandb_group = args_baseline.wandb_group
diff --git a/tests/neox_args/test_neoxargs_implementation.py b/tests/neox_args/test_neoxargs_implementation.py
index ecc65ed16..702b038d6 100644
--- a/tests/neox_args/test_neoxargs_implementation.py
+++ b/tests/neox_args/test_neoxargs_implementation.py
@@ -1,9 +1,8 @@
-import os
-import sys
 import unittest
 
 from megatron.neox_arguments import NeoXArgs
 
+
 class TestNeoXArgsImplementation(unittest.TestCase):
     """
     verify code implementation of NeoXArgs 
diff --git a/tests/neox_args/test_neoxargs_load.py b/tests/neox_args/test_neoxargs_load.py
index 4113f2a20..84ee55b5b 100644
--- a/tests/neox_args/test_neoxargs_load.py
+++ b/tests/neox_args/test_neoxargs_load.py
@@ -1,7 +1,6 @@
 import os
 import sys
 import unittest
-from unittest.mock import patch
 
 if __name__ == "__main__":
     sys.path.append(os.path.abspath(''))
diff --git a/tests/neox_args/test_neoxargs_validation.py b/tests/neox_args/test_neoxargs_validation.py
index b0a57d264..3797495b2 100644
--- a/tests/neox_args/test_neoxargs_validation.py
+++ b/tests/neox_args/test_neoxargs_validation.py
@@ -1,9 +1,8 @@
-import os
-import sys
 import unittest
 
 from megatron.neox_arguments import NeoXArgs
 
+
 class TestNeoXArgsValidation(unittest.TestCase):
     """
     verify the implementation of NeoXArgs
diff --git a/text_gen_gpt2.py b/text_gen_gpt2.py
index d933f13b0..ef68791d3 100755
--- a/text_gen_gpt2.py
+++ b/text_gen_gpt2.py
@@ -20,6 +20,7 @@
 
 import os
 import sys
+
 from pretrain_gpt2 import model_provider
 
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
@@ -30,9 +31,7 @@
 from megatron.training import setup_model_and_optimizer
 from megatron.text_generation_utils import generate_and_write_samples_unconditional, generate_samples_input_from_file, \
     generate_samples_interactive
-from megatron.utils import pipe_to_normal
 
-from deepspeed import PipelineEngine
 
 def main(extra_args_provider=None, get_key_value=True):
     """
@@ -53,9 +52,6 @@ def main(extra_args_provider=None, get_key_value=True):
 
     # Set up model and load checkpoint.
     model, _, _ = setup_model_and_optimizer(lambda: model_provider(use_wandb=False, inference=True, get_key_value=get_key_value))
-    if args.pipe_parallel_size == 1 and isinstance(model, PipelineEngine):
-        # if it's a pipe parallel model but not actually doing parallelism, convert it to a normal deepspeed model
-        model = pipe_to_normal(model)
     print_rank_0('Finished loading model')
 
     if args.text_gen_type == 'unconditional':
diff --git a/tools/convert_args_to_conf_file.py b/tools/convert_args_to_conf_file.py
index 890b71651..210f5b7fc 100755
--- a/tools/convert_args_to_conf_file.py
+++ b/tools/convert_args_to_conf_file.py
@@ -1,24 +1,22 @@
 #!/usr/bin/env python
 
-import json
 import argparse
+import dataclasses
+import json
 import sys
+from dataclasses import dataclass
 from io import StringIO
 from typing import Any
-import dataclasses
-import pandas as pd
 
 import deepspeed
-from dataclasses import dataclass
+import pandas as pd
 from deepspeed.constants import TORCH_DISTRIBUTED_DEFAULT_PORT
 from deepspeed.launcher.constants import PDSH_LAUNCHER
 from deepspeed.launcher.runner import DLTS_HOSTFILE
-
 from megatron.arguments import _add_network_size_args, _add_regularization_args, _add_training_args, \
     _add_initialization_args, _add_learning_rate_args, _add_checkpointing_args, _add_mixed_precision_args, \
     _add_distributed_args, _add_validation_args, _add_data_args, _add_autoresume_args, _add_zero_args, \
     _add_activation_checkpoint_args
-
 from megatron.config_monster import megatron_keys_exclude, ds_config_keys
 
 
diff --git a/tools/inspect_checkpoints.py b/tools/inspect_checkpoints.py
index 63f12d128..b904f53ee 100644
--- a/tools/inspect_checkpoints.py
+++ b/tools/inspect_checkpoints.py
@@ -1,13 +1,14 @@
 # Adapted from https://github.com/awaelchli/pytorch-lightning-snippets/blob/master/checkpoint/peek.py
 
 import code
+import os
+import re
 from argparse import ArgumentParser, Namespace
 from collections.abc import Mapping, Sequence
 from pathlib import Path
-import pdb
+
 import torch
-import os
-import re
+
 
 class COLORS:
     BLUE = "\033[94m"
diff --git a/tools/merge_mp_partitions.py b/tools/merge_mp_partitions.py
index b4ec64db3..33bdb44ba 100644
--- a/tools/merge_mp_partitions.py
+++ b/tools/merge_mp_partitions.py
@@ -259,7 +259,7 @@ def main():
     args.model_parallel_size = 1
     mpu.initialize.set_model_parallel_rank(0)
     sd = {}
-    sd['model'] = merged_model.state_dict_for_save_checkpoint()
+    sd['model'] = merged_model.state_dict()
     sd['iteration'] = iteration
     merged_path = os.path.join(args.load, 'merged')
     checkpoint_name = get_checkpoint_name(merged_path, iteration)
diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py
index 7ea56ec83..236365da9 100644
--- a/tools/preprocess_data.py
+++ b/tools/preprocess_data.py
@@ -19,6 +19,7 @@
 import multiprocessing
 import os
 import sys
+
 import lm_dataformat as lmd
 
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),